mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Merged PR 12923: LSH indexing to replace short list
This PR adds the FAISS LSH index to the Marian as a CPU-side optimization in the output layer of transformer models via KNN search. * It allows to replace the potentially harmful short-list with a ML free approximation of the final matrix multiply. With increasing number of K neighbors and the size of the chosen hash in bits the approximation becomes more accurate, but also slower. For model dimensions of 512, the sweet spot seems to be around k=100-150 and nbits=1024-1536 * Enable during CPU-side decoding via `--output-approx-knn k nbits` * Add a `lambda` node that allows to create custom new nodes, most useful for CPU use right now.
This commit is contained in:
parent
77a420740c
commit
5a439245b1
@ -138,6 +138,9 @@ else(MSVC)
|
||||
set(INTRINSICS "-msse4.1")
|
||||
endif()
|
||||
|
||||
add_definitions(-DFINTEGER=long)
|
||||
set(EXT_LIBS ${EXT_LIBS} faiss)
|
||||
|
||||
if(USE_FBGEMM)
|
||||
set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
|
||||
add_definitions(-DUSE_FBGEMM=1)
|
||||
|
2
src/3rd_party/CMakeLists.txt
vendored
2
src/3rd_party/CMakeLists.txt
vendored
@ -5,6 +5,8 @@ add_subdirectory(./yaml-cpp)
|
||||
add_subdirectory(./SQLiteCpp)
|
||||
add_subdirectory(./pathie-cpp)
|
||||
add_subdirectory(./zlib)
|
||||
add_subdirectory(./faiss)
|
||||
include_directories(./faiss)
|
||||
|
||||
if(USE_FBGEMM)
|
||||
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
|
||||
|
10
src/3rd_party/faiss/CMakeLists.txt
vendored
Normal file
10
src/3rd_party/faiss/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# adding a new file require explicittly modifing the CMakeLists.txt
|
||||
|
||||
include_directories("impl")
|
||||
|
||||
FILE(GLOB FaissCppSources *.cpp impl/*.cpp utils/*.cpp)
|
||||
|
||||
if (NOT TARGET faiss)
|
||||
add_library(faiss STATIC ${FaissCppSources})
|
||||
endif()
|
||||
|
120
src/3rd_party/faiss/Index.cpp
vendored
Normal file
120
src/3rd_party/faiss/Index.cpp
vendored
Normal file
@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include "Index.h"
|
||||
#include "common/logging.h"
|
||||
#include <cstring>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
Index::~Index ()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void Index::train(idx_t /*n*/, const float* /*x*/) {
|
||||
// does nothing by default
|
||||
}
|
||||
|
||||
|
||||
void Index::range_search (idx_t , const float *, float,
|
||||
RangeSearchResult *) const
|
||||
{
|
||||
ABORT ("range search not implemented");
|
||||
}
|
||||
|
||||
void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
|
||||
{
|
||||
float * distances = new float[n * k];
|
||||
ScopeDeleter<float> del(distances);
|
||||
search (n, x, k, distances, labels);
|
||||
}
|
||||
|
||||
void Index::add_with_ids(
|
||||
idx_t /*n*/,
|
||||
const float* /*x*/,
|
||||
const idx_t* /*xids*/) {
|
||||
ABORT ("add_with_ids not implemented for this type of index");
|
||||
}
|
||||
|
||||
size_t Index::remove_ids(const IDSelector& /*sel*/) {
|
||||
ABORT ("remove_ids not implemented for this type of index");
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
void Index::reconstruct (idx_t, float * ) const {
|
||||
ABORT ("reconstruct not implemented for this type of index");
|
||||
}
|
||||
|
||||
|
||||
void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
|
||||
for (idx_t i = 0; i < ni; i++) {
|
||||
reconstruct (i0 + i, recons + i * d);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
|
||||
float *distances, idx_t *labels,
|
||||
float *recons) const {
|
||||
search (n, x, k, distances, labels);
|
||||
for (idx_t i = 0; i < n; ++i) {
|
||||
for (idx_t j = 0; j < k; ++j) {
|
||||
idx_t ij = i * k + j;
|
||||
idx_t key = labels[ij];
|
||||
float* reconstructed = recons + ij * d;
|
||||
if (key < 0) {
|
||||
// Fill with NaNs
|
||||
memset(reconstructed, -1, sizeof(*reconstructed) * d);
|
||||
} else {
|
||||
reconstruct (key, reconstructed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::compute_residual (const float * x,
|
||||
float * residual, idx_t key) const {
|
||||
reconstruct (key, residual);
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
residual[i] = x[i] - residual[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Index::compute_residual_n (idx_t n, const float* xs,
|
||||
float* residuals,
|
||||
const idx_t* keys) const {
|
||||
//#pragma omp parallel for
|
||||
for (idx_t i = 0; i < n; ++i) {
|
||||
compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
size_t Index::sa_code_size () const
|
||||
{
|
||||
ABORT ("standalone codec not implemented for this type of index");
|
||||
}
|
||||
|
||||
void Index::sa_encode (idx_t, const float *,
|
||||
uint8_t *) const
|
||||
{
|
||||
ABORT ("standalone codec not implemented for this type of index");
|
||||
}
|
||||
|
||||
void Index::sa_decode (idx_t, const uint8_t *,
|
||||
float *) const
|
||||
{
|
||||
ABORT ("standalone codec not implemented for this type of index");
|
||||
}
|
||||
|
||||
}
|
233
src/3rd_party/faiss/Index.h
vendored
Normal file
233
src/3rd_party/faiss/Index.h
vendored
Normal file
@ -0,0 +1,233 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#ifndef FAISS_INDEX_H
|
||||
#define FAISS_INDEX_H
|
||||
|
||||
#include "utils/misc.h"
|
||||
#include <cstdio>
|
||||
#include <typeinfo>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#define FAISS_VERSION_MAJOR 1
|
||||
#define FAISS_VERSION_MINOR 6
|
||||
#define FAISS_VERSION_PATCH 3
|
||||
|
||||
/**
|
||||
* @namespace faiss
|
||||
*
|
||||
* Throughout the library, vectors are provided as float * pointers.
|
||||
* Most algorithms can be optimized when several vectors are processed
|
||||
* (added/searched) together in a batch. In this case, they are passed
|
||||
* in as a matrix. When n vectors of size d are provided as float * x,
|
||||
* component j of vector i is
|
||||
*
|
||||
* x[ i * d + j ]
|
||||
*
|
||||
* where 0 <= i < n and 0 <= j < d. In other words, matrices are
|
||||
* always compact. When specifying the size of the matrix, we call it
|
||||
* an n*d matrix, which implies a row-major storage.
|
||||
*/
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/// Forward declarations see AuxIndexStructures.h
|
||||
struct IDSelector;
|
||||
struct RangeSearchResult;
|
||||
struct DistanceComputer;
|
||||
|
||||
/** Abstract structure for an index, supports adding vectors and searching them.
|
||||
*
|
||||
* All vectors provided at add or search time are 32-bit float arrays,
|
||||
* although the internal representation may vary.
|
||||
*/
|
||||
struct Index {
|
||||
using idx_t = int64_t; ///< all indices are this type
|
||||
using component_t = float;
|
||||
using distance_t = float;
|
||||
|
||||
int d; ///< vector dimension
|
||||
idx_t ntotal; ///< total nb of indexed vectors
|
||||
bool verbose; ///< verbosity level
|
||||
|
||||
/// set if the Index does not require training, or if training is
|
||||
/// done already
|
||||
bool is_trained;
|
||||
|
||||
/// type of metric this index uses for search
|
||||
MetricType metric_type;
|
||||
float metric_arg; ///< argument of the metric type
|
||||
|
||||
explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
|
||||
d(d),
|
||||
ntotal(0),
|
||||
verbose(false),
|
||||
is_trained(true),
|
||||
metric_type (metric),
|
||||
metric_arg(0) {}
|
||||
|
||||
virtual ~Index ();
|
||||
|
||||
|
||||
/** Perform training on a representative set of vectors
|
||||
*
|
||||
* @param n nb of training vectors
|
||||
* @param x training vecors, size n * d
|
||||
*/
|
||||
virtual void train(idx_t n, const float* x);
|
||||
|
||||
/** Add n vectors of dimension d to the index.
|
||||
*
|
||||
* Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
|
||||
* This function slices the input vectors in chuncks smaller than
|
||||
* blocksize_add and calls add_core.
|
||||
* @param x input matrix, size n * d
|
||||
*/
|
||||
virtual void add (idx_t n, const float *x) = 0;
|
||||
|
||||
/** Same as add, but stores xids instead of sequential ids.
|
||||
*
|
||||
* The default implementation fails with an assertion, as it is
|
||||
* not supported by all indexes.
|
||||
*
|
||||
* @param xids if non-null, ids to store for the vectors (size n)
|
||||
*/
|
||||
virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
|
||||
|
||||
/** query n vectors of dimension d to the index.
|
||||
*
|
||||
* return at most k vectors. If there are not enough results for a
|
||||
* query, the result array is padded with -1s.
|
||||
*
|
||||
* @param x input vectors to search, size n * d
|
||||
* @param labels output labels of the NNs, size n*k
|
||||
* @param distances output pairwise distances, size n*k
|
||||
*/
|
||||
virtual void search (idx_t n, const float *x, idx_t k,
|
||||
float *distances, idx_t *labels) const = 0;
|
||||
|
||||
/** query n vectors of dimension d to the index.
|
||||
*
|
||||
* return all vectors with distance < radius. Note that many
|
||||
* indexes do not implement the range_search (only the k-NN search
|
||||
* is mandatory).
|
||||
*
|
||||
* @param x input vectors to search, size n * d
|
||||
* @param radius search radius
|
||||
* @param result result table
|
||||
*/
|
||||
virtual void range_search (idx_t n, const float *x, float radius,
|
||||
RangeSearchResult *result) const;
|
||||
|
||||
/** return the indexes of the k vectors closest to the query x.
|
||||
*
|
||||
* This function is identical as search but only return labels of neighbors.
|
||||
* @param x input vectors to search, size n * d
|
||||
* @param labels output labels of the NNs, size n*k
|
||||
*/
|
||||
void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
|
||||
|
||||
/// removes all elements from the database.
|
||||
virtual void reset() = 0;
|
||||
|
||||
/** removes IDs from the index. Not supported by all
|
||||
* indexes. Returns the number of elements removed.
|
||||
*/
|
||||
virtual size_t remove_ids (const IDSelector & sel);
|
||||
|
||||
/** Reconstruct a stored vector (or an approximation if lossy coding)
|
||||
*
|
||||
* this function may not be defined for some indexes
|
||||
* @param key id of the vector to reconstruct
|
||||
* @param recons reconstucted vector (size d)
|
||||
*/
|
||||
virtual void reconstruct (idx_t key, float * recons) const;
|
||||
|
||||
/** Reconstruct vectors i0 to i0 + ni - 1
|
||||
*
|
||||
* this function may not be defined for some indexes
|
||||
* @param recons reconstucted vector (size ni * d)
|
||||
*/
|
||||
virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
|
||||
|
||||
/** Similar to search, but also reconstructs the stored vectors (or an
|
||||
* approximation in the case of lossy coding) for the search results.
|
||||
*
|
||||
* If there are not enough results for a query, the resulting arrays
|
||||
* is padded with -1s.
|
||||
*
|
||||
* @param recons reconstructed vectors size (n, k, d)
|
||||
**/
|
||||
virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
|
||||
float *distances, idx_t *labels,
|
||||
float *recons) const;
|
||||
|
||||
/** Computes a residual vector after indexing encoding.
|
||||
*
|
||||
* The residual vector is the difference between a vector and the
|
||||
* reconstruction that can be decoded from its representation in
|
||||
* the index. The residual can be used for multiple-stage indexing
|
||||
* methods, like IndexIVF's methods.
|
||||
*
|
||||
* @param x input vector, size d
|
||||
* @param residual output residual vector, size d
|
||||
* @param key encoded index, as returned by search and assign
|
||||
*/
|
||||
virtual void compute_residual (const float * x,
|
||||
float * residual, idx_t key) const;
|
||||
|
||||
/** Computes a residual vector after indexing encoding (batch form).
|
||||
* Equivalent to calling compute_residual for each vector.
|
||||
*
|
||||
* The residual vector is the difference between a vector and the
|
||||
* reconstruction that can be decoded from its representation in
|
||||
* the index. The residual can be used for multiple-stage indexing
|
||||
* methods, like IndexIVF's methods.
|
||||
*
|
||||
* @param n number of vectors
|
||||
* @param xs input vectors, size (n x d)
|
||||
* @param residuals output residual vectors, size (n x d)
|
||||
* @param keys encoded index, as returned by search and assign
|
||||
*/
|
||||
virtual void compute_residual_n (idx_t n, const float* xs,
|
||||
float* residuals,
|
||||
const idx_t* keys) const;
|
||||
|
||||
/* The standalone codec interface */
|
||||
|
||||
/** size of the produced codes in bytes */
|
||||
virtual size_t sa_code_size () const;
|
||||
|
||||
/** encode a set of vectors
|
||||
*
|
||||
* @param n number of vectors
|
||||
* @param x input vectors, size n * d
|
||||
* @param bytes output encoded vectors, size n * sa_code_size()
|
||||
*/
|
||||
virtual void sa_encode (idx_t n, const float *x,
|
||||
uint8_t *bytes) const;
|
||||
|
||||
/** encode a set of vectors
|
||||
*
|
||||
* @param n number of vectors
|
||||
* @param bytes input encoded vectors, size n * sa_code_size()
|
||||
* @param x output vectors, size n * d
|
||||
*/
|
||||
virtual void sa_decode (idx_t n, const uint8_t *bytes,
|
||||
float *x) const;
|
||||
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
224
src/3rd_party/faiss/IndexLSH.cpp
vendored
Normal file
224
src/3rd_party/faiss/IndexLSH.cpp
vendored
Normal file
@ -0,0 +1,224 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/IndexLSH.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include "common/logging.h"
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/***************************************************************
|
||||
* IndexLSH
|
||||
***************************************************************/
|
||||
|
||||
|
||||
IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds):
|
||||
Index(d), nbits(nbits), rotate_data(rotate_data),
|
||||
train_thresholds (train_thresholds), rrot(d, nbits)
|
||||
{
|
||||
is_trained = !train_thresholds;
|
||||
|
||||
bytes_per_vec = (nbits + 7) / 8;
|
||||
|
||||
if (rotate_data) {
|
||||
rrot.init(5);
|
||||
} else {
|
||||
ABORT_UNLESS(d >= nbits, "d >= nbits");
|
||||
}
|
||||
}
|
||||
|
||||
IndexLSH::IndexLSH ():
|
||||
nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const
|
||||
{
|
||||
|
||||
float *xt = nullptr;
|
||||
if (rotate_data) {
|
||||
// also applies bias if exists
|
||||
xt = rrot.apply (n, x);
|
||||
} else if (d != nbits) {
|
||||
assert (nbits < d);
|
||||
xt = new float [nbits * n];
|
||||
float *xp = xt;
|
||||
for (idx_t i = 0; i < n; i++) {
|
||||
const float *xl = x + i * d;
|
||||
for (int j = 0; j < nbits; j++)
|
||||
*xp++ = xl [j];
|
||||
}
|
||||
}
|
||||
|
||||
if (train_thresholds) {
|
||||
|
||||
if (xt == NULL) {
|
||||
xt = new float [nbits * n];
|
||||
memcpy (xt, x, sizeof(*x) * n * nbits);
|
||||
}
|
||||
|
||||
float *xp = xt;
|
||||
for (idx_t i = 0; i < n; i++)
|
||||
for (int j = 0; j < nbits; j++)
|
||||
*xp++ -= thresholds [j];
|
||||
}
|
||||
|
||||
return xt ? xt : x;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void IndexLSH::train (idx_t n, const float *x)
|
||||
{
|
||||
if (train_thresholds) {
|
||||
thresholds.resize (nbits);
|
||||
train_thresholds = false;
|
||||
const float *xt = apply_preprocess (n, x);
|
||||
ScopeDeleter<float> del (xt == x ? nullptr : xt);
|
||||
train_thresholds = true;
|
||||
|
||||
float * transposed_x = new float [n * nbits];
|
||||
ScopeDeleter<float> del2 (transposed_x);
|
||||
|
||||
for (idx_t i = 0; i < n; i++)
|
||||
for (idx_t j = 0; j < nbits; j++)
|
||||
transposed_x [j * n + i] = xt [i * nbits + j];
|
||||
|
||||
for (idx_t i = 0; i < nbits; i++) {
|
||||
float *xi = transposed_x + i * n;
|
||||
// std::nth_element
|
||||
std::sort (xi, xi + n);
|
||||
if (n % 2 == 1)
|
||||
thresholds [i] = xi [n / 2];
|
||||
else
|
||||
thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2;
|
||||
|
||||
}
|
||||
}
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
|
||||
void IndexLSH::add (idx_t n, const float *x)
|
||||
{
|
||||
ABORT_UNLESS (is_trained, "is_trained");
|
||||
codes.resize ((ntotal + n) * bytes_per_vec);
|
||||
|
||||
sa_encode (n, x, &codes[ntotal * bytes_per_vec]);
|
||||
|
||||
ntotal += n;
|
||||
}
|
||||
|
||||
|
||||
void IndexLSH::search (
|
||||
idx_t n,
|
||||
const float *x,
|
||||
idx_t k,
|
||||
float *distances,
|
||||
idx_t *labels) const
|
||||
{
|
||||
ABORT_UNLESS (is_trained, "is_trained");
|
||||
const float *xt = apply_preprocess (n, x);
|
||||
ScopeDeleter<float> del (xt == x ? nullptr : xt);
|
||||
|
||||
uint8_t * qcodes = new uint8_t [n * bytes_per_vec];
|
||||
ScopeDeleter<uint8_t> del2 (qcodes);
|
||||
|
||||
fvecs2bitvecs (xt, qcodes, nbits, n);
|
||||
|
||||
int * idistances = new int [n * k];
|
||||
ScopeDeleter<int> del3 (idistances);
|
||||
|
||||
int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances};
|
||||
|
||||
hammings_knn_hc (&res, qcodes, codes.data(),
|
||||
ntotal, bytes_per_vec, true);
|
||||
|
||||
|
||||
// convert distances to floats
|
||||
for (int i = 0; i < k * n; i++)
|
||||
distances[i] = idistances[i];
|
||||
|
||||
}
|
||||
|
||||
|
||||
void IndexLSH::transfer_thresholds (LinearTransform *vt) {
|
||||
if (!train_thresholds) return;
|
||||
ABORT_UNLESS (nbits == vt->d_out, "nbits == vt->d_out");
|
||||
if (!vt->have_bias) {
|
||||
vt->b.resize (nbits, 0);
|
||||
vt->have_bias = true;
|
||||
}
|
||||
for (int i = 0; i < nbits; i++)
|
||||
vt->b[i] -= thresholds[i];
|
||||
train_thresholds = false;
|
||||
thresholds.clear();
|
||||
}
|
||||
|
||||
void IndexLSH::reset() {
|
||||
codes.clear();
|
||||
ntotal = 0;
|
||||
}
|
||||
|
||||
|
||||
size_t IndexLSH::sa_code_size () const
|
||||
{
|
||||
return bytes_per_vec;
|
||||
}
|
||||
|
||||
void IndexLSH::sa_encode (idx_t n, const float *x,
|
||||
uint8_t *bytes) const
|
||||
{
|
||||
ABORT_UNLESS (is_trained, "is_trained");
|
||||
const float *xt = apply_preprocess (n, x);
|
||||
ScopeDeleter<float> del (xt == x ? nullptr : xt);
|
||||
fvecs2bitvecs (xt, bytes, nbits, n);
|
||||
}
|
||||
|
||||
void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes,
|
||||
float *x) const
|
||||
{
|
||||
float *xt = x;
|
||||
ScopeDeleter<float> del;
|
||||
if (rotate_data || nbits != d) {
|
||||
xt = new float [n * nbits];
|
||||
del.set(xt);
|
||||
}
|
||||
bitvecs2fvecs (bytes, xt, nbits, n);
|
||||
|
||||
if (train_thresholds) {
|
||||
float *xp = xt;
|
||||
for (idx_t i = 0; i < n; i++) {
|
||||
for (int j = 0; j < nbits; j++) {
|
||||
*xp++ += thresholds [j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rotate_data) {
|
||||
rrot.reverse_transform (n, xt, x);
|
||||
} else if (nbits != d) {
|
||||
for (idx_t i = 0; i < n; i++) {
|
||||
memcpy (x + i * d, xt + i * nbits,
|
||||
nbits * sizeof(xt[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace faiss
|
90
src/3rd_party/faiss/IndexLSH.h
vendored
Normal file
90
src/3rd_party/faiss/IndexLSH.h
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#ifndef INDEX_LSH_H
|
||||
#define INDEX_LSH_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/Index.h>
|
||||
#include <faiss/VectorTransform.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
/** The sign of each vector component is put in a binary signature */
|
||||
struct IndexLSH:Index {
|
||||
typedef unsigned char uint8_t;
|
||||
|
||||
int nbits; ///< nb of bits per vector
|
||||
int bytes_per_vec; ///< nb of 8-bits per encoded vector
|
||||
bool rotate_data; ///< whether to apply a random rotation to input
|
||||
bool train_thresholds; ///< whether we train thresholds or use 0
|
||||
|
||||
RandomRotationMatrix rrot; ///< optional random rotation
|
||||
|
||||
std::vector <float> thresholds; ///< thresholds to compare with
|
||||
|
||||
/// encoded dataset
|
||||
std::vector<uint8_t> codes;
|
||||
|
||||
IndexLSH (
|
||||
idx_t d, int nbits,
|
||||
bool rotate_data = true,
|
||||
bool train_thresholds = false);
|
||||
|
||||
/** Preprocesses and resizes the input to the size required to
|
||||
* binarize the data
|
||||
*
|
||||
* @param x input vectors, size n * d
|
||||
* @return output vectors, size n * bits. May be the same pointer
|
||||
* as x, otherwise it should be deleted by the caller
|
||||
*/
|
||||
const float *apply_preprocess (idx_t n, const float *x) const;
|
||||
|
||||
void train(idx_t n, const float* x) override;
|
||||
|
||||
void add(idx_t n, const float* x) override;
|
||||
|
||||
void search(
|
||||
idx_t n,
|
||||
const float* x,
|
||||
idx_t k,
|
||||
float* distances,
|
||||
idx_t* labels) const override;
|
||||
|
||||
void reset() override;
|
||||
|
||||
/// transfer the thresholds to a pre-processing stage (and unset
|
||||
/// train_thresholds)
|
||||
void transfer_thresholds (LinearTransform * vt);
|
||||
|
||||
~IndexLSH() override {}
|
||||
|
||||
IndexLSH ();
|
||||
|
||||
/* standalone codec interface.
|
||||
*
|
||||
* The vectors are decoded to +/- 1 (not 0, 1) */
|
||||
|
||||
size_t sa_code_size () const override;
|
||||
|
||||
void sa_encode (idx_t n, const float *x,
|
||||
uint8_t *bytes) const override;
|
||||
|
||||
void sa_decode (idx_t n, const uint8_t *bytes,
|
||||
float *x) const override;
|
||||
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
21
src/3rd_party/faiss/LICENSE
vendored
Normal file
21
src/3rd_party/faiss/LICENSE
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) Facebook, Inc. and its affiliates.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
1
src/3rd_party/faiss/README
vendored
Normal file
1
src/3rd_party/faiss/README
vendored
Normal file
@ -0,0 +1 @@
|
||||
This is code extracted from the original FAISS repository: https://github.com/facebookresearch/faiss
|
726
src/3rd_party/faiss/VectorTransform.cpp
vendored
Normal file
726
src/3rd_party/faiss/VectorTransform.cpp
vendored
Normal file
@ -0,0 +1,726 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/VectorTransform.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include <faiss/utils/random.h>
|
||||
#include "common/logging.h"
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
// this is to keep the clang syntax checker happy
|
||||
#ifndef FINTEGER
|
||||
#define FINTEGER int
|
||||
#endif
|
||||
|
||||
|
||||
/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
|
||||
|
||||
int sgemm_ (
|
||||
const char *transa, const char *transb, FINTEGER *m, FINTEGER *
|
||||
n, FINTEGER *k, const float *alpha, const float *a,
|
||||
FINTEGER *lda, const float *b,
|
||||
FINTEGER *ldb, float *beta,
|
||||
float *c, FINTEGER *ldc);
|
||||
|
||||
int dgemm_ (
|
||||
const char *transa, const char *transb, FINTEGER *m, FINTEGER *
|
||||
n, FINTEGER *k, const double *alpha, const double *a,
|
||||
FINTEGER *lda, const double *b,
|
||||
FINTEGER *ldb, double *beta,
|
||||
double *c, FINTEGER *ldc);
|
||||
|
||||
int ssyrk_ (
|
||||
const char *uplo, const char *trans, FINTEGER *n, FINTEGER *k,
|
||||
float *alpha, float *a, FINTEGER *lda,
|
||||
float *beta, float *c, FINTEGER *ldc);
|
||||
|
||||
/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
|
||||
|
||||
int ssyev_ (
|
||||
const char *jobz, const char *uplo, FINTEGER *n, float *a,
|
||||
FINTEGER *lda, float *w, float *work, FINTEGER *lwork,
|
||||
FINTEGER *info);
|
||||
|
||||
int dsyev_ (
|
||||
const char *jobz, const char *uplo, FINTEGER *n, double *a,
|
||||
FINTEGER *lda, double *w, double *work, FINTEGER *lwork,
|
||||
FINTEGER *info);
|
||||
|
||||
int sgesvd_(
|
||||
const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
|
||||
float *a, FINTEGER *lda, float *s, float *u, FINTEGER *ldu, float *vt,
|
||||
FINTEGER *ldvt, float *work, FINTEGER *lwork, FINTEGER *info);
|
||||
|
||||
|
||||
int dgesvd_(
|
||||
const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
|
||||
double *a, FINTEGER *lda, double *s, double *u, FINTEGER *ldu, double *vt,
|
||||
FINTEGER *ldvt, double *work, FINTEGER *lwork, FINTEGER *info);
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
extern "C" {
|
||||
/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
|
||||
|
||||
int sgeqrf_(FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
|
||||
float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
|
||||
|
||||
int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
|
||||
FINTEGER *lda, float *tau, float *work,
|
||||
FINTEGER *lwork, FINTEGER *info);
|
||||
|
||||
}
|
||||
|
||||
void matrix_qr(int m, int n, float *a)
|
||||
{
|
||||
ABORT_UNLESS(m >= n, "m >= n");
|
||||
FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
|
||||
std::vector<float> tau(ki);
|
||||
FINTEGER lwork = -1, info;
|
||||
float work_size;
|
||||
|
||||
sgeqrf_(&mi, &ni, a, &mi, tau.data(),
|
||||
&work_size, &lwork, &info);
|
||||
lwork = size_t(work_size);
|
||||
std::vector<float> work(lwork);
|
||||
|
||||
sgeqrf_(&mi, &ni, a, &mi,
|
||||
tau.data(), work.data(), &lwork, &info);
|
||||
|
||||
sorgqr_(&mi, &ni, &ki, a, &mi, tau.data(),
|
||||
work.data(), &lwork, &info);
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
const float *fvecs_maybe_subsample(
|
||||
size_t d, size_t *n, size_t nmax, const float *x,
|
||||
bool verbose = false, int64_t seed = 1234)
|
||||
{
|
||||
|
||||
if (*n <= nmax) return x; // nothing to do
|
||||
|
||||
size_t n2 = nmax;
|
||||
if (verbose) {
|
||||
printf(" Input training set too big (max size is %ld), sampling "
|
||||
"%ld / %ld vectors\n", nmax, n2, *n);
|
||||
}
|
||||
std::vector<int> subset(*n);
|
||||
rand_perm(subset.data(), *n, seed);
|
||||
float *x_subset = new float[n2 * d];
|
||||
for (int64_t i = 0; i < n2; i++)
|
||||
memcpy(&x_subset[i * d],
|
||||
&x[subset[i] * size_t(d)],
|
||||
sizeof(x[0]) * d);
|
||||
*n = n2;
|
||||
return x_subset;
|
||||
}
|
||||
|
||||
#ifdef __SSE__
|
||||
// reads 0 <= d < 4 floats as __m128
|
||||
static inline __m128 masked_read(int d, const float *x)
|
||||
{
|
||||
assert(0 <= d && d < 4);
|
||||
__attribute__((__aligned__(16))) float buf[4] = { 0, 0, 0, 0 };
|
||||
switch (d) {
|
||||
case 3:
|
||||
buf[2] = x[2];
|
||||
case 2:
|
||||
buf[1] = x[1];
|
||||
case 1:
|
||||
buf[0] = x[0];
|
||||
}
|
||||
return _mm_load_ps(buf);
|
||||
// cannot use AVX2 _mm_mask_set1_epi32
|
||||
}
|
||||
|
||||
float fvec_norm_L2sqr(const float * x,
|
||||
size_t d)
|
||||
{
|
||||
__m128 mx;
|
||||
__m128 msum1 = _mm_setzero_ps();
|
||||
|
||||
while (d >= 4) {
|
||||
mx = _mm_loadu_ps(x); x += 4;
|
||||
msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
mx = masked_read(d, x);
|
||||
msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
|
||||
|
||||
msum1 = _mm_hadd_ps(msum1, msum1);
|
||||
msum1 = _mm_hadd_ps(msum1, msum1);
|
||||
return _mm_cvtss_f32(msum1);
|
||||
}
|
||||
#else
|
||||
// scalar implementation
|
||||
float fvec_norm_L2sqr(const float *x, size_t d)
|
||||
{
|
||||
return fvec_norm_L2sqr_ref(x, d);
|
||||
}
|
||||
#endif
|
||||
|
||||
void fvec_renorm_L2(size_t d, size_t nx, float * __restrict x)
|
||||
{
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = 0; i < nx; i++) {
|
||||
float * __restrict xi = x + i * d;
|
||||
|
||||
float nr = fvec_norm_L2sqr(xi, d);
|
||||
|
||||
if (nr > 0) {
|
||||
size_t j;
|
||||
const float inv_nr = 1.0 / sqrtf(nr);
|
||||
for (j = 0; j < d; j++)
|
||||
xi[j] *= inv_nr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*********************************************
|
||||
* VectorTransform
|
||||
*********************************************/
|
||||
|
||||
|
||||
|
||||
float * VectorTransform::apply (Index::idx_t n, const float * x) const
|
||||
{
|
||||
float * xt = new float[n * d_out];
|
||||
apply_noalloc (n, x, xt);
|
||||
return xt;
|
||||
}
|
||||
|
||||
|
||||
void VectorTransform::train (idx_t, const float *) {
|
||||
// does nothing by default
|
||||
}
|
||||
|
||||
|
||||
void VectorTransform::reverse_transform (
|
||||
idx_t , const float *,
|
||||
float *) const
|
||||
{
|
||||
ABORT ("reverse transform not implemented");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*********************************************
|
||||
* LinearTransform
|
||||
*********************************************/
|
||||
|
||||
/// both d_in > d_out and d_out < d_in are supported
|
||||
LinearTransform::LinearTransform (int d_in, int d_out,
|
||||
bool have_bias):
|
||||
VectorTransform (d_in, d_out), have_bias (have_bias),
|
||||
is_orthonormal (false), verbose (false)
|
||||
{
|
||||
is_trained = false; // will be trained when A and b are initialized
|
||||
}
|
||||
|
||||
void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
|
||||
float * xt) const
|
||||
{
|
||||
ABORT_UNLESS(is_trained, "Transformation not trained yet");
|
||||
|
||||
float c_factor;
|
||||
if (have_bias) {
|
||||
ABORT_UNLESS(b.size() == d_out, "Bias not initialized");
|
||||
float * xi = xt;
|
||||
for (int i = 0; i < n; i++)
|
||||
for(int j = 0; j < d_out; j++)
|
||||
*xi++ = b[j];
|
||||
c_factor = 1.0;
|
||||
} else {
|
||||
c_factor = 0.0;
|
||||
}
|
||||
|
||||
ABORT_UNLESS(A.size() == d_out * d_in,
|
||||
"Transformation matrix not initialized");
|
||||
|
||||
float one = 1;
|
||||
FINTEGER nbiti = d_out, ni = n, di = d_in;
|
||||
sgemm_ ("Transposed", "Not transposed",
|
||||
&nbiti, &ni, &di,
|
||||
&one, A.data(), &di, x, &di, &c_factor, xt, &nbiti);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void LinearTransform::transform_transpose (idx_t n, const float * y,
|
||||
float *x) const
|
||||
{
|
||||
if (have_bias) { // allocate buffer to store bias-corrected data
|
||||
float *y_new = new float [n * d_out];
|
||||
const float *yr = y;
|
||||
float *yw = y_new;
|
||||
for (idx_t i = 0; i < n; i++) {
|
||||
for (int j = 0; j < d_out; j++) {
|
||||
*yw++ = *yr++ - b [j];
|
||||
}
|
||||
}
|
||||
y = y_new;
|
||||
}
|
||||
|
||||
{
|
||||
FINTEGER dii = d_in, doi = d_out, ni = n;
|
||||
float one = 1.0, zero = 0.0;
|
||||
sgemm_ ("Not", "Not", &dii, &ni, &doi,
|
||||
&one, A.data (), &dii, y, &doi, &zero, x, &dii);
|
||||
}
|
||||
|
||||
if (have_bias) delete [] y;
|
||||
}
|
||||
|
||||
void LinearTransform::set_is_orthonormal ()
|
||||
{
|
||||
if (d_out > d_in) {
|
||||
// not clear what we should do in this case
|
||||
is_orthonormal = false;
|
||||
return;
|
||||
}
|
||||
if (d_out == 0) { // borderline case, unnormalized matrix
|
||||
is_orthonormal = true;
|
||||
return;
|
||||
}
|
||||
|
||||
double eps = 4e-5;
|
||||
ABORT_UNLESS(A.size() >= d_out * d_in, "A.size() >= d_out * d_in");
|
||||
{
|
||||
std::vector<float> ATA(d_out * d_out);
|
||||
FINTEGER dii = d_in, doi = d_out;
|
||||
float one = 1.0, zero = 0.0;
|
||||
|
||||
sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
|
||||
&one, A.data (), &dii,
|
||||
A.data(), &dii,
|
||||
&zero, ATA.data(), &doi);
|
||||
|
||||
is_orthonormal = true;
|
||||
for (long i = 0; i < d_out; i++) {
|
||||
for (long j = 0; j < d_out; j++) {
|
||||
float v = ATA[i + j * d_out];
|
||||
if (i == j) v-= 1;
|
||||
if (fabs(v) > eps) {
|
||||
is_orthonormal = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void LinearTransform::reverse_transform (idx_t n, const float * xt,
|
||||
float *x) const
|
||||
{
|
||||
if (is_orthonormal) {
|
||||
transform_transpose (n, xt, x);
|
||||
} else {
|
||||
ABORT("reverse transform not implemented for non-orthonormal matrices");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void LinearTransform::print_if_verbose (
|
||||
const char*name, const std::vector<double> &mat,
|
||||
int n, int d) const
|
||||
{
|
||||
if (!verbose) return;
|
||||
printf("matrix %s: %d*%d [\n", name, n, d);
|
||||
ABORT_UNLESS(mat.size() >= n * d, "mat.size() >= n * d");
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
printf("%10.5g ", mat[i * d + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
/*********************************************
|
||||
* RandomRotationMatrix
|
||||
*********************************************/
|
||||
|
||||
void RandomRotationMatrix::init (int seed)
|
||||
{
|
||||
|
||||
if(d_out <= d_in) {
|
||||
A.resize (d_out * d_in);
|
||||
float *q = A.data();
|
||||
float_randn(q, d_out * d_in, seed);
|
||||
matrix_qr(d_in, d_out, q);
|
||||
} else {
|
||||
// use tight-frame transformation
|
||||
A.resize (d_out * d_out);
|
||||
float *q = A.data();
|
||||
float_randn(q, d_out * d_out, seed);
|
||||
matrix_qr(d_out, d_out, q);
|
||||
// remove columns
|
||||
int i, j;
|
||||
for (i = 0; i < d_out; i++) {
|
||||
for(j = 0; j < d_in; j++) {
|
||||
q[i * d_in + j] = q[i * d_out + j];
|
||||
}
|
||||
}
|
||||
A.resize(d_in * d_out);
|
||||
}
|
||||
is_orthonormal = true;
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
|
||||
{
|
||||
// initialize with some arbitrary seed
|
||||
init (12345);
|
||||
}
|
||||
|
||||
|
||||
/*********************************************
|
||||
* PCAMatrix
|
||||
*********************************************/
|
||||
|
||||
PCAMatrix::PCAMatrix (int d_in, int d_out,
|
||||
float eigen_power, bool random_rotation):
|
||||
LinearTransform(d_in, d_out, true),
|
||||
eigen_power(eigen_power), random_rotation(random_rotation)
|
||||
{
|
||||
is_trained = false;
|
||||
max_points_per_d = 1000;
|
||||
balanced_bins = 0;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
/// Compute the eigenvalue decomposition of symmetric matrix cov,
|
||||
/// dimensions d_in-by-d_in. Output eigenvectors in cov.
|
||||
|
||||
void eig(size_t d_in, double *cov, double *eigenvalues, int verbose)
|
||||
{
|
||||
{ // compute eigenvalues and vectors
|
||||
FINTEGER info = 0, lwork = -1, di = d_in;
|
||||
double workq;
|
||||
|
||||
dsyev_ ("Vectors as well", "Upper",
|
||||
&di, cov, &di, eigenvalues, &workq, &lwork, &info);
|
||||
lwork = FINTEGER(workq);
|
||||
double *work = new double[lwork];
|
||||
|
||||
dsyev_ ("Vectors as well", "Upper",
|
||||
&di, cov, &di, eigenvalues, work, &lwork, &info);
|
||||
|
||||
delete [] work;
|
||||
|
||||
if (info != 0) {
|
||||
fprintf (stderr, "WARN ssyev info returns %d, "
|
||||
"a very bad PCA matrix is learnt\n",
|
||||
int(info));
|
||||
// do not throw exception, as the matrix could still be useful
|
||||
}
|
||||
|
||||
|
||||
if(verbose && d_in <= 10) {
|
||||
printf("info=%ld new eigvals=[", long(info));
|
||||
for(int j = 0; j < d_in; j++) printf("%g ", eigenvalues[j]);
|
||||
printf("]\n");
|
||||
|
||||
double *ci = cov;
|
||||
printf("eigenvecs=\n");
|
||||
for(int i = 0; i < d_in; i++) {
|
||||
for(int j = 0; j < d_in; j++)
|
||||
printf("%10.4g ", *ci++);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// revert order of eigenvectors & values
|
||||
|
||||
for(int i = 0; i < d_in / 2; i++) {
|
||||
|
||||
std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
|
||||
double *v1 = cov + i * d_in;
|
||||
double *v2 = cov + (d_in - 1 - i) * d_in;
|
||||
for(int j = 0; j < d_in; j++)
|
||||
std::swap(v1[j], v2[j]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void PCAMatrix::train (Index::idx_t n, const float *x)
|
||||
{
|
||||
const float * x_in = x;
|
||||
|
||||
x = fvecs_maybe_subsample (d_in, (size_t*)&n,
|
||||
max_points_per_d * d_in, x, verbose);
|
||||
|
||||
ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
|
||||
|
||||
// compute mean
|
||||
mean.clear(); mean.resize(d_in, 0.0);
|
||||
if (have_bias) { // we may want to skip the bias
|
||||
const float *xi = x;
|
||||
for (int i = 0; i < n; i++) {
|
||||
for(int j = 0; j < d_in; j++)
|
||||
mean[j] += *xi++;
|
||||
}
|
||||
for(int j = 0; j < d_in; j++)
|
||||
mean[j] /= n;
|
||||
}
|
||||
if(verbose) {
|
||||
printf("mean=[");
|
||||
for(int j = 0; j < d_in; j++) printf("%g ", mean[j]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
if(n >= d_in) {
|
||||
// compute covariance matrix, store it in PCA matrix
|
||||
PCAMat.resize(d_in * d_in);
|
||||
float * cov = PCAMat.data();
|
||||
{ // initialize with mean * mean^T term
|
||||
float *ci = cov;
|
||||
for(int i = 0; i < d_in; i++) {
|
||||
for(int j = 0; j < d_in; j++)
|
||||
*ci++ = - n * mean[i] * mean[j];
|
||||
}
|
||||
}
|
||||
{
|
||||
FINTEGER di = d_in, ni = n;
|
||||
float one = 1.0;
|
||||
ssyrk_ ("Up", "Non transposed",
|
||||
&di, &ni, &one, (float*)x, &di, &one, cov, &di);
|
||||
|
||||
}
|
||||
if(verbose && d_in <= 10) {
|
||||
float *ci = cov;
|
||||
printf("cov=\n");
|
||||
for(int i = 0; i < d_in; i++) {
|
||||
for(int j = 0; j < d_in; j++)
|
||||
printf("%10g ", *ci++);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<double> covd (d_in * d_in);
|
||||
for (size_t i = 0; i < d_in * d_in; i++) covd [i] = cov [i];
|
||||
|
||||
std::vector<double> eigenvaluesd (d_in);
|
||||
|
||||
eig (d_in, covd.data (), eigenvaluesd.data (), verbose);
|
||||
|
||||
for (size_t i = 0; i < d_in * d_in; i++) PCAMat [i] = covd [i];
|
||||
eigenvalues.resize (d_in);
|
||||
|
||||
for (size_t i = 0; i < d_in; i++)
|
||||
eigenvalues [i] = eigenvaluesd [i];
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
std::vector<float> xc (n * d_in);
|
||||
|
||||
for (size_t i = 0; i < n; i++)
|
||||
for(size_t j = 0; j < d_in; j++)
|
||||
xc [i * d_in + j] = x [i * d_in + j] - mean[j];
|
||||
|
||||
// compute Gram matrix
|
||||
std::vector<float> gram (n * n);
|
||||
{
|
||||
FINTEGER di = d_in, ni = n;
|
||||
float one = 1.0, zero = 0.0;
|
||||
ssyrk_ ("Up", "Transposed",
|
||||
&ni, &di, &one, xc.data(), &di, &zero, gram.data(), &ni);
|
||||
}
|
||||
|
||||
if(verbose && d_in <= 10) {
|
||||
float *ci = gram.data();
|
||||
printf("gram=\n");
|
||||
for(int i = 0; i < n; i++) {
|
||||
for(int j = 0; j < n; j++)
|
||||
printf("%10g ", *ci++);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<double> gramd (n * n);
|
||||
for (size_t i = 0; i < n * n; i++)
|
||||
gramd [i] = gram [i];
|
||||
|
||||
std::vector<double> eigenvaluesd (n);
|
||||
|
||||
// eig will fill in only the n first eigenvals
|
||||
|
||||
eig (n, gramd.data (), eigenvaluesd.data (), verbose);
|
||||
|
||||
PCAMat.resize(d_in * n);
|
||||
|
||||
for (size_t i = 0; i < n * n; i++)
|
||||
gram [i] = gramd [i];
|
||||
|
||||
eigenvalues.resize (d_in);
|
||||
// fill in only the n first ones
|
||||
for (size_t i = 0; i < n; i++)
|
||||
eigenvalues [i] = eigenvaluesd [i];
|
||||
|
||||
{ // compute PCAMat = x' * v
|
||||
FINTEGER di = d_in, ni = n;
|
||||
float one = 1.0;
|
||||
|
||||
sgemm_ ("Non", "Non Trans",
|
||||
&di, &ni, &ni,
|
||||
&one, xc.data(), &di, gram.data(), &ni,
|
||||
&one, PCAMat.data(), &di);
|
||||
}
|
||||
|
||||
if(verbose && d_in <= 10) {
|
||||
float *ci = PCAMat.data();
|
||||
printf("PCAMat=\n");
|
||||
for(int i = 0; i < n; i++) {
|
||||
for(int j = 0; j < d_in; j++)
|
||||
printf("%10g ", *ci++);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
fvec_renorm_L2 (d_in, n, PCAMat.data());
|
||||
|
||||
}
|
||||
|
||||
prepare_Ab();
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
void PCAMatrix::copy_from (const PCAMatrix & other)
|
||||
{
|
||||
ABORT_UNLESS(other.is_trained, "other.is_trained");
|
||||
mean = other.mean;
|
||||
eigenvalues = other.eigenvalues;
|
||||
PCAMat = other.PCAMat;
|
||||
prepare_Ab ();
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
void PCAMatrix::prepare_Ab ()
|
||||
{
|
||||
ABORT_UNLESS(
|
||||
d_out * d_in <= PCAMat.size(),
|
||||
"PCA matrix cannot output %d dimensions from %d ",
|
||||
d_out, d_in);
|
||||
|
||||
if (!random_rotation) {
|
||||
A = PCAMat;
|
||||
A.resize(d_out * d_in); // strip off useless dimensions
|
||||
|
||||
// first scale the components
|
||||
if (eigen_power != 0) {
|
||||
float *ai = A.data();
|
||||
for (int i = 0; i < d_out; i++) {
|
||||
float factor = pow(eigenvalues[i], eigen_power);
|
||||
for(int j = 0; j < d_in; j++)
|
||||
*ai++ *= factor;
|
||||
}
|
||||
}
|
||||
|
||||
if (balanced_bins != 0) {
|
||||
ABORT_UNLESS(d_out % balanced_bins == 0, "d_out % balanced_bins == 0");
|
||||
int dsub = d_out / balanced_bins;
|
||||
std::vector <float> Ain;
|
||||
std::swap(A, Ain);
|
||||
A.resize(d_out * d_in);
|
||||
|
||||
std::vector <float> accu(balanced_bins);
|
||||
std::vector <int> counter(balanced_bins);
|
||||
|
||||
// greedy assignment
|
||||
for (int i = 0; i < d_out; i++) {
|
||||
// find best bin
|
||||
int best_j = -1;
|
||||
float min_w = 1e30;
|
||||
for (int j = 0; j < balanced_bins; j++) {
|
||||
if (counter[j] < dsub && accu[j] < min_w) {
|
||||
min_w = accu[j];
|
||||
best_j = j;
|
||||
}
|
||||
}
|
||||
int row_dst = best_j * dsub + counter[best_j];
|
||||
accu[best_j] += eigenvalues[i];
|
||||
counter[best_j] ++;
|
||||
memcpy (&A[row_dst * d_in], &Ain[i * d_in],
|
||||
d_in * sizeof (A[0]));
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
printf(" bin accu=[");
|
||||
for (int i = 0; i < balanced_bins; i++)
|
||||
printf("%g ", accu[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
ABORT_UNLESS(balanced_bins == 0,
|
||||
"both balancing bins and applying a random rotation "
|
||||
"does not make sense");
|
||||
RandomRotationMatrix rr(d_out, d_out);
|
||||
|
||||
rr.init(5);
|
||||
|
||||
// apply scaling on the rotation matrix (right multiplication)
|
||||
if (eigen_power != 0) {
|
||||
for (int i = 0; i < d_out; i++) {
|
||||
float factor = pow(eigenvalues[i], eigen_power);
|
||||
for(int j = 0; j < d_out; j++)
|
||||
rr.A[j * d_out + i] *= factor;
|
||||
}
|
||||
}
|
||||
|
||||
A.resize(d_in * d_out);
|
||||
{
|
||||
FINTEGER dii = d_in, doo = d_out;
|
||||
float one = 1.0, zero = 0.0;
|
||||
|
||||
sgemm_ ("Not", "Not", &dii, &doo, &doo,
|
||||
&one, PCAMat.data(), &dii, rr.A.data(), &doo, &zero,
|
||||
A.data(), &dii);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
b.clear(); b.resize(d_out);
|
||||
|
||||
for (int i = 0; i < d_out; i++) {
|
||||
float accu = 0;
|
||||
for (int j = 0; j < d_in; j++)
|
||||
accu -= mean[j] * A[j + i * d_in];
|
||||
b[i] = accu;
|
||||
}
|
||||
|
||||
is_orthonormal = eigen_power == 0;
|
||||
|
||||
}
|
||||
|
||||
|
184
src/3rd_party/faiss/VectorTransform.h
vendored
Normal file
184
src/3rd_party/faiss/VectorTransform.h
vendored
Normal file
@ -0,0 +1,184 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#ifndef FAISS_VECTOR_TRANSFORM_H
|
||||
#define FAISS_VECTOR_TRANSFORM_H
|
||||
|
||||
/** Defines a few objects that apply transformations to a set of
|
||||
* vectors Often these are pre-processing steps.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <faiss/Index.h>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
/** Any transformation applied on a set of vectors */
|
||||
struct VectorTransform {
|
||||
|
||||
typedef Index::idx_t idx_t;
|
||||
|
||||
int d_in; ///! input dimension
|
||||
int d_out; ///! output dimension
|
||||
|
||||
explicit VectorTransform (int d_in = 0, int d_out = 0):
|
||||
d_in(d_in), d_out(d_out), is_trained(true)
|
||||
{}
|
||||
|
||||
|
||||
/// set if the VectorTransform does not require training, or if
|
||||
/// training is done already
|
||||
bool is_trained;
|
||||
|
||||
|
||||
/** Perform training on a representative set of vectors. Does
|
||||
* nothing by default.
|
||||
*
|
||||
* @param n nb of training vectors
|
||||
* @param x training vecors, size n * d
|
||||
*/
|
||||
virtual void train (idx_t n, const float *x);
|
||||
|
||||
/** apply the random roation, return new allocated matrix
|
||||
* @param x size n * d_in
|
||||
* @return size n * d_out
|
||||
*/
|
||||
float *apply (idx_t n, const float * x) const;
|
||||
|
||||
/// same as apply, but result is pre-allocated
|
||||
virtual void apply_noalloc (idx_t n, const float * x,
|
||||
float *xt) const = 0;
|
||||
|
||||
/// reverse transformation. May not be implemented or may return
|
||||
/// approximate result
|
||||
virtual void reverse_transform (idx_t n, const float * xt,
|
||||
float *x) const;
|
||||
|
||||
virtual ~VectorTransform () {}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
/** Generic linear transformation, with bias term applied on output
|
||||
* y = A * x + b
|
||||
*/
|
||||
struct LinearTransform: VectorTransform {
|
||||
|
||||
bool have_bias; ///! whether to use the bias term
|
||||
|
||||
/// check if matrix A is orthonormal (enables reverse_transform)
|
||||
bool is_orthonormal;
|
||||
|
||||
/// Transformation matrix, size d_out * d_in
|
||||
std::vector<float> A;
|
||||
|
||||
/// bias vector, size d_out
|
||||
std::vector<float> b;
|
||||
|
||||
/// both d_in > d_out and d_out < d_in are supported
|
||||
explicit LinearTransform (int d_in = 0, int d_out = 0,
|
||||
bool have_bias = false);
|
||||
|
||||
/// same as apply, but result is pre-allocated
|
||||
void apply_noalloc(idx_t n, const float* x, float* xt) const override;
|
||||
|
||||
/// compute x = A^T * (x - b)
|
||||
/// is reverse transform if A has orthonormal lines
|
||||
void transform_transpose (idx_t n, const float * y,
|
||||
float *x) const;
|
||||
|
||||
/// works only if is_orthonormal
|
||||
void reverse_transform (idx_t n, const float * xt,
|
||||
float *x) const override;
|
||||
|
||||
/// compute A^T * A to set the is_orthonormal flag
|
||||
void set_is_orthonormal ();
|
||||
|
||||
bool verbose;
|
||||
void print_if_verbose (const char*name, const std::vector<double> &mat,
|
||||
int n, int d) const;
|
||||
|
||||
~LinearTransform() override {}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Randomly rotate a set of vectors
|
||||
struct RandomRotationMatrix: LinearTransform {
|
||||
|
||||
/// both d_in > d_out and d_out < d_in are supported
|
||||
RandomRotationMatrix (int d_in, int d_out):
|
||||
LinearTransform(d_in, d_out, false) {}
|
||||
|
||||
/// must be called before the transform is used
|
||||
void init(int seed);
|
||||
|
||||
// intializes with an arbitrary seed
|
||||
void train(idx_t n, const float* x) override;
|
||||
|
||||
RandomRotationMatrix () {}
|
||||
};
|
||||
|
||||
|
||||
/** Applies a principal component analysis on a set of vectors,
|
||||
* with optionally whitening and random rotation. */
|
||||
struct PCAMatrix: LinearTransform {
|
||||
|
||||
/** after transformation the components are multiplied by
|
||||
* eigenvalues^eigen_power
|
||||
*
|
||||
* =0: no whitening
|
||||
* =-0.5: full whitening
|
||||
*/
|
||||
float eigen_power;
|
||||
|
||||
/// random rotation after PCA
|
||||
bool random_rotation;
|
||||
|
||||
/// ratio between # training vectors and dimension
|
||||
size_t max_points_per_d;
|
||||
|
||||
/// try to distribute output eigenvectors in this many bins
|
||||
int balanced_bins;
|
||||
|
||||
/// Mean, size d_in
|
||||
std::vector<float> mean;
|
||||
|
||||
/// eigenvalues of covariance matrix (= squared singular values)
|
||||
std::vector<float> eigenvalues;
|
||||
|
||||
/// PCA matrix, size d_in * d_in
|
||||
std::vector<float> PCAMat;
|
||||
|
||||
// the final matrix is computed after random rotation and/or whitening
|
||||
explicit PCAMatrix (int d_in = 0, int d_out = 0,
|
||||
float eigen_power = 0, bool random_rotation = false);
|
||||
|
||||
/// train on n vectors. If n < d_in then the eigenvector matrix
|
||||
/// will be completed with 0s
|
||||
void train(idx_t n, const float* x) override;
|
||||
|
||||
/// copy pre-trained PCA matrix
|
||||
void copy_from (const PCAMatrix & other);
|
||||
|
||||
/// called after mean, PCAMat and eigenvalues are computed
|
||||
void prepare_Ab();
|
||||
|
||||
};
|
||||
|
||||
|
||||
} // namespace faiss
|
||||
|
||||
|
||||
#endif
|
122
src/3rd_party/faiss/utils/Heap.cpp
vendored
Normal file
122
src/3rd_party/faiss/utils/Heap.cpp
vendored
Normal file
@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/* Function for soft heap */
|
||||
|
||||
#include <faiss/utils/Heap.h>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
template <typename C>
|
||||
void HeapArray<C>::heapify ()
|
||||
{
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nh; j++)
|
||||
heap_heapify<C> (k, val + j * k, ids + j * k);
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
void HeapArray<C>::reorder ()
|
||||
{
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nh; j++)
|
||||
heap_reorder<C> (k, val + j * k, ids + j * k);
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
|
||||
size_t i0, int64_t ni)
|
||||
{
|
||||
if (ni == -1) ni = nh;
|
||||
assert (i0 >= 0 && i0 + ni <= nh);
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = i0; i < i0 + ni; i++) {
|
||||
T * __restrict simi = get_val(i);
|
||||
TI * __restrict idxi = get_ids (i);
|
||||
const T *ip_line = vin + (i - i0) * nj;
|
||||
|
||||
for (size_t j = 0; j < nj; j++) {
|
||||
T ip = ip_line [j];
|
||||
if (C::cmp(simi[0], ip)) {
|
||||
heap_pop<C> (k, simi, idxi);
|
||||
heap_push<C> (k, simi, idxi, ip, j + j0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
void HeapArray<C>::addn_with_ids (
|
||||
size_t nj, const T *vin, const TI *id_in,
|
||||
int64_t id_stride, size_t i0, int64_t ni)
|
||||
{
|
||||
if (id_in == nullptr) {
|
||||
addn (nj, vin, 0, i0, ni);
|
||||
return;
|
||||
}
|
||||
if (ni == -1) ni = nh;
|
||||
assert (i0 >= 0 && i0 + ni <= nh);
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = i0; i < i0 + ni; i++) {
|
||||
T * __restrict simi = get_val(i);
|
||||
TI * __restrict idxi = get_ids (i);
|
||||
const T *ip_line = vin + (i - i0) * nj;
|
||||
const TI *id_line = id_in + (i - i0) * id_stride;
|
||||
|
||||
for (size_t j = 0; j < nj; j++) {
|
||||
T ip = ip_line [j];
|
||||
if (C::cmp(simi[0], ip)) {
|
||||
heap_pop<C> (k, simi, idxi);
|
||||
heap_push<C> (k, simi, idxi, ip, id_line [j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
void HeapArray<C>::per_line_extrema (
|
||||
T * out_val,
|
||||
TI * out_ids) const
|
||||
{
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nh; j++) {
|
||||
int64_t imin = -1;
|
||||
typename C::T xval = C::Crev::neutral ();
|
||||
const typename C::T * x_ = val + j * k;
|
||||
for (size_t i = 0; i < k; i++)
|
||||
if (C::cmp (x_[i], xval)) {
|
||||
xval = x_[i];
|
||||
imin = i;
|
||||
}
|
||||
if (out_val)
|
||||
out_val[j] = xval;
|
||||
|
||||
if (out_ids) {
|
||||
if (ids && imin != -1)
|
||||
out_ids[j] = ids [j * k + imin];
|
||||
else
|
||||
out_ids[j] = imin;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// explicit instanciations
|
||||
|
||||
template struct HeapArray<CMin <float, int64_t> >;
|
||||
template struct HeapArray<CMax <float, int64_t> >;
|
||||
template struct HeapArray<CMin <int, int64_t> >;
|
||||
template struct HeapArray<CMax <int, int64_t> >;
|
||||
|
||||
|
||||
} // END namespace fasis
|
495
src/3rd_party/faiss/utils/Heap.h
vendored
Normal file
495
src/3rd_party/faiss/utils/Heap.h
vendored
Normal file
@ -0,0 +1,495 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/*
|
||||
* C++ support for heaps. The set of functions is tailored for
|
||||
* efficient similarity search.
|
||||
*
|
||||
* There is no specific object for a heap, and the functions that
|
||||
* operate on a signle heap are inlined, because heaps are often
|
||||
* small. More complex functions are implemented in Heaps.cpp
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef FAISS_Heap_h
|
||||
#define FAISS_Heap_h
|
||||
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/*******************************************************************
|
||||
* C object: uniform handling of min and max heap
|
||||
*******************************************************************/
|
||||
|
||||
/** The C object gives the type T of the values in the heap, the type
|
||||
* of the keys, TI and the comparison that is done: > for the minheap
|
||||
* and < for the maxheap. The neutral value will always be dropped in
|
||||
* favor of any other value in the heap.
|
||||
*/
|
||||
|
||||
template <typename T_, typename TI_>
|
||||
struct CMax;
|
||||
|
||||
// traits of minheaps = heaps where the minimum value is stored on top
|
||||
// useful to find the *max* values of an array
|
||||
template <typename T_, typename TI_>
|
||||
struct CMin {
|
||||
typedef T_ T;
|
||||
typedef TI_ TI;
|
||||
typedef CMax<T_, TI_> Crev;
|
||||
inline static bool cmp (T a, T b) {
|
||||
return a < b;
|
||||
}
|
||||
// value that will be popped first -> must be smaller than all others
|
||||
// for int types this is not strictly the smallest val (-max - 1)
|
||||
inline static T neutral () {
|
||||
return -std::numeric_limits<T>::max();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T_, typename TI_>
|
||||
struct CMax {
|
||||
typedef T_ T;
|
||||
typedef TI_ TI;
|
||||
typedef CMin<T_, TI_> Crev;
|
||||
inline static bool cmp (T a, T b) {
|
||||
return a > b;
|
||||
}
|
||||
inline static T neutral () {
|
||||
return std::numeric_limits<T>::max();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Basic heap ops: push and pop
|
||||
*******************************************************************/
|
||||
|
||||
/** Pops the top element from the heap defined by bh_val[0..k-1] and
|
||||
* bh_ids[0..k-1]. on output the element at k-1 is undefined.
|
||||
*/
|
||||
template <class C> inline
|
||||
void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
|
||||
{
|
||||
bh_val--; /* Use 1-based indexing for easier node->child translation */
|
||||
bh_ids--;
|
||||
typename C::T val = bh_val[k];
|
||||
size_t i = 1, i1, i2;
|
||||
while (1) {
|
||||
i1 = i << 1;
|
||||
i2 = i1 + 1;
|
||||
if (i1 > k)
|
||||
break;
|
||||
if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
|
||||
if (C::cmp(val, bh_val[i1]))
|
||||
break;
|
||||
bh_val[i] = bh_val[i1];
|
||||
bh_ids[i] = bh_ids[i1];
|
||||
i = i1;
|
||||
}
|
||||
else {
|
||||
if (C::cmp(val, bh_val[i2]))
|
||||
break;
|
||||
bh_val[i] = bh_val[i2];
|
||||
bh_ids[i] = bh_ids[i2];
|
||||
i = i2;
|
||||
}
|
||||
}
|
||||
bh_val[i] = bh_val[k];
|
||||
bh_ids[i] = bh_ids[k];
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
|
||||
* bh_ids[0..k-2]. on output the element at k-1 is defined.
|
||||
*/
|
||||
template <class C> inline
|
||||
void heap_push (size_t k,
|
||||
typename C::T * bh_val, typename C::TI * bh_ids,
|
||||
typename C::T val, typename C::TI ids)
|
||||
{
|
||||
bh_val--; /* Use 1-based indexing for easier node->child translation */
|
||||
bh_ids--;
|
||||
size_t i = k, i_father;
|
||||
while (i > 1) {
|
||||
i_father = i >> 1;
|
||||
if (!C::cmp (val, bh_val[i_father])) /* the heap structure is ok */
|
||||
break;
|
||||
bh_val[i] = bh_val[i_father];
|
||||
bh_ids[i] = bh_ids[i_father];
|
||||
i = i_father;
|
||||
}
|
||||
bh_val[i] = val;
|
||||
bh_ids[i] = ids;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Partial instanciation for heaps with TI = int64_t */
|
||||
|
||||
template <typename T> inline
|
||||
void minheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
|
||||
{
|
||||
heap_pop<CMin<T, int64_t> > (k, bh_val, bh_ids);
|
||||
}
|
||||
|
||||
|
||||
template <typename T> inline
|
||||
void minheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
|
||||
{
|
||||
heap_push<CMin<T, int64_t> > (k, bh_val, bh_ids, val, ids);
|
||||
}
|
||||
|
||||
|
||||
template <typename T> inline
|
||||
void maxheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
|
||||
{
|
||||
heap_pop<CMax<T, int64_t> > (k, bh_val, bh_ids);
|
||||
}
|
||||
|
||||
|
||||
template <typename T> inline
|
||||
void maxheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
|
||||
{
|
||||
heap_push<CMax<T, int64_t> > (k, bh_val, bh_ids, val, ids);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Heap initialization
|
||||
*******************************************************************/
|
||||
|
||||
/* Initialization phase for the heap (with unconditionnal pushes).
|
||||
* Store k0 elements in a heap containing up to k values. Note that
|
||||
* (bh_val, bh_ids) can be the same as (x, ids) */
|
||||
template <class C> inline
|
||||
void heap_heapify (
|
||||
size_t k,
|
||||
typename C::T * bh_val,
|
||||
typename C::TI * bh_ids,
|
||||
const typename C::T * x = nullptr,
|
||||
const typename C::TI * ids = nullptr,
|
||||
size_t k0 = 0)
|
||||
{
|
||||
if (k0 > 0) assert (x);
|
||||
|
||||
if (ids) {
|
||||
for (size_t i = 0; i < k0; i++)
|
||||
heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
|
||||
} else {
|
||||
for (size_t i = 0; i < k0; i++)
|
||||
heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
|
||||
}
|
||||
|
||||
for (size_t i = k0; i < k; i++) {
|
||||
bh_val[i] = C::neutral();
|
||||
bh_ids[i] = -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T> inline
|
||||
void minheap_heapify (
|
||||
size_t k, T * bh_val,
|
||||
int64_t * bh_ids,
|
||||
const T * x = nullptr,
|
||||
const int64_t * ids = nullptr,
|
||||
size_t k0 = 0)
|
||||
{
|
||||
heap_heapify< CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
|
||||
}
|
||||
|
||||
|
||||
template <typename T> inline
|
||||
void maxheap_heapify (
|
||||
size_t k,
|
||||
T * bh_val,
|
||||
int64_t * bh_ids,
|
||||
const T * x = nullptr,
|
||||
const int64_t * ids = nullptr,
|
||||
size_t k0 = 0)
|
||||
{
|
||||
heap_heapify< CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Add n elements to the heap
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
/* Add some elements to the heap */
|
||||
template <class C> inline
|
||||
void heap_addn (size_t k,
|
||||
typename C::T * bh_val, typename C::TI * bh_ids,
|
||||
const typename C::T * x,
|
||||
const typename C::TI * ids,
|
||||
size_t n)
|
||||
{
|
||||
size_t i;
|
||||
if (ids)
|
||||
for (i = 0; i < n; i++) {
|
||||
if (C::cmp (bh_val[0], x[i])) {
|
||||
heap_pop<C> (k, bh_val, bh_ids);
|
||||
heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
for (i = 0; i < n; i++) {
|
||||
if (C::cmp (bh_val[0], x[i])) {
|
||||
heap_pop<C> (k, bh_val, bh_ids);
|
||||
heap_push<C> (k, bh_val, bh_ids, x[i], i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Partial instanciation for heaps with TI = int64_t */
|
||||
|
||||
template <typename T> inline
|
||||
void minheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
|
||||
const T * x, const int64_t * ids, size_t n)
|
||||
{
|
||||
heap_addn<CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
|
||||
}
|
||||
|
||||
template <typename T> inline
|
||||
void maxheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
|
||||
const T * x, const int64_t * ids, size_t n)
|
||||
{
|
||||
heap_addn<CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Heap finalization (reorder elements)
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
/* This function maps a binary heap into an sorted structure.
|
||||
It returns the number */
|
||||
template <typename C> inline
|
||||
size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
|
||||
{
|
||||
size_t i, ii;
|
||||
|
||||
for (i = 0, ii = 0; i < k; i++) {
|
||||
/* top element should be put at the end of the list */
|
||||
typename C::T val = bh_val[0];
|
||||
typename C::TI id = bh_ids[0];
|
||||
|
||||
/* boundary case: we will over-ride this value if not a true element */
|
||||
heap_pop<C> (k-i, bh_val, bh_ids);
|
||||
bh_val[k-ii-1] = val;
|
||||
bh_ids[k-ii-1] = id;
|
||||
if (id != -1) ii++;
|
||||
}
|
||||
/* Count the number of elements which are effectively returned */
|
||||
size_t nel = ii;
|
||||
|
||||
memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
|
||||
memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
|
||||
|
||||
for (; ii < k; ii++) {
|
||||
bh_val[ii] = C::neutral();
|
||||
bh_ids[ii] = -1;
|
||||
}
|
||||
return nel;
|
||||
}
|
||||
|
||||
template <typename T> inline
|
||||
size_t minheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
|
||||
{
|
||||
return heap_reorder< CMin<T, int64_t> > (k, bh_val, bh_ids);
|
||||
}
|
||||
|
||||
template <typename T> inline
|
||||
size_t maxheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
|
||||
{
|
||||
return heap_reorder< CMax<T, int64_t> > (k, bh_val, bh_ids);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Operations on heap arrays
|
||||
*******************************************************************/
|
||||
|
||||
/** a template structure for a set of [min|max]-heaps it is tailored
|
||||
* so that the actual data of the heaps can just live in compact
|
||||
* arrays.
|
||||
*/
|
||||
template <typename C>
|
||||
struct HeapArray {
|
||||
typedef typename C::TI TI;
|
||||
typedef typename C::T T;
|
||||
|
||||
size_t nh; ///< number of heaps
|
||||
size_t k; ///< allocated size per heap
|
||||
TI * ids; ///< identifiers (size nh * k)
|
||||
T * val; ///< values (distances or similarities), size nh * k
|
||||
|
||||
/// Return the list of values for a heap
|
||||
T * get_val (size_t key) { return val + key * k; }
|
||||
|
||||
/// Correspponding identifiers
|
||||
TI * get_ids (size_t key) { return ids + key * k; }
|
||||
|
||||
/// prepare all the heaps before adding
|
||||
void heapify ();
|
||||
|
||||
/** add nj elements to heaps i0:i0+ni, with sequential ids
|
||||
*
|
||||
* @param nj nb of elements to add to each heap
|
||||
* @param vin elements to add, size ni * nj
|
||||
* @param j0 add this to the ids that are added
|
||||
* @param i0 first heap to update
|
||||
* @param ni nb of elements to update (-1 = use nh)
|
||||
*/
|
||||
void addn (size_t nj, const T *vin, TI j0 = 0,
|
||||
size_t i0 = 0, int64_t ni = -1);
|
||||
|
||||
/** same as addn
|
||||
*
|
||||
* @param id_in ids of the elements to add, size ni * nj
|
||||
* @param id_stride stride for id_in
|
||||
*/
|
||||
void addn_with_ids (
|
||||
size_t nj, const T *vin, const TI *id_in = nullptr,
|
||||
int64_t id_stride = 0, size_t i0 = 0, int64_t ni = -1);
|
||||
|
||||
/// reorder all the heaps
|
||||
void reorder ();
|
||||
|
||||
/** this is not really a heap function. It just finds the per-line
|
||||
* extrema of each line of array D
|
||||
* @param vals_out extreme value of each line (size nh, or NULL)
|
||||
* @param idx_out index of extreme value (size nh or NULL)
|
||||
*/
|
||||
void per_line_extrema (T *vals_out, TI *idx_out) const;
|
||||
|
||||
};
|
||||
|
||||
|
||||
/* Define useful heaps */
|
||||
typedef HeapArray<CMin<float, int64_t> > float_minheap_array_t;
|
||||
typedef HeapArray<CMin<int, int64_t> > int_minheap_array_t;
|
||||
|
||||
typedef HeapArray<CMax<float, int64_t> > float_maxheap_array_t;
|
||||
typedef HeapArray<CMax<int, int64_t> > int_maxheap_array_t;
|
||||
|
||||
// The heap templates are instanciated explicitly in Heap.cpp
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*********************************************************************
|
||||
* Indirect heaps: instead of having
|
||||
*
|
||||
* node i = (bh_ids[i], bh_val[i]),
|
||||
*
|
||||
* in indirect heaps,
|
||||
*
|
||||
* node i = (bh_ids[i], bh_val[bh_ids[i]]),
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
|
||||
template <class C>
|
||||
inline
|
||||
void indirect_heap_pop (
|
||||
size_t k,
|
||||
const typename C::T * bh_val,
|
||||
typename C::TI * bh_ids)
|
||||
{
|
||||
bh_ids--; /* Use 1-based indexing for easier node->child translation */
|
||||
typename C::T val = bh_val[bh_ids[k]];
|
||||
size_t i = 1;
|
||||
while (1) {
|
||||
size_t i1 = i << 1;
|
||||
size_t i2 = i1 + 1;
|
||||
if (i1 > k)
|
||||
break;
|
||||
typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
|
||||
if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
|
||||
if (C::cmp(val, bh_val[id1]))
|
||||
break;
|
||||
bh_ids[i] = id1;
|
||||
i = i1;
|
||||
} else {
|
||||
if (C::cmp(val, bh_val[id2]))
|
||||
break;
|
||||
bh_ids[i] = id2;
|
||||
i = i2;
|
||||
}
|
||||
}
|
||||
bh_ids[i] = bh_ids[k];
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <class C>
|
||||
inline
|
||||
void indirect_heap_push (size_t k,
|
||||
const typename C::T * bh_val, typename C::TI * bh_ids,
|
||||
typename C::TI id)
|
||||
{
|
||||
bh_ids--; /* Use 1-based indexing for easier node->child translation */
|
||||
typename C::T val = bh_val[id];
|
||||
size_t i = k;
|
||||
while (i > 1) {
|
||||
size_t i_father = i >> 1;
|
||||
if (!C::cmp (val, bh_val[bh_ids[i_father]]))
|
||||
break;
|
||||
bh_ids[i] = bh_ids[i_father];
|
||||
i = i_father;
|
||||
}
|
||||
bh_ids[i] = id;
|
||||
}
|
||||
|
||||
|
||||
} // namespace faiss
|
||||
|
||||
#endif /* FAISS_Heap_h */
|
472
src/3rd_party/faiss/utils/hamming-inl.h
vendored
Normal file
472
src/3rd_party/faiss/utils/hamming-inl.h
vendored
Normal file
@ -0,0 +1,472 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
inline BitstringWriter::BitstringWriter(uint8_t *code, int code_size):
|
||||
code (code), code_size (code_size), i(0)
|
||||
{
|
||||
bzero (code, code_size);
|
||||
}
|
||||
|
||||
inline void BitstringWriter::write(uint64_t x, int nbit) {
|
||||
assert (code_size * 8 >= nbit + i);
|
||||
// nb of available bits in i / 8
|
||||
int na = 8 - (i & 7);
|
||||
|
||||
if (nbit <= na) {
|
||||
code[i >> 3] |= x << (i & 7);
|
||||
i += nbit;
|
||||
return;
|
||||
} else {
|
||||
int j = i >> 3;
|
||||
code[j++] |= x << (i & 7);
|
||||
i += nbit;
|
||||
x >>= na;
|
||||
while (x != 0) {
|
||||
code[j++] |= x;
|
||||
x >>= 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline BitstringReader::BitstringReader(const uint8_t *code, int code_size):
|
||||
code (code), code_size (code_size), i(0)
|
||||
{}
|
||||
|
||||
inline uint64_t BitstringReader::read(int nbit) {
|
||||
assert (code_size * 8 >= nbit + i);
|
||||
// nb of available bits in i / 8
|
||||
int na = 8 - (i & 7);
|
||||
// get available bits in current byte
|
||||
uint64_t res = code[i >> 3] >> (i & 7);
|
||||
if (nbit <= na) {
|
||||
res &= (1 << nbit) - 1;
|
||||
i += nbit;
|
||||
return res;
|
||||
} else {
|
||||
int ofs = na;
|
||||
int j = (i >> 3) + 1;
|
||||
i += nbit;
|
||||
nbit -= na;
|
||||
while (nbit > 8) {
|
||||
res |= ((uint64_t)code[j++]) << ofs;
|
||||
ofs += 8;
|
||||
nbit -= 8; // TODO remove nbit
|
||||
}
|
||||
uint64_t last_byte = code[j];
|
||||
last_byte &= (1 << nbit) - 1;
|
||||
res |= last_byte << ofs;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************
|
||||
* The HammingComputer series of classes compares a single code of
|
||||
* size 4 to 32 to incoming codes. They are intended for use as a
|
||||
* template class where it would be inefficient to switch on the code
|
||||
* size in the inner loop. Hopefully the compiler will inline the
|
||||
* hamming() functions and put the a0, a1, ... in registers.
|
||||
******************************************************************/
|
||||
|
||||
|
||||
struct HammingComputer4 {
|
||||
uint32_t a0;
|
||||
|
||||
HammingComputer4 () {}
|
||||
|
||||
HammingComputer4 (const uint8_t *a, int code_size) {
|
||||
set (a, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a, int code_size) {
|
||||
assert (code_size == 4);
|
||||
a0 = *(uint32_t *)a;
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b) const {
|
||||
return popcount64 (*(uint32_t *)b ^ a0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct HammingComputer8 {
|
||||
uint64_t a0;
|
||||
|
||||
HammingComputer8 () {}
|
||||
|
||||
HammingComputer8 (const uint8_t *a, int code_size) {
|
||||
set (a, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a, int code_size) {
|
||||
assert (code_size == 8);
|
||||
a0 = *(uint64_t *)a;
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b) const {
|
||||
return popcount64 (*(uint64_t *)b ^ a0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct HammingComputer16 {
|
||||
uint64_t a0, a1;
|
||||
|
||||
HammingComputer16 () {}
|
||||
|
||||
HammingComputer16 (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 16);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
|
||||
// This incurs a penalty of ~10% wrt. fully aligned accesses.
|
||||
struct HammingComputer20 {
|
||||
uint64_t a0, a1;
|
||||
uint32_t a2;
|
||||
|
||||
HammingComputer20 () {}
|
||||
|
||||
HammingComputer20 (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 20);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1]; a2 = a[2];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
|
||||
popcount64 (*(uint32_t*)(b + 2) ^ a2);
|
||||
}
|
||||
};
|
||||
|
||||
struct HammingComputer32 {
|
||||
uint64_t a0, a1, a2, a3;
|
||||
|
||||
HammingComputer32 () {}
|
||||
|
||||
HammingComputer32 (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 32);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
|
||||
popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct HammingComputer64 {
|
||||
uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
|
||||
HammingComputer64 () {}
|
||||
|
||||
HammingComputer64 (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 64);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
|
||||
a4 = a[4]; a5 = a[5]; a6 = a[6]; a7 = a[7];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
|
||||
popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3) +
|
||||
popcount64 (b[4] ^ a4) + popcount64 (b[5] ^ a5) +
|
||||
popcount64 (b[6] ^ a6) + popcount64 (b[7] ^ a7);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// very inefficient...
|
||||
struct HammingComputerDefault {
|
||||
const uint8_t *a;
|
||||
int n;
|
||||
|
||||
HammingComputerDefault () {}
|
||||
|
||||
HammingComputerDefault (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
a = a8;
|
||||
n = code_size;
|
||||
}
|
||||
|
||||
int hamming (const uint8_t *b8) const {
|
||||
int accu = 0;
|
||||
for (int i = 0; i < n; i++)
|
||||
accu += popcount64 (a[i] ^ b8[i]);
|
||||
return accu;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct HammingComputerM8 {
|
||||
const uint64_t *a;
|
||||
int n;
|
||||
|
||||
HammingComputerM8 () {}
|
||||
|
||||
HammingComputerM8 (const uint8_t *a8, int code_size) {
|
||||
set (a8, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a8, int code_size) {
|
||||
assert (code_size % 8 == 0);
|
||||
a = (uint64_t *)a8;
|
||||
n = code_size / 8;
|
||||
}
|
||||
|
||||
int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
int accu = 0;
|
||||
for (int i = 0; i < n; i++)
|
||||
accu += popcount64 (a[i] ^ b[i]);
|
||||
return accu;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// even more inefficient!
|
||||
struct HammingComputerM4 {
|
||||
const uint32_t *a;
|
||||
int n;
|
||||
|
||||
HammingComputerM4 () {}
|
||||
|
||||
HammingComputerM4 (const uint8_t *a4, int code_size) {
|
||||
set (a4, code_size);
|
||||
}
|
||||
|
||||
void set (const uint8_t *a4, int code_size) {
|
||||
assert (code_size % 4 == 0);
|
||||
a = (uint32_t *)a4;
|
||||
n = code_size / 4;
|
||||
}
|
||||
|
||||
int hamming (const uint8_t *b8) const {
|
||||
const uint32_t *b = (uint32_t *)b8;
|
||||
int accu = 0;
|
||||
for (int i = 0; i < n; i++)
|
||||
accu += popcount64 (a[i] ^ b[i]);
|
||||
return accu;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Equivalence with a template class when code size is known at compile time
|
||||
**************************************************************************/
|
||||
|
||||
// default template
|
||||
template<int CODE_SIZE>
|
||||
struct HammingComputer: HammingComputerM8 {
|
||||
HammingComputer (const uint8_t *a, int code_size):
|
||||
HammingComputerM8(a, code_size) {}
|
||||
};
|
||||
|
||||
#define SPECIALIZED_HC(CODE_SIZE) \
|
||||
template<> struct HammingComputer<CODE_SIZE>: \
|
||||
HammingComputer ## CODE_SIZE { \
|
||||
HammingComputer (const uint8_t *a): \
|
||||
HammingComputer ## CODE_SIZE(a, CODE_SIZE) {} \
|
||||
}
|
||||
|
||||
SPECIALIZED_HC(4);
|
||||
SPECIALIZED_HC(8);
|
||||
SPECIALIZED_HC(16);
|
||||
SPECIALIZED_HC(20);
|
||||
SPECIALIZED_HC(32);
|
||||
SPECIALIZED_HC(64);
|
||||
|
||||
#undef SPECIALIZED_HC
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
* generalized Hamming = number of bytes that are different between
|
||||
* two codes.
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
inline int generalized_hamming_64 (uint64_t a) {
|
||||
a |= a >> 1;
|
||||
a |= a >> 2;
|
||||
a |= a >> 4;
|
||||
a &= 0x0101010101010101UL;
|
||||
return popcount64 (a);
|
||||
}
|
||||
|
||||
|
||||
struct GenHammingComputer8 {
|
||||
uint64_t a0;
|
||||
|
||||
GenHammingComputer8 (const uint8_t *a, int code_size) {
|
||||
assert (code_size == 8);
|
||||
a0 = *(uint64_t *)a;
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b) const {
|
||||
return generalized_hamming_64 (*(uint64_t *)b ^ a0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct GenHammingComputer16 {
|
||||
uint64_t a0, a1;
|
||||
GenHammingComputer16 (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 16);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return generalized_hamming_64 (b[0] ^ a0) +
|
||||
generalized_hamming_64 (b[1] ^ a1);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct GenHammingComputer32 {
|
||||
uint64_t a0, a1, a2, a3;
|
||||
|
||||
GenHammingComputer32 (const uint8_t *a8, int code_size) {
|
||||
assert (code_size == 32);
|
||||
const uint64_t *a = (uint64_t *)a8;
|
||||
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
|
||||
}
|
||||
|
||||
inline int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
return generalized_hamming_64 (b[0] ^ a0) +
|
||||
generalized_hamming_64 (b[1] ^ a1) +
|
||||
generalized_hamming_64 (b[2] ^ a2) +
|
||||
generalized_hamming_64 (b[3] ^ a3);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct GenHammingComputerM8 {
|
||||
const uint64_t *a;
|
||||
int n;
|
||||
|
||||
GenHammingComputerM8 (const uint8_t *a8, int code_size) {
|
||||
assert (code_size % 8 == 0);
|
||||
a = (uint64_t *)a8;
|
||||
n = code_size / 8;
|
||||
}
|
||||
|
||||
int hamming (const uint8_t *b8) const {
|
||||
const uint64_t *b = (uint64_t *)b8;
|
||||
int accu = 0;
|
||||
for (int i = 0; i < n; i++)
|
||||
accu += generalized_hamming_64 (a[i] ^ b[i]);
|
||||
return accu;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
/** generalized Hamming distances (= count number of code bytes that
|
||||
are the same) */
|
||||
void generalized_hammings_knn_hc (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t nb,
|
||||
size_t code_size,
|
||||
int ordered = true);
|
||||
|
||||
|
||||
|
||||
/** This class maintains a list of best distances seen so far.
|
||||
*
|
||||
* Since the distances are in a limited range (0 to nbit), the
|
||||
* object maintains one list per possible distance, and fills
|
||||
* in only the n-first lists, such that the sum of sizes of the
|
||||
* n lists is below k.
|
||||
*/
|
||||
template<class HammingComputer>
|
||||
struct HCounterState {
|
||||
int *counters;
|
||||
int64_t *ids_per_dis;
|
||||
|
||||
HammingComputer hc;
|
||||
int thres;
|
||||
int count_lt;
|
||||
int count_eq;
|
||||
int k;
|
||||
|
||||
HCounterState(int *counters, int64_t *ids_per_dis,
|
||||
const uint8_t *x, int d, int k)
|
||||
: counters(counters),
|
||||
ids_per_dis(ids_per_dis),
|
||||
hc(x, d / 8),
|
||||
thres(d + 1),
|
||||
count_lt(0),
|
||||
count_eq(0),
|
||||
k(k) {}
|
||||
|
||||
void update_counter(const uint8_t *y, size_t j) {
|
||||
int32_t dis = hc.hamming(y);
|
||||
|
||||
if (dis <= thres) {
|
||||
if (dis < thres) {
|
||||
ids_per_dis[dis * k + counters[dis]++] = j;
|
||||
++count_lt;
|
||||
while (count_lt == k && thres > 0) {
|
||||
--thres;
|
||||
count_eq = counters[thres];
|
||||
count_lt -= count_eq;
|
||||
}
|
||||
} else if (count_eq < k) {
|
||||
ids_per_dis[dis * k + count_eq++] = j;
|
||||
counters[dis] = count_eq;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // namespace faiss
|
879
src/3rd_party/faiss/utils/hamming.cpp
vendored
Normal file
879
src/3rd_party/faiss/utils/hamming.cpp
vendored
Normal file
@ -0,0 +1,879 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/*
|
||||
* Implementation of Hamming related functions (distances, smallest distance
|
||||
* selection with regular heap|radix and probabilistic heap|radix.
|
||||
*
|
||||
* IMPLEMENTATION NOTES
|
||||
* Bitvectors are generally assumed to be multiples of 64 bits.
|
||||
*
|
||||
* hamdis_t is used for distances because at this time
|
||||
* it is not clear how we will need to balance
|
||||
* - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
|
||||
* - memory usage
|
||||
* - cache-misses when dealing with large volumes of data (lower bits is better)
|
||||
*
|
||||
* The hamdis_t should optimally be compatibe with one of the Torch Storage
|
||||
* (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
|
||||
*/
|
||||
|
||||
#include <faiss/utils/hamming.h>
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <faiss/utils/Heap.h>
|
||||
#include "common/logging.h"
|
||||
#include "misc.h"
|
||||
|
||||
static const size_t BLOCKSIZE_QUERY = 8192;
|
||||
|
||||
|
||||
namespace faiss {
|
||||
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
|
||||
for (size_t i = 0; i < d; ++i) {
|
||||
x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
size_t hamming_batch_size = 65536;
|
||||
|
||||
static const uint8_t hamdis_tab_ham_bytes[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
|
||||
/* Elementary Hamming distance computation: unoptimized */
|
||||
template <size_t nbits, typename T>
|
||||
T hamming (const uint8_t *bs1,
|
||||
const uint8_t *bs2)
|
||||
{
|
||||
const size_t nbytes = nbits / 8;
|
||||
size_t i;
|
||||
T h = 0;
|
||||
for (i = 0; i < nbytes; i++)
|
||||
h += (T) hamdis_tab_ham_bytes[bs1[i]^bs2[i]];
|
||||
return h;
|
||||
}
|
||||
|
||||
|
||||
/* Hamming distances for multiples of 64 bits */
|
||||
template <size_t nbits>
|
||||
hamdis_t hamming (const uint64_t * bs1, const uint64_t * bs2)
|
||||
{
|
||||
const size_t nwords = nbits / 64;
|
||||
size_t i;
|
||||
hamdis_t h = 0;
|
||||
for (i = 0; i < nwords; i++)
|
||||
h += popcount64 (bs1[i] ^ bs2[i]);
|
||||
return h;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* specialized (optimized) functions */
|
||||
template <>
|
||||
hamdis_t hamming<64> (const uint64_t * pa, const uint64_t * pb)
|
||||
{
|
||||
return popcount64 (pa[0] ^ pb[0]);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
hamdis_t hamming<128> (const uint64_t *pa, const uint64_t *pb)
|
||||
{
|
||||
return popcount64 (pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
hamdis_t hamming<256> (const uint64_t * pa, const uint64_t * pb)
|
||||
{
|
||||
return popcount64 (pa[0] ^ pb[0])
|
||||
+ popcount64 (pa[1] ^ pb[1])
|
||||
+ popcount64 (pa[2] ^ pb[2])
|
||||
+ popcount64 (pa[3] ^ pb[3]);
|
||||
}
|
||||
|
||||
|
||||
/* Hamming distances for multiple of 64 bits */
|
||||
hamdis_t hamming (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t nwords)
|
||||
{
|
||||
size_t i;
|
||||
hamdis_t h = 0;
|
||||
for (i = 0; i < nwords; i++)
|
||||
h += popcount64 (bs1[i] ^ bs2[i]);
|
||||
return h;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <size_t nbits>
|
||||
void hammings (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t n1, size_t n2,
|
||||
hamdis_t * dis)
|
||||
|
||||
{
|
||||
size_t i, j;
|
||||
const size_t nwords = nbits / 64;
|
||||
for (i = 0; i < n1; i++) {
|
||||
const uint64_t * __restrict bs1_ = bs1 + i * nwords;
|
||||
hamdis_t * __restrict dis_ = dis + i * n2;
|
||||
for (j = 0; j < n2; j++)
|
||||
dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void hammings (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
size_t nwords,
|
||||
hamdis_t * __restrict dis)
|
||||
{
|
||||
size_t i, j;
|
||||
n1 *= nwords;
|
||||
n2 *= nwords;
|
||||
for (i = 0; i < n1; i+=nwords) {
|
||||
const uint64_t * bs1_ = bs1+i;
|
||||
for (j = 0; j < n2; j+=nwords)
|
||||
dis[j] = hamming (bs1_, bs2+j, nwords);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* Count number of matches given a max threshold */
|
||||
template <size_t nbits>
|
||||
void hamming_count_thres (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
hamdis_t ht,
|
||||
size_t * nptr)
|
||||
{
|
||||
const size_t nwords = nbits / 64;
|
||||
size_t i, j, posm = 0;
|
||||
const uint64_t * bs2_ = bs2;
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
bs2 = bs2_;
|
||||
for (j = 0; j < n2; j++) {
|
||||
/* collect the match only if this satisfies the threshold */
|
||||
if (hamming <nbits> (bs1, bs2) <= ht)
|
||||
posm++;
|
||||
bs2 += nwords;
|
||||
}
|
||||
bs1 += nwords; /* next signature */
|
||||
}
|
||||
*nptr = posm;
|
||||
}
|
||||
|
||||
|
||||
template <size_t nbits>
|
||||
void crosshamming_count_thres (
|
||||
const uint64_t * dbs,
|
||||
size_t n,
|
||||
int ht,
|
||||
size_t * nptr)
|
||||
{
|
||||
const size_t nwords = nbits / 64;
|
||||
size_t i, j, posm = 0;
|
||||
const uint64_t * bs1 = dbs;
|
||||
for (i = 0; i < n; i++) {
|
||||
const uint64_t * bs2 = bs1 + 2;
|
||||
for (j = i + 1; j < n; j++) {
|
||||
/* collect the match only if this satisfies the threshold */
|
||||
if (hamming <nbits> (bs1, bs2) <= ht)
|
||||
posm++;
|
||||
bs2 += nwords;
|
||||
}
|
||||
bs1 += nwords;
|
||||
}
|
||||
*nptr = posm;
|
||||
}
|
||||
|
||||
|
||||
template <size_t nbits>
|
||||
size_t match_hamming_thres (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
int ht,
|
||||
int64_t * idx,
|
||||
hamdis_t * hams)
|
||||
{
|
||||
const size_t nwords = nbits / 64;
|
||||
size_t i, j, posm = 0;
|
||||
hamdis_t h;
|
||||
const uint64_t * bs2_ = bs2;
|
||||
for (i = 0; i < n1; i++) {
|
||||
bs2 = bs2_;
|
||||
for (j = 0; j < n2; j++) {
|
||||
/* Here perform the real work of computing the distance */
|
||||
h = hamming <nbits> (bs1, bs2);
|
||||
|
||||
/* collect the match only if this satisfies the threshold */
|
||||
if (h <= ht) {
|
||||
/* Enough space to store another match ? */
|
||||
*idx = i; idx++;
|
||||
*idx = j; idx++;
|
||||
*hams = h;
|
||||
hams++;
|
||||
posm++;
|
||||
}
|
||||
bs2+=nwords; /* next signature */
|
||||
}
|
||||
bs1+=nwords;
|
||||
}
|
||||
return posm;
|
||||
}
|
||||
|
||||
|
||||
/* Return closest neighbors w.r.t Hamming distance, using a heap. */
|
||||
template <class HammingComputer>
|
||||
static
|
||||
void hammings_knn_hc (
|
||||
int bytes_per_code,
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * bs1,
|
||||
const uint8_t * bs2,
|
||||
size_t n2,
|
||||
bool order = true,
|
||||
bool init_heap = true)
|
||||
{
|
||||
size_t k = ha->k;
|
||||
if (init_heap) ha->heapify ();
|
||||
|
||||
const size_t block_size = hamming_batch_size;
|
||||
for (size_t j0 = 0; j0 < n2; j0 += block_size) {
|
||||
const size_t j1 = std::min(j0 + block_size, n2);
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = 0; i < ha->nh; i++) {
|
||||
HammingComputer hc (bs1 + i * bytes_per_code, bytes_per_code);
|
||||
|
||||
const uint8_t * bs2_ = bs2 + j0 * bytes_per_code;
|
||||
hamdis_t dis;
|
||||
hamdis_t * __restrict bh_val_ = ha->val + i * k;
|
||||
int64_t * __restrict bh_ids_ = ha->ids + i * k;
|
||||
size_t j;
|
||||
for (j = j0; j < j1; j++, bs2_+= bytes_per_code) {
|
||||
dis = hc.hamming (bs2_);
|
||||
if (dis < bh_val_[0]) {
|
||||
faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
|
||||
faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (order) ha->reorder ();
|
||||
}
|
||||
|
||||
/* Return closest neighbors w.r.t Hamming distance, using max count. */
|
||||
template <class HammingComputer>
|
||||
static
|
||||
void hammings_knn_mc (
|
||||
int bytes_per_code,
|
||||
const uint8_t *a,
|
||||
const uint8_t *b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
size_t k,
|
||||
int32_t *distances,
|
||||
int64_t *labels)
|
||||
{
|
||||
const int nBuckets = bytes_per_code * 8 + 1;
|
||||
std::vector<int> all_counters(na * nBuckets, 0);
|
||||
std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
|
||||
|
||||
std::vector<HCounterState<HammingComputer>> cs;
|
||||
for (size_t i = 0; i < na; ++i) {
|
||||
cs.push_back(HCounterState<HammingComputer>(
|
||||
all_counters.data() + i * nBuckets,
|
||||
all_ids_per_dis.get() + i * nBuckets * k,
|
||||
a + i * bytes_per_code,
|
||||
8 * bytes_per_code,
|
||||
k
|
||||
));
|
||||
}
|
||||
|
||||
const size_t block_size = hamming_batch_size;
|
||||
for (size_t j0 = 0; j0 < nb; j0 += block_size) {
|
||||
const size_t j1 = std::min(j0 + block_size, nb);
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = 0; i < na; ++i) {
|
||||
for (size_t j = j0; j < j1; ++j) {
|
||||
cs[i].update_counter(b + j * bytes_per_code, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < na; ++i) {
|
||||
HCounterState<HammingComputer>& csi = cs[i];
|
||||
|
||||
int nres = 0;
|
||||
for (int b = 0; b < nBuckets && nres < k; b++) {
|
||||
for (int l = 0; l < csi.counters[b] && nres < k; l++) {
|
||||
labels[i * k + nres] = csi.ids_per_dis[b * k + l];
|
||||
distances[i * k + nres] = b;
|
||||
nres++;
|
||||
}
|
||||
}
|
||||
while (nres < k) {
|
||||
labels[i * k + nres] = -1;
|
||||
distances[i * k + nres] = std::numeric_limits<int32_t>::max();
|
||||
++nres;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// works faster than the template version
|
||||
static
|
||||
void hammings_knn_hc_1 (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t n2,
|
||||
bool order = true,
|
||||
bool init_heap = true)
|
||||
{
|
||||
const size_t nwords = 1;
|
||||
size_t k = ha->k;
|
||||
|
||||
|
||||
if (init_heap) {
|
||||
ha->heapify ();
|
||||
}
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t i = 0; i < ha->nh; i++) {
|
||||
const uint64_t bs1_ = bs1 [i];
|
||||
const uint64_t * bs2_ = bs2;
|
||||
hamdis_t dis;
|
||||
hamdis_t * bh_val_ = ha->val + i * k;
|
||||
hamdis_t bh_val_0 = bh_val_[0];
|
||||
int64_t * bh_ids_ = ha->ids + i * k;
|
||||
size_t j;
|
||||
for (j = 0; j < n2; j++, bs2_+= nwords) {
|
||||
dis = popcount64 (bs1_ ^ *bs2_);
|
||||
if (dis < bh_val_0) {
|
||||
faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
|
||||
faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
|
||||
bh_val_0 = bh_val_[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (order) {
|
||||
ha->reorder ();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
|
||||
meaning that b should be be able to receive as many bits as x may produce. */
|
||||
|
||||
/*
|
||||
* dimension 0 corresponds to the least significant bit of b[0], or
|
||||
* equivalently to the lsb of the first byte that is stored.
|
||||
*/
|
||||
void fvec2bitvec (const float * x, uint8_t * b, size_t d)
|
||||
{
|
||||
for (int i = 0; i < d; i += 8) {
|
||||
uint8_t w = 0;
|
||||
uint8_t mask = 1;
|
||||
int nj = i + 8 <= d ? 8 : d - i;
|
||||
for (int j = 0; j < nj; j++) {
|
||||
if (x[i + j] >= 0)
|
||||
w |= mask;
|
||||
mask <<= 1;
|
||||
}
|
||||
*b = w;
|
||||
b++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Same but for n vectors.
|
||||
Ensure that the ouptut b is byte-aligned (pad with 0s). */
|
||||
void fvecs2bitvecs (const float * x, uint8_t * b, size_t d, size_t n)
|
||||
{
|
||||
const int64_t ncodes = ((d + 7) / 8);
|
||||
//#pragma omp parallel for if(n > 100000)
|
||||
for (size_t i = 0; i < n; i++)
|
||||
fvec2bitvec (x + i * d, b + i * ncodes, d);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bitvecs2fvecs (
|
||||
const uint8_t * b,
|
||||
float * x,
|
||||
size_t d,
|
||||
size_t n) {
|
||||
|
||||
const int64_t ncodes = ((d + 7) / 8);
|
||||
//#pragma omp parallel for if(n > 100000)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
binary_to_real (d, b + i * ncodes, x + i * d);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Reverse bit (NOT a optimized function, only used for print purpose) */
|
||||
static uint64_t uint64_reverse_bits (uint64_t b)
|
||||
{
|
||||
int i;
|
||||
uint64_t revb = 0;
|
||||
for (i = 0; i < 64; i++) {
|
||||
revb <<= 1;
|
||||
revb |= b & 1;
|
||||
b >>= 1;
|
||||
}
|
||||
return revb;
|
||||
}
|
||||
|
||||
|
||||
/* print the bit vector */
|
||||
void bitvec_print (const uint8_t * b, size_t d)
|
||||
{
|
||||
size_t i, j;
|
||||
for (i = 0; i < d; ) {
|
||||
uint64_t brev = uint64_reverse_bits (* (uint64_t *) b);
|
||||
for (j = 0; j < 64 && i < d; j++, i++) {
|
||||
printf ("%d", (int) (brev & 1));
|
||||
brev >>= 1;
|
||||
}
|
||||
b += 8;
|
||||
printf (" ");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bitvec_shuffle (size_t n, size_t da, size_t db,
|
||||
const int *order,
|
||||
const uint8_t *a,
|
||||
uint8_t *b)
|
||||
{
|
||||
for(size_t i = 0; i < db; i++) {
|
||||
ABORT_UNLESS (order[i] >= 0 && order[i] < da, "order[i] >= 0 && order[i] < da");
|
||||
}
|
||||
size_t lda = (da + 7) / 8;
|
||||
size_t ldb = (db + 7) / 8;
|
||||
|
||||
//#pragma omp parallel for if(n > 10000)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
const uint8_t *ai = a + i * lda;
|
||||
uint8_t *bi = b + i * ldb;
|
||||
memset (bi, 0, ldb);
|
||||
for(size_t i = 0; i < db; i++) {
|
||||
int o = order[i];
|
||||
uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
|
||||
bi[i >> 3] |= the_bit << (i & 7);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*----------------------------------------*/
|
||||
/* Hamming distance computation and k-nn */
|
||||
|
||||
|
||||
#define C64(x) ((uint64_t *)x)
|
||||
|
||||
|
||||
/* Compute a set of Hamming distances */
|
||||
void hammings (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na, size_t nb,
|
||||
size_t ncodes,
|
||||
hamdis_t * __restrict dis)
|
||||
{
|
||||
ABORT_UNLESS (ncodes % 8 == 0, "ncodes % 8 == 0");
|
||||
switch (ncodes) {
|
||||
case 8:
|
||||
faiss::hammings <64> (C64(a), C64(b), na, nb, dis); return;
|
||||
case 16:
|
||||
faiss::hammings <128> (C64(a), C64(b), na, nb, dis); return;
|
||||
case 32:
|
||||
faiss::hammings <256> (C64(a), C64(b), na, nb, dis); return;
|
||||
case 64:
|
||||
faiss::hammings <512> (C64(a), C64(b), na, nb, dis); return;
|
||||
default:
|
||||
faiss::hammings (C64(a), C64(b), na, nb, ncodes * 8, dis); return;
|
||||
}
|
||||
}
|
||||
|
||||
void hammings_knn(
|
||||
int_maxheap_array_t *ha,
|
||||
const uint8_t *a,
|
||||
const uint8_t *b,
|
||||
size_t nb,
|
||||
size_t ncodes,
|
||||
int order)
|
||||
{
|
||||
hammings_knn_hc(ha, a, b, nb, ncodes, order);
|
||||
}
|
||||
|
||||
void hammings_knn_hc (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t nb,
|
||||
size_t ncodes,
|
||||
int order)
|
||||
{
|
||||
switch (ncodes) {
|
||||
case 4:
|
||||
hammings_knn_hc<faiss::HammingComputer4>
|
||||
(4, ha, a, b, nb, order, true);
|
||||
break;
|
||||
case 8:
|
||||
hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
|
||||
// hammings_knn_hc<faiss::HammingComputer8>
|
||||
// (8, ha, a, b, nb, order, true);
|
||||
break;
|
||||
case 16:
|
||||
hammings_knn_hc<faiss::HammingComputer16>
|
||||
(16, ha, a, b, nb, order, true);
|
||||
break;
|
||||
case 32:
|
||||
hammings_knn_hc<faiss::HammingComputer32>
|
||||
(32, ha, a, b, nb, order, true);
|
||||
break;
|
||||
default:
|
||||
if(ncodes % 8 == 0) {
|
||||
hammings_knn_hc<faiss::HammingComputerM8>
|
||||
(ncodes, ha, a, b, nb, order, true);
|
||||
} else {
|
||||
hammings_knn_hc<faiss::HammingComputerDefault>
|
||||
(ncodes, ha, a, b, nb, order, true);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void hammings_knn_mc(
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
size_t k,
|
||||
size_t ncodes,
|
||||
int32_t *distances,
|
||||
int64_t *labels)
|
||||
{
|
||||
switch (ncodes) {
|
||||
case 4:
|
||||
hammings_knn_mc<faiss::HammingComputer4>(
|
||||
4, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
break;
|
||||
case 8:
|
||||
// TODO(hoss): Write analog to hammings_knn_hc_1
|
||||
// hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
|
||||
hammings_knn_mc<faiss::HammingComputer8>(
|
||||
8, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
break;
|
||||
case 16:
|
||||
hammings_knn_mc<faiss::HammingComputer16>(
|
||||
16, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
break;
|
||||
case 32:
|
||||
hammings_knn_mc<faiss::HammingComputer32>(
|
||||
32, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
break;
|
||||
default:
|
||||
if(ncodes % 8 == 0) {
|
||||
hammings_knn_mc<faiss::HammingComputerM8>(
|
||||
ncodes, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
} else {
|
||||
hammings_knn_mc<faiss::HammingComputerDefault>(
|
||||
ncodes, a, b, na, nb, k, distances, labels
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <class HammingComputer>
|
||||
static
|
||||
void hamming_range_search_template (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t code_size,
|
||||
RangeSearchResult *res)
|
||||
{
|
||||
|
||||
//#pragma omp parallel
|
||||
{
|
||||
RangeSearchPartialResult pres (res);
|
||||
|
||||
//#pragma omp for
|
||||
for (size_t i = 0; i < na; i++) {
|
||||
HammingComputer hc (a + i * code_size, code_size);
|
||||
const uint8_t * yi = b;
|
||||
RangeQueryResult & qres = pres.new_result (i);
|
||||
|
||||
for (size_t j = 0; j < nb; j++) {
|
||||
int dis = hc.hamming (yi);
|
||||
if (dis < radius) {
|
||||
qres.add(dis, j);
|
||||
}
|
||||
yi += code_size;
|
||||
}
|
||||
}
|
||||
pres.finalize ();
|
||||
}
|
||||
}
|
||||
|
||||
void hamming_range_search (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t code_size,
|
||||
RangeSearchResult *result)
|
||||
{
|
||||
|
||||
#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
|
||||
|
||||
switch(code_size) {
|
||||
case 4: HC(HammingComputer4); break;
|
||||
case 8: HC(HammingComputer8); break;
|
||||
case 16: HC(HammingComputer16); break;
|
||||
case 32: HC(HammingComputer32); break;
|
||||
default:
|
||||
if (code_size % 8 == 0) {
|
||||
HC(HammingComputerM8);
|
||||
} else {
|
||||
HC(HammingComputerDefault);
|
||||
}
|
||||
}
|
||||
#undef HC
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Count number of matches given a max threshold */
|
||||
void hamming_count_thres (
|
||||
const uint8_t * bs1,
|
||||
const uint8_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
size_t * nptr)
|
||||
{
|
||||
switch (ncodes) {
|
||||
case 8:
|
||||
faiss::hamming_count_thres <64> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, nptr);
|
||||
return;
|
||||
case 16:
|
||||
faiss::hamming_count_thres <128> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, nptr);
|
||||
return;
|
||||
case 32:
|
||||
faiss::hamming_count_thres <256> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, nptr);
|
||||
return;
|
||||
case 64:
|
||||
faiss::hamming_count_thres <512> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, nptr);
|
||||
return;
|
||||
default:
|
||||
ABORT ("not implemented for %zu bits", ncodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Count number of cross-matches given a threshold */
|
||||
void crosshamming_count_thres (
|
||||
const uint8_t * dbs,
|
||||
size_t n,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
size_t * nptr)
|
||||
{
|
||||
switch (ncodes) {
|
||||
case 8:
|
||||
faiss::crosshamming_count_thres <64> (C64(dbs), n, ht, nptr);
|
||||
return;
|
||||
case 16:
|
||||
faiss::crosshamming_count_thres <128> (C64(dbs), n, ht, nptr);
|
||||
return;
|
||||
case 32:
|
||||
faiss::crosshamming_count_thres <256> (C64(dbs), n, ht, nptr);
|
||||
return;
|
||||
case 64:
|
||||
faiss::crosshamming_count_thres <512> (C64(dbs), n, ht, nptr);
|
||||
return;
|
||||
default:
|
||||
ABORT ("not implemented for %zu bits", ncodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Returns all matches given a threshold */
|
||||
size_t match_hamming_thres (
|
||||
const uint8_t * bs1,
|
||||
const uint8_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
int64_t * idx,
|
||||
hamdis_t * dis)
|
||||
{
|
||||
switch (ncodes) {
|
||||
case 8:
|
||||
return faiss::match_hamming_thres <64> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, idx, dis);
|
||||
case 16:
|
||||
return faiss::match_hamming_thres <128> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, idx, dis);
|
||||
case 32:
|
||||
return faiss::match_hamming_thres <256> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, idx, dis);
|
||||
case 64:
|
||||
return faiss::match_hamming_thres <512> (C64(bs1), C64(bs2),
|
||||
n1, n2, ht, idx, dis);
|
||||
default:
|
||||
ABORT ("not implemented for %zu bits", ncodes);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef C64
|
||||
|
||||
|
||||
|
||||
/*************************************
|
||||
* generalized Hamming distances
|
||||
************************************/
|
||||
|
||||
|
||||
|
||||
template <class HammingComputer>
|
||||
static void hamming_dis_inner_loop (
|
||||
const uint8_t *ca,
|
||||
const uint8_t *cb,
|
||||
size_t nb,
|
||||
size_t code_size,
|
||||
int k,
|
||||
hamdis_t * bh_val_,
|
||||
int64_t * bh_ids_)
|
||||
{
|
||||
|
||||
HammingComputer hc (ca, code_size);
|
||||
|
||||
for (size_t j = 0; j < nb; j++) {
|
||||
int ndiff = hc.hamming (cb);
|
||||
cb += code_size;
|
||||
if (ndiff < bh_val_[0]) {
|
||||
maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
|
||||
maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, ndiff, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void generalized_hammings_knn_hc (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t nb,
|
||||
size_t code_size,
|
||||
int ordered)
|
||||
{
|
||||
int na = ha->nh;
|
||||
int k = ha->k;
|
||||
|
||||
if (ordered)
|
||||
ha->heapify ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (int i = 0; i < na; i++) {
|
||||
const uint8_t *ca = a + i * code_size;
|
||||
const uint8_t *cb = b;
|
||||
|
||||
hamdis_t * bh_val_ = ha->val + i * k;
|
||||
int64_t * bh_ids_ = ha->ids + i * k;
|
||||
|
||||
switch (code_size) {
|
||||
case 8:
|
||||
hamming_dis_inner_loop<GenHammingComputer8>
|
||||
(ca, cb, nb, 8, k, bh_val_, bh_ids_);
|
||||
break;
|
||||
case 16:
|
||||
hamming_dis_inner_loop<GenHammingComputer16>
|
||||
(ca, cb, nb, 16, k, bh_val_, bh_ids_);
|
||||
break;
|
||||
case 32:
|
||||
hamming_dis_inner_loop<GenHammingComputer32>
|
||||
(ca, cb, nb, 32, k, bh_val_, bh_ids_);
|
||||
break;
|
||||
default:
|
||||
hamming_dis_inner_loop<GenHammingComputerM8>
|
||||
(ca, cb, nb, code_size, k, bh_val_, bh_ids_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ordered)
|
||||
ha->reorder ();
|
||||
|
||||
}
|
||||
|
||||
|
||||
} // namespace faiss
|
240
src/3rd_party/faiss/utils/hamming.h
vendored
Normal file
240
src/3rd_party/faiss/utils/hamming.h
vendored
Normal file
@ -0,0 +1,240 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/*
|
||||
* Hamming distances. The binary vector dimensionality should be a
|
||||
* multiple of 8, as the elementary operations operate on bytes. If
|
||||
* you need other sizes, just pad with 0s (this is done by function
|
||||
* fvecs2bitvecs).
|
||||
*
|
||||
* User-defined type hamdis_t is used for distances because at this time
|
||||
* it is still uncler clear how we will need to balance
|
||||
* - flexibility in vector size (may need 16- or even 8-bit vectors)
|
||||
* - memory usage
|
||||
* - cache-misses when dealing with large volumes of data (fewer bits is better)
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef FAISS_hamming_h
|
||||
#define FAISS_hamming_h
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <faiss/utils/Heap.h>
|
||||
|
||||
|
||||
/* The Hamming distance type */
|
||||
typedef int32_t hamdis_t;
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/**************************************************
|
||||
* General bit vector functions
|
||||
**************************************************/
|
||||
|
||||
struct RangeSearchResult;
|
||||
|
||||
void bitvec_print (const uint8_t * b, size_t d);
|
||||
|
||||
|
||||
/* Functions for casting vectors of regular types to compact bits.
|
||||
They assume proper allocation done beforehand, meaning that b
|
||||
should be be able to receive as many bits as x may produce. */
|
||||
|
||||
/* Makes an array of bits from the signs of a float array. The length
|
||||
of the output array b is rounded up to byte size (allocate
|
||||
accordingly) */
|
||||
void fvecs2bitvecs (
|
||||
const float * x,
|
||||
uint8_t * b,
|
||||
size_t d,
|
||||
size_t n);
|
||||
|
||||
void bitvecs2fvecs (
|
||||
const uint8_t * b,
|
||||
float * x,
|
||||
size_t d,
|
||||
size_t n);
|
||||
|
||||
|
||||
void fvec2bitvec (const float * x, uint8_t * b, size_t d);
|
||||
|
||||
/** Shuffle the bits from b(i, j) := a(i, order[j])
|
||||
*/
|
||||
void bitvec_shuffle (size_t n, size_t da, size_t db,
|
||||
const int *order,
|
||||
const uint8_t *a,
|
||||
uint8_t *b);
|
||||
|
||||
|
||||
/***********************************************
|
||||
* Generic reader/writer for bit strings
|
||||
***********************************************/
|
||||
|
||||
|
||||
struct BitstringWriter {
|
||||
uint8_t *code;
|
||||
size_t code_size;
|
||||
size_t i; // current bit offset
|
||||
|
||||
// code_size in bytes
|
||||
BitstringWriter(uint8_t *code, int code_size);
|
||||
|
||||
// write the nbit low bits of x
|
||||
void write(uint64_t x, int nbit);
|
||||
};
|
||||
|
||||
struct BitstringReader {
|
||||
const uint8_t *code;
|
||||
size_t code_size;
|
||||
size_t i;
|
||||
|
||||
// code_size in bytes
|
||||
BitstringReader(const uint8_t *code, int code_size);
|
||||
|
||||
// read nbit bits from the code
|
||||
uint64_t read(int nbit);
|
||||
};
|
||||
|
||||
/**************************************************
|
||||
* Hamming distance computation functions
|
||||
**************************************************/
|
||||
|
||||
|
||||
|
||||
extern size_t hamming_batch_size;
|
||||
|
||||
inline int popcount64(uint64_t x) {
|
||||
return __builtin_popcountl(x);
|
||||
}
|
||||
|
||||
|
||||
/** Compute a set of Hamming distances between na and nb binary vectors
|
||||
*
|
||||
* @param a size na * nbytespercode
|
||||
* @param b size nb * nbytespercode
|
||||
* @param nbytespercode should be multiple of 8
|
||||
* @param dis output distances, size na * nb
|
||||
*/
|
||||
void hammings (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na, size_t nb,
|
||||
size_t nbytespercode,
|
||||
hamdis_t * dis);
|
||||
|
||||
|
||||
|
||||
|
||||
/** Return the k smallest Hamming distances for a set of binary query vectors,
|
||||
* using a max heap.
|
||||
* @param a queries, size ha->nh * ncodes
|
||||
* @param b database, size nb * ncodes
|
||||
* @param nb number of database vectors
|
||||
* @param ncodes size of the binary codes (bytes)
|
||||
* @param ordered if != 0: order the results by decreasing distance
|
||||
* (may be bottleneck for k/n > 0.01) */
|
||||
void hammings_knn_hc (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t nb,
|
||||
size_t ncodes,
|
||||
int ordered);
|
||||
|
||||
/* Legacy alias to hammings_knn_hc. */
|
||||
void hammings_knn (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t nb,
|
||||
size_t ncodes,
|
||||
int ordered);
|
||||
|
||||
/** Return the k smallest Hamming distances for a set of binary query vectors,
|
||||
* using counting max.
|
||||
* @param a queries, size na * ncodes
|
||||
* @param b database, size nb * ncodes
|
||||
* @param na number of query vectors
|
||||
* @param nb number of database vectors
|
||||
* @param k number of vectors/distances to return
|
||||
* @param ncodes size of the binary codes (bytes)
|
||||
* @param distances output distances from each query vector to its k nearest
|
||||
* neighbors
|
||||
* @param labels output ids of the k nearest neighbors to each query vector
|
||||
*/
|
||||
void hammings_knn_mc (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
size_t k,
|
||||
size_t ncodes,
|
||||
int32_t *distances,
|
||||
int64_t *labels);
|
||||
|
||||
/** same as hammings_knn except we are doing a range search with radius */
|
||||
void hamming_range_search (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t ncodes,
|
||||
RangeSearchResult *result);
|
||||
|
||||
|
||||
/* Counting the number of matches or of cross-matches (without returning them)
|
||||
For use with function that assume pre-allocated memory */
|
||||
void hamming_count_thres (
|
||||
const uint8_t * bs1,
|
||||
const uint8_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
size_t * nptr);
|
||||
|
||||
/* Return all Hamming distances/index passing a thres. Pre-allocation of output
|
||||
is required. Use hamming_count_thres to determine the proper size. */
|
||||
size_t match_hamming_thres (
|
||||
const uint8_t * bs1,
|
||||
const uint8_t * bs2,
|
||||
size_t n1,
|
||||
size_t n2,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
int64_t * idx,
|
||||
hamdis_t * dis);
|
||||
|
||||
/* Cross-matching in a set of vectors */
|
||||
void crosshamming_count_thres (
|
||||
const uint8_t * dbs,
|
||||
size_t n,
|
||||
hamdis_t ht,
|
||||
size_t ncodes,
|
||||
size_t * nptr);
|
||||
|
||||
|
||||
/* compute the Hamming distances between two codewords of nwords*64 bits */
|
||||
hamdis_t hamming (
|
||||
const uint64_t * bs1,
|
||||
const uint64_t * bs2,
|
||||
size_t nwords);
|
||||
|
||||
|
||||
|
||||
} // namespace faiss
|
||||
|
||||
// inlined definitions of HammingComputerXX and GenHammingComputerXX
|
||||
|
||||
#include <faiss/utils/hamming-inl.h>
|
||||
|
||||
#endif /* FAISS_hamming_h */
|
162
src/3rd_party/faiss/utils/misc.cpp
vendored
Normal file
162
src/3rd_party/faiss/utils/misc.cpp
vendored
Normal file
@ -0,0 +1,162 @@
|
||||
#include "misc.h"
|
||||
#include <cstring>
|
||||
|
||||
namespace faiss {
|
||||
/***********************************************************************
|
||||
* BufferList
|
||||
***********************************************************************/
|
||||
|
||||
|
||||
BufferList::BufferList(size_t buffer_size) :
|
||||
buffer_size(buffer_size)
|
||||
{
|
||||
wp = buffer_size;
|
||||
}
|
||||
|
||||
BufferList::~BufferList()
|
||||
{
|
||||
for (int i = 0; i < buffers.size(); i++) {
|
||||
delete[] buffers[i].ids;
|
||||
delete[] buffers[i].dis;
|
||||
}
|
||||
}
|
||||
|
||||
void BufferList::add(idx_t id, float dis) {
|
||||
if (wp == buffer_size) { // need new buffer
|
||||
append_buffer();
|
||||
}
|
||||
Buffer & buf = buffers.back();
|
||||
buf.ids[wp] = id;
|
||||
buf.dis[wp] = dis;
|
||||
wp++;
|
||||
}
|
||||
|
||||
|
||||
void BufferList::append_buffer()
|
||||
{
|
||||
Buffer buf = { new idx_t[buffer_size], new float[buffer_size] };
|
||||
buffers.push_back(buf);
|
||||
wp = 0;
|
||||
}
|
||||
|
||||
/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
|
||||
/// tables dest_ids, dest_dis
|
||||
void BufferList::copy_range(size_t ofs, size_t n,
|
||||
idx_t * dest_ids, float *dest_dis)
|
||||
{
|
||||
size_t bno = ofs / buffer_size;
|
||||
ofs -= bno * buffer_size;
|
||||
while (n > 0) {
|
||||
size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
|
||||
Buffer buf = buffers[bno];
|
||||
memcpy(dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
|
||||
memcpy(dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
|
||||
dest_ids += ncopy;
|
||||
dest_dis += ncopy;
|
||||
ofs = 0;
|
||||
bno++;
|
||||
n -= ncopy;
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* RangeSearchPartialResult
|
||||
***********************************************************************/
|
||||
|
||||
void RangeQueryResult::add(float dis, idx_t id) {
|
||||
nres++;
|
||||
pres->add(id, dis);
|
||||
}
|
||||
|
||||
|
||||
|
||||
RangeSearchPartialResult::RangeSearchPartialResult(RangeSearchResult * res_in) :
|
||||
BufferList(res_in->buffer_size),
|
||||
res(res_in)
|
||||
{}
|
||||
|
||||
|
||||
/// begin a new result
|
||||
RangeQueryResult &
|
||||
RangeSearchPartialResult::new_result(idx_t qno)
|
||||
{
|
||||
RangeQueryResult qres = { qno, 0, this };
|
||||
queries.push_back(qres);
|
||||
return queries.back();
|
||||
}
|
||||
|
||||
|
||||
void RangeSearchPartialResult::finalize()
|
||||
{
|
||||
set_lims();
|
||||
//#pragma omp barrier
|
||||
|
||||
//#pragma omp single
|
||||
res->do_allocation();
|
||||
|
||||
//#pragma omp barrier
|
||||
copy_result();
|
||||
}
|
||||
|
||||
|
||||
/// called by range_search before do_allocation
|
||||
void RangeSearchPartialResult::set_lims()
|
||||
{
|
||||
for (int i = 0; i < queries.size(); i++) {
|
||||
RangeQueryResult & qres = queries[i];
|
||||
res->lims[qres.qno] = qres.nres;
|
||||
}
|
||||
}
|
||||
|
||||
/// called by range_search after do_allocation
|
||||
void RangeSearchPartialResult::copy_result(bool incremental)
|
||||
{
|
||||
size_t ofs = 0;
|
||||
for (int i = 0; i < queries.size(); i++) {
|
||||
RangeQueryResult & qres = queries[i];
|
||||
|
||||
copy_range(ofs, qres.nres,
|
||||
res->labels + res->lims[qres.qno],
|
||||
res->distances + res->lims[qres.qno]);
|
||||
if (incremental) {
|
||||
res->lims[qres.qno] += qres.nres;
|
||||
}
|
||||
ofs += qres.nres;
|
||||
}
|
||||
}
|
||||
|
||||
void RangeSearchPartialResult::merge(std::vector <RangeSearchPartialResult *> &
|
||||
partial_results, bool do_delete)
|
||||
{
|
||||
|
||||
int npres = partial_results.size();
|
||||
if (npres == 0) return;
|
||||
RangeSearchResult *result = partial_results[0]->res;
|
||||
size_t nx = result->nq;
|
||||
|
||||
// count
|
||||
for (const RangeSearchPartialResult * pres : partial_results) {
|
||||
if (!pres) continue;
|
||||
for (const RangeQueryResult &qres : pres->queries) {
|
||||
result->lims[qres.qno] += qres.nres;
|
||||
}
|
||||
}
|
||||
result->do_allocation();
|
||||
for (int j = 0; j < npres; j++) {
|
||||
if (!partial_results[j]) continue;
|
||||
partial_results[j]->copy_result(true);
|
||||
if (do_delete) {
|
||||
delete partial_results[j];
|
||||
partial_results[j] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// reset the limits
|
||||
for (size_t i = nx; i > 0; i--) {
|
||||
result->lims[i] = result->lims[i - 1];
|
||||
}
|
||||
result->lims[0] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
144
src/3rd_party/faiss/utils/misc.h
vendored
Normal file
144
src/3rd_party/faiss/utils/misc.h
vendored
Normal file
@ -0,0 +1,144 @@
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
namespace faiss {
|
||||
/// The metric space for vector comparison for Faiss indices and algorithms.
|
||||
///
|
||||
/// Most algorithms support both inner product and L2, with the flat
|
||||
/// (brute-force) indices supporting additional metric types for vector
|
||||
/// comparison.
|
||||
enum MetricType {
|
||||
METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
|
||||
METRIC_L2 = 1, ///< squared L2 search
|
||||
METRIC_L1, ///< L1 (aka cityblock)
|
||||
METRIC_Linf, ///< infinity distance
|
||||
METRIC_Lp, ///< L_p distance, p is given by a faiss::Index
|
||||
/// metric_arg
|
||||
|
||||
/// some additional metrics defined in scipy.spatial.distance
|
||||
METRIC_Canberra = 20,
|
||||
METRIC_BrayCurtis,
|
||||
METRIC_JensenShannon,
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct ScopeDeleter {
|
||||
const T * ptr;
|
||||
explicit ScopeDeleter(const T* ptr = nullptr) : ptr(ptr) {}
|
||||
void release() { ptr = nullptr; }
|
||||
void set(const T * ptr_in) { ptr = ptr_in; }
|
||||
void swap(ScopeDeleter<T> &other) { std::swap(ptr, other.ptr); }
|
||||
~ScopeDeleter() {
|
||||
delete[] ptr;
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////
|
||||
using idx_t = int64_t;
|
||||
|
||||
/** List of temporary buffers used to store results before they are
|
||||
* copied to the RangeSearchResult object. */
|
||||
struct BufferList {
|
||||
typedef faiss::idx_t idx_t;
|
||||
|
||||
// buffer sizes in # entries
|
||||
size_t buffer_size;
|
||||
|
||||
struct Buffer {
|
||||
idx_t *ids;
|
||||
float *dis;
|
||||
};
|
||||
|
||||
std::vector<Buffer> buffers;
|
||||
size_t wp; ///< write pointer in the last buffer.
|
||||
|
||||
explicit BufferList(size_t buffer_size);
|
||||
|
||||
~BufferList();
|
||||
|
||||
/// create a new buffer
|
||||
void append_buffer();
|
||||
|
||||
/// add one result, possibly appending a new buffer if needed
|
||||
void add(idx_t id, float dis);
|
||||
|
||||
/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
|
||||
/// tables dest_ids, dest_dis
|
||||
void copy_range(size_t ofs, size_t n,
|
||||
idx_t * dest_ids, float *dest_dis);
|
||||
|
||||
};
|
||||
|
||||
/** The objective is to have a simple result structure while
|
||||
* minimizing the number of mem copies in the result. The method
|
||||
* do_allocation can be overloaded to allocate the result tables in
|
||||
* the matrix type of a scripting language like Lua or Python. */
|
||||
struct RangeSearchResult {
|
||||
size_t nq; ///< nb of queries
|
||||
size_t *lims; ///< size (nq + 1)
|
||||
|
||||
typedef faiss::idx_t idx_t;
|
||||
|
||||
idx_t *labels; ///< result for query i is labels[lims[i]:lims[i+1]]
|
||||
float *distances; ///< corresponding distances (not sorted)
|
||||
|
||||
size_t buffer_size; ///< size of the result buffers used
|
||||
|
||||
/// lims must be allocated on input to range_search.
|
||||
explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
|
||||
|
||||
/// called when lims contains the nb of elements result entries
|
||||
/// for each query
|
||||
|
||||
virtual void do_allocation();
|
||||
|
||||
virtual ~RangeSearchResult();
|
||||
};
|
||||
|
||||
struct RangeSearchPartialResult;
|
||||
|
||||
/// result structure for a single query
|
||||
struct RangeQueryResult {
|
||||
using idx_t = faiss::idx_t;
|
||||
idx_t qno; //< id of the query
|
||||
size_t nres; //< nb of results for this query
|
||||
RangeSearchPartialResult * pres;
|
||||
|
||||
/// called by search function to report a new result
|
||||
void add(float dis, idx_t id);
|
||||
};
|
||||
|
||||
/// the entries in the buffers are split per query
|
||||
struct RangeSearchPartialResult : BufferList {
|
||||
RangeSearchResult * res;
|
||||
|
||||
/// eventually the result will be stored in res_in
|
||||
explicit RangeSearchPartialResult(RangeSearchResult * res_in);
|
||||
|
||||
/// query ids + nb of results per query.
|
||||
std::vector<RangeQueryResult> queries;
|
||||
|
||||
/// begin a new result
|
||||
RangeQueryResult & new_result(idx_t qno);
|
||||
|
||||
/*****************************************
|
||||
* functions used at the end of the search to merge the result
|
||||
* lists */
|
||||
void finalize();
|
||||
|
||||
/// called by range_search before do_allocation
|
||||
void set_lims();
|
||||
|
||||
/// called by range_search after do_allocation
|
||||
void copy_result(bool incremental = false);
|
||||
|
||||
/// merge a set of PartialResult's into one RangeSearchResult
|
||||
/// on ouptut the partialresults are empty!
|
||||
static void merge(std::vector <RangeSearchPartialResult *> &
|
||||
partial_results, bool do_delete = true);
|
||||
|
||||
};
|
||||
|
||||
|
||||
} // namespace
|
192
src/3rd_party/faiss/utils/random.cpp
vendored
Normal file
192
src/3rd_party/faiss/utils/random.cpp
vendored
Normal file
@ -0,0 +1,192 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/**************************************************
|
||||
* Random data generation functions
|
||||
**************************************************/
|
||||
|
||||
RandomGenerator::RandomGenerator (int64_t seed)
|
||||
: mt((unsigned int)seed) {}
|
||||
|
||||
int RandomGenerator::rand_int ()
|
||||
{
|
||||
return mt() & 0x7fffffff;
|
||||
}
|
||||
|
||||
int64_t RandomGenerator::rand_int64 ()
|
||||
{
|
||||
return int64_t(rand_int()) | int64_t(rand_int()) << 31;
|
||||
}
|
||||
|
||||
int RandomGenerator::rand_int (int max)
|
||||
{
|
||||
return mt() % max;
|
||||
}
|
||||
|
||||
float RandomGenerator::rand_float ()
|
||||
{
|
||||
return mt() / float(mt.max());
|
||||
}
|
||||
|
||||
double RandomGenerator::rand_double ()
|
||||
{
|
||||
return mt() / double(mt.max());
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* Random functions in this C file only exist because Torch
|
||||
* counterparts are slow and not multi-threaded. Typical use is for
|
||||
* more than 1-100 billion values. */
|
||||
|
||||
|
||||
/* Generate a set of random floating point values such that x[i] in [0,1]
|
||||
multi-threading. For this reason, we rely on re-entreant functions. */
|
||||
void float_rand (float * x, size_t n, int64_t seed)
|
||||
{
|
||||
// only try to parallelize on large enough arrays
|
||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||
|
||||
RandomGenerator rng0 (seed);
|
||||
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nblock; j++) {
|
||||
|
||||
RandomGenerator rng (a0 + j * b0);
|
||||
|
||||
const size_t istart = j * n / nblock;
|
||||
const size_t iend = (j + 1) * n / nblock;
|
||||
|
||||
for (size_t i = istart; i < iend; i++)
|
||||
x[i] = rng.rand_float ();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void float_randn (float * x, size_t n, int64_t seed)
|
||||
{
|
||||
// only try to parallelize on large enough arrays
|
||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||
|
||||
RandomGenerator rng0 (seed);
|
||||
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nblock; j++) {
|
||||
RandomGenerator rng (a0 + j * b0);
|
||||
|
||||
double a = 0, b = 0, s = 0;
|
||||
int state = 0; /* generate two number per "do-while" loop */
|
||||
|
||||
const size_t istart = j * n / nblock;
|
||||
const size_t iend = (j + 1) * n / nblock;
|
||||
|
||||
for (size_t i = istart; i < iend; i++) {
|
||||
/* Marsaglia's method (see Knuth) */
|
||||
if (state == 0) {
|
||||
do {
|
||||
a = 2.0 * rng.rand_double () - 1;
|
||||
b = 2.0 * rng.rand_double () - 1;
|
||||
s = a * a + b * b;
|
||||
} while (s >= 1.0);
|
||||
x[i] = a * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
else
|
||||
x[i] = b * sqrt(-2.0 * log(s) / s);
|
||||
state = 1 - state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Integer versions */
|
||||
void int64_rand (int64_t * x, size_t n, int64_t seed)
|
||||
{
|
||||
// only try to parallelize on large enough arrays
|
||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||
|
||||
RandomGenerator rng0 (seed);
|
||||
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nblock; j++) {
|
||||
|
||||
RandomGenerator rng (a0 + j * b0);
|
||||
|
||||
const size_t istart = j * n / nblock;
|
||||
const size_t iend = (j + 1) * n / nblock;
|
||||
for (size_t i = istart; i < iend; i++)
|
||||
x[i] = rng.rand_int64 ();
|
||||
}
|
||||
}
|
||||
|
||||
void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed)
|
||||
{
|
||||
// only try to parallelize on large enough arrays
|
||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||
|
||||
RandomGenerator rng0 (seed);
|
||||
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nblock; j++) {
|
||||
|
||||
RandomGenerator rng (a0 + j * b0);
|
||||
|
||||
const size_t istart = j * n / nblock;
|
||||
const size_t iend = (j + 1) * n / nblock;
|
||||
for (size_t i = istart; i < iend; i++)
|
||||
x[i] = rng.rand_int64 () % max;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void rand_perm (int *perm, size_t n, int64_t seed)
|
||||
{
|
||||
for (size_t i = 0; i < n; i++) perm[i] = i;
|
||||
|
||||
RandomGenerator rng (seed);
|
||||
|
||||
for (size_t i = 0; i + 1 < n; i++) {
|
||||
int i2 = i + rng.rand_int (n - i);
|
||||
std::swap(perm[i], perm[i2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void byte_rand (uint8_t * x, size_t n, int64_t seed)
|
||||
{
|
||||
// only try to parallelize on large enough arrays
|
||||
const size_t nblock = n < 1024 ? 1 : 1024;
|
||||
|
||||
RandomGenerator rng0 (seed);
|
||||
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
|
||||
|
||||
//#pragma omp parallel for
|
||||
for (size_t j = 0; j < nblock; j++) {
|
||||
|
||||
RandomGenerator rng (a0 + j * b0);
|
||||
|
||||
const size_t istart = j * n / nblock;
|
||||
const size_t iend = (j + 1) * n / nblock;
|
||||
|
||||
size_t i;
|
||||
for (i = istart; i < iend; i++)
|
||||
x[i] = rng.rand_int64 ();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace faiss
|
60
src/3rd_party/faiss/utils/random.h
vendored
Normal file
60
src/3rd_party/faiss/utils/random.h
vendored
Normal file
@ -0,0 +1,60 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/* Random generators. Implemented here for speed and to make
|
||||
* sequences reproducible.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <random>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/**************************************************
|
||||
* Random data generation functions
|
||||
**************************************************/
|
||||
|
||||
/// random generator that can be used in multithreaded contexts
|
||||
struct RandomGenerator {
|
||||
|
||||
std::mt19937 mt;
|
||||
|
||||
/// random positive integer
|
||||
int rand_int ();
|
||||
|
||||
/// random int64_t
|
||||
int64_t rand_int64 ();
|
||||
|
||||
/// generate random integer between 0 and max-1
|
||||
int rand_int (int max);
|
||||
|
||||
/// between 0 and 1
|
||||
float rand_float ();
|
||||
|
||||
double rand_double ();
|
||||
|
||||
explicit RandomGenerator (int64_t seed = 1234);
|
||||
};
|
||||
|
||||
/* Generate an array of uniform random floats / multi-threaded implementation */
|
||||
void float_rand (float * x, size_t n, int64_t seed);
|
||||
void float_randn (float * x, size_t n, int64_t seed);
|
||||
void int64_rand (int64_t * x, size_t n, int64_t seed);
|
||||
void byte_rand (uint8_t * x, size_t n, int64_t seed);
|
||||
// max is actually the maximum value + 1
|
||||
void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed);
|
||||
|
||||
/* random permutation */
|
||||
void rand_perm (int * perm, size_t n, int64_t seed);
|
||||
|
||||
|
||||
} // namespace faiss
|
@ -70,6 +70,7 @@ add_library(marian STATIC
|
||||
layers/generic.cpp
|
||||
layers/loss.cpp
|
||||
layers/weight.cpp
|
||||
layers/lsh.cpp
|
||||
|
||||
rnn/cells.cpp
|
||||
rnn/attention.cpp
|
||||
|
@ -646,6 +646,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
|
||||
cli.add<bool>("--output-sampling",
|
||||
"Noise output layer with gumbel noise",
|
||||
false);
|
||||
cli.add<std::vector<int>>("--output-approx-knn",
|
||||
"Use approximate knn search in output layer (currently only in transformer)")
|
||||
->implicit_val("100 1024");
|
||||
|
||||
#if 0 // @TODO: Ask Hany if there are any decoding-time options
|
||||
// add ULR settings
|
||||
|
@ -120,6 +120,13 @@ namespace marian {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ABORT_UNLESS(condition, ...) \
|
||||
do { \
|
||||
if(!(bool)(condition)) { \
|
||||
ABORT(__VA_ARGS__); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
typedef std::shared_ptr<spdlog::logger> Logger;
|
||||
Logger createStderrLogger(const std::string&,
|
||||
const std::string&,
|
||||
|
@ -26,6 +26,16 @@ Expr checkpoint(Expr a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
|
||||
LambdaNodeFunctor fwd) {
|
||||
return Expression<LambdaNodeOp>(nodes, shape, type, fwd);
|
||||
}
|
||||
|
||||
Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
|
||||
LambdaNodeFunctor fwd, LambdaNodeFunctor bwd) {
|
||||
return Expression<LambdaNodeOp>(nodes, shape, type, fwd, bwd);
|
||||
}
|
||||
|
||||
// logistic function. Note: scipy name is expit()
|
||||
Expr sigmoid(Expr a) {
|
||||
return Expression<SigmoidNodeOp>(a);
|
||||
|
@ -10,6 +10,10 @@ Expr checkpoint(Expr a);
|
||||
|
||||
typedef Expr(ActivationFunction)(Expr);
|
||||
|
||||
typedef std::function<void(Expr, const std::vector<Expr>&)> LambdaNodeFunctor;
|
||||
Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor);
|
||||
Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor, LambdaNodeFunctor);
|
||||
|
||||
Expr plus(const std::vector<Expr>&);
|
||||
|
||||
// TODO: should be logistic(), not sigmoid()
|
||||
@ -188,6 +192,30 @@ Expr stopGradient(Expr a);
|
||||
|
||||
Expr gather(Expr a, int axis, Expr indices);
|
||||
|
||||
#if 0
|
||||
// reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
|
||||
// with broadcasting
|
||||
|
||||
auto knn = get<0>(KNN->apply(query, k)); // [beam, time, batch, k]
|
||||
|
||||
auto W = reshape(gather(Wt_, -2, flatten(knn)), {beam * time * batch, k, dim});
|
||||
auto b = reshape(gather(b_, -1, flatten(knn)), {beam * time * batch, 1, k });
|
||||
query = reshape(query, {beam * time * batch, 1, dim});
|
||||
auto logits = bdot(query, W, false, true); // [beam * time * batch, 1, k]
|
||||
logits = reshape(logits + b, {beam, time, batch, k}); // @TODO: add baffine node
|
||||
|
||||
auto shape = indices.shape();
|
||||
shape.set(-1, 32000);
|
||||
auto output = grep->constant(shape, inits::lowest(), logits->value_type());
|
||||
output = scatter(output, -1, indices, logits);
|
||||
|
||||
// auto a = graph->constant({2,2,5,32000}, inits::fromValue(minimal))
|
||||
// scatter(a, -1, indices, values)
|
||||
// PyTorch does for out-of-place scatter: out = a.scatter(-1, indices, values)
|
||||
Expr scatter(Expr a, int axis, Expr indices, Expr b);
|
||||
|
||||
#endif
|
||||
|
||||
// Warning: Don't try to pass a scalar literal 0 as indices; it will compile but pass nullptr...
|
||||
Expr index_select(Expr a, int axis, Expr indices);
|
||||
|
||||
|
@ -251,5 +251,24 @@ template Ptr<NodeInitializer> range<float> (float begin, float end, f
|
||||
template Ptr<NodeInitializer> range<IndexType>(IndexType begin, IndexType end, IndexType step);
|
||||
|
||||
} // namespace inits
|
||||
|
||||
} // namespace marian
|
||||
|
||||
|
||||
#include "faiss/VectorTransform.h"
|
||||
|
||||
namespace marian {
|
||||
namespace inits {
|
||||
|
||||
Ptr<NodeInitializer> randomRotation(size_t seed) {
|
||||
auto rot = [=](Tensor t) {
|
||||
int rows = t->shape()[-2];
|
||||
int cols = t->shape()[-1];
|
||||
faiss::RandomRotationMatrix rrot(cols, rows); // transposed in faiss
|
||||
rrot.init(seed);
|
||||
t->set(rrot.A);
|
||||
};
|
||||
return fromLambda(rot, Type::float32);
|
||||
}
|
||||
|
||||
} // namespace inits
|
||||
} // namespace marian
|
||||
|
@ -176,6 +176,13 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
|
||||
*/
|
||||
Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);
|
||||
|
||||
/**
|
||||
* Computes a random rotation matrix for LSH hashing. This is part
|
||||
* of a hash function. The values are orthonormal and computed via
|
||||
* QR decomposition. Same seed results in same random rotation.
|
||||
*/
|
||||
Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
|
||||
|
||||
/**
|
||||
* Computes a range from begin to end-1, like Python's range().
|
||||
* The constant being initialized must have one dimension that matches
|
||||
|
@ -13,6 +13,62 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
class LambdaNodeOp : public NaryNodeOp {
|
||||
private:
|
||||
typedef const std::vector<Expr>& Inputs;
|
||||
typedef std::function<void(Expr, Inputs)> LambdaNodeFunctor;
|
||||
|
||||
std::unique_ptr<LambdaNodeFunctor> forward_;
|
||||
std::unique_ptr<LambdaNodeFunctor> backward_;
|
||||
|
||||
public:
|
||||
LambdaNodeOp(Inputs inputs, Shape shape, Type type,
|
||||
LambdaNodeFunctor forward)
|
||||
: NaryNodeOp(inputs, shape, type),
|
||||
forward_(new LambdaNodeFunctor(forward)) {
|
||||
Node::trainable_ = !!backward_;
|
||||
}
|
||||
|
||||
LambdaNodeOp(Inputs inputs, Shape shape, Type type,
|
||||
LambdaNodeFunctor forward,
|
||||
LambdaNodeFunctor backward)
|
||||
: NaryNodeOp(inputs, shape, type),
|
||||
forward_(new LambdaNodeFunctor(forward)),
|
||||
backward_(new LambdaNodeFunctor(backward)) {
|
||||
}
|
||||
|
||||
void forward() override {
|
||||
(*forward_)(this, children_);
|
||||
}
|
||||
|
||||
void backward() override {
|
||||
ABORT_IF(!backward_, "No backward lambda given?");
|
||||
(*backward_)(this, children_);
|
||||
}
|
||||
|
||||
const std::string type() override { return "lambda"; }
|
||||
|
||||
virtual size_t hash() override {
|
||||
size_t seed = NaryNodeOp::hash();
|
||||
util::hash_combine(seed, forward_.get());
|
||||
util::hash_combine(seed, backward_.get());
|
||||
return seed;
|
||||
}
|
||||
|
||||
virtual bool equal(Expr node) override {
|
||||
if(!NaryNodeOp::equal(node))
|
||||
return false;
|
||||
auto cnode = std::dynamic_pointer_cast<LambdaNodeOp>(node);
|
||||
if(!cnode)
|
||||
return false;
|
||||
if(forward_ != cnode->forward_) // pointer compare on purpose
|
||||
return false;
|
||||
if(backward_ != cnode->backward_) // pointer compare on purpose
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class DotNodeOp : public NaryNodeOp {
|
||||
private:
|
||||
friend class SerializationHelpers;
|
||||
|
@ -6,8 +6,7 @@
|
||||
#include "data/factored_vocab.h"
|
||||
#include "rnn/types.h" // for State::select()
|
||||
#include "models/states.h" // for EncoderState
|
||||
|
||||
//using std::size_t; // not sure why this is needed
|
||||
#include "layers/lsh.h"
|
||||
|
||||
namespace marian {
|
||||
Logits::Logits(Expr logits) : Logits(New<RationalLoss>(logits, nullptr)) {} // single-output constructor from Expr only (RationalLoss has no count)
|
||||
@ -218,6 +217,13 @@ namespace marian {
|
||||
if (Wt_)
|
||||
return;
|
||||
|
||||
// this option is only set in the decoder
|
||||
if(!lsh_ && options_->hasAndNotEmpty("output-approx-knn")) {
|
||||
auto k = opt<std::vector<int>>("output-approx-knn")[0];
|
||||
auto nbits = opt<std::vector<int>>("output-approx-knn")[1];
|
||||
lsh_ = New<LSH>(k, nbits);
|
||||
}
|
||||
|
||||
auto name = options_->get<std::string>("prefix");
|
||||
auto numOutputClasses = options_->get<int>("dim");
|
||||
|
||||
@ -257,6 +263,16 @@ namespace marian {
|
||||
Logits Output::applyAsLogits(Expr input) /*override final*/ {
|
||||
lazyConstruct(input->shape()[-1]);
|
||||
|
||||
auto affineOrLSH = [this](Expr x, Expr W, Expr b, bool transA, bool transB) {
|
||||
if(lsh_) {
|
||||
ABORT_IF( transA, "Transposed query not supported for LSH");
|
||||
ABORT_IF(!transB, "Untransposed indexed matrix not supported for LSH");
|
||||
return lsh_->apply(x, W, b);
|
||||
} else {
|
||||
return affine(x, W, b, transA, transB);
|
||||
}
|
||||
};
|
||||
|
||||
if (shortlist_ && !cachedShortWt_) { // shortlisted versions of parameters are cached within one batch, then clear()ed
|
||||
cachedShortWt_ = index_select(Wt_, isLegacyUntransposedW ? -1 : 0, shortlist_->indices());
|
||||
cachedShortb_ = index_select(b_ , -1, shortlist_->indices());
|
||||
@ -333,7 +349,12 @@ namespace marian {
|
||||
input1 = layerNorm(input1, name + "_ffn");
|
||||
}
|
||||
// @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
|
||||
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
|
||||
Expr factorLogits;
|
||||
if(g == 0)
|
||||
factorLogits = affineOrLSH(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
|
||||
else
|
||||
factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
|
||||
|
||||
// optionally add lemma-dependent bias
|
||||
if (Plemma) { // [B... x U0]
|
||||
int lemmaVocabDim = Plemma->shape()[-1];
|
||||
@ -396,11 +417,11 @@ namespace marian {
|
||||
}
|
||||
}
|
||||
return Logits(std::move(allLogits), factoredVocab_);
|
||||
} else if (shortlist_) {
|
||||
return Logits(affineOrLSH(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
|
||||
} else {
|
||||
return Logits(affineOrLSH(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
|
||||
}
|
||||
else if (shortlist_)
|
||||
return Logits(affine(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
|
||||
else
|
||||
return Logits(affine(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -228,6 +228,12 @@ public:
|
||||
Expr apply(Expr input) override { return apply(std::vector<Expr>({input})); }
|
||||
};
|
||||
|
||||
} // namespace mlp
|
||||
|
||||
class LSH;
|
||||
|
||||
namespace mlp {
|
||||
|
||||
class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList {
|
||||
private:
|
||||
// parameters held by this layer
|
||||
@ -239,10 +245,11 @@ private:
|
||||
Expr cachedShortb_; // these match the current value of shortlist_
|
||||
Expr cachedShortLemmaEt_;
|
||||
Ptr<FactoredVocab> factoredVocab_;
|
||||
|
||||
|
||||
// optional parameters set/updated after construction
|
||||
Expr tiedParam_;
|
||||
Ptr<data::Shortlist> shortlist_;
|
||||
Ptr<LSH> lsh_;
|
||||
|
||||
void lazyConstruct(int inputDim);
|
||||
public:
|
||||
|
118
src/layers/lsh.cpp
Normal file
118
src/layers/lsh.cpp
Normal file
@ -0,0 +1,118 @@
|
||||
#include "layers/lsh.h"
|
||||
#include "3rd_party/faiss/IndexLSH.h"
|
||||
#include "graph/expression_operators.h"
|
||||
#include "tensors/cpu/prod_blas.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
Expr LSH::apply(Expr input, Expr W, Expr b) {
|
||||
auto idx = search(input, W);
|
||||
return affine(idx, input, W, b);
|
||||
}
|
||||
|
||||
Expr LSH::search(Expr query, Expr values) {
|
||||
ABORT_IF(query->graph()->getDeviceId().type == DeviceType::gpu,
|
||||
"LSH index (--output-approx-knn) currently not implemented for GPU");
|
||||
|
||||
auto kShape = query->shape();
|
||||
kShape.set(-1, k_);
|
||||
|
||||
auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
|
||||
auto query = inputs[0];
|
||||
auto values = inputs[1];
|
||||
|
||||
int dim = values->shape()[-1];
|
||||
|
||||
if(!index_ || indexHash_ != values->hash()) {
|
||||
LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_);
|
||||
index_.reset(new faiss::IndexLSH(dim, nbits_,
|
||||
/*rotate=*/dim != nbits_,
|
||||
/*train_thesholds*/false));
|
||||
int vRows = values->shape().elements() / dim;
|
||||
index_->train(vRows, values->val()->data<float>());
|
||||
index_->add( vRows, values->val()->data<float>());
|
||||
indexHash_ = values->hash();
|
||||
}
|
||||
|
||||
int qRows = query->shape().elements() / dim;
|
||||
std::vector<float> distances(qRows * k_);
|
||||
std::vector<faiss::Index::idx_t> ids(qRows * k_);
|
||||
|
||||
index_->search(qRows, query->val()->data<float>(), k_,
|
||||
distances.data(), ids.data());
|
||||
|
||||
std::vector<IndexType> vOut;
|
||||
vOut.reserve(ids.size());
|
||||
for(auto id : ids)
|
||||
vOut.push_back((IndexType)id);
|
||||
|
||||
out->val()->set(vOut);
|
||||
};
|
||||
|
||||
return lambda({query, values}, kShape, Type::uint32, forward);
|
||||
}
|
||||
|
||||
Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
|
||||
auto outShape = input->shape();
|
||||
int dimVoc = W->shape()[-2];
|
||||
outShape.set(-1, dimVoc);
|
||||
|
||||
auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
|
||||
auto lowest = NumericLimits<float>(out->value_type()).lowest;
|
||||
out->val()->set(lowest);
|
||||
|
||||
int dimIn = inputs[1]->shape()[-1];
|
||||
int dimOut = out->shape()[-1];
|
||||
int dimRows = out->shape().elements() / dimOut;
|
||||
|
||||
auto outPtr = out->val()->data<float>();
|
||||
auto idxPtr = inputs[0]->val()->data<uint32_t>();
|
||||
auto queryPtr = inputs[1]->val()->data<float>();
|
||||
auto WPtr = inputs[2]->val()->data<float>();
|
||||
auto bPtr = inputs[3]->val()->data<float>();
|
||||
|
||||
for(int row = 0; row < dimRows; ++row) {
|
||||
auto currIdxPtr = idxPtr + row * k_; // move to next batch of k entries
|
||||
auto currQueryPtr = queryPtr + row * dimIn; // move to next input query vector
|
||||
auto currOutPtr = outPtr + row * dimOut; // move to next output position vector (of vocabulary size)
|
||||
for(int k = 0; k < k_; k++) {
|
||||
int relPos = currIdxPtr[k]; // k-th best vocabulay item
|
||||
auto currWPtr = WPtr + relPos * dimIn; // offset for k-th best embedding
|
||||
currOutPtr[relPos] = bPtr[relPos]; // write bias value to position
|
||||
|
||||
// proceed one vector product at a time writing to the correct position
|
||||
sgemm(false, true, 1, 1, dimIn, 1.0f, currQueryPtr, dimIn, currWPtr, dimIn, 1.0f, &currOutPtr[relPos], 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return lambda({idx, input, W, b},
|
||||
outShape,
|
||||
input->value_type(),
|
||||
forward);
|
||||
}
|
||||
|
||||
// @TODO: alternative version which does the same as above with Marian operators, currently missing "scatter".
|
||||
// this uses more memory and likely to be slower. Would make sense to have a scatter node that actually creates
|
||||
// the node instead of relying on an existing node, e.g. scatter(shape, defaultValue, axis, indices, values);
|
||||
#if 0
|
||||
Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
|
||||
int dim = input->shape()[-1];
|
||||
int bch = idx->shape().elements() / k;
|
||||
|
||||
auto W = reshape(rows(Wt_, flatten(idx)), {bch, k, dim}); // [rows, k, dim]
|
||||
auto b = reshape(cols(b_, flatten(idx)), {bch, 1, k}); // [rows, 1, k]
|
||||
|
||||
auto aff = reshape(bdot(reshape(input, {bch, 1, dim}), W, false, true) + b, idx->shape()); // [beam, time, batch, k]
|
||||
|
||||
int dimVoc = Wt_->shape()[-2];
|
||||
auto oShape = input->shape();
|
||||
oShape.set(-1, dimVoc);
|
||||
auto lowest = graph_->constant(oShape,
|
||||
inits::fromValue(NumericLimits<float>(input->value_type()).lowest),
|
||||
input->value_type());
|
||||
return scatter(lowest, -1, idx, aff);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace marian
|
26
src/layers/lsh.h
Normal file
26
src/layers/lsh.h
Normal file
@ -0,0 +1,26 @@
|
||||
#include "graph/expression_graph.h"
|
||||
#include <memory>
|
||||
|
||||
namespace faiss {
|
||||
class IndexLSH;
|
||||
}
|
||||
|
||||
namespace marian {
|
||||
|
||||
class LSH {
|
||||
public:
|
||||
LSH(int k, int nbits) : k_{k}, nbits_{nbits} {}
|
||||
Expr apply(Expr query, Expr values, Expr bias);
|
||||
|
||||
private:
|
||||
Ptr<faiss::IndexLSH> index_;
|
||||
size_t indexHash_{0};
|
||||
|
||||
int k_{100};
|
||||
int nbits_{1024};
|
||||
|
||||
Expr search(Expr query, Expr values);
|
||||
Expr affine(Expr idx, Expr query, Expr values, Expr bias);
|
||||
};
|
||||
|
||||
}
|
@ -615,6 +615,7 @@ private:
|
||||
"prefix", prefix_ + "_ff_logit_out",
|
||||
"dim", dimTrgVoc,
|
||||
"vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_], // for factored outputs
|
||||
"output-approx-knn", opt<std::vector<int>>("output-approx-knn"),
|
||||
"lemma-dim-emb", opt<int>("lemma-dim-emb", 0)); // for factored outputs
|
||||
|
||||
if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all"))
|
||||
|
@ -7,51 +7,13 @@
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
|
||||
#if MKL_FOUND
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#if BLAS_FOUND
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "prod_blas.h"
|
||||
#include "sharp/int_gemm.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
namespace cpu {
|
||||
|
||||
#if BLAS_FOUND
|
||||
inline void sgemm(bool transA,
|
||||
bool transB,
|
||||
int rows_a,
|
||||
int rows_b,
|
||||
int width,
|
||||
float alpha,
|
||||
float* a,
|
||||
int lda,
|
||||
float* b,
|
||||
int ldb,
|
||||
float beta,
|
||||
float* c,
|
||||
int ldc) {
|
||||
cblas_sgemm(CblasRowMajor,
|
||||
transA ? CblasTrans : CblasNoTrans,
|
||||
transB ? CblasTrans : CblasNoTrans,
|
||||
rows_a,
|
||||
rows_b,
|
||||
width,
|
||||
alpha,
|
||||
a,
|
||||
lda,
|
||||
b,
|
||||
ldb,
|
||||
beta,
|
||||
c,
|
||||
ldc);
|
||||
}
|
||||
#endif
|
||||
|
||||
void Prod(marian::Tensor C,
|
||||
const marian::Tensor& A,
|
||||
const marian::Tensor& B,
|
||||
|
38
src/tensors/cpu/prod_blas.h
Normal file
38
src/tensors/cpu/prod_blas.h
Normal file
@ -0,0 +1,38 @@
|
||||
#if MKL_FOUND
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#if BLAS_FOUND
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if BLAS_FOUND
|
||||
inline void sgemm(bool transA,
|
||||
bool transB,
|
||||
int rows_a,
|
||||
int rows_b,
|
||||
int width,
|
||||
float alpha,
|
||||
float* a,
|
||||
int lda,
|
||||
float* b,
|
||||
int ldb,
|
||||
float beta,
|
||||
float* c,
|
||||
int ldc) {
|
||||
cblas_sgemm(CblasRowMajor,
|
||||
transA ? CblasTrans : CblasNoTrans,
|
||||
transB ? CblasTrans : CblasNoTrans,
|
||||
rows_a,
|
||||
rows_b,
|
||||
width,
|
||||
alpha,
|
||||
a,
|
||||
lda,
|
||||
b,
|
||||
ldb,
|
||||
beta,
|
||||
c,
|
||||
ldc);
|
||||
}
|
||||
#endif
|
@ -36,12 +36,24 @@ class TensorBase {
|
||||
Ptr<Backend> backend)
|
||||
: memory_(memory), shape_(shape), type_(type), backend_(backend) {}
|
||||
|
||||
TensorBase(MemoryPiece::PtrType memory, Shape shape, Ptr<Backend> backend)
|
||||
TensorBase(MemoryPiece::PtrType memory,
|
||||
Shape shape,
|
||||
Ptr<Backend> backend)
|
||||
: memory_(memory),
|
||||
shape_(shape),
|
||||
type_(Type::float32),
|
||||
backend_(backend) {}
|
||||
|
||||
// Wraps existing memory
|
||||
template <typename T>
|
||||
TensorBase(T* rawMemory,
|
||||
size_t rawMemoryNum,
|
||||
Shape shape,
|
||||
Type type,
|
||||
Ptr<Backend> backend)
|
||||
: memory_(MemoryPiece::New((uint8_t*)rawMemory, rawMemoryNum * sizeof(T))),
|
||||
shape_(shape), type_(type), backend_(backend) {}
|
||||
|
||||
public:
|
||||
// Use this whenever pointing to MemoryPiece
|
||||
typedef IPtr<TensorBase> PtrType;
|
||||
|
Loading…
Reference in New Issue
Block a user