Merged PR 12923: LSH indexing to replace short list

This PR adds the FAISS LSH index to the Marian as a CPU-side optimization in the output layer of transformer models via KNN search.

* It allows to replace the potentially harmful short-list with a ML free approximation of the final matrix multiply. With increasing number of K neighbors and the size of the chosen hash in bits the approximation becomes more accurate, but also slower. For model dimensions of 512, the sweet spot seems to be around k=100-150 and nbits=1024-1536
* Enable during CPU-side decoding via `--output-approx-knn k nbits`
* Add a `lambda` node that allows to create custom new nodes, most useful for CPU use right now.
This commit is contained in:
Martin Junczys-Dowmunt 2020-05-21 23:20:52 +00:00
parent 77a420740c
commit 5a439245b1
36 changed files with 4745 additions and 49 deletions

View File

@ -138,6 +138,9 @@ else(MSVC)
set(INTRINSICS "-msse4.1")
endif()
add_definitions(-DFINTEGER=long)
set(EXT_LIBS ${EXT_LIBS} faiss)
if(USE_FBGEMM)
set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
add_definitions(-DUSE_FBGEMM=1)

View File

@ -5,6 +5,8 @@ add_subdirectory(./yaml-cpp)
add_subdirectory(./SQLiteCpp)
add_subdirectory(./pathie-cpp)
add_subdirectory(./zlib)
add_subdirectory(./faiss)
include_directories(./faiss)
if(USE_FBGEMM)
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS

10
src/3rd_party/faiss/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,10 @@
# adding a new file require explicittly modifing the CMakeLists.txt
include_directories("impl")
FILE(GLOB FaissCppSources *.cpp impl/*.cpp utils/*.cpp)
if (NOT TARGET faiss)
add_library(faiss STATIC ${FaissCppSources})
endif()

120
src/3rd_party/faiss/Index.cpp vendored Normal file
View File

@ -0,0 +1,120 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "Index.h"
#include "common/logging.h"
#include <cstring>
namespace faiss {
Index::~Index ()
{
}
void Index::train(idx_t /*n*/, const float* /*x*/) {
// does nothing by default
}
void Index::range_search (idx_t , const float *, float,
RangeSearchResult *) const
{
ABORT ("range search not implemented");
}
void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
{
float * distances = new float[n * k];
ScopeDeleter<float> del(distances);
search (n, x, k, distances, labels);
}
void Index::add_with_ids(
idx_t /*n*/,
const float* /*x*/,
const idx_t* /*xids*/) {
ABORT ("add_with_ids not implemented for this type of index");
}
size_t Index::remove_ids(const IDSelector& /*sel*/) {
ABORT ("remove_ids not implemented for this type of index");
return -1;
}
void Index::reconstruct (idx_t, float * ) const {
ABORT ("reconstruct not implemented for this type of index");
}
void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
for (idx_t i = 0; i < ni; i++) {
reconstruct (i0 + i, recons + i * d);
}
}
void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *recons) const {
search (n, x, k, distances, labels);
for (idx_t i = 0; i < n; ++i) {
for (idx_t j = 0; j < k; ++j) {
idx_t ij = i * k + j;
idx_t key = labels[ij];
float* reconstructed = recons + ij * d;
if (key < 0) {
// Fill with NaNs
memset(reconstructed, -1, sizeof(*reconstructed) * d);
} else {
reconstruct (key, reconstructed);
}
}
}
}
void Index::compute_residual (const float * x,
float * residual, idx_t key) const {
reconstruct (key, residual);
for (size_t i = 0; i < d; i++) {
residual[i] = x[i] - residual[i];
}
}
void Index::compute_residual_n (idx_t n, const float* xs,
float* residuals,
const idx_t* keys) const {
//#pragma omp parallel for
for (idx_t i = 0; i < n; ++i) {
compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
}
}
size_t Index::sa_code_size () const
{
ABORT ("standalone codec not implemented for this type of index");
}
void Index::sa_encode (idx_t, const float *,
uint8_t *) const
{
ABORT ("standalone codec not implemented for this type of index");
}
void Index::sa_decode (idx_t, const uint8_t *,
float *) const
{
ABORT ("standalone codec not implemented for this type of index");
}
}

233
src/3rd_party/faiss/Index.h vendored Normal file
View File

@ -0,0 +1,233 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#ifndef FAISS_INDEX_H
#define FAISS_INDEX_H
#include "utils/misc.h"
#include <cstdio>
#include <typeinfo>
#include <string>
#include <sstream>
#define FAISS_VERSION_MAJOR 1
#define FAISS_VERSION_MINOR 6
#define FAISS_VERSION_PATCH 3
/**
* @namespace faiss
*
* Throughout the library, vectors are provided as float * pointers.
* Most algorithms can be optimized when several vectors are processed
* (added/searched) together in a batch. In this case, they are passed
* in as a matrix. When n vectors of size d are provided as float * x,
* component j of vector i is
*
* x[ i * d + j ]
*
* where 0 <= i < n and 0 <= j < d. In other words, matrices are
* always compact. When specifying the size of the matrix, we call it
* an n*d matrix, which implies a row-major storage.
*/
namespace faiss {
/// Forward declarations see AuxIndexStructures.h
struct IDSelector;
struct RangeSearchResult;
struct DistanceComputer;
/** Abstract structure for an index, supports adding vectors and searching them.
*
* All vectors provided at add or search time are 32-bit float arrays,
* although the internal representation may vary.
*/
struct Index {
using idx_t = int64_t; ///< all indices are this type
using component_t = float;
using distance_t = float;
int d; ///< vector dimension
idx_t ntotal; ///< total nb of indexed vectors
bool verbose; ///< verbosity level
/// set if the Index does not require training, or if training is
/// done already
bool is_trained;
/// type of metric this index uses for search
MetricType metric_type;
float metric_arg; ///< argument of the metric type
explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
d(d),
ntotal(0),
verbose(false),
is_trained(true),
metric_type (metric),
metric_arg(0) {}
virtual ~Index ();
/** Perform training on a representative set of vectors
*
* @param n nb of training vectors
* @param x training vecors, size n * d
*/
virtual void train(idx_t n, const float* x);
/** Add n vectors of dimension d to the index.
*
* Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
* This function slices the input vectors in chuncks smaller than
* blocksize_add and calls add_core.
* @param x input matrix, size n * d
*/
virtual void add (idx_t n, const float *x) = 0;
/** Same as add, but stores xids instead of sequential ids.
*
* The default implementation fails with an assertion, as it is
* not supported by all indexes.
*
* @param xids if non-null, ids to store for the vectors (size n)
*/
virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
/** query n vectors of dimension d to the index.
*
* return at most k vectors. If there are not enough results for a
* query, the result array is padded with -1s.
*
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
* @param distances output pairwise distances, size n*k
*/
virtual void search (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels) const = 0;
/** query n vectors of dimension d to the index.
*
* return all vectors with distance < radius. Note that many
* indexes do not implement the range_search (only the k-NN search
* is mandatory).
*
* @param x input vectors to search, size n * d
* @param radius search radius
* @param result result table
*/
virtual void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const;
/** return the indexes of the k vectors closest to the query x.
*
* This function is identical as search but only return labels of neighbors.
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
*/
void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
/// removes all elements from the database.
virtual void reset() = 0;
/** removes IDs from the index. Not supported by all
* indexes. Returns the number of elements removed.
*/
virtual size_t remove_ids (const IDSelector & sel);
/** Reconstruct a stored vector (or an approximation if lossy coding)
*
* this function may not be defined for some indexes
* @param key id of the vector to reconstruct
* @param recons reconstucted vector (size d)
*/
virtual void reconstruct (idx_t key, float * recons) const;
/** Reconstruct vectors i0 to i0 + ni - 1
*
* this function may not be defined for some indexes
* @param recons reconstucted vector (size ni * d)
*/
virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
/** Similar to search, but also reconstructs the stored vectors (or an
* approximation in the case of lossy coding) for the search results.
*
* If there are not enough results for a query, the resulting arrays
* is padded with -1s.
*
* @param recons reconstructed vectors size (n, k, d)
**/
virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *recons) const;
/** Computes a residual vector after indexing encoding.
*
* The residual vector is the difference between a vector and the
* reconstruction that can be decoded from its representation in
* the index. The residual can be used for multiple-stage indexing
* methods, like IndexIVF's methods.
*
* @param x input vector, size d
* @param residual output residual vector, size d
* @param key encoded index, as returned by search and assign
*/
virtual void compute_residual (const float * x,
float * residual, idx_t key) const;
/** Computes a residual vector after indexing encoding (batch form).
* Equivalent to calling compute_residual for each vector.
*
* The residual vector is the difference between a vector and the
* reconstruction that can be decoded from its representation in
* the index. The residual can be used for multiple-stage indexing
* methods, like IndexIVF's methods.
*
* @param n number of vectors
* @param xs input vectors, size (n x d)
* @param residuals output residual vectors, size (n x d)
* @param keys encoded index, as returned by search and assign
*/
virtual void compute_residual_n (idx_t n, const float* xs,
float* residuals,
const idx_t* keys) const;
/* The standalone codec interface */
/** size of the produced codes in bytes */
virtual size_t sa_code_size () const;
/** encode a set of vectors
*
* @param n number of vectors
* @param x input vectors, size n * d
* @param bytes output encoded vectors, size n * sa_code_size()
*/
virtual void sa_encode (idx_t n, const float *x,
uint8_t *bytes) const;
/** encode a set of vectors
*
* @param n number of vectors
* @param bytes input encoded vectors, size n * sa_code_size()
* @param x output vectors, size n * d
*/
virtual void sa_decode (idx_t n, const uint8_t *bytes,
float *x) const;
};
}
#endif

224
src/3rd_party/faiss/IndexLSH.cpp vendored Normal file
View File

@ -0,0 +1,224 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <faiss/IndexLSH.h>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <faiss/utils/hamming.h>
#include "common/logging.h"
namespace faiss {
/***************************************************************
* IndexLSH
***************************************************************/
IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds):
Index(d), nbits(nbits), rotate_data(rotate_data),
train_thresholds (train_thresholds), rrot(d, nbits)
{
is_trained = !train_thresholds;
bytes_per_vec = (nbits + 7) / 8;
if (rotate_data) {
rrot.init(5);
} else {
ABORT_UNLESS(d >= nbits, "d >= nbits");
}
}
IndexLSH::IndexLSH ():
nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false)
{
}
const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const
{
float *xt = nullptr;
if (rotate_data) {
// also applies bias if exists
xt = rrot.apply (n, x);
} else if (d != nbits) {
assert (nbits < d);
xt = new float [nbits * n];
float *xp = xt;
for (idx_t i = 0; i < n; i++) {
const float *xl = x + i * d;
for (int j = 0; j < nbits; j++)
*xp++ = xl [j];
}
}
if (train_thresholds) {
if (xt == NULL) {
xt = new float [nbits * n];
memcpy (xt, x, sizeof(*x) * n * nbits);
}
float *xp = xt;
for (idx_t i = 0; i < n; i++)
for (int j = 0; j < nbits; j++)
*xp++ -= thresholds [j];
}
return xt ? xt : x;
}
void IndexLSH::train (idx_t n, const float *x)
{
if (train_thresholds) {
thresholds.resize (nbits);
train_thresholds = false;
const float *xt = apply_preprocess (n, x);
ScopeDeleter<float> del (xt == x ? nullptr : xt);
train_thresholds = true;
float * transposed_x = new float [n * nbits];
ScopeDeleter<float> del2 (transposed_x);
for (idx_t i = 0; i < n; i++)
for (idx_t j = 0; j < nbits; j++)
transposed_x [j * n + i] = xt [i * nbits + j];
for (idx_t i = 0; i < nbits; i++) {
float *xi = transposed_x + i * n;
// std::nth_element
std::sort (xi, xi + n);
if (n % 2 == 1)
thresholds [i] = xi [n / 2];
else
thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2;
}
}
is_trained = true;
}
void IndexLSH::add (idx_t n, const float *x)
{
ABORT_UNLESS (is_trained, "is_trained");
codes.resize ((ntotal + n) * bytes_per_vec);
sa_encode (n, x, &codes[ntotal * bytes_per_vec]);
ntotal += n;
}
void IndexLSH::search (
idx_t n,
const float *x,
idx_t k,
float *distances,
idx_t *labels) const
{
ABORT_UNLESS (is_trained, "is_trained");
const float *xt = apply_preprocess (n, x);
ScopeDeleter<float> del (xt == x ? nullptr : xt);
uint8_t * qcodes = new uint8_t [n * bytes_per_vec];
ScopeDeleter<uint8_t> del2 (qcodes);
fvecs2bitvecs (xt, qcodes, nbits, n);
int * idistances = new int [n * k];
ScopeDeleter<int> del3 (idistances);
int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances};
hammings_knn_hc (&res, qcodes, codes.data(),
ntotal, bytes_per_vec, true);
// convert distances to floats
for (int i = 0; i < k * n; i++)
distances[i] = idistances[i];
}
void IndexLSH::transfer_thresholds (LinearTransform *vt) {
if (!train_thresholds) return;
ABORT_UNLESS (nbits == vt->d_out, "nbits == vt->d_out");
if (!vt->have_bias) {
vt->b.resize (nbits, 0);
vt->have_bias = true;
}
for (int i = 0; i < nbits; i++)
vt->b[i] -= thresholds[i];
train_thresholds = false;
thresholds.clear();
}
void IndexLSH::reset() {
codes.clear();
ntotal = 0;
}
size_t IndexLSH::sa_code_size () const
{
return bytes_per_vec;
}
void IndexLSH::sa_encode (idx_t n, const float *x,
uint8_t *bytes) const
{
ABORT_UNLESS (is_trained, "is_trained");
const float *xt = apply_preprocess (n, x);
ScopeDeleter<float> del (xt == x ? nullptr : xt);
fvecs2bitvecs (xt, bytes, nbits, n);
}
void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes,
float *x) const
{
float *xt = x;
ScopeDeleter<float> del;
if (rotate_data || nbits != d) {
xt = new float [n * nbits];
del.set(xt);
}
bitvecs2fvecs (bytes, xt, nbits, n);
if (train_thresholds) {
float *xp = xt;
for (idx_t i = 0; i < n; i++) {
for (int j = 0; j < nbits; j++) {
*xp++ += thresholds [j];
}
}
}
if (rotate_data) {
rrot.reverse_transform (n, xt, x);
} else if (nbits != d) {
for (idx_t i = 0; i < n; i++) {
memcpy (x + i * d, xt + i * nbits,
nbits * sizeof(xt[0]));
}
}
}
} // namespace faiss

90
src/3rd_party/faiss/IndexLSH.h vendored Normal file
View File

@ -0,0 +1,90 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#ifndef INDEX_LSH_H
#define INDEX_LSH_H
#include <vector>
#include <faiss/Index.h>
#include <faiss/VectorTransform.h>
namespace faiss {
/** The sign of each vector component is put in a binary signature */
struct IndexLSH:Index {
typedef unsigned char uint8_t;
int nbits; ///< nb of bits per vector
int bytes_per_vec; ///< nb of 8-bits per encoded vector
bool rotate_data; ///< whether to apply a random rotation to input
bool train_thresholds; ///< whether we train thresholds or use 0
RandomRotationMatrix rrot; ///< optional random rotation
std::vector <float> thresholds; ///< thresholds to compare with
/// encoded dataset
std::vector<uint8_t> codes;
IndexLSH (
idx_t d, int nbits,
bool rotate_data = true,
bool train_thresholds = false);
/** Preprocesses and resizes the input to the size required to
* binarize the data
*
* @param x input vectors, size n * d
* @return output vectors, size n * bits. May be the same pointer
* as x, otherwise it should be deleted by the caller
*/
const float *apply_preprocess (idx_t n, const float *x) const;
void train(idx_t n, const float* x) override;
void add(idx_t n, const float* x) override;
void search(
idx_t n,
const float* x,
idx_t k,
float* distances,
idx_t* labels) const override;
void reset() override;
/// transfer the thresholds to a pre-processing stage (and unset
/// train_thresholds)
void transfer_thresholds (LinearTransform * vt);
~IndexLSH() override {}
IndexLSH ();
/* standalone codec interface.
*
* The vectors are decoded to +/- 1 (not 0, 1) */
size_t sa_code_size () const override;
void sa_encode (idx_t n, const float *x,
uint8_t *bytes) const override;
void sa_decode (idx_t n, const uint8_t *bytes,
float *x) const override;
};
}
#endif

21
src/3rd_party/faiss/LICENSE vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1
src/3rd_party/faiss/README vendored Normal file
View File

@ -0,0 +1 @@
This is code extracted from the original FAISS repository: https://github.com/facebookresearch/faiss

726
src/3rd_party/faiss/VectorTransform.cpp vendored Normal file
View File

@ -0,0 +1,726 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <faiss/VectorTransform.h>
#include <cstdio>
#include <cmath>
#include <cstring>
#include <memory>
#include <faiss/utils/random.h>
#include "common/logging.h"
using namespace faiss;
extern "C" {
// this is to keep the clang syntax checker happy
#ifndef FINTEGER
#define FINTEGER int
#endif
/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
int sgemm_ (
const char *transa, const char *transb, FINTEGER *m, FINTEGER *
n, FINTEGER *k, const float *alpha, const float *a,
FINTEGER *lda, const float *b,
FINTEGER *ldb, float *beta,
float *c, FINTEGER *ldc);
int dgemm_ (
const char *transa, const char *transb, FINTEGER *m, FINTEGER *
n, FINTEGER *k, const double *alpha, const double *a,
FINTEGER *lda, const double *b,
FINTEGER *ldb, double *beta,
double *c, FINTEGER *ldc);
int ssyrk_ (
const char *uplo, const char *trans, FINTEGER *n, FINTEGER *k,
float *alpha, float *a, FINTEGER *lda,
float *beta, float *c, FINTEGER *ldc);
/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
int ssyev_ (
const char *jobz, const char *uplo, FINTEGER *n, float *a,
FINTEGER *lda, float *w, float *work, FINTEGER *lwork,
FINTEGER *info);
int dsyev_ (
const char *jobz, const char *uplo, FINTEGER *n, double *a,
FINTEGER *lda, double *w, double *work, FINTEGER *lwork,
FINTEGER *info);
int sgesvd_(
const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
float *a, FINTEGER *lda, float *s, float *u, FINTEGER *ldu, float *vt,
FINTEGER *ldvt, float *work, FINTEGER *lwork, FINTEGER *info);
int dgesvd_(
const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
double *a, FINTEGER *lda, double *s, double *u, FINTEGER *ldu, double *vt,
FINTEGER *ldvt, double *work, FINTEGER *lwork, FINTEGER *info);
}
///////////////////////////////////////////////
extern "C" {
/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
int sgeqrf_(FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
FINTEGER *lda, float *tau, float *work,
FINTEGER *lwork, FINTEGER *info);
}
void matrix_qr(int m, int n, float *a)
{
ABORT_UNLESS(m >= n, "m >= n");
FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
std::vector<float> tau(ki);
FINTEGER lwork = -1, info;
float work_size;
sgeqrf_(&mi, &ni, a, &mi, tau.data(),
&work_size, &lwork, &info);
lwork = size_t(work_size);
std::vector<float> work(lwork);
sgeqrf_(&mi, &ni, a, &mi,
tau.data(), work.data(), &lwork, &info);
sorgqr_(&mi, &ni, &ki, a, &mi, tau.data(),
work.data(), &lwork, &info);
}
///////////////////////////////////////////////
const float *fvecs_maybe_subsample(
size_t d, size_t *n, size_t nmax, const float *x,
bool verbose = false, int64_t seed = 1234)
{
if (*n <= nmax) return x; // nothing to do
size_t n2 = nmax;
if (verbose) {
printf(" Input training set too big (max size is %ld), sampling "
"%ld / %ld vectors\n", nmax, n2, *n);
}
std::vector<int> subset(*n);
rand_perm(subset.data(), *n, seed);
float *x_subset = new float[n2 * d];
for (int64_t i = 0; i < n2; i++)
memcpy(&x_subset[i * d],
&x[subset[i] * size_t(d)],
sizeof(x[0]) * d);
*n = n2;
return x_subset;
}
#ifdef __SSE__
// reads 0 <= d < 4 floats as __m128
static inline __m128 masked_read(int d, const float *x)
{
assert(0 <= d && d < 4);
__attribute__((__aligned__(16))) float buf[4] = { 0, 0, 0, 0 };
switch (d) {
case 3:
buf[2] = x[2];
case 2:
buf[1] = x[1];
case 1:
buf[0] = x[0];
}
return _mm_load_ps(buf);
// cannot use AVX2 _mm_mask_set1_epi32
}
float fvec_norm_L2sqr(const float * x,
size_t d)
{
__m128 mx;
__m128 msum1 = _mm_setzero_ps();
while (d >= 4) {
mx = _mm_loadu_ps(x); x += 4;
msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
d -= 4;
}
mx = masked_read(d, x);
msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
msum1 = _mm_hadd_ps(msum1, msum1);
msum1 = _mm_hadd_ps(msum1, msum1);
return _mm_cvtss_f32(msum1);
}
#else
// scalar implementation
float fvec_norm_L2sqr(const float *x, size_t d)
{
return fvec_norm_L2sqr_ref(x, d);
}
#endif
void fvec_renorm_L2(size_t d, size_t nx, float * __restrict x)
{
//#pragma omp parallel for
for (size_t i = 0; i < nx; i++) {
float * __restrict xi = x + i * d;
float nr = fvec_norm_L2sqr(xi, d);
if (nr > 0) {
size_t j;
const float inv_nr = 1.0 / sqrtf(nr);
for (j = 0; j < d; j++)
xi[j] *= inv_nr;
}
}
}
/*********************************************
* VectorTransform
*********************************************/
float * VectorTransform::apply (Index::idx_t n, const float * x) const
{
float * xt = new float[n * d_out];
apply_noalloc (n, x, xt);
return xt;
}
void VectorTransform::train (idx_t, const float *) {
// does nothing by default
}
void VectorTransform::reverse_transform (
idx_t , const float *,
float *) const
{
ABORT ("reverse transform not implemented");
}
/*********************************************
* LinearTransform
*********************************************/
/// both d_in > d_out and d_out < d_in are supported
LinearTransform::LinearTransform (int d_in, int d_out,
bool have_bias):
VectorTransform (d_in, d_out), have_bias (have_bias),
is_orthonormal (false), verbose (false)
{
is_trained = false; // will be trained when A and b are initialized
}
void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
float * xt) const
{
ABORT_UNLESS(is_trained, "Transformation not trained yet");
float c_factor;
if (have_bias) {
ABORT_UNLESS(b.size() == d_out, "Bias not initialized");
float * xi = xt;
for (int i = 0; i < n; i++)
for(int j = 0; j < d_out; j++)
*xi++ = b[j];
c_factor = 1.0;
} else {
c_factor = 0.0;
}
ABORT_UNLESS(A.size() == d_out * d_in,
"Transformation matrix not initialized");
float one = 1;
FINTEGER nbiti = d_out, ni = n, di = d_in;
sgemm_ ("Transposed", "Not transposed",
&nbiti, &ni, &di,
&one, A.data(), &di, x, &di, &c_factor, xt, &nbiti);
}
void LinearTransform::transform_transpose (idx_t n, const float * y,
float *x) const
{
if (have_bias) { // allocate buffer to store bias-corrected data
float *y_new = new float [n * d_out];
const float *yr = y;
float *yw = y_new;
for (idx_t i = 0; i < n; i++) {
for (int j = 0; j < d_out; j++) {
*yw++ = *yr++ - b [j];
}
}
y = y_new;
}
{
FINTEGER dii = d_in, doi = d_out, ni = n;
float one = 1.0, zero = 0.0;
sgemm_ ("Not", "Not", &dii, &ni, &doi,
&one, A.data (), &dii, y, &doi, &zero, x, &dii);
}
if (have_bias) delete [] y;
}
void LinearTransform::set_is_orthonormal ()
{
if (d_out > d_in) {
// not clear what we should do in this case
is_orthonormal = false;
return;
}
if (d_out == 0) { // borderline case, unnormalized matrix
is_orthonormal = true;
return;
}
double eps = 4e-5;
ABORT_UNLESS(A.size() >= d_out * d_in, "A.size() >= d_out * d_in");
{
std::vector<float> ATA(d_out * d_out);
FINTEGER dii = d_in, doi = d_out;
float one = 1.0, zero = 0.0;
sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
&one, A.data (), &dii,
A.data(), &dii,
&zero, ATA.data(), &doi);
is_orthonormal = true;
for (long i = 0; i < d_out; i++) {
for (long j = 0; j < d_out; j++) {
float v = ATA[i + j * d_out];
if (i == j) v-= 1;
if (fabs(v) > eps) {
is_orthonormal = false;
}
}
}
}
}
void LinearTransform::reverse_transform (idx_t n, const float * xt,
float *x) const
{
if (is_orthonormal) {
transform_transpose (n, xt, x);
} else {
ABORT("reverse transform not implemented for non-orthonormal matrices");
}
}
void LinearTransform::print_if_verbose (
const char*name, const std::vector<double> &mat,
int n, int d) const
{
if (!verbose) return;
printf("matrix %s: %d*%d [\n", name, n, d);
ABORT_UNLESS(mat.size() >= n * d, "mat.size() >= n * d");
for (int i = 0; i < n; i++) {
for (int j = 0; j < d; j++) {
printf("%10.5g ", mat[i * d + j]);
}
printf("\n");
}
printf("]\n");
}
/*********************************************
* RandomRotationMatrix
*********************************************/
void RandomRotationMatrix::init (int seed)
{
if(d_out <= d_in) {
A.resize (d_out * d_in);
float *q = A.data();
float_randn(q, d_out * d_in, seed);
matrix_qr(d_in, d_out, q);
} else {
// use tight-frame transformation
A.resize (d_out * d_out);
float *q = A.data();
float_randn(q, d_out * d_out, seed);
matrix_qr(d_out, d_out, q);
// remove columns
int i, j;
for (i = 0; i < d_out; i++) {
for(j = 0; j < d_in; j++) {
q[i * d_in + j] = q[i * d_out + j];
}
}
A.resize(d_in * d_out);
}
is_orthonormal = true;
is_trained = true;
}
void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
{
// initialize with some arbitrary seed
init (12345);
}
/*********************************************
* PCAMatrix
*********************************************/
PCAMatrix::PCAMatrix (int d_in, int d_out,
float eigen_power, bool random_rotation):
LinearTransform(d_in, d_out, true),
eigen_power(eigen_power), random_rotation(random_rotation)
{
is_trained = false;
max_points_per_d = 1000;
balanced_bins = 0;
}
namespace {
/// Compute the eigenvalue decomposition of symmetric matrix cov,
/// dimensions d_in-by-d_in. Output eigenvectors in cov.
void eig(size_t d_in, double *cov, double *eigenvalues, int verbose)
{
{ // compute eigenvalues and vectors
FINTEGER info = 0, lwork = -1, di = d_in;
double workq;
dsyev_ ("Vectors as well", "Upper",
&di, cov, &di, eigenvalues, &workq, &lwork, &info);
lwork = FINTEGER(workq);
double *work = new double[lwork];
dsyev_ ("Vectors as well", "Upper",
&di, cov, &di, eigenvalues, work, &lwork, &info);
delete [] work;
if (info != 0) {
fprintf (stderr, "WARN ssyev info returns %d, "
"a very bad PCA matrix is learnt\n",
int(info));
// do not throw exception, as the matrix could still be useful
}
if(verbose && d_in <= 10) {
printf("info=%ld new eigvals=[", long(info));
for(int j = 0; j < d_in; j++) printf("%g ", eigenvalues[j]);
printf("]\n");
double *ci = cov;
printf("eigenvecs=\n");
for(int i = 0; i < d_in; i++) {
for(int j = 0; j < d_in; j++)
printf("%10.4g ", *ci++);
printf("\n");
}
}
}
// revert order of eigenvectors & values
for(int i = 0; i < d_in / 2; i++) {
std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
double *v1 = cov + i * d_in;
double *v2 = cov + (d_in - 1 - i) * d_in;
for(int j = 0; j < d_in; j++)
std::swap(v1[j], v2[j]);
}
}
}
void PCAMatrix::train (Index::idx_t n, const float *x)
{
const float * x_in = x;
x = fvecs_maybe_subsample (d_in, (size_t*)&n,
max_points_per_d * d_in, x, verbose);
ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
// compute mean
mean.clear(); mean.resize(d_in, 0.0);
if (have_bias) { // we may want to skip the bias
const float *xi = x;
for (int i = 0; i < n; i++) {
for(int j = 0; j < d_in; j++)
mean[j] += *xi++;
}
for(int j = 0; j < d_in; j++)
mean[j] /= n;
}
if(verbose) {
printf("mean=[");
for(int j = 0; j < d_in; j++) printf("%g ", mean[j]);
printf("]\n");
}
if(n >= d_in) {
// compute covariance matrix, store it in PCA matrix
PCAMat.resize(d_in * d_in);
float * cov = PCAMat.data();
{ // initialize with mean * mean^T term
float *ci = cov;
for(int i = 0; i < d_in; i++) {
for(int j = 0; j < d_in; j++)
*ci++ = - n * mean[i] * mean[j];
}
}
{
FINTEGER di = d_in, ni = n;
float one = 1.0;
ssyrk_ ("Up", "Non transposed",
&di, &ni, &one, (float*)x, &di, &one, cov, &di);
}
if(verbose && d_in <= 10) {
float *ci = cov;
printf("cov=\n");
for(int i = 0; i < d_in; i++) {
for(int j = 0; j < d_in; j++)
printf("%10g ", *ci++);
printf("\n");
}
}
std::vector<double> covd (d_in * d_in);
for (size_t i = 0; i < d_in * d_in; i++) covd [i] = cov [i];
std::vector<double> eigenvaluesd (d_in);
eig (d_in, covd.data (), eigenvaluesd.data (), verbose);
for (size_t i = 0; i < d_in * d_in; i++) PCAMat [i] = covd [i];
eigenvalues.resize (d_in);
for (size_t i = 0; i < d_in; i++)
eigenvalues [i] = eigenvaluesd [i];
} else {
std::vector<float> xc (n * d_in);
for (size_t i = 0; i < n; i++)
for(size_t j = 0; j < d_in; j++)
xc [i * d_in + j] = x [i * d_in + j] - mean[j];
// compute Gram matrix
std::vector<float> gram (n * n);
{
FINTEGER di = d_in, ni = n;
float one = 1.0, zero = 0.0;
ssyrk_ ("Up", "Transposed",
&ni, &di, &one, xc.data(), &di, &zero, gram.data(), &ni);
}
if(verbose && d_in <= 10) {
float *ci = gram.data();
printf("gram=\n");
for(int i = 0; i < n; i++) {
for(int j = 0; j < n; j++)
printf("%10g ", *ci++);
printf("\n");
}
}
std::vector<double> gramd (n * n);
for (size_t i = 0; i < n * n; i++)
gramd [i] = gram [i];
std::vector<double> eigenvaluesd (n);
// eig will fill in only the n first eigenvals
eig (n, gramd.data (), eigenvaluesd.data (), verbose);
PCAMat.resize(d_in * n);
for (size_t i = 0; i < n * n; i++)
gram [i] = gramd [i];
eigenvalues.resize (d_in);
// fill in only the n first ones
for (size_t i = 0; i < n; i++)
eigenvalues [i] = eigenvaluesd [i];
{ // compute PCAMat = x' * v
FINTEGER di = d_in, ni = n;
float one = 1.0;
sgemm_ ("Non", "Non Trans",
&di, &ni, &ni,
&one, xc.data(), &di, gram.data(), &ni,
&one, PCAMat.data(), &di);
}
if(verbose && d_in <= 10) {
float *ci = PCAMat.data();
printf("PCAMat=\n");
for(int i = 0; i < n; i++) {
for(int j = 0; j < d_in; j++)
printf("%10g ", *ci++);
printf("\n");
}
}
fvec_renorm_L2 (d_in, n, PCAMat.data());
}
prepare_Ab();
is_trained = true;
}
void PCAMatrix::copy_from (const PCAMatrix & other)
{
ABORT_UNLESS(other.is_trained, "other.is_trained");
mean = other.mean;
eigenvalues = other.eigenvalues;
PCAMat = other.PCAMat;
prepare_Ab ();
is_trained = true;
}
void PCAMatrix::prepare_Ab ()
{
ABORT_UNLESS(
d_out * d_in <= PCAMat.size(),
"PCA matrix cannot output %d dimensions from %d ",
d_out, d_in);
if (!random_rotation) {
A = PCAMat;
A.resize(d_out * d_in); // strip off useless dimensions
// first scale the components
if (eigen_power != 0) {
float *ai = A.data();
for (int i = 0; i < d_out; i++) {
float factor = pow(eigenvalues[i], eigen_power);
for(int j = 0; j < d_in; j++)
*ai++ *= factor;
}
}
if (balanced_bins != 0) {
ABORT_UNLESS(d_out % balanced_bins == 0, "d_out % balanced_bins == 0");
int dsub = d_out / balanced_bins;
std::vector <float> Ain;
std::swap(A, Ain);
A.resize(d_out * d_in);
std::vector <float> accu(balanced_bins);
std::vector <int> counter(balanced_bins);
// greedy assignment
for (int i = 0; i < d_out; i++) {
// find best bin
int best_j = -1;
float min_w = 1e30;
for (int j = 0; j < balanced_bins; j++) {
if (counter[j] < dsub && accu[j] < min_w) {
min_w = accu[j];
best_j = j;
}
}
int row_dst = best_j * dsub + counter[best_j];
accu[best_j] += eigenvalues[i];
counter[best_j] ++;
memcpy (&A[row_dst * d_in], &Ain[i * d_in],
d_in * sizeof (A[0]));
}
if (verbose) {
printf(" bin accu=[");
for (int i = 0; i < balanced_bins; i++)
printf("%g ", accu[i]);
printf("]\n");
}
}
} else {
ABORT_UNLESS(balanced_bins == 0,
"both balancing bins and applying a random rotation "
"does not make sense");
RandomRotationMatrix rr(d_out, d_out);
rr.init(5);
// apply scaling on the rotation matrix (right multiplication)
if (eigen_power != 0) {
for (int i = 0; i < d_out; i++) {
float factor = pow(eigenvalues[i], eigen_power);
for(int j = 0; j < d_out; j++)
rr.A[j * d_out + i] *= factor;
}
}
A.resize(d_in * d_out);
{
FINTEGER dii = d_in, doo = d_out;
float one = 1.0, zero = 0.0;
sgemm_ ("Not", "Not", &dii, &doo, &doo,
&one, PCAMat.data(), &dii, rr.A.data(), &doo, &zero,
A.data(), &dii);
}
}
b.clear(); b.resize(d_out);
for (int i = 0; i < d_out; i++) {
float accu = 0;
for (int j = 0; j < d_in; j++)
accu -= mean[j] * A[j + i * d_in];
b[i] = accu;
}
is_orthonormal = eigen_power == 0;
}

184
src/3rd_party/faiss/VectorTransform.h vendored Normal file
View File

@ -0,0 +1,184 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#ifndef FAISS_VECTOR_TRANSFORM_H
#define FAISS_VECTOR_TRANSFORM_H
/** Defines a few objects that apply transformations to a set of
* vectors Often these are pre-processing steps.
*/
#include <vector>
#include <stdint.h>
#include <faiss/Index.h>
namespace faiss {
/** Any transformation applied on a set of vectors */
struct VectorTransform {
typedef Index::idx_t idx_t;
int d_in; ///! input dimension
int d_out; ///! output dimension
explicit VectorTransform (int d_in = 0, int d_out = 0):
d_in(d_in), d_out(d_out), is_trained(true)
{}
/// set if the VectorTransform does not require training, or if
/// training is done already
bool is_trained;
/** Perform training on a representative set of vectors. Does
* nothing by default.
*
* @param n nb of training vectors
* @param x training vecors, size n * d
*/
virtual void train (idx_t n, const float *x);
/** apply the random roation, return new allocated matrix
* @param x size n * d_in
* @return size n * d_out
*/
float *apply (idx_t n, const float * x) const;
/// same as apply, but result is pre-allocated
virtual void apply_noalloc (idx_t n, const float * x,
float *xt) const = 0;
/// reverse transformation. May not be implemented or may return
/// approximate result
virtual void reverse_transform (idx_t n, const float * xt,
float *x) const;
virtual ~VectorTransform () {}
};
/** Generic linear transformation, with bias term applied on output
* y = A * x + b
*/
struct LinearTransform: VectorTransform {
bool have_bias; ///! whether to use the bias term
/// check if matrix A is orthonormal (enables reverse_transform)
bool is_orthonormal;
/// Transformation matrix, size d_out * d_in
std::vector<float> A;
/// bias vector, size d_out
std::vector<float> b;
/// both d_in > d_out and d_out < d_in are supported
explicit LinearTransform (int d_in = 0, int d_out = 0,
bool have_bias = false);
/// same as apply, but result is pre-allocated
void apply_noalloc(idx_t n, const float* x, float* xt) const override;
/// compute x = A^T * (x - b)
/// is reverse transform if A has orthonormal lines
void transform_transpose (idx_t n, const float * y,
float *x) const;
/// works only if is_orthonormal
void reverse_transform (idx_t n, const float * xt,
float *x) const override;
/// compute A^T * A to set the is_orthonormal flag
void set_is_orthonormal ();
bool verbose;
void print_if_verbose (const char*name, const std::vector<double> &mat,
int n, int d) const;
~LinearTransform() override {}
};
/// Randomly rotate a set of vectors
struct RandomRotationMatrix: LinearTransform {
/// both d_in > d_out and d_out < d_in are supported
RandomRotationMatrix (int d_in, int d_out):
LinearTransform(d_in, d_out, false) {}
/// must be called before the transform is used
void init(int seed);
// intializes with an arbitrary seed
void train(idx_t n, const float* x) override;
RandomRotationMatrix () {}
};
/** Applies a principal component analysis on a set of vectors,
* with optionally whitening and random rotation. */
struct PCAMatrix: LinearTransform {
/** after transformation the components are multiplied by
* eigenvalues^eigen_power
*
* =0: no whitening
* =-0.5: full whitening
*/
float eigen_power;
/// random rotation after PCA
bool random_rotation;
/// ratio between # training vectors and dimension
size_t max_points_per_d;
/// try to distribute output eigenvectors in this many bins
int balanced_bins;
/// Mean, size d_in
std::vector<float> mean;
/// eigenvalues of covariance matrix (= squared singular values)
std::vector<float> eigenvalues;
/// PCA matrix, size d_in * d_in
std::vector<float> PCAMat;
// the final matrix is computed after random rotation and/or whitening
explicit PCAMatrix (int d_in = 0, int d_out = 0,
float eigen_power = 0, bool random_rotation = false);
/// train on n vectors. If n < d_in then the eigenvector matrix
/// will be completed with 0s
void train(idx_t n, const float* x) override;
/// copy pre-trained PCA matrix
void copy_from (const PCAMatrix & other);
/// called after mean, PCAMat and eigenvalues are computed
void prepare_Ab();
};
} // namespace faiss
#endif

122
src/3rd_party/faiss/utils/Heap.cpp vendored Normal file
View File

@ -0,0 +1,122 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/* Function for soft heap */
#include <faiss/utils/Heap.h>
namespace faiss {
template <typename C>
void HeapArray<C>::heapify ()
{
//#pragma omp parallel for
for (size_t j = 0; j < nh; j++)
heap_heapify<C> (k, val + j * k, ids + j * k);
}
template <typename C>
void HeapArray<C>::reorder ()
{
//#pragma omp parallel for
for (size_t j = 0; j < nh; j++)
heap_reorder<C> (k, val + j * k, ids + j * k);
}
template <typename C>
void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
size_t i0, int64_t ni)
{
if (ni == -1) ni = nh;
assert (i0 >= 0 && i0 + ni <= nh);
//#pragma omp parallel for
for (size_t i = i0; i < i0 + ni; i++) {
T * __restrict simi = get_val(i);
TI * __restrict idxi = get_ids (i);
const T *ip_line = vin + (i - i0) * nj;
for (size_t j = 0; j < nj; j++) {
T ip = ip_line [j];
if (C::cmp(simi[0], ip)) {
heap_pop<C> (k, simi, idxi);
heap_push<C> (k, simi, idxi, ip, j + j0);
}
}
}
}
template <typename C>
void HeapArray<C>::addn_with_ids (
size_t nj, const T *vin, const TI *id_in,
int64_t id_stride, size_t i0, int64_t ni)
{
if (id_in == nullptr) {
addn (nj, vin, 0, i0, ni);
return;
}
if (ni == -1) ni = nh;
assert (i0 >= 0 && i0 + ni <= nh);
//#pragma omp parallel for
for (size_t i = i0; i < i0 + ni; i++) {
T * __restrict simi = get_val(i);
TI * __restrict idxi = get_ids (i);
const T *ip_line = vin + (i - i0) * nj;
const TI *id_line = id_in + (i - i0) * id_stride;
for (size_t j = 0; j < nj; j++) {
T ip = ip_line [j];
if (C::cmp(simi[0], ip)) {
heap_pop<C> (k, simi, idxi);
heap_push<C> (k, simi, idxi, ip, id_line [j]);
}
}
}
}
template <typename C>
void HeapArray<C>::per_line_extrema (
T * out_val,
TI * out_ids) const
{
//#pragma omp parallel for
for (size_t j = 0; j < nh; j++) {
int64_t imin = -1;
typename C::T xval = C::Crev::neutral ();
const typename C::T * x_ = val + j * k;
for (size_t i = 0; i < k; i++)
if (C::cmp (x_[i], xval)) {
xval = x_[i];
imin = i;
}
if (out_val)
out_val[j] = xval;
if (out_ids) {
if (ids && imin != -1)
out_ids[j] = ids [j * k + imin];
else
out_ids[j] = imin;
}
}
}
// explicit instanciations
template struct HeapArray<CMin <float, int64_t> >;
template struct HeapArray<CMax <float, int64_t> >;
template struct HeapArray<CMin <int, int64_t> >;
template struct HeapArray<CMax <int, int64_t> >;
} // END namespace fasis

495
src/3rd_party/faiss/utils/Heap.h vendored Normal file
View File

@ -0,0 +1,495 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/*
* C++ support for heaps. The set of functions is tailored for
* efficient similarity search.
*
* There is no specific object for a heap, and the functions that
* operate on a signle heap are inlined, because heaps are often
* small. More complex functions are implemented in Heaps.cpp
*
*/
#ifndef FAISS_Heap_h
#define FAISS_Heap_h
#include <climits>
#include <cstring>
#include <cmath>
#include <cassert>
#include <cstdio>
#include <stdint.h>
#include <limits>
namespace faiss {
/*******************************************************************
* C object: uniform handling of min and max heap
*******************************************************************/
/** The C object gives the type T of the values in the heap, the type
* of the keys, TI and the comparison that is done: > for the minheap
* and < for the maxheap. The neutral value will always be dropped in
* favor of any other value in the heap.
*/
template <typename T_, typename TI_>
struct CMax;
// traits of minheaps = heaps where the minimum value is stored on top
// useful to find the *max* values of an array
template <typename T_, typename TI_>
struct CMin {
typedef T_ T;
typedef TI_ TI;
typedef CMax<T_, TI_> Crev;
inline static bool cmp (T a, T b) {
return a < b;
}
// value that will be popped first -> must be smaller than all others
// for int types this is not strictly the smallest val (-max - 1)
inline static T neutral () {
return -std::numeric_limits<T>::max();
}
};
template <typename T_, typename TI_>
struct CMax {
typedef T_ T;
typedef TI_ TI;
typedef CMin<T_, TI_> Crev;
inline static bool cmp (T a, T b) {
return a > b;
}
inline static T neutral () {
return std::numeric_limits<T>::max();
}
};
/*******************************************************************
* Basic heap ops: push and pop
*******************************************************************/
/** Pops the top element from the heap defined by bh_val[0..k-1] and
* bh_ids[0..k-1]. on output the element at k-1 is undefined.
*/
template <class C> inline
void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
{
bh_val--; /* Use 1-based indexing for easier node->child translation */
bh_ids--;
typename C::T val = bh_val[k];
size_t i = 1, i1, i2;
while (1) {
i1 = i << 1;
i2 = i1 + 1;
if (i1 > k)
break;
if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
if (C::cmp(val, bh_val[i1]))
break;
bh_val[i] = bh_val[i1];
bh_ids[i] = bh_ids[i1];
i = i1;
}
else {
if (C::cmp(val, bh_val[i2]))
break;
bh_val[i] = bh_val[i2];
bh_ids[i] = bh_ids[i2];
i = i2;
}
}
bh_val[i] = bh_val[k];
bh_ids[i] = bh_ids[k];
}
/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
* bh_ids[0..k-2]. on output the element at k-1 is defined.
*/
template <class C> inline
void heap_push (size_t k,
typename C::T * bh_val, typename C::TI * bh_ids,
typename C::T val, typename C::TI ids)
{
bh_val--; /* Use 1-based indexing for easier node->child translation */
bh_ids--;
size_t i = k, i_father;
while (i > 1) {
i_father = i >> 1;
if (!C::cmp (val, bh_val[i_father])) /* the heap structure is ok */
break;
bh_val[i] = bh_val[i_father];
bh_ids[i] = bh_ids[i_father];
i = i_father;
}
bh_val[i] = val;
bh_ids[i] = ids;
}
/* Partial instanciation for heaps with TI = int64_t */
template <typename T> inline
void minheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
{
heap_pop<CMin<T, int64_t> > (k, bh_val, bh_ids);
}
template <typename T> inline
void minheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
{
heap_push<CMin<T, int64_t> > (k, bh_val, bh_ids, val, ids);
}
template <typename T> inline
void maxheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
{
heap_pop<CMax<T, int64_t> > (k, bh_val, bh_ids);
}
template <typename T> inline
void maxheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
{
heap_push<CMax<T, int64_t> > (k, bh_val, bh_ids, val, ids);
}
/*******************************************************************
* Heap initialization
*******************************************************************/
/* Initialization phase for the heap (with unconditionnal pushes).
* Store k0 elements in a heap containing up to k values. Note that
* (bh_val, bh_ids) can be the same as (x, ids) */
template <class C> inline
void heap_heapify (
size_t k,
typename C::T * bh_val,
typename C::TI * bh_ids,
const typename C::T * x = nullptr,
const typename C::TI * ids = nullptr,
size_t k0 = 0)
{
if (k0 > 0) assert (x);
if (ids) {
for (size_t i = 0; i < k0; i++)
heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
} else {
for (size_t i = 0; i < k0; i++)
heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
}
for (size_t i = k0; i < k; i++) {
bh_val[i] = C::neutral();
bh_ids[i] = -1;
}
}
template <typename T> inline
void minheap_heapify (
size_t k, T * bh_val,
int64_t * bh_ids,
const T * x = nullptr,
const int64_t * ids = nullptr,
size_t k0 = 0)
{
heap_heapify< CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
}
template <typename T> inline
void maxheap_heapify (
size_t k,
T * bh_val,
int64_t * bh_ids,
const T * x = nullptr,
const int64_t * ids = nullptr,
size_t k0 = 0)
{
heap_heapify< CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
}
/*******************************************************************
* Add n elements to the heap
*******************************************************************/
/* Add some elements to the heap */
template <class C> inline
void heap_addn (size_t k,
typename C::T * bh_val, typename C::TI * bh_ids,
const typename C::T * x,
const typename C::TI * ids,
size_t n)
{
size_t i;
if (ids)
for (i = 0; i < n; i++) {
if (C::cmp (bh_val[0], x[i])) {
heap_pop<C> (k, bh_val, bh_ids);
heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
}
}
else
for (i = 0; i < n; i++) {
if (C::cmp (bh_val[0], x[i])) {
heap_pop<C> (k, bh_val, bh_ids);
heap_push<C> (k, bh_val, bh_ids, x[i], i);
}
}
}
/* Partial instanciation for heaps with TI = int64_t */
template <typename T> inline
void minheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
const T * x, const int64_t * ids, size_t n)
{
heap_addn<CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
}
template <typename T> inline
void maxheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
const T * x, const int64_t * ids, size_t n)
{
heap_addn<CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
}
/*******************************************************************
* Heap finalization (reorder elements)
*******************************************************************/
/* This function maps a binary heap into an sorted structure.
It returns the number */
template <typename C> inline
size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
{
size_t i, ii;
for (i = 0, ii = 0; i < k; i++) {
/* top element should be put at the end of the list */
typename C::T val = bh_val[0];
typename C::TI id = bh_ids[0];
/* boundary case: we will over-ride this value if not a true element */
heap_pop<C> (k-i, bh_val, bh_ids);
bh_val[k-ii-1] = val;
bh_ids[k-ii-1] = id;
if (id != -1) ii++;
}
/* Count the number of elements which are effectively returned */
size_t nel = ii;
memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
for (; ii < k; ii++) {
bh_val[ii] = C::neutral();
bh_ids[ii] = -1;
}
return nel;
}
template <typename T> inline
size_t minheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
{
return heap_reorder< CMin<T, int64_t> > (k, bh_val, bh_ids);
}
template <typename T> inline
size_t maxheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
{
return heap_reorder< CMax<T, int64_t> > (k, bh_val, bh_ids);
}
/*******************************************************************
* Operations on heap arrays
*******************************************************************/
/** a template structure for a set of [min|max]-heaps it is tailored
* so that the actual data of the heaps can just live in compact
* arrays.
*/
template <typename C>
struct HeapArray {
typedef typename C::TI TI;
typedef typename C::T T;
size_t nh; ///< number of heaps
size_t k; ///< allocated size per heap
TI * ids; ///< identifiers (size nh * k)
T * val; ///< values (distances or similarities), size nh * k
/// Return the list of values for a heap
T * get_val (size_t key) { return val + key * k; }
/// Correspponding identifiers
TI * get_ids (size_t key) { return ids + key * k; }
/// prepare all the heaps before adding
void heapify ();
/** add nj elements to heaps i0:i0+ni, with sequential ids
*
* @param nj nb of elements to add to each heap
* @param vin elements to add, size ni * nj
* @param j0 add this to the ids that are added
* @param i0 first heap to update
* @param ni nb of elements to update (-1 = use nh)
*/
void addn (size_t nj, const T *vin, TI j0 = 0,
size_t i0 = 0, int64_t ni = -1);
/** same as addn
*
* @param id_in ids of the elements to add, size ni * nj
* @param id_stride stride for id_in
*/
void addn_with_ids (
size_t nj, const T *vin, const TI *id_in = nullptr,
int64_t id_stride = 0, size_t i0 = 0, int64_t ni = -1);
/// reorder all the heaps
void reorder ();
/** this is not really a heap function. It just finds the per-line
* extrema of each line of array D
* @param vals_out extreme value of each line (size nh, or NULL)
* @param idx_out index of extreme value (size nh or NULL)
*/
void per_line_extrema (T *vals_out, TI *idx_out) const;
};
/* Define useful heaps */
typedef HeapArray<CMin<float, int64_t> > float_minheap_array_t;
typedef HeapArray<CMin<int, int64_t> > int_minheap_array_t;
typedef HeapArray<CMax<float, int64_t> > float_maxheap_array_t;
typedef HeapArray<CMax<int, int64_t> > int_maxheap_array_t;
// The heap templates are instanciated explicitly in Heap.cpp
/*********************************************************************
* Indirect heaps: instead of having
*
* node i = (bh_ids[i], bh_val[i]),
*
* in indirect heaps,
*
* node i = (bh_ids[i], bh_val[bh_ids[i]]),
*
*********************************************************************/
template <class C>
inline
void indirect_heap_pop (
size_t k,
const typename C::T * bh_val,
typename C::TI * bh_ids)
{
bh_ids--; /* Use 1-based indexing for easier node->child translation */
typename C::T val = bh_val[bh_ids[k]];
size_t i = 1;
while (1) {
size_t i1 = i << 1;
size_t i2 = i1 + 1;
if (i1 > k)
break;
typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
if (C::cmp(val, bh_val[id1]))
break;
bh_ids[i] = id1;
i = i1;
} else {
if (C::cmp(val, bh_val[id2]))
break;
bh_ids[i] = id2;
i = i2;
}
}
bh_ids[i] = bh_ids[k];
}
template <class C>
inline
void indirect_heap_push (size_t k,
const typename C::T * bh_val, typename C::TI * bh_ids,
typename C::TI id)
{
bh_ids--; /* Use 1-based indexing for easier node->child translation */
typename C::T val = bh_val[id];
size_t i = k;
while (i > 1) {
size_t i_father = i >> 1;
if (!C::cmp (val, bh_val[bh_ids[i_father]]))
break;
bh_ids[i] = bh_ids[i_father];
i = i_father;
}
bh_ids[i] = id;
}
} // namespace faiss
#endif /* FAISS_Heap_h */

472
src/3rd_party/faiss/utils/hamming-inl.h vendored Normal file
View File

@ -0,0 +1,472 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
namespace faiss {
inline BitstringWriter::BitstringWriter(uint8_t *code, int code_size):
code (code), code_size (code_size), i(0)
{
bzero (code, code_size);
}
inline void BitstringWriter::write(uint64_t x, int nbit) {
assert (code_size * 8 >= nbit + i);
// nb of available bits in i / 8
int na = 8 - (i & 7);
if (nbit <= na) {
code[i >> 3] |= x << (i & 7);
i += nbit;
return;
} else {
int j = i >> 3;
code[j++] |= x << (i & 7);
i += nbit;
x >>= na;
while (x != 0) {
code[j++] |= x;
x >>= 8;
}
}
}
inline BitstringReader::BitstringReader(const uint8_t *code, int code_size):
code (code), code_size (code_size), i(0)
{}
inline uint64_t BitstringReader::read(int nbit) {
assert (code_size * 8 >= nbit + i);
// nb of available bits in i / 8
int na = 8 - (i & 7);
// get available bits in current byte
uint64_t res = code[i >> 3] >> (i & 7);
if (nbit <= na) {
res &= (1 << nbit) - 1;
i += nbit;
return res;
} else {
int ofs = na;
int j = (i >> 3) + 1;
i += nbit;
nbit -= na;
while (nbit > 8) {
res |= ((uint64_t)code[j++]) << ofs;
ofs += 8;
nbit -= 8; // TODO remove nbit
}
uint64_t last_byte = code[j];
last_byte &= (1 << nbit) - 1;
res |= last_byte << ofs;
return res;
}
}
/******************************************************************
* The HammingComputer series of classes compares a single code of
* size 4 to 32 to incoming codes. They are intended for use as a
* template class where it would be inefficient to switch on the code
* size in the inner loop. Hopefully the compiler will inline the
* hamming() functions and put the a0, a1, ... in registers.
******************************************************************/
struct HammingComputer4 {
uint32_t a0;
HammingComputer4 () {}
HammingComputer4 (const uint8_t *a, int code_size) {
set (a, code_size);
}
void set (const uint8_t *a, int code_size) {
assert (code_size == 4);
a0 = *(uint32_t *)a;
}
inline int hamming (const uint8_t *b) const {
return popcount64 (*(uint32_t *)b ^ a0);
}
};
struct HammingComputer8 {
uint64_t a0;
HammingComputer8 () {}
HammingComputer8 (const uint8_t *a, int code_size) {
set (a, code_size);
}
void set (const uint8_t *a, int code_size) {
assert (code_size == 8);
a0 = *(uint64_t *)a;
}
inline int hamming (const uint8_t *b) const {
return popcount64 (*(uint64_t *)b ^ a0);
}
};
struct HammingComputer16 {
uint64_t a0, a1;
HammingComputer16 () {}
HammingComputer16 (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
assert (code_size == 16);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1);
}
};
// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
// This incurs a penalty of ~10% wrt. fully aligned accesses.
struct HammingComputer20 {
uint64_t a0, a1;
uint32_t a2;
HammingComputer20 () {}
HammingComputer20 (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
assert (code_size == 20);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1]; a2 = a[2];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
popcount64 (*(uint32_t*)(b + 2) ^ a2);
}
};
struct HammingComputer32 {
uint64_t a0, a1, a2, a3;
HammingComputer32 () {}
HammingComputer32 (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
assert (code_size == 32);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3);
}
};
struct HammingComputer64 {
uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
HammingComputer64 () {}
HammingComputer64 (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
assert (code_size == 64);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
a4 = a[4]; a5 = a[5]; a6 = a[6]; a7 = a[7];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3) +
popcount64 (b[4] ^ a4) + popcount64 (b[5] ^ a5) +
popcount64 (b[6] ^ a6) + popcount64 (b[7] ^ a7);
}
};
// very inefficient...
struct HammingComputerDefault {
const uint8_t *a;
int n;
HammingComputerDefault () {}
HammingComputerDefault (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
a = a8;
n = code_size;
}
int hamming (const uint8_t *b8) const {
int accu = 0;
for (int i = 0; i < n; i++)
accu += popcount64 (a[i] ^ b8[i]);
return accu;
}
};
struct HammingComputerM8 {
const uint64_t *a;
int n;
HammingComputerM8 () {}
HammingComputerM8 (const uint8_t *a8, int code_size) {
set (a8, code_size);
}
void set (const uint8_t *a8, int code_size) {
assert (code_size % 8 == 0);
a = (uint64_t *)a8;
n = code_size / 8;
}
int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
int accu = 0;
for (int i = 0; i < n; i++)
accu += popcount64 (a[i] ^ b[i]);
return accu;
}
};
// even more inefficient!
struct HammingComputerM4 {
const uint32_t *a;
int n;
HammingComputerM4 () {}
HammingComputerM4 (const uint8_t *a4, int code_size) {
set (a4, code_size);
}
void set (const uint8_t *a4, int code_size) {
assert (code_size % 4 == 0);
a = (uint32_t *)a4;
n = code_size / 4;
}
int hamming (const uint8_t *b8) const {
const uint32_t *b = (uint32_t *)b8;
int accu = 0;
for (int i = 0; i < n; i++)
accu += popcount64 (a[i] ^ b[i]);
return accu;
}
};
/***************************************************************************
* Equivalence with a template class when code size is known at compile time
**************************************************************************/
// default template
template<int CODE_SIZE>
struct HammingComputer: HammingComputerM8 {
HammingComputer (const uint8_t *a, int code_size):
HammingComputerM8(a, code_size) {}
};
#define SPECIALIZED_HC(CODE_SIZE) \
template<> struct HammingComputer<CODE_SIZE>: \
HammingComputer ## CODE_SIZE { \
HammingComputer (const uint8_t *a): \
HammingComputer ## CODE_SIZE(a, CODE_SIZE) {} \
}
SPECIALIZED_HC(4);
SPECIALIZED_HC(8);
SPECIALIZED_HC(16);
SPECIALIZED_HC(20);
SPECIALIZED_HC(32);
SPECIALIZED_HC(64);
#undef SPECIALIZED_HC
/***************************************************************************
* generalized Hamming = number of bytes that are different between
* two codes.
***************************************************************************/
inline int generalized_hamming_64 (uint64_t a) {
a |= a >> 1;
a |= a >> 2;
a |= a >> 4;
a &= 0x0101010101010101UL;
return popcount64 (a);
}
struct GenHammingComputer8 {
uint64_t a0;
GenHammingComputer8 (const uint8_t *a, int code_size) {
assert (code_size == 8);
a0 = *(uint64_t *)a;
}
inline int hamming (const uint8_t *b) const {
return generalized_hamming_64 (*(uint64_t *)b ^ a0);
}
};
struct GenHammingComputer16 {
uint64_t a0, a1;
GenHammingComputer16 (const uint8_t *a8, int code_size) {
assert (code_size == 16);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return generalized_hamming_64 (b[0] ^ a0) +
generalized_hamming_64 (b[1] ^ a1);
}
};
struct GenHammingComputer32 {
uint64_t a0, a1, a2, a3;
GenHammingComputer32 (const uint8_t *a8, int code_size) {
assert (code_size == 32);
const uint64_t *a = (uint64_t *)a8;
a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
}
inline int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
return generalized_hamming_64 (b[0] ^ a0) +
generalized_hamming_64 (b[1] ^ a1) +
generalized_hamming_64 (b[2] ^ a2) +
generalized_hamming_64 (b[3] ^ a3);
}
};
struct GenHammingComputerM8 {
const uint64_t *a;
int n;
GenHammingComputerM8 (const uint8_t *a8, int code_size) {
assert (code_size % 8 == 0);
a = (uint64_t *)a8;
n = code_size / 8;
}
int hamming (const uint8_t *b8) const {
const uint64_t *b = (uint64_t *)b8;
int accu = 0;
for (int i = 0; i < n; i++)
accu += generalized_hamming_64 (a[i] ^ b[i]);
return accu;
}
};
/** generalized Hamming distances (= count number of code bytes that
are the same) */
void generalized_hammings_knn_hc (
int_maxheap_array_t * ha,
const uint8_t * a,
const uint8_t * b,
size_t nb,
size_t code_size,
int ordered = true);
/** This class maintains a list of best distances seen so far.
*
* Since the distances are in a limited range (0 to nbit), the
* object maintains one list per possible distance, and fills
* in only the n-first lists, such that the sum of sizes of the
* n lists is below k.
*/
template<class HammingComputer>
struct HCounterState {
int *counters;
int64_t *ids_per_dis;
HammingComputer hc;
int thres;
int count_lt;
int count_eq;
int k;
HCounterState(int *counters, int64_t *ids_per_dis,
const uint8_t *x, int d, int k)
: counters(counters),
ids_per_dis(ids_per_dis),
hc(x, d / 8),
thres(d + 1),
count_lt(0),
count_eq(0),
k(k) {}
void update_counter(const uint8_t *y, size_t j) {
int32_t dis = hc.hamming(y);
if (dis <= thres) {
if (dis < thres) {
ids_per_dis[dis * k + counters[dis]++] = j;
++count_lt;
while (count_lt == k && thres > 0) {
--thres;
count_eq = counters[thres];
count_lt -= count_eq;
}
} else if (count_eq < k) {
ids_per_dis[dis * k + count_eq++] = j;
counters[dis] = count_eq;
}
}
}
};
} // namespace faiss

879
src/3rd_party/faiss/utils/hamming.cpp vendored Normal file
View File

@ -0,0 +1,879 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/*
* Implementation of Hamming related functions (distances, smallest distance
* selection with regular heap|radix and probabilistic heap|radix.
*
* IMPLEMENTATION NOTES
* Bitvectors are generally assumed to be multiples of 64 bits.
*
* hamdis_t is used for distances because at this time
* it is not clear how we will need to balance
* - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
* - memory usage
* - cache-misses when dealing with large volumes of data (lower bits is better)
*
* The hamdis_t should optimally be compatibe with one of the Torch Storage
* (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
*/
#include <faiss/utils/hamming.h>
#include <vector>
#include <memory>
#include <stdio.h>
#include <math.h>
#include <faiss/utils/Heap.h>
#include "common/logging.h"
#include "misc.h"
static const size_t BLOCKSIZE_QUERY = 8192;
namespace faiss {
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
for (size_t i = 0; i < d; ++i) {
x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
}
}
//////////////////////////////////////////
size_t hamming_batch_size = 65536;
static const uint8_t hamdis_tab_ham_bytes[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
/* Elementary Hamming distance computation: unoptimized */
template <size_t nbits, typename T>
T hamming (const uint8_t *bs1,
const uint8_t *bs2)
{
const size_t nbytes = nbits / 8;
size_t i;
T h = 0;
for (i = 0; i < nbytes; i++)
h += (T) hamdis_tab_ham_bytes[bs1[i]^bs2[i]];
return h;
}
/* Hamming distances for multiples of 64 bits */
template <size_t nbits>
hamdis_t hamming (const uint64_t * bs1, const uint64_t * bs2)
{
const size_t nwords = nbits / 64;
size_t i;
hamdis_t h = 0;
for (i = 0; i < nwords; i++)
h += popcount64 (bs1[i] ^ bs2[i]);
return h;
}
/* specialized (optimized) functions */
template <>
hamdis_t hamming<64> (const uint64_t * pa, const uint64_t * pb)
{
return popcount64 (pa[0] ^ pb[0]);
}
template <>
hamdis_t hamming<128> (const uint64_t *pa, const uint64_t *pb)
{
return popcount64 (pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
}
template <>
hamdis_t hamming<256> (const uint64_t * pa, const uint64_t * pb)
{
return popcount64 (pa[0] ^ pb[0])
+ popcount64 (pa[1] ^ pb[1])
+ popcount64 (pa[2] ^ pb[2])
+ popcount64 (pa[3] ^ pb[3]);
}
/* Hamming distances for multiple of 64 bits */
hamdis_t hamming (
const uint64_t * bs1,
const uint64_t * bs2,
size_t nwords)
{
size_t i;
hamdis_t h = 0;
for (i = 0; i < nwords; i++)
h += popcount64 (bs1[i] ^ bs2[i]);
return h;
}
template <size_t nbits>
void hammings (
const uint64_t * bs1,
const uint64_t * bs2,
size_t n1, size_t n2,
hamdis_t * dis)
{
size_t i, j;
const size_t nwords = nbits / 64;
for (i = 0; i < n1; i++) {
const uint64_t * __restrict bs1_ = bs1 + i * nwords;
hamdis_t * __restrict dis_ = dis + i * n2;
for (j = 0; j < n2; j++)
dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
}
}
void hammings (
const uint64_t * bs1,
const uint64_t * bs2,
size_t n1,
size_t n2,
size_t nwords,
hamdis_t * __restrict dis)
{
size_t i, j;
n1 *= nwords;
n2 *= nwords;
for (i = 0; i < n1; i+=nwords) {
const uint64_t * bs1_ = bs1+i;
for (j = 0; j < n2; j+=nwords)
dis[j] = hamming (bs1_, bs2+j, nwords);
}
}
/* Count number of matches given a max threshold */
template <size_t nbits>
void hamming_count_thres (
const uint64_t * bs1,
const uint64_t * bs2,
size_t n1,
size_t n2,
hamdis_t ht,
size_t * nptr)
{
const size_t nwords = nbits / 64;
size_t i, j, posm = 0;
const uint64_t * bs2_ = bs2;
for (i = 0; i < n1; i++) {
bs2 = bs2_;
for (j = 0; j < n2; j++) {
/* collect the match only if this satisfies the threshold */
if (hamming <nbits> (bs1, bs2) <= ht)
posm++;
bs2 += nwords;
}
bs1 += nwords; /* next signature */
}
*nptr = posm;
}
template <size_t nbits>
void crosshamming_count_thres (
const uint64_t * dbs,
size_t n,
int ht,
size_t * nptr)
{
const size_t nwords = nbits / 64;
size_t i, j, posm = 0;
const uint64_t * bs1 = dbs;
for (i = 0; i < n; i++) {
const uint64_t * bs2 = bs1 + 2;
for (j = i + 1; j < n; j++) {
/* collect the match only if this satisfies the threshold */
if (hamming <nbits> (bs1, bs2) <= ht)
posm++;
bs2 += nwords;
}
bs1 += nwords;
}
*nptr = posm;
}
template <size_t nbits>
size_t match_hamming_thres (
const uint64_t * bs1,
const uint64_t * bs2,
size_t n1,
size_t n2,
int ht,
int64_t * idx,
hamdis_t * hams)
{
const size_t nwords = nbits / 64;
size_t i, j, posm = 0;
hamdis_t h;
const uint64_t * bs2_ = bs2;
for (i = 0; i < n1; i++) {
bs2 = bs2_;
for (j = 0; j < n2; j++) {
/* Here perform the real work of computing the distance */
h = hamming <nbits> (bs1, bs2);
/* collect the match only if this satisfies the threshold */
if (h <= ht) {
/* Enough space to store another match ? */
*idx = i; idx++;
*idx = j; idx++;
*hams = h;
hams++;
posm++;
}
bs2+=nwords; /* next signature */
}
bs1+=nwords;
}
return posm;
}
/* Return closest neighbors w.r.t Hamming distance, using a heap. */
template <class HammingComputer>
static
void hammings_knn_hc (
int bytes_per_code,
int_maxheap_array_t * ha,
const uint8_t * bs1,
const uint8_t * bs2,
size_t n2,
bool order = true,
bool init_heap = true)
{
size_t k = ha->k;
if (init_heap) ha->heapify ();
const size_t block_size = hamming_batch_size;
for (size_t j0 = 0; j0 < n2; j0 += block_size) {
const size_t j1 = std::min(j0 + block_size, n2);
//#pragma omp parallel for
for (size_t i = 0; i < ha->nh; i++) {
HammingComputer hc (bs1 + i * bytes_per_code, bytes_per_code);
const uint8_t * bs2_ = bs2 + j0 * bytes_per_code;
hamdis_t dis;
hamdis_t * __restrict bh_val_ = ha->val + i * k;
int64_t * __restrict bh_ids_ = ha->ids + i * k;
size_t j;
for (j = j0; j < j1; j++, bs2_+= bytes_per_code) {
dis = hc.hamming (bs2_);
if (dis < bh_val_[0]) {
faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
}
}
}
}
if (order) ha->reorder ();
}
/* Return closest neighbors w.r.t Hamming distance, using max count. */
template <class HammingComputer>
static
void hammings_knn_mc (
int bytes_per_code,
const uint8_t *a,
const uint8_t *b,
size_t na,
size_t nb,
size_t k,
int32_t *distances,
int64_t *labels)
{
const int nBuckets = bytes_per_code * 8 + 1;
std::vector<int> all_counters(na * nBuckets, 0);
std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
std::vector<HCounterState<HammingComputer>> cs;
for (size_t i = 0; i < na; ++i) {
cs.push_back(HCounterState<HammingComputer>(
all_counters.data() + i * nBuckets,
all_ids_per_dis.get() + i * nBuckets * k,
a + i * bytes_per_code,
8 * bytes_per_code,
k
));
}
const size_t block_size = hamming_batch_size;
for (size_t j0 = 0; j0 < nb; j0 += block_size) {
const size_t j1 = std::min(j0 + block_size, nb);
//#pragma omp parallel for
for (size_t i = 0; i < na; ++i) {
for (size_t j = j0; j < j1; ++j) {
cs[i].update_counter(b + j * bytes_per_code, j);
}
}
}
for (size_t i = 0; i < na; ++i) {
HCounterState<HammingComputer>& csi = cs[i];
int nres = 0;
for (int b = 0; b < nBuckets && nres < k; b++) {
for (int l = 0; l < csi.counters[b] && nres < k; l++) {
labels[i * k + nres] = csi.ids_per_dis[b * k + l];
distances[i * k + nres] = b;
nres++;
}
}
while (nres < k) {
labels[i * k + nres] = -1;
distances[i * k + nres] = std::numeric_limits<int32_t>::max();
++nres;
}
}
}
// works faster than the template version
static
void hammings_knn_hc_1 (
int_maxheap_array_t * ha,
const uint64_t * bs1,
const uint64_t * bs2,
size_t n2,
bool order = true,
bool init_heap = true)
{
const size_t nwords = 1;
size_t k = ha->k;
if (init_heap) {
ha->heapify ();
}
//#pragma omp parallel for
for (size_t i = 0; i < ha->nh; i++) {
const uint64_t bs1_ = bs1 [i];
const uint64_t * bs2_ = bs2;
hamdis_t dis;
hamdis_t * bh_val_ = ha->val + i * k;
hamdis_t bh_val_0 = bh_val_[0];
int64_t * bh_ids_ = ha->ids + i * k;
size_t j;
for (j = 0; j < n2; j++, bs2_+= nwords) {
dis = popcount64 (bs1_ ^ *bs2_);
if (dis < bh_val_0) {
faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
bh_val_0 = bh_val_[0];
}
}
}
if (order) {
ha->reorder ();
}
}
/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
meaning that b should be be able to receive as many bits as x may produce. */
/*
* dimension 0 corresponds to the least significant bit of b[0], or
* equivalently to the lsb of the first byte that is stored.
*/
void fvec2bitvec (const float * x, uint8_t * b, size_t d)
{
for (int i = 0; i < d; i += 8) {
uint8_t w = 0;
uint8_t mask = 1;
int nj = i + 8 <= d ? 8 : d - i;
for (int j = 0; j < nj; j++) {
if (x[i + j] >= 0)
w |= mask;
mask <<= 1;
}
*b = w;
b++;
}
}
/* Same but for n vectors.
Ensure that the ouptut b is byte-aligned (pad with 0s). */
void fvecs2bitvecs (const float * x, uint8_t * b, size_t d, size_t n)
{
const int64_t ncodes = ((d + 7) / 8);
//#pragma omp parallel for if(n > 100000)
for (size_t i = 0; i < n; i++)
fvec2bitvec (x + i * d, b + i * ncodes, d);
}
void bitvecs2fvecs (
const uint8_t * b,
float * x,
size_t d,
size_t n) {
const int64_t ncodes = ((d + 7) / 8);
//#pragma omp parallel for if(n > 100000)
for (size_t i = 0; i < n; i++) {
binary_to_real (d, b + i * ncodes, x + i * d);
}
}
/* Reverse bit (NOT a optimized function, only used for print purpose) */
static uint64_t uint64_reverse_bits (uint64_t b)
{
int i;
uint64_t revb = 0;
for (i = 0; i < 64; i++) {
revb <<= 1;
revb |= b & 1;
b >>= 1;
}
return revb;
}
/* print the bit vector */
void bitvec_print (const uint8_t * b, size_t d)
{
size_t i, j;
for (i = 0; i < d; ) {
uint64_t brev = uint64_reverse_bits (* (uint64_t *) b);
for (j = 0; j < 64 && i < d; j++, i++) {
printf ("%d", (int) (brev & 1));
brev >>= 1;
}
b += 8;
printf (" ");
}
}
void bitvec_shuffle (size_t n, size_t da, size_t db,
const int *order,
const uint8_t *a,
uint8_t *b)
{
for(size_t i = 0; i < db; i++) {
ABORT_UNLESS (order[i] >= 0 && order[i] < da, "order[i] >= 0 && order[i] < da");
}
size_t lda = (da + 7) / 8;
size_t ldb = (db + 7) / 8;
//#pragma omp parallel for if(n > 10000)
for (size_t i = 0; i < n; i++) {
const uint8_t *ai = a + i * lda;
uint8_t *bi = b + i * ldb;
memset (bi, 0, ldb);
for(size_t i = 0; i < db; i++) {
int o = order[i];
uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
bi[i >> 3] |= the_bit << (i & 7);
}
}
}
/*----------------------------------------*/
/* Hamming distance computation and k-nn */
#define C64(x) ((uint64_t *)x)
/* Compute a set of Hamming distances */
void hammings (
const uint8_t * a,
const uint8_t * b,
size_t na, size_t nb,
size_t ncodes,
hamdis_t * __restrict dis)
{
ABORT_UNLESS (ncodes % 8 == 0, "ncodes % 8 == 0");
switch (ncodes) {
case 8:
faiss::hammings <64> (C64(a), C64(b), na, nb, dis); return;
case 16:
faiss::hammings <128> (C64(a), C64(b), na, nb, dis); return;
case 32:
faiss::hammings <256> (C64(a), C64(b), na, nb, dis); return;
case 64:
faiss::hammings <512> (C64(a), C64(b), na, nb, dis); return;
default:
faiss::hammings (C64(a), C64(b), na, nb, ncodes * 8, dis); return;
}
}
void hammings_knn(
int_maxheap_array_t *ha,
const uint8_t *a,
const uint8_t *b,
size_t nb,
size_t ncodes,
int order)
{
hammings_knn_hc(ha, a, b, nb, ncodes, order);
}
void hammings_knn_hc (
int_maxheap_array_t * ha,
const uint8_t * a,
const uint8_t * b,
size_t nb,
size_t ncodes,
int order)
{
switch (ncodes) {
case 4:
hammings_knn_hc<faiss::HammingComputer4>
(4, ha, a, b, nb, order, true);
break;
case 8:
hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
// hammings_knn_hc<faiss::HammingComputer8>
// (8, ha, a, b, nb, order, true);
break;
case 16:
hammings_knn_hc<faiss::HammingComputer16>
(16, ha, a, b, nb, order, true);
break;
case 32:
hammings_knn_hc<faiss::HammingComputer32>
(32, ha, a, b, nb, order, true);
break;
default:
if(ncodes % 8 == 0) {
hammings_knn_hc<faiss::HammingComputerM8>
(ncodes, ha, a, b, nb, order, true);
} else {
hammings_knn_hc<faiss::HammingComputerDefault>
(ncodes, ha, a, b, nb, order, true);
}
}
}
void hammings_knn_mc(
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
size_t k,
size_t ncodes,
int32_t *distances,
int64_t *labels)
{
switch (ncodes) {
case 4:
hammings_knn_mc<faiss::HammingComputer4>(
4, a, b, na, nb, k, distances, labels
);
break;
case 8:
// TODO(hoss): Write analog to hammings_knn_hc_1
// hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
hammings_knn_mc<faiss::HammingComputer8>(
8, a, b, na, nb, k, distances, labels
);
break;
case 16:
hammings_knn_mc<faiss::HammingComputer16>(
16, a, b, na, nb, k, distances, labels
);
break;
case 32:
hammings_knn_mc<faiss::HammingComputer32>(
32, a, b, na, nb, k, distances, labels
);
break;
default:
if(ncodes % 8 == 0) {
hammings_knn_mc<faiss::HammingComputerM8>(
ncodes, a, b, na, nb, k, distances, labels
);
} else {
hammings_knn_mc<faiss::HammingComputerDefault>(
ncodes, a, b, na, nb, k, distances, labels
);
}
}
}
template <class HammingComputer>
static
void hamming_range_search_template (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t code_size,
RangeSearchResult *res)
{
//#pragma omp parallel
{
RangeSearchPartialResult pres (res);
//#pragma omp for
for (size_t i = 0; i < na; i++) {
HammingComputer hc (a + i * code_size, code_size);
const uint8_t * yi = b;
RangeQueryResult & qres = pres.new_result (i);
for (size_t j = 0; j < nb; j++) {
int dis = hc.hamming (yi);
if (dis < radius) {
qres.add(dis, j);
}
yi += code_size;
}
}
pres.finalize ();
}
}
void hamming_range_search (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t code_size,
RangeSearchResult *result)
{
#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
switch(code_size) {
case 4: HC(HammingComputer4); break;
case 8: HC(HammingComputer8); break;
case 16: HC(HammingComputer16); break;
case 32: HC(HammingComputer32); break;
default:
if (code_size % 8 == 0) {
HC(HammingComputerM8);
} else {
HC(HammingComputerDefault);
}
}
#undef HC
}
/* Count number of matches given a max threshold */
void hamming_count_thres (
const uint8_t * bs1,
const uint8_t * bs2,
size_t n1,
size_t n2,
hamdis_t ht,
size_t ncodes,
size_t * nptr)
{
switch (ncodes) {
case 8:
faiss::hamming_count_thres <64> (C64(bs1), C64(bs2),
n1, n2, ht, nptr);
return;
case 16:
faiss::hamming_count_thres <128> (C64(bs1), C64(bs2),
n1, n2, ht, nptr);
return;
case 32:
faiss::hamming_count_thres <256> (C64(bs1), C64(bs2),
n1, n2, ht, nptr);
return;
case 64:
faiss::hamming_count_thres <512> (C64(bs1), C64(bs2),
n1, n2, ht, nptr);
return;
default:
ABORT ("not implemented for %zu bits", ncodes);
}
}
/* Count number of cross-matches given a threshold */
void crosshamming_count_thres (
const uint8_t * dbs,
size_t n,
hamdis_t ht,
size_t ncodes,
size_t * nptr)
{
switch (ncodes) {
case 8:
faiss::crosshamming_count_thres <64> (C64(dbs), n, ht, nptr);
return;
case 16:
faiss::crosshamming_count_thres <128> (C64(dbs), n, ht, nptr);
return;
case 32:
faiss::crosshamming_count_thres <256> (C64(dbs), n, ht, nptr);
return;
case 64:
faiss::crosshamming_count_thres <512> (C64(dbs), n, ht, nptr);
return;
default:
ABORT ("not implemented for %zu bits", ncodes);
}
}
/* Returns all matches given a threshold */
size_t match_hamming_thres (
const uint8_t * bs1,
const uint8_t * bs2,
size_t n1,
size_t n2,
hamdis_t ht,
size_t ncodes,
int64_t * idx,
hamdis_t * dis)
{
switch (ncodes) {
case 8:
return faiss::match_hamming_thres <64> (C64(bs1), C64(bs2),
n1, n2, ht, idx, dis);
case 16:
return faiss::match_hamming_thres <128> (C64(bs1), C64(bs2),
n1, n2, ht, idx, dis);
case 32:
return faiss::match_hamming_thres <256> (C64(bs1), C64(bs2),
n1, n2, ht, idx, dis);
case 64:
return faiss::match_hamming_thres <512> (C64(bs1), C64(bs2),
n1, n2, ht, idx, dis);
default:
ABORT ("not implemented for %zu bits", ncodes);
return 0;
}
}
#undef C64
/*************************************
* generalized Hamming distances
************************************/
template <class HammingComputer>
static void hamming_dis_inner_loop (
const uint8_t *ca,
const uint8_t *cb,
size_t nb,
size_t code_size,
int k,
hamdis_t * bh_val_,
int64_t * bh_ids_)
{
HammingComputer hc (ca, code_size);
for (size_t j = 0; j < nb; j++) {
int ndiff = hc.hamming (cb);
cb += code_size;
if (ndiff < bh_val_[0]) {
maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, ndiff, j);
}
}
}
void generalized_hammings_knn_hc (
int_maxheap_array_t * ha,
const uint8_t * a,
const uint8_t * b,
size_t nb,
size_t code_size,
int ordered)
{
int na = ha->nh;
int k = ha->k;
if (ordered)
ha->heapify ();
//#pragma omp parallel for
for (int i = 0; i < na; i++) {
const uint8_t *ca = a + i * code_size;
const uint8_t *cb = b;
hamdis_t * bh_val_ = ha->val + i * k;
int64_t * bh_ids_ = ha->ids + i * k;
switch (code_size) {
case 8:
hamming_dis_inner_loop<GenHammingComputer8>
(ca, cb, nb, 8, k, bh_val_, bh_ids_);
break;
case 16:
hamming_dis_inner_loop<GenHammingComputer16>
(ca, cb, nb, 16, k, bh_val_, bh_ids_);
break;
case 32:
hamming_dis_inner_loop<GenHammingComputer32>
(ca, cb, nb, 32, k, bh_val_, bh_ids_);
break;
default:
hamming_dis_inner_loop<GenHammingComputerM8>
(ca, cb, nb, code_size, k, bh_val_, bh_ids_);
break;
}
}
if (ordered)
ha->reorder ();
}
} // namespace faiss

240
src/3rd_party/faiss/utils/hamming.h vendored Normal file
View File

@ -0,0 +1,240 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/*
* Hamming distances. The binary vector dimensionality should be a
* multiple of 8, as the elementary operations operate on bytes. If
* you need other sizes, just pad with 0s (this is done by function
* fvecs2bitvecs).
*
* User-defined type hamdis_t is used for distances because at this time
* it is still uncler clear how we will need to balance
* - flexibility in vector size (may need 16- or even 8-bit vectors)
* - memory usage
* - cache-misses when dealing with large volumes of data (fewer bits is better)
*
*/
#ifndef FAISS_hamming_h
#define FAISS_hamming_h
#include <stdint.h>
#include <faiss/utils/Heap.h>
/* The Hamming distance type */
typedef int32_t hamdis_t;
namespace faiss {
/**************************************************
* General bit vector functions
**************************************************/
struct RangeSearchResult;
void bitvec_print (const uint8_t * b, size_t d);
/* Functions for casting vectors of regular types to compact bits.
They assume proper allocation done beforehand, meaning that b
should be be able to receive as many bits as x may produce. */
/* Makes an array of bits from the signs of a float array. The length
of the output array b is rounded up to byte size (allocate
accordingly) */
void fvecs2bitvecs (
const float * x,
uint8_t * b,
size_t d,
size_t n);
void bitvecs2fvecs (
const uint8_t * b,
float * x,
size_t d,
size_t n);
void fvec2bitvec (const float * x, uint8_t * b, size_t d);
/** Shuffle the bits from b(i, j) := a(i, order[j])
*/
void bitvec_shuffle (size_t n, size_t da, size_t db,
const int *order,
const uint8_t *a,
uint8_t *b);
/***********************************************
* Generic reader/writer for bit strings
***********************************************/
struct BitstringWriter {
uint8_t *code;
size_t code_size;
size_t i; // current bit offset
// code_size in bytes
BitstringWriter(uint8_t *code, int code_size);
// write the nbit low bits of x
void write(uint64_t x, int nbit);
};
struct BitstringReader {
const uint8_t *code;
size_t code_size;
size_t i;
// code_size in bytes
BitstringReader(const uint8_t *code, int code_size);
// read nbit bits from the code
uint64_t read(int nbit);
};
/**************************************************
* Hamming distance computation functions
**************************************************/
extern size_t hamming_batch_size;
inline int popcount64(uint64_t x) {
return __builtin_popcountl(x);
}
/** Compute a set of Hamming distances between na and nb binary vectors
*
* @param a size na * nbytespercode
* @param b size nb * nbytespercode
* @param nbytespercode should be multiple of 8
* @param dis output distances, size na * nb
*/
void hammings (
const uint8_t * a,
const uint8_t * b,
size_t na, size_t nb,
size_t nbytespercode,
hamdis_t * dis);
/** Return the k smallest Hamming distances for a set of binary query vectors,
* using a max heap.
* @param a queries, size ha->nh * ncodes
* @param b database, size nb * ncodes
* @param nb number of database vectors
* @param ncodes size of the binary codes (bytes)
* @param ordered if != 0: order the results by decreasing distance
* (may be bottleneck for k/n > 0.01) */
void hammings_knn_hc (
int_maxheap_array_t * ha,
const uint8_t * a,
const uint8_t * b,
size_t nb,
size_t ncodes,
int ordered);
/* Legacy alias to hammings_knn_hc. */
void hammings_knn (
int_maxheap_array_t * ha,
const uint8_t * a,
const uint8_t * b,
size_t nb,
size_t ncodes,
int ordered);
/** Return the k smallest Hamming distances for a set of binary query vectors,
* using counting max.
* @param a queries, size na * ncodes
* @param b database, size nb * ncodes
* @param na number of query vectors
* @param nb number of database vectors
* @param k number of vectors/distances to return
* @param ncodes size of the binary codes (bytes)
* @param distances output distances from each query vector to its k nearest
* neighbors
* @param labels output ids of the k nearest neighbors to each query vector
*/
void hammings_knn_mc (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
size_t k,
size_t ncodes,
int32_t *distances,
int64_t *labels);
/** same as hammings_knn except we are doing a range search with radius */
void hamming_range_search (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t ncodes,
RangeSearchResult *result);
/* Counting the number of matches or of cross-matches (without returning them)
For use with function that assume pre-allocated memory */
void hamming_count_thres (
const uint8_t * bs1,
const uint8_t * bs2,
size_t n1,
size_t n2,
hamdis_t ht,
size_t ncodes,
size_t * nptr);
/* Return all Hamming distances/index passing a thres. Pre-allocation of output
is required. Use hamming_count_thres to determine the proper size. */
size_t match_hamming_thres (
const uint8_t * bs1,
const uint8_t * bs2,
size_t n1,
size_t n2,
hamdis_t ht,
size_t ncodes,
int64_t * idx,
hamdis_t * dis);
/* Cross-matching in a set of vectors */
void crosshamming_count_thres (
const uint8_t * dbs,
size_t n,
hamdis_t ht,
size_t ncodes,
size_t * nptr);
/* compute the Hamming distances between two codewords of nwords*64 bits */
hamdis_t hamming (
const uint64_t * bs1,
const uint64_t * bs2,
size_t nwords);
} // namespace faiss
// inlined definitions of HammingComputerXX and GenHammingComputerXX
#include <faiss/utils/hamming-inl.h>
#endif /* FAISS_hamming_h */

162
src/3rd_party/faiss/utils/misc.cpp vendored Normal file
View File

@ -0,0 +1,162 @@
#include "misc.h"
#include <cstring>
namespace faiss {
/***********************************************************************
* BufferList
***********************************************************************/
BufferList::BufferList(size_t buffer_size) :
buffer_size(buffer_size)
{
wp = buffer_size;
}
BufferList::~BufferList()
{
for (int i = 0; i < buffers.size(); i++) {
delete[] buffers[i].ids;
delete[] buffers[i].dis;
}
}
void BufferList::add(idx_t id, float dis) {
if (wp == buffer_size) { // need new buffer
append_buffer();
}
Buffer & buf = buffers.back();
buf.ids[wp] = id;
buf.dis[wp] = dis;
wp++;
}
void BufferList::append_buffer()
{
Buffer buf = { new idx_t[buffer_size], new float[buffer_size] };
buffers.push_back(buf);
wp = 0;
}
/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
/// tables dest_ids, dest_dis
void BufferList::copy_range(size_t ofs, size_t n,
idx_t * dest_ids, float *dest_dis)
{
size_t bno = ofs / buffer_size;
ofs -= bno * buffer_size;
while (n > 0) {
size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
Buffer buf = buffers[bno];
memcpy(dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
memcpy(dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
dest_ids += ncopy;
dest_dis += ncopy;
ofs = 0;
bno++;
n -= ncopy;
}
}
/***********************************************************************
* RangeSearchPartialResult
***********************************************************************/
void RangeQueryResult::add(float dis, idx_t id) {
nres++;
pres->add(id, dis);
}
RangeSearchPartialResult::RangeSearchPartialResult(RangeSearchResult * res_in) :
BufferList(res_in->buffer_size),
res(res_in)
{}
/// begin a new result
RangeQueryResult &
RangeSearchPartialResult::new_result(idx_t qno)
{
RangeQueryResult qres = { qno, 0, this };
queries.push_back(qres);
return queries.back();
}
void RangeSearchPartialResult::finalize()
{
set_lims();
//#pragma omp barrier
//#pragma omp single
res->do_allocation();
//#pragma omp barrier
copy_result();
}
/// called by range_search before do_allocation
void RangeSearchPartialResult::set_lims()
{
for (int i = 0; i < queries.size(); i++) {
RangeQueryResult & qres = queries[i];
res->lims[qres.qno] = qres.nres;
}
}
/// called by range_search after do_allocation
void RangeSearchPartialResult::copy_result(bool incremental)
{
size_t ofs = 0;
for (int i = 0; i < queries.size(); i++) {
RangeQueryResult & qres = queries[i];
copy_range(ofs, qres.nres,
res->labels + res->lims[qres.qno],
res->distances + res->lims[qres.qno]);
if (incremental) {
res->lims[qres.qno] += qres.nres;
}
ofs += qres.nres;
}
}
void RangeSearchPartialResult::merge(std::vector <RangeSearchPartialResult *> &
partial_results, bool do_delete)
{
int npres = partial_results.size();
if (npres == 0) return;
RangeSearchResult *result = partial_results[0]->res;
size_t nx = result->nq;
// count
for (const RangeSearchPartialResult * pres : partial_results) {
if (!pres) continue;
for (const RangeQueryResult &qres : pres->queries) {
result->lims[qres.qno] += qres.nres;
}
}
result->do_allocation();
for (int j = 0; j < npres; j++) {
if (!partial_results[j]) continue;
partial_results[j]->copy_result(true);
if (do_delete) {
delete partial_results[j];
partial_results[j] = nullptr;
}
}
// reset the limits
for (size_t i = nx; i > 0; i--) {
result->lims[i] = result->lims[i - 1];
}
result->lims[0] = 0;
}
}

144
src/3rd_party/faiss/utils/misc.h vendored Normal file
View File

@ -0,0 +1,144 @@
#pragma once
#include <algorithm>
#include <vector>
namespace faiss {
/// The metric space for vector comparison for Faiss indices and algorithms.
///
/// Most algorithms support both inner product and L2, with the flat
/// (brute-force) indices supporting additional metric types for vector
/// comparison.
enum MetricType {
METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
METRIC_L2 = 1, ///< squared L2 search
METRIC_L1, ///< L1 (aka cityblock)
METRIC_Linf, ///< infinity distance
METRIC_Lp, ///< L_p distance, p is given by a faiss::Index
/// metric_arg
/// some additional metrics defined in scipy.spatial.distance
METRIC_Canberra = 20,
METRIC_BrayCurtis,
METRIC_JensenShannon,
};
template<class T>
struct ScopeDeleter {
const T * ptr;
explicit ScopeDeleter(const T* ptr = nullptr) : ptr(ptr) {}
void release() { ptr = nullptr; }
void set(const T * ptr_in) { ptr = ptr_in; }
void swap(ScopeDeleter<T> &other) { std::swap(ptr, other.ptr); }
~ScopeDeleter() {
delete[] ptr;
}
};
//////////////////////////////////////////
using idx_t = int64_t;
/** List of temporary buffers used to store results before they are
* copied to the RangeSearchResult object. */
struct BufferList {
typedef faiss::idx_t idx_t;
// buffer sizes in # entries
size_t buffer_size;
struct Buffer {
idx_t *ids;
float *dis;
};
std::vector<Buffer> buffers;
size_t wp; ///< write pointer in the last buffer.
explicit BufferList(size_t buffer_size);
~BufferList();
/// create a new buffer
void append_buffer();
/// add one result, possibly appending a new buffer if needed
void add(idx_t id, float dis);
/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
/// tables dest_ids, dest_dis
void copy_range(size_t ofs, size_t n,
idx_t * dest_ids, float *dest_dis);
};
/** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in
* the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult {
size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1)
typedef faiss::idx_t idx_t;
idx_t *labels; ///< result for query i is labels[lims[i]:lims[i+1]]
float *distances; ///< corresponding distances (not sorted)
size_t buffer_size; ///< size of the result buffers used
/// lims must be allocated on input to range_search.
explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
/// called when lims contains the nb of elements result entries
/// for each query
virtual void do_allocation();
virtual ~RangeSearchResult();
};
struct RangeSearchPartialResult;
/// result structure for a single query
struct RangeQueryResult {
using idx_t = faiss::idx_t;
idx_t qno; //< id of the query
size_t nres; //< nb of results for this query
RangeSearchPartialResult * pres;
/// called by search function to report a new result
void add(float dis, idx_t id);
};
/// the entries in the buffers are split per query
struct RangeSearchPartialResult : BufferList {
RangeSearchResult * res;
/// eventually the result will be stored in res_in
explicit RangeSearchPartialResult(RangeSearchResult * res_in);
/// query ids + nb of results per query.
std::vector<RangeQueryResult> queries;
/// begin a new result
RangeQueryResult & new_result(idx_t qno);
/*****************************************
* functions used at the end of the search to merge the result
* lists */
void finalize();
/// called by range_search before do_allocation
void set_lims();
/// called by range_search after do_allocation
void copy_result(bool incremental = false);
/// merge a set of PartialResult's into one RangeSearchResult
/// on ouptut the partialresults are empty!
static void merge(std::vector <RangeSearchPartialResult *> &
partial_results, bool do_delete = true);
};
} // namespace

192
src/3rd_party/faiss/utils/random.cpp vendored Normal file
View File

@ -0,0 +1,192 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <faiss/utils/random.h>
namespace faiss {
/**************************************************
* Random data generation functions
**************************************************/
RandomGenerator::RandomGenerator (int64_t seed)
: mt((unsigned int)seed) {}
int RandomGenerator::rand_int ()
{
return mt() & 0x7fffffff;
}
int64_t RandomGenerator::rand_int64 ()
{
return int64_t(rand_int()) | int64_t(rand_int()) << 31;
}
int RandomGenerator::rand_int (int max)
{
return mt() % max;
}
float RandomGenerator::rand_float ()
{
return mt() / float(mt.max());
}
double RandomGenerator::rand_double ()
{
return mt() / double(mt.max());
}
/***********************************************************************
* Random functions in this C file only exist because Torch
* counterparts are slow and not multi-threaded. Typical use is for
* more than 1-100 billion values. */
/* Generate a set of random floating point values such that x[i] in [0,1]
multi-threading. For this reason, we rely on re-entreant functions. */
void float_rand (float * x, size_t n, int64_t seed)
{
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
//#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
const size_t istart = j * n / nblock;
const size_t iend = (j + 1) * n / nblock;
for (size_t i = istart; i < iend; i++)
x[i] = rng.rand_float ();
}
}
void float_randn (float * x, size_t n, int64_t seed)
{
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
//#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
double a = 0, b = 0, s = 0;
int state = 0; /* generate two number per "do-while" loop */
const size_t istart = j * n / nblock;
const size_t iend = (j + 1) * n / nblock;
for (size_t i = istart; i < iend; i++) {
/* Marsaglia's method (see Knuth) */
if (state == 0) {
do {
a = 2.0 * rng.rand_double () - 1;
b = 2.0 * rng.rand_double () - 1;
s = a * a + b * b;
} while (s >= 1.0);
x[i] = a * sqrt(-2.0 * log(s) / s);
}
else
x[i] = b * sqrt(-2.0 * log(s) / s);
state = 1 - state;
}
}
}
/* Integer versions */
void int64_rand (int64_t * x, size_t n, int64_t seed)
{
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
//#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
const size_t istart = j * n / nblock;
const size_t iend = (j + 1) * n / nblock;
for (size_t i = istart; i < iend; i++)
x[i] = rng.rand_int64 ();
}
}
void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed)
{
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
//#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
const size_t istart = j * n / nblock;
const size_t iend = (j + 1) * n / nblock;
for (size_t i = istart; i < iend; i++)
x[i] = rng.rand_int64 () % max;
}
}
void rand_perm (int *perm, size_t n, int64_t seed)
{
for (size_t i = 0; i < n; i++) perm[i] = i;
RandomGenerator rng (seed);
for (size_t i = 0; i + 1 < n; i++) {
int i2 = i + rng.rand_int (n - i);
std::swap(perm[i], perm[i2]);
}
}
void byte_rand (uint8_t * x, size_t n, int64_t seed)
{
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
//#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
const size_t istart = j * n / nblock;
const size_t iend = (j + 1) * n / nblock;
size_t i;
for (i = istart; i < iend; i++)
x[i] = rng.rand_int64 ();
}
}
} // namespace faiss

60
src/3rd_party/faiss/utils/random.h vendored Normal file
View File

@ -0,0 +1,60 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
/* Random generators. Implemented here for speed and to make
* sequences reproducible.
*/
#pragma once
#include <random>
#include <stdint.h>
namespace faiss {
/**************************************************
* Random data generation functions
**************************************************/
/// random generator that can be used in multithreaded contexts
struct RandomGenerator {
std::mt19937 mt;
/// random positive integer
int rand_int ();
/// random int64_t
int64_t rand_int64 ();
/// generate random integer between 0 and max-1
int rand_int (int max);
/// between 0 and 1
float rand_float ();
double rand_double ();
explicit RandomGenerator (int64_t seed = 1234);
};
/* Generate an array of uniform random floats / multi-threaded implementation */
void float_rand (float * x, size_t n, int64_t seed);
void float_randn (float * x, size_t n, int64_t seed);
void int64_rand (int64_t * x, size_t n, int64_t seed);
void byte_rand (uint8_t * x, size_t n, int64_t seed);
// max is actually the maximum value + 1
void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed);
/* random permutation */
void rand_perm (int * perm, size_t n, int64_t seed);
} // namespace faiss

View File

@ -70,6 +70,7 @@ add_library(marian STATIC
layers/generic.cpp
layers/loss.cpp
layers/weight.cpp
layers/lsh.cpp
rnn/cells.cpp
rnn/attention.cpp

View File

@ -646,6 +646,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
cli.add<bool>("--output-sampling",
"Noise output layer with gumbel noise",
false);
cli.add<std::vector<int>>("--output-approx-knn",
"Use approximate knn search in output layer (currently only in transformer)")
->implicit_val("100 1024");
#if 0 // @TODO: Ask Hany if there are any decoding-time options
// add ULR settings

View File

@ -120,6 +120,13 @@ namespace marian {
} \
} while(0)
#define ABORT_UNLESS(condition, ...) \
do { \
if(!(bool)(condition)) { \
ABORT(__VA_ARGS__); \
} \
} while(0)
typedef std::shared_ptr<spdlog::logger> Logger;
Logger createStderrLogger(const std::string&,
const std::string&,

View File

@ -26,6 +26,16 @@ Expr checkpoint(Expr a) {
return a;
}
Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
LambdaNodeFunctor fwd) {
return Expression<LambdaNodeOp>(nodes, shape, type, fwd);
}
Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
LambdaNodeFunctor fwd, LambdaNodeFunctor bwd) {
return Expression<LambdaNodeOp>(nodes, shape, type, fwd, bwd);
}
// logistic function. Note: scipy name is expit()
Expr sigmoid(Expr a) {
return Expression<SigmoidNodeOp>(a);

View File

@ -10,6 +10,10 @@ Expr checkpoint(Expr a);
typedef Expr(ActivationFunction)(Expr);
typedef std::function<void(Expr, const std::vector<Expr>&)> LambdaNodeFunctor;
Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor);
Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor, LambdaNodeFunctor);
Expr plus(const std::vector<Expr>&);
// TODO: should be logistic(), not sigmoid()
@ -188,6 +192,30 @@ Expr stopGradient(Expr a);
Expr gather(Expr a, int axis, Expr indices);
#if 0
// reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
// with broadcasting
auto knn = get<0>(KNN->apply(query, k)); // [beam, time, batch, k]
auto W = reshape(gather(Wt_, -2, flatten(knn)), {beam * time * batch, k, dim});
auto b = reshape(gather(b_, -1, flatten(knn)), {beam * time * batch, 1, k });
query = reshape(query, {beam * time * batch, 1, dim});
auto logits = bdot(query, W, false, true); // [beam * time * batch, 1, k]
logits = reshape(logits + b, {beam, time, batch, k}); // @TODO: add baffine node
auto shape = indices.shape();
shape.set(-1, 32000);
auto output = grep->constant(shape, inits::lowest(), logits->value_type());
output = scatter(output, -1, indices, logits);
// auto a = graph->constant({2,2,5,32000}, inits::fromValue(minimal))
// scatter(a, -1, indices, values)
// PyTorch does for out-of-place scatter: out = a.scatter(-1, indices, values)
Expr scatter(Expr a, int axis, Expr indices, Expr b);
#endif
// Warning: Don't try to pass a scalar literal 0 as indices; it will compile but pass nullptr...
Expr index_select(Expr a, int axis, Expr indices);

View File

@ -251,5 +251,24 @@ template Ptr<NodeInitializer> range<float> (float begin, float end, f
template Ptr<NodeInitializer> range<IndexType>(IndexType begin, IndexType end, IndexType step);
} // namespace inits
} // namespace marian
#include "faiss/VectorTransform.h"
namespace marian {
namespace inits {
Ptr<NodeInitializer> randomRotation(size_t seed) {
auto rot = [=](Tensor t) {
int rows = t->shape()[-2];
int cols = t->shape()[-1];
faiss::RandomRotationMatrix rrot(cols, rows); // transposed in faiss
rrot.init(seed);
t->set(rrot.A);
};
return fromLambda(rot, Type::float32);
}
} // namespace inits
} // namespace marian

View File

@ -176,6 +176,13 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
*/
Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);
/**
* Computes a random rotation matrix for LSH hashing. This is part
* of a hash function. The values are orthonormal and computed via
* QR decomposition. Same seed results in same random rotation.
*/
Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
/**
* Computes a range from begin to end-1, like Python's range().
* The constant being initialized must have one dimension that matches

View File

@ -13,6 +13,62 @@
namespace marian {
class LambdaNodeOp : public NaryNodeOp {
private:
typedef const std::vector<Expr>& Inputs;
typedef std::function<void(Expr, Inputs)> LambdaNodeFunctor;
std::unique_ptr<LambdaNodeFunctor> forward_;
std::unique_ptr<LambdaNodeFunctor> backward_;
public:
LambdaNodeOp(Inputs inputs, Shape shape, Type type,
LambdaNodeFunctor forward)
: NaryNodeOp(inputs, shape, type),
forward_(new LambdaNodeFunctor(forward)) {
Node::trainable_ = !!backward_;
}
LambdaNodeOp(Inputs inputs, Shape shape, Type type,
LambdaNodeFunctor forward,
LambdaNodeFunctor backward)
: NaryNodeOp(inputs, shape, type),
forward_(new LambdaNodeFunctor(forward)),
backward_(new LambdaNodeFunctor(backward)) {
}
void forward() override {
(*forward_)(this, children_);
}
void backward() override {
ABORT_IF(!backward_, "No backward lambda given?");
(*backward_)(this, children_);
}
const std::string type() override { return "lambda"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, forward_.get());
util::hash_combine(seed, backward_.get());
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<LambdaNodeOp>(node);
if(!cnode)
return false;
if(forward_ != cnode->forward_) // pointer compare on purpose
return false;
if(backward_ != cnode->backward_) // pointer compare on purpose
return false;
return true;
}
};
class DotNodeOp : public NaryNodeOp {
private:
friend class SerializationHelpers;

View File

@ -6,8 +6,7 @@
#include "data/factored_vocab.h"
#include "rnn/types.h" // for State::select()
#include "models/states.h" // for EncoderState
//using std::size_t; // not sure why this is needed
#include "layers/lsh.h"
namespace marian {
Logits::Logits(Expr logits) : Logits(New<RationalLoss>(logits, nullptr)) {} // single-output constructor from Expr only (RationalLoss has no count)
@ -218,6 +217,13 @@ namespace marian {
if (Wt_)
return;
// this option is only set in the decoder
if(!lsh_ && options_->hasAndNotEmpty("output-approx-knn")) {
auto k = opt<std::vector<int>>("output-approx-knn")[0];
auto nbits = opt<std::vector<int>>("output-approx-knn")[1];
lsh_ = New<LSH>(k, nbits);
}
auto name = options_->get<std::string>("prefix");
auto numOutputClasses = options_->get<int>("dim");
@ -257,6 +263,16 @@ namespace marian {
Logits Output::applyAsLogits(Expr input) /*override final*/ {
lazyConstruct(input->shape()[-1]);
auto affineOrLSH = [this](Expr x, Expr W, Expr b, bool transA, bool transB) {
if(lsh_) {
ABORT_IF( transA, "Transposed query not supported for LSH");
ABORT_IF(!transB, "Untransposed indexed matrix not supported for LSH");
return lsh_->apply(x, W, b);
} else {
return affine(x, W, b, transA, transB);
}
};
if (shortlist_ && !cachedShortWt_) { // shortlisted versions of parameters are cached within one batch, then clear()ed
cachedShortWt_ = index_select(Wt_, isLegacyUntransposedW ? -1 : 0, shortlist_->indices());
cachedShortb_ = index_select(b_ , -1, shortlist_->indices());
@ -333,7 +349,12 @@ namespace marian {
input1 = layerNorm(input1, name + "_ffn");
}
// @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
Expr factorLogits;
if(g == 0)
factorLogits = affineOrLSH(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
else
factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
// optionally add lemma-dependent bias
if (Plemma) { // [B... x U0]
int lemmaVocabDim = Plemma->shape()[-1];
@ -396,11 +417,11 @@ namespace marian {
}
}
return Logits(std::move(allLogits), factoredVocab_);
} else if (shortlist_) {
return Logits(affineOrLSH(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
} else {
return Logits(affineOrLSH(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
}
else if (shortlist_)
return Logits(affine(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
else
return Logits(affine(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
}
}

View File

@ -228,6 +228,12 @@ public:
Expr apply(Expr input) override { return apply(std::vector<Expr>({input})); }
};
} // namespace mlp
class LSH;
namespace mlp {
class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList {
private:
// parameters held by this layer
@ -239,10 +245,11 @@ private:
Expr cachedShortb_; // these match the current value of shortlist_
Expr cachedShortLemmaEt_;
Ptr<FactoredVocab> factoredVocab_;
// optional parameters set/updated after construction
Expr tiedParam_;
Ptr<data::Shortlist> shortlist_;
Ptr<LSH> lsh_;
void lazyConstruct(int inputDim);
public:

118
src/layers/lsh.cpp Normal file
View File

@ -0,0 +1,118 @@
#include "layers/lsh.h"
#include "3rd_party/faiss/IndexLSH.h"
#include "graph/expression_operators.h"
#include "tensors/cpu/prod_blas.h"
namespace marian {
Expr LSH::apply(Expr input, Expr W, Expr b) {
auto idx = search(input, W);
return affine(idx, input, W, b);
}
Expr LSH::search(Expr query, Expr values) {
ABORT_IF(query->graph()->getDeviceId().type == DeviceType::gpu,
"LSH index (--output-approx-knn) currently not implemented for GPU");
auto kShape = query->shape();
kShape.set(-1, k_);
auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
auto query = inputs[0];
auto values = inputs[1];
int dim = values->shape()[-1];
if(!index_ || indexHash_ != values->hash()) {
LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_);
index_.reset(new faiss::IndexLSH(dim, nbits_,
/*rotate=*/dim != nbits_,
/*train_thesholds*/false));
int vRows = values->shape().elements() / dim;
index_->train(vRows, values->val()->data<float>());
index_->add( vRows, values->val()->data<float>());
indexHash_ = values->hash();
}
int qRows = query->shape().elements() / dim;
std::vector<float> distances(qRows * k_);
std::vector<faiss::Index::idx_t> ids(qRows * k_);
index_->search(qRows, query->val()->data<float>(), k_,
distances.data(), ids.data());
std::vector<IndexType> vOut;
vOut.reserve(ids.size());
for(auto id : ids)
vOut.push_back((IndexType)id);
out->val()->set(vOut);
};
return lambda({query, values}, kShape, Type::uint32, forward);
}
Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
auto outShape = input->shape();
int dimVoc = W->shape()[-2];
outShape.set(-1, dimVoc);
auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
auto lowest = NumericLimits<float>(out->value_type()).lowest;
out->val()->set(lowest);
int dimIn = inputs[1]->shape()[-1];
int dimOut = out->shape()[-1];
int dimRows = out->shape().elements() / dimOut;
auto outPtr = out->val()->data<float>();
auto idxPtr = inputs[0]->val()->data<uint32_t>();
auto queryPtr = inputs[1]->val()->data<float>();
auto WPtr = inputs[2]->val()->data<float>();
auto bPtr = inputs[3]->val()->data<float>();
for(int row = 0; row < dimRows; ++row) {
auto currIdxPtr = idxPtr + row * k_; // move to next batch of k entries
auto currQueryPtr = queryPtr + row * dimIn; // move to next input query vector
auto currOutPtr = outPtr + row * dimOut; // move to next output position vector (of vocabulary size)
for(int k = 0; k < k_; k++) {
int relPos = currIdxPtr[k]; // k-th best vocabulay item
auto currWPtr = WPtr + relPos * dimIn; // offset for k-th best embedding
currOutPtr[relPos] = bPtr[relPos]; // write bias value to position
// proceed one vector product at a time writing to the correct position
sgemm(false, true, 1, 1, dimIn, 1.0f, currQueryPtr, dimIn, currWPtr, dimIn, 1.0f, &currOutPtr[relPos], 1);
}
}
};
return lambda({idx, input, W, b},
outShape,
input->value_type(),
forward);
}
// @TODO: alternative version which does the same as above with Marian operators, currently missing "scatter".
// this uses more memory and likely to be slower. Would make sense to have a scatter node that actually creates
// the node instead of relying on an existing node, e.g. scatter(shape, defaultValue, axis, indices, values);
#if 0
Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
int dim = input->shape()[-1];
int bch = idx->shape().elements() / k;
auto W = reshape(rows(Wt_, flatten(idx)), {bch, k, dim}); // [rows, k, dim]
auto b = reshape(cols(b_, flatten(idx)), {bch, 1, k}); // [rows, 1, k]
auto aff = reshape(bdot(reshape(input, {bch, 1, dim}), W, false, true) + b, idx->shape()); // [beam, time, batch, k]
int dimVoc = Wt_->shape()[-2];
auto oShape = input->shape();
oShape.set(-1, dimVoc);
auto lowest = graph_->constant(oShape,
inits::fromValue(NumericLimits<float>(input->value_type()).lowest),
input->value_type());
return scatter(lowest, -1, idx, aff);
}
#endif
} // namespace marian

26
src/layers/lsh.h Normal file
View File

@ -0,0 +1,26 @@
#include "graph/expression_graph.h"
#include <memory>
namespace faiss {
class IndexLSH;
}
namespace marian {
class LSH {
public:
LSH(int k, int nbits) : k_{k}, nbits_{nbits} {}
Expr apply(Expr query, Expr values, Expr bias);
private:
Ptr<faiss::IndexLSH> index_;
size_t indexHash_{0};
int k_{100};
int nbits_{1024};
Expr search(Expr query, Expr values);
Expr affine(Expr idx, Expr query, Expr values, Expr bias);
};
}

View File

@ -615,6 +615,7 @@ private:
"prefix", prefix_ + "_ff_logit_out",
"dim", dimTrgVoc,
"vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_], // for factored outputs
"output-approx-knn", opt<std::vector<int>>("output-approx-knn"),
"lemma-dim-emb", opt<int>("lemma-dim-emb", 0)); // for factored outputs
if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all"))

View File

@ -7,51 +7,13 @@
#include "tensors/tensor.h"
#include "tensors/tensor_allocator.h"
#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif
#include "prod_blas.h"
#include "sharp/int_gemm.h"
namespace marian {
namespace cpu {
#if BLAS_FOUND
inline void sgemm(bool transA,
bool transB,
int rows_a,
int rows_b,
int width,
float alpha,
float* a,
int lda,
float* b,
int ldb,
float beta,
float* c,
int ldc) {
cblas_sgemm(CblasRowMajor,
transA ? CblasTrans : CblasNoTrans,
transB ? CblasTrans : CblasNoTrans,
rows_a,
rows_b,
width,
alpha,
a,
lda,
b,
ldb,
beta,
c,
ldc);
}
#endif
void Prod(marian::Tensor C,
const marian::Tensor& A,
const marian::Tensor& B,

View File

@ -0,0 +1,38 @@
#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif
#if BLAS_FOUND
inline void sgemm(bool transA,
bool transB,
int rows_a,
int rows_b,
int width,
float alpha,
float* a,
int lda,
float* b,
int ldb,
float beta,
float* c,
int ldc) {
cblas_sgemm(CblasRowMajor,
transA ? CblasTrans : CblasNoTrans,
transB ? CblasTrans : CblasNoTrans,
rows_a,
rows_b,
width,
alpha,
a,
lda,
b,
ldb,
beta,
c,
ldc);
}
#endif

View File

@ -36,12 +36,24 @@ class TensorBase {
Ptr<Backend> backend)
: memory_(memory), shape_(shape), type_(type), backend_(backend) {}
TensorBase(MemoryPiece::PtrType memory, Shape shape, Ptr<Backend> backend)
TensorBase(MemoryPiece::PtrType memory,
Shape shape,
Ptr<Backend> backend)
: memory_(memory),
shape_(shape),
type_(Type::float32),
backend_(backend) {}
// Wraps existing memory
template <typename T>
TensorBase(T* rawMemory,
size_t rawMemoryNum,
Shape shape,
Type type,
Ptr<Backend> backend)
: memory_(MemoryPiece::New((uint8_t*)rawMemory, rawMemoryNum * sizeof(T))),
shape_(shape), type_(type), backend_(backend) {}
public:
// Use this whenever pointing to MemoryPiece
typedef IPtr<TensorBase> PtrType;