Merged PR 12923: LSH indexing to replace short list

This PR adds the FAISS LSH index to the Marian as a CPU-side optimization in the output layer of transformer models via KNN search. * It allows to replace the potentially harmful short-list with a ML free approximation of the final matrix multiply. With increasing number of K neighbors and the size of the chosen hash in bits the approximation becomes more accurate, but also slower. For model dimensions of 512, the sweet spot seems to be around k=100-150 and nbits=1024-1536 * Enable during CPU-side decoding via `--output-approx-knn k nbits` * Add a `lambda` node that allows to create custom new nodes, most useful for CPU use right now.
2024-09-17 09:47:34 +03:00 · 2020-05-21 23:20:52 +00:00 · 2020-05-21 23:20:52 +00:00 · 5a439245b1
commit 5a439245b1
parent 77a420740c
36 changed files with 4745 additions and 49 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -138,6 +138,9 @@ else(MSVC)
    set(INTRINSICS "-msse4.1")
  endif()

+  add_definitions(-DFINTEGER=long)
+  set(EXT_LIBS ${EXT_LIBS} faiss)
+
  if(USE_FBGEMM)
    set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
    add_definitions(-DUSE_FBGEMM=1)
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -5,6 +5,8 @@ add_subdirectory(./yaml-cpp)
 add_subdirectory(./SQLiteCpp)
 add_subdirectory(./pathie-cpp)
 add_subdirectory(./zlib)
+add_subdirectory(./faiss)
+include_directories(./faiss)

 if(USE_FBGEMM)
  # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
--- a/src/3rd_party/faiss/CMakeLists.txt
+++ b/src/3rd_party/faiss/CMakeLists.txt
@ -0,0 +1,10 @@
+# adding a new file require explicittly modifing the CMakeLists.txt
+
+include_directories("impl")
+
+FILE(GLOB FaissCppSources *.cpp impl/*.cpp utils/*.cpp)
+
+if (NOT TARGET faiss)
+  add_library(faiss STATIC ${FaissCppSources})
+endif()
+
--- a/src/3rd_party/faiss/Index.cpp
+++ b/src/3rd_party/faiss/Index.cpp
@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "Index.h"
+#include "common/logging.h"
+#include <cstring>
+
+
+namespace faiss {
+
+Index::~Index ()
+{
+}
+
+
+void Index::train(idx_t /*n*/, const float* /*x*/) {
+    // does nothing by default
+}
+
+
+void Index::range_search (idx_t , const float *, float,
+                          RangeSearchResult *) const
+{
+  ABORT ("range search not implemented");
+}
+
+void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
+{
+  float * distances = new float[n * k];
+  ScopeDeleter<float> del(distances);
+  search (n, x, k, distances, labels);
+}
+
+void Index::add_with_ids(
+    idx_t /*n*/,
+    const float* /*x*/,
+    const idx_t* /*xids*/) {
+  ABORT ("add_with_ids not implemented for this type of index");
+}
+
+size_t Index::remove_ids(const IDSelector& /*sel*/) {
+  ABORT ("remove_ids not implemented for this type of index");
+  return -1;
+}
+
+
+void Index::reconstruct (idx_t, float * ) const {
+  ABORT ("reconstruct not implemented for this type of index");
+}
+
+
+void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
+  for (idx_t i = 0; i < ni; i++) {
+    reconstruct (i0 + i, recons + i * d);
+  }
+}
+
+
+void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                    float *distances, idx_t *labels,
+                                    float *recons) const {
+  search (n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      float* reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct (key, reconstructed);
+      }
+    }
+  }
+}
+
+void Index::compute_residual (const float * x,
+                              float * residual, idx_t key) const {
+  reconstruct (key, residual);
+  for (size_t i = 0; i < d; i++) {
+    residual[i] = x[i] - residual[i];
+  }
+}
+
+void Index::compute_residual_n (idx_t n, const float* xs,
+                                float* residuals,
+                                const idx_t* keys) const {
+//#pragma omp parallel for
+  for (idx_t i = 0; i < n; ++i) {
+    compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
+  }
+}
+
+
+
+size_t Index::sa_code_size () const
+{
+    ABORT ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_encode (idx_t, const float *,
+                             uint8_t *) const
+{
+    ABORT ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_decode (idx_t, const uint8_t *,
+                            float *) const
+{
+    ABORT ("standalone codec not implemented for this type of index");
+}
+
+}
--- a/src/3rd_party/faiss/Index.h
+++ b/src/3rd_party/faiss/Index.h
@ -0,0 +1,233 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_H
+#define FAISS_INDEX_H
+
+#include "utils/misc.h"
+#include <cstdio>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#define FAISS_VERSION_MAJOR 1
+#define FAISS_VERSION_MINOR 6
+#define FAISS_VERSION_PATCH 3
+
+/**
+ * @namespace faiss
+ *
+ * Throughout the library, vectors are provided as float * pointers.
+ * Most algorithms can be optimized when several vectors are processed
+ * (added/searched) together in a batch. In this case, they are passed
+ * in as a matrix. When n vectors of size d are provided as float * x,
+ * component j of vector i is
+ *
+ *   x[ i * d + j ]
+ *
+ * where 0 <= i < n and 0 <= j < d. In other words, matrices are
+ * always compact. When specifying the size of the matrix, we call it
+ * an n*d matrix, which implies a row-major storage.
+ */
+
+
+namespace faiss {
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+struct DistanceComputer;
+
+/** Abstract structure for an index, supports adding vectors and searching them.
+ *
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
+ */
+struct Index {
+    using idx_t = int64_t;  ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
+
+    int d;                 ///< vector dimension
+    idx_t ntotal;          ///< total nb of indexed vectors
+    bool verbose;          ///< verbosity level
+
+    /// set if the Index does not require training, or if training is
+    /// done already
+    bool is_trained;
+
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    float metric_arg;     ///< argument of the metric type
+
+    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
+                    d(d),
+                    ntotal(0),
+                    verbose(false),
+                    is_trained(true),
+                    metric_type (metric),
+                    metric_arg(0) {}
+
+    virtual ~Index ();
+
+
+    /** Perform training on a representative set of vectors
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * This function slices the input vectors in chuncks smaller than
+     * blocksize_add and calls add_core.
+     * @param x      input matrix, size n * d
+     */
+    virtual void add (idx_t n, const float *x) = 0;
+
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels) const = 0;
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many
+     * indexes do not implement the range_search (only the k-NN search
+     * is mandatory).
+     *
+     * @param x           input vectors to search, size n * d
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search (idx_t n, const float *x, float radius,
+                               RangeSearchResult *result) const;
+
+    /** return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical as search but only return labels of neighbors.
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     */
+    void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
+
+    /// removes all elements from the database.
+    virtual void reset() = 0;
+
+    /** removes IDs from the index. Not supported by all
+     * indexes. Returns the number of elements removed.
+     */
+    virtual size_t remove_ids (const IDSelector & sel);
+
+    /** Reconstruct a stored vector (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d)
+     */
+    virtual void reconstruct (idx_t key, float * recons) const;
+
+    /** Reconstruct vectors i0 to i0 + ni - 1
+     *
+     * this function may not be defined for some indexes
+     * @param recons      reconstucted vector (size ni * d)
+     */
+    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *recons) const;
+
+    /** Computes a residual vector after indexing encoding.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param x           input vector, size d
+     * @param residual    output residual vector, size d
+     * @param key         encoded index, as returned by search and assign
+     */
+    virtual void compute_residual (const float * x,
+                                   float * residual, idx_t key) const;
+
+    /** Computes a residual vector after indexing encoding (batch form).
+     * Equivalent to calling compute_residual for each vector.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param n           number of vectors
+     * @param xs          input vectors, size (n x d)
+     * @param residuals   output residual vectors, size (n x d)
+     * @param keys        encoded index, as returned by search and assign
+     */
+    virtual void compute_residual_n (idx_t n, const float* xs,
+                                     float* residuals,
+                                     const idx_t* keys) const;
+
+    /* The standalone codec interface */
+
+    /** size of the produced codes in bytes */
+    virtual size_t sa_code_size () const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param x       input vectors, size n * d
+     * @param bytes   output encoded vectors, size n * sa_code_size()
+     */
+    virtual void sa_encode (idx_t n, const float *x,
+                                  uint8_t *bytes) const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * sa_code_size()
+     * @param x       output vectors, size n * d
+     */
+    virtual void sa_decode (idx_t n, const uint8_t *bytes,
+                                    float *x) const;
+
+
+};
+
+}
+
+
+#endif
--- a/src/3rd_party/faiss/IndexLSH.cpp
+++ b/src/3rd_party/faiss/IndexLSH.cpp
@ -0,0 +1,224 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexLSH.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include <algorithm>
+
+#include <faiss/utils/hamming.h>
+#include "common/logging.h"
+
+
+namespace faiss {
+
+/***************************************************************
+ * IndexLSH
+ ***************************************************************/
+
+
+IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds):
+    Index(d), nbits(nbits), rotate_data(rotate_data),
+    train_thresholds (train_thresholds), rrot(d, nbits)
+{
+    is_trained = !train_thresholds;
+
+    bytes_per_vec = (nbits + 7) / 8;
+
+    if (rotate_data) {
+        rrot.init(5);
+    } else {
+      ABORT_UNLESS(d >= nbits, "d >= nbits");
+    }
+}
+
+IndexLSH::IndexLSH ():
+    nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false)
+{
+}
+
+
+const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const
+{
+
+    float *xt = nullptr;
+    if (rotate_data) {
+        // also applies bias if exists
+        xt = rrot.apply (n, x);
+    } else if (d != nbits) {
+        assert (nbits < d);
+        xt = new float [nbits * n];
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            const float *xl = x + i * d;
+            for (int j = 0; j < nbits; j++)
+                *xp++ = xl [j];
+        }
+    }
+
+    if (train_thresholds) {
+
+        if (xt == NULL) {
+            xt = new float [nbits * n];
+            memcpy (xt, x, sizeof(*x) * n * nbits);
+        }
+
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++)
+            for (int j = 0; j < nbits; j++)
+                *xp++ -= thresholds [j];
+    }
+
+    return xt ? xt : x;
+}
+
+
+
+void IndexLSH::train (idx_t n, const float *x)
+{
+    if (train_thresholds) {
+        thresholds.resize (nbits);
+        train_thresholds = false;
+        const float *xt = apply_preprocess (n, x);
+        ScopeDeleter<float> del (xt == x ? nullptr : xt);
+        train_thresholds = true;
+
+        float * transposed_x = new float [n * nbits];
+        ScopeDeleter<float> del2 (transposed_x);
+
+        for (idx_t i = 0; i < n; i++)
+            for (idx_t j = 0; j < nbits; j++)
+                transposed_x [j * n + i] = xt [i * nbits + j];
+
+        for (idx_t i = 0; i < nbits; i++) {
+            float *xi = transposed_x + i * n;
+            // std::nth_element
+            std::sort (xi, xi + n);
+            if (n % 2 == 1)
+                thresholds [i] = xi [n / 2];
+            else
+                thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2;
+
+        }
+    }
+    is_trained = true;
+}
+
+
+void IndexLSH::add (idx_t n, const float *x)
+{
+    ABORT_UNLESS (is_trained, "is_trained");
+    codes.resize ((ntotal + n) * bytes_per_vec);
+
+    sa_encode (n, x, &codes[ntotal * bytes_per_vec]);
+
+    ntotal += n;
+}
+
+
+void IndexLSH::search (
+        idx_t n,
+        const float *x,
+        idx_t k,
+        float *distances,
+        idx_t *labels) const
+{
+    ABORT_UNLESS (is_trained, "is_trained");
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+
+    uint8_t * qcodes = new uint8_t [n * bytes_per_vec];
+    ScopeDeleter<uint8_t> del2 (qcodes);
+
+    fvecs2bitvecs (xt, qcodes, nbits, n);
+
+    int * idistances = new int [n * k];
+    ScopeDeleter<int> del3 (idistances);
+
+    int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances};
+
+    hammings_knn_hc (&res, qcodes, codes.data(),
+                     ntotal, bytes_per_vec, true);
+
+
+    // convert distances to floats
+    for (int i = 0; i < k * n; i++)
+        distances[i] = idistances[i];
+
+}
+
+
+void IndexLSH::transfer_thresholds (LinearTransform *vt) {
+    if (!train_thresholds) return;
+    ABORT_UNLESS (nbits == vt->d_out, "nbits == vt->d_out");
+    if (!vt->have_bias) {
+        vt->b.resize (nbits, 0);
+        vt->have_bias = true;
+    }
+    for (int i = 0; i < nbits; i++)
+        vt->b[i] -= thresholds[i];
+    train_thresholds = false;
+    thresholds.clear();
+}
+
+void IndexLSH::reset() {
+    codes.clear();
+    ntotal = 0;
+}
+
+
+size_t IndexLSH::sa_code_size () const
+{
+    return bytes_per_vec;
+}
+
+void IndexLSH::sa_encode (idx_t n, const float *x,
+                                uint8_t *bytes) const
+{
+    ABORT_UNLESS (is_trained, "is_trained");
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+    fvecs2bitvecs (xt, bytes, nbits, n);
+}
+
+void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes,
+                                  float *x) const
+{
+    float *xt = x;
+    ScopeDeleter<float> del;
+    if (rotate_data || nbits != d) {
+        xt = new float [n * nbits];
+        del.set(xt);
+    }
+    bitvecs2fvecs (bytes, xt, nbits, n);
+
+    if (train_thresholds) {
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < nbits; j++) {
+                *xp++ += thresholds [j];
+            }
+        }
+    }
+
+    if (rotate_data) {
+        rrot.reverse_transform (n, xt, x);
+    } else if (nbits != d) {
+        for (idx_t i = 0; i < n; i++) {
+            memcpy (x + i * d, xt + i * nbits,
+                    nbits * sizeof(xt[0]));
+        }
+    }
+}
+
+
+
+} // namespace faiss
--- a/src/3rd_party/faiss/IndexLSH.h
+++ b/src/3rd_party/faiss/IndexLSH.h
@ -0,0 +1,90 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_LSH_H
+#define INDEX_LSH_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+
+/** The sign of each vector component is put in a binary signature */
+struct IndexLSH:Index {
+    typedef unsigned char uint8_t;
+
+    int nbits;              ///< nb of bits per vector
+    int bytes_per_vec;      ///< nb of 8-bits per encoded vector
+    bool rotate_data;       ///< whether to apply a random rotation to input
+    bool train_thresholds;  ///< whether we train thresholds or use 0
+
+    RandomRotationMatrix rrot; ///< optional random rotation
+
+    std::vector <float> thresholds; ///< thresholds to compare with
+
+    /// encoded dataset
+    std::vector<uint8_t> codes;
+
+    IndexLSH (
+            idx_t d, int nbits,
+            bool rotate_data = true,
+            bool train_thresholds = false);
+
+    /** Preprocesses and resizes the input to the size required to
+     * binarize the data
+     *
+     * @param x input vectors, size n * d
+     * @return output vectors, size n * bits. May be the same pointer
+     *         as x, otherwise it should be deleted by the caller
+     */
+    const float *apply_preprocess (idx_t n, const float *x) const;
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    /// transfer the thresholds to a pre-processing stage (and unset
+    /// train_thresholds)
+    void transfer_thresholds (LinearTransform * vt);
+
+    ~IndexLSH() override {}
+
+    IndexLSH ();
+
+    /* standalone codec interface.
+     *
+     * The vectors are decoded to +/- 1 (not 0, 1) */
+
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
--- a/src/3rd_party/faiss/LICENSE
+++ b/src/3rd_party/faiss/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/3rd_party/faiss/README
+++ b/src/3rd_party/faiss/README
@ -0,0 +1 @@
+This is code extracted from the original FAISS repository: https://github.com/facebookresearch/faiss
--- a/src/3rd_party/faiss/VectorTransform.cpp
+++ b/src/3rd_party/faiss/VectorTransform.cpp
@ -0,0 +1,726 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/VectorTransform.h>
+
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+
+#include <faiss/utils/random.h>
+#include "common/logging.h"
+
+using namespace faiss;
+
+
+extern "C" {
+
+// this is to keep the clang syntax checker happy
+#ifndef FINTEGER
+#define FINTEGER int
+#endif
+
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const float *alpha, const float *a,
+        FINTEGER *lda, const float *b,
+        FINTEGER *ldb, float *beta,
+        float *c, FINTEGER *ldc);
+
+int dgemm_ (
+        const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+        n, FINTEGER *k, const double *alpha, const double *a,
+        FINTEGER *lda, const double *b,
+        FINTEGER *ldb, double *beta,
+        double *c, FINTEGER *ldc);
+
+int ssyrk_ (
+        const char *uplo, const char *trans, FINTEGER *n, FINTEGER *k,
+        float *alpha, float *a, FINTEGER *lda,
+        float *beta, float *c, FINTEGER *ldc);
+
+/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
+
+int ssyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, float *a,
+        FINTEGER *lda, float *w, float *work, FINTEGER *lwork,
+        FINTEGER *info);
+
+int dsyev_ (
+        const char *jobz, const char *uplo, FINTEGER *n, double *a,
+        FINTEGER *lda, double *w, double *work, FINTEGER *lwork,
+        FINTEGER *info);
+
+int sgesvd_(
+        const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+        float *a, FINTEGER *lda, float *s, float *u, FINTEGER *ldu, float *vt,
+        FINTEGER *ldvt, float *work, FINTEGER *lwork, FINTEGER *info);
+
+
+int dgesvd_(
+     const char *jobu, const char *jobvt, FINTEGER *m, FINTEGER *n,
+     double *a, FINTEGER *lda, double *s, double *u, FINTEGER *ldu, double *vt,
+     FINTEGER *ldvt, double *work, FINTEGER *lwork, FINTEGER *info);
+
+}
+
+///////////////////////////////////////////////
+extern "C" {
+  /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
+
+  int sgeqrf_(FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
+    float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
+
+  int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
+    FINTEGER *lda, float *tau, float *work,
+    FINTEGER *lwork, FINTEGER *info);
+
+}
+
+void matrix_qr(int m, int n, float *a)
+{
+  ABORT_UNLESS(m >= n, "m >= n");
+  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
+  std::vector<float> tau(ki);
+  FINTEGER lwork = -1, info;
+  float work_size;
+
+  sgeqrf_(&mi, &ni, a, &mi, tau.data(),
+    &work_size, &lwork, &info);
+  lwork = size_t(work_size);
+  std::vector<float> work(lwork);
+
+  sgeqrf_(&mi, &ni, a, &mi,
+    tau.data(), work.data(), &lwork, &info);
+
+  sorgqr_(&mi, &ni, &ki, a, &mi, tau.data(),
+    work.data(), &lwork, &info);
+
+}
+
+///////////////////////////////////////////////
+const float *fvecs_maybe_subsample(
+  size_t d, size_t *n, size_t nmax, const float *x,
+  bool verbose = false, int64_t seed = 1234)
+{
+
+  if (*n <= nmax) return x; // nothing to do
+
+  size_t n2 = nmax;
+  if (verbose) {
+    printf("  Input training set too big (max size is %ld), sampling "
+      "%ld / %ld vectors\n", nmax, n2, *n);
+  }
+  std::vector<int> subset(*n);
+  rand_perm(subset.data(), *n, seed);
+  float *x_subset = new float[n2 * d];
+  for (int64_t i = 0; i < n2; i++)
+    memcpy(&x_subset[i * d],
+      &x[subset[i] * size_t(d)],
+      sizeof(x[0]) * d);
+  *n = n2;
+  return x_subset;
+}
+
+#ifdef __SSE__
+// reads 0 <= d < 4 floats as __m128
+static inline __m128 masked_read(int d, const float *x)
+{
+  assert(0 <= d && d < 4);
+  __attribute__((__aligned__(16))) float buf[4] = { 0, 0, 0, 0 };
+  switch (d) {
+  case 3:
+    buf[2] = x[2];
+  case 2:
+    buf[1] = x[1];
+  case 1:
+    buf[0] = x[0];
+  }
+  return _mm_load_ps(buf);
+  // cannot use AVX2 _mm_mask_set1_epi32
+}
+
+float fvec_norm_L2sqr(const float *  x,
+  size_t d)
+{
+  __m128 mx;
+  __m128 msum1 = _mm_setzero_ps();
+
+  while (d >= 4) {
+    mx = _mm_loadu_ps(x); x += 4;
+    msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
+    d -= 4;
+  }
+
+  mx = masked_read(d, x);
+  msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
+
+  msum1 = _mm_hadd_ps(msum1, msum1);
+  msum1 = _mm_hadd_ps(msum1, msum1);
+  return  _mm_cvtss_f32(msum1);
+}
+#else
+// scalar implementation
+float fvec_norm_L2sqr(const float *x, size_t d)
+{
+  return fvec_norm_L2sqr_ref(x, d);
+}
+#endif
+
+void fvec_renorm_L2(size_t d, size_t nx, float * __restrict x)
+{
+//#pragma omp parallel for
+  for (size_t i = 0; i < nx; i++) {
+    float * __restrict xi = x + i * d;
+
+    float nr = fvec_norm_L2sqr(xi, d);
+
+    if (nr > 0) {
+      size_t j;
+      const float inv_nr = 1.0 / sqrtf(nr);
+      for (j = 0; j < d; j++)
+        xi[j] *= inv_nr;
+    }
+  }
+}
+
+/*********************************************
+ * VectorTransform
+ *********************************************/
+
+
+
+float * VectorTransform::apply (Index::idx_t n, const float * x) const
+{
+    float * xt = new float[n * d_out];
+    apply_noalloc (n, x, xt);
+    return xt;
+}
+
+
+void VectorTransform::train (idx_t, const float *) {
+    // does nothing by default
+}
+
+
+void VectorTransform::reverse_transform (
+             idx_t , const float *,
+             float *) const
+{
+    ABORT ("reverse transform not implemented");
+}
+
+
+
+
+/*********************************************
+ * LinearTransform
+ *********************************************/
+
+/// both d_in > d_out and d_out < d_in are supported
+LinearTransform::LinearTransform (int d_in, int d_out,
+                                  bool have_bias):
+    VectorTransform (d_in, d_out), have_bias (have_bias),
+    is_orthonormal (false), verbose (false)
+{
+    is_trained = false; // will be trained when A and b are initialized
+}
+
+void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
+                               float * xt) const
+{
+    ABORT_UNLESS(is_trained, "Transformation not trained yet");
+
+    float c_factor;
+    if (have_bias) {
+        ABORT_UNLESS(b.size() == d_out, "Bias not initialized");
+        float * xi = xt;
+        for (int i = 0; i < n; i++)
+            for(int j = 0; j < d_out; j++)
+                *xi++ = b[j];
+        c_factor = 1.0;
+    } else {
+        c_factor = 0.0;
+    }
+
+    ABORT_UNLESS(A.size() == d_out * d_in,
+                      "Transformation matrix not initialized");
+
+    float one = 1;
+    FINTEGER nbiti = d_out, ni = n, di = d_in;
+    sgemm_ ("Transposed", "Not transposed",
+            &nbiti, &ni, &di,
+            &one, A.data(), &di, x, &di, &c_factor, xt, &nbiti);
+
+}
+
+
+void LinearTransform::transform_transpose (idx_t n, const float * y,
+                                           float *x) const
+{
+    if (have_bias) { // allocate buffer to store bias-corrected data
+        float *y_new = new float [n * d_out];
+        const float *yr = y;
+        float *yw = y_new;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_out; j++) {
+                *yw++ = *yr++ - b [j];
+            }
+        }
+        y = y_new;
+    }
+
+    {
+        FINTEGER dii = d_in, doi = d_out, ni = n;
+        float one = 1.0, zero = 0.0;
+        sgemm_ ("Not", "Not", &dii, &ni, &doi,
+                &one, A.data (), &dii, y, &doi, &zero, x, &dii);
+    }
+
+    if (have_bias) delete [] y;
+}
+
+void LinearTransform::set_is_orthonormal ()
+{
+    if (d_out > d_in) {
+        // not clear what we should do in this case
+        is_orthonormal = false;
+        return;
+    }
+    if (d_out == 0) { // borderline case, unnormalized matrix
+        is_orthonormal = true;
+        return;
+    }
+
+    double eps = 4e-5;
+    ABORT_UNLESS(A.size() >= d_out * d_in, "A.size() >= d_out * d_in");
+    {
+        std::vector<float> ATA(d_out * d_out);
+        FINTEGER dii = d_in, doi = d_out;
+        float one = 1.0, zero = 0.0;
+
+        sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
+                &one, A.data (), &dii,
+                A.data(), &dii,
+                &zero, ATA.data(), &doi);
+
+        is_orthonormal = true;
+        for (long i = 0; i < d_out; i++) {
+            for (long j = 0; j < d_out; j++) {
+                float v = ATA[i + j * d_out];
+                if (i == j) v-= 1;
+                if (fabs(v) > eps) {
+                    is_orthonormal = false;
+                }
+            }
+        }
+    }
+
+}
+
+
+void LinearTransform::reverse_transform (idx_t n, const float * xt,
+                                         float *x) const
+{
+    if (is_orthonormal) {
+        transform_transpose (n, xt, x);
+    } else {
+      ABORT("reverse transform not implemented for non-orthonormal matrices");
+    }
+}
+
+
+void LinearTransform::print_if_verbose (
+         const char*name, const std::vector<double> &mat,
+         int n, int d) const
+{
+    if (!verbose) return;
+    printf("matrix %s: %d*%d [\n", name, n, d);
+    ABORT_UNLESS(mat.size() >= n * d, "mat.size() >= n * d");
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < d; j++) {
+            printf("%10.5g ", mat[i * d + j]);
+        }
+        printf("\n");
+    }
+    printf("]\n");
+}
+
+/*********************************************
+ * RandomRotationMatrix
+ *********************************************/
+
+void RandomRotationMatrix::init (int seed)
+{
+
+    if(d_out <= d_in) {
+        A.resize (d_out * d_in);
+        float *q = A.data();
+        float_randn(q, d_out * d_in, seed);
+        matrix_qr(d_in, d_out, q);
+    } else {
+        // use tight-frame transformation
+        A.resize (d_out * d_out);
+        float *q = A.data();
+        float_randn(q, d_out * d_out, seed);
+        matrix_qr(d_out, d_out, q);
+        // remove columns
+        int i, j;
+        for (i = 0; i < d_out; i++) {
+            for(j = 0; j < d_in; j++) {
+                q[i * d_in + j] = q[i * d_out + j];
+            }
+        }
+        A.resize(d_in * d_out);
+    }
+    is_orthonormal = true;
+    is_trained = true;
+}
+
+void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
+{
+    // initialize with some arbitrary seed
+    init (12345);
+}
+
+
+/*********************************************
+ * PCAMatrix
+ *********************************************/
+
+PCAMatrix::PCAMatrix (int d_in, int d_out,
+                      float eigen_power, bool random_rotation):
+    LinearTransform(d_in, d_out, true),
+    eigen_power(eigen_power), random_rotation(random_rotation)
+{
+    is_trained = false;
+    max_points_per_d = 1000;
+    balanced_bins = 0;
+}
+
+
+namespace {
+
+/// Compute the eigenvalue decomposition of symmetric matrix cov,
+/// dimensions d_in-by-d_in. Output eigenvectors in cov.
+
+void eig(size_t d_in, double *cov, double *eigenvalues, int verbose)
+{
+    { // compute eigenvalues and vectors
+        FINTEGER info = 0, lwork = -1, di = d_in;
+        double workq;
+
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, &workq, &lwork, &info);
+        lwork = FINTEGER(workq);
+        double *work = new double[lwork];
+
+        dsyev_ ("Vectors as well", "Upper",
+                &di, cov, &di, eigenvalues, work, &lwork, &info);
+
+        delete [] work;
+
+        if (info != 0) {
+            fprintf (stderr, "WARN ssyev info returns %d, "
+                     "a very bad PCA matrix is learnt\n",
+                     int(info));
+            // do not throw exception, as the matrix could still be useful
+        }
+
+
+        if(verbose && d_in <= 10) {
+            printf("info=%ld new eigvals=[", long(info));
+            for(int j = 0; j < d_in; j++) printf("%g ", eigenvalues[j]);
+            printf("]\n");
+
+            double *ci = cov;
+            printf("eigenvecs=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10.4g ", *ci++);
+                printf("\n");
+            }
+        }
+
+    }
+
+    // revert order of eigenvectors & values
+
+    for(int i = 0; i < d_in / 2; i++) {
+
+        std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
+        double *v1 = cov + i * d_in;
+        double *v2 = cov + (d_in - 1 - i) * d_in;
+        for(int j = 0; j < d_in; j++)
+            std::swap(v1[j], v2[j]);
+    }
+
+}
+
+
+}
+
+void PCAMatrix::train (Index::idx_t n, const float *x)
+{
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_points_per_d * d_in, x, verbose);
+
+    ScopeDeleter<float> del_x (x != x_in ? x : nullptr);
+
+    // compute mean
+    mean.clear(); mean.resize(d_in, 0.0);
+    if (have_bias) { // we may want to skip the bias
+        const float *xi = x;
+        for (int i = 0; i < n; i++) {
+            for(int j = 0; j < d_in; j++)
+                mean[j] += *xi++;
+        }
+        for(int j = 0; j < d_in; j++)
+            mean[j] /= n;
+    }
+    if(verbose) {
+        printf("mean=[");
+        for(int j = 0; j < d_in; j++) printf("%g ", mean[j]);
+        printf("]\n");
+    }
+
+    if(n >= d_in) {
+        // compute covariance matrix, store it in PCA matrix
+        PCAMat.resize(d_in * d_in);
+        float * cov = PCAMat.data();
+        { // initialize with  mean * mean^T term
+            float *ci = cov;
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    *ci++ = - n * mean[i] * mean[j];
+            }
+        }
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+            ssyrk_ ("Up", "Non transposed",
+                    &di, &ni, &one, (float*)x, &di, &one, cov, &di);
+
+        }
+        if(verbose && d_in <= 10) {
+            float *ci = cov;
+            printf("cov=\n");
+            for(int i = 0; i < d_in; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> covd (d_in * d_in);
+        for (size_t i = 0; i < d_in * d_in; i++) covd [i] = cov [i];
+
+        std::vector<double> eigenvaluesd (d_in);
+
+        eig (d_in, covd.data (), eigenvaluesd.data (), verbose);
+
+        for (size_t i = 0; i < d_in * d_in; i++) PCAMat [i] = covd [i];
+        eigenvalues.resize (d_in);
+
+        for (size_t i = 0; i < d_in; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+
+
+    } else {
+
+        std::vector<float> xc (n * d_in);
+
+        for (size_t i = 0; i < n; i++)
+            for(size_t j = 0; j < d_in; j++)
+                xc [i * d_in + j] = x [i * d_in + j] - mean[j];
+
+        // compute Gram matrix
+        std::vector<float> gram (n * n);
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0, zero = 0.0;
+            ssyrk_ ("Up", "Transposed",
+                    &ni, &di, &one, xc.data(), &di, &zero, gram.data(), &ni);
+        }
+
+        if(verbose && d_in <= 10) {
+            float *ci = gram.data();
+            printf("gram=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < n; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> gramd (n * n);
+        for (size_t i = 0; i < n * n; i++)
+            gramd [i] = gram [i];
+
+        std::vector<double> eigenvaluesd (n);
+
+        // eig will fill in only the n first eigenvals
+
+        eig (n, gramd.data (), eigenvaluesd.data (), verbose);
+
+        PCAMat.resize(d_in * n);
+
+        for (size_t i = 0; i < n * n; i++)
+            gram [i] = gramd [i];
+
+        eigenvalues.resize (d_in);
+        // fill in only the n first ones
+        for (size_t i = 0; i < n; i++)
+            eigenvalues [i] = eigenvaluesd [i];
+
+        { // compute PCAMat = x' * v
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+
+            sgemm_ ("Non", "Non Trans",
+                    &di, &ni, &ni,
+                    &one, xc.data(), &di, gram.data(), &ni,
+                    &one, PCAMat.data(), &di);
+        }
+
+        if(verbose && d_in <= 10) {
+            float *ci = PCAMat.data();
+            printf("PCAMat=\n");
+            for(int i = 0; i < n; i++) {
+                for(int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        fvec_renorm_L2 (d_in, n, PCAMat.data());
+
+    }
+
+    prepare_Ab();
+    is_trained = true;
+}
+
+void PCAMatrix::copy_from (const PCAMatrix & other)
+{
+    ABORT_UNLESS(other.is_trained, "other.is_trained");
+    mean = other.mean;
+    eigenvalues = other.eigenvalues;
+    PCAMat = other.PCAMat;
+    prepare_Ab ();
+    is_trained = true;
+}
+
+void PCAMatrix::prepare_Ab ()
+{
+    ABORT_UNLESS(
+            d_out * d_in <= PCAMat.size(),
+            "PCA matrix cannot output %d dimensions from %d ",
+            d_out, d_in);
+
+    if (!random_rotation) {
+        A = PCAMat;
+        A.resize(d_out * d_in); // strip off useless dimensions
+
+        // first scale the components
+        if (eigen_power != 0) {
+            float *ai = A.data();
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_in; j++)
+                    *ai++ *= factor;
+            }
+        }
+
+        if (balanced_bins != 0) {
+            ABORT_UNLESS(d_out % balanced_bins == 0, "d_out % balanced_bins == 0");
+            int dsub = d_out / balanced_bins;
+            std::vector <float> Ain;
+            std::swap(A, Ain);
+            A.resize(d_out * d_in);
+
+            std::vector <float> accu(balanced_bins);
+            std::vector <int> counter(balanced_bins);
+
+            // greedy assignment
+            for (int i = 0; i < d_out; i++) {
+                // find best bin
+                int best_j = -1;
+                float min_w = 1e30;
+                for (int j = 0; j < balanced_bins; j++) {
+                    if (counter[j] < dsub && accu[j] < min_w) {
+                        min_w = accu[j];
+                        best_j = j;
+                    }
+                }
+                int row_dst = best_j * dsub + counter[best_j];
+                accu[best_j] += eigenvalues[i];
+                counter[best_j] ++;
+                memcpy (&A[row_dst * d_in], &Ain[i * d_in],
+                        d_in * sizeof (A[0]));
+            }
+
+            if (verbose) {
+                printf("  bin accu=[");
+                for (int i = 0; i < balanced_bins; i++)
+                    printf("%g ", accu[i]);
+                printf("]\n");
+            }
+        }
+
+
+    } else {
+        ABORT_UNLESS(balanced_bins == 0,
+             "both balancing bins and applying a random rotation "
+             "does not make sense");
+        RandomRotationMatrix rr(d_out, d_out);
+
+        rr.init(5);
+
+        // apply scaling on the rotation matrix (right multiplication)
+        if (eigen_power != 0) {
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for(int j = 0; j < d_out; j++)
+                   rr.A[j * d_out + i] *= factor;
+            }
+        }
+
+        A.resize(d_in * d_out);
+        {
+            FINTEGER dii = d_in, doo = d_out;
+            float one = 1.0, zero = 0.0;
+
+            sgemm_ ("Not", "Not", &dii, &doo, &doo,
+                    &one, PCAMat.data(), &dii, rr.A.data(), &doo, &zero,
+                    A.data(), &dii);
+
+        }
+
+    }
+
+    b.clear(); b.resize(d_out);
+
+    for (int i = 0; i < d_out; i++) {
+        float accu = 0;
+        for (int j = 0; j < d_in; j++)
+            accu -= mean[j] * A[j + i * d_in];
+        b[i] = accu;
+    }
+
+    is_orthonormal = eigen_power == 0;
+
+}
+
+
--- a/src/3rd_party/faiss/VectorTransform.h
+++ b/src/3rd_party/faiss/VectorTransform.h
@ -0,0 +1,184 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_VECTOR_TRANSFORM_H
+#define FAISS_VECTOR_TRANSFORM_H
+
+/** Defines a few objects that apply transformations to a set of
+ * vectors Often these are pre-processing steps.
+ */
+
+#include <vector>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+
+/** Any transformation applied on a set of vectors */
+struct VectorTransform {
+
+    typedef Index::idx_t idx_t;
+
+    int d_in;      ///! input dimension
+    int d_out;     ///! output dimension
+
+    explicit VectorTransform (int d_in = 0, int d_out = 0):
+    d_in(d_in), d_out(d_out), is_trained(true)
+    {}
+
+
+    /// set if the VectorTransform does not require training, or if
+    /// training is done already
+    bool is_trained;
+
+
+    /** Perform training on a representative set of vectors. Does
+     * nothing by default.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train (idx_t n, const float *x);
+
+    /** apply the random roation, return new allocated matrix
+     * @param     x size n * d_in
+     * @return    size n * d_out
+     */
+    float *apply (idx_t n, const float * x) const;
+
+    /// same as apply, but result is pre-allocated
+    virtual void apply_noalloc (idx_t n, const float * x,
+                                float *xt) const = 0;
+
+    /// reverse transformation. May not be implemented or may return
+    /// approximate result
+    virtual void reverse_transform (idx_t n, const float * xt,
+                                    float *x) const;
+
+    virtual ~VectorTransform () {}
+
+};
+
+
+
+/** Generic linear transformation, with bias term applied on output
+ * y = A * x + b
+ */
+struct LinearTransform: VectorTransform {
+
+    bool have_bias; ///! whether to use the bias term
+
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
+
+    /// Transformation matrix, size d_out * d_in
+    std::vector<float> A;
+
+     /// bias vector, size d_out
+    std::vector<float> b;
+
+    /// both d_in > d_out and d_out < d_in are supported
+    explicit LinearTransform (int d_in = 0, int d_out = 0,
+                              bool have_bias = false);
+
+    /// same as apply, but result is pre-allocated
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// compute x = A^T * (x - b)
+    /// is reverse transform if A has orthonormal lines
+    void transform_transpose (idx_t n, const float * y,
+                              float *x) const;
+
+    /// works only if is_orthonormal
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal ();
+
+    bool verbose;
+    void print_if_verbose (const char*name, const std::vector<double> &mat,
+                           int n, int d) const;
+
+    ~LinearTransform() override {}
+};
+
+
+
+/// Randomly rotate a set of vectors
+struct RandomRotationMatrix: LinearTransform {
+
+     /// both d_in > d_out and d_out < d_in are supported
+     RandomRotationMatrix (int d_in, int d_out):
+         LinearTransform(d_in, d_out, false) {}
+
+     /// must be called before the transform is used
+     void init(int seed);
+
+     // intializes with an arbitrary seed
+     void train(idx_t n, const float* x) override;
+
+     RandomRotationMatrix () {}
+};
+
+
+/** Applies a principal component analysis on a set of vectors,
+ *  with optionally whitening and random rotation. */
+struct PCAMatrix: LinearTransform {
+
+    /** after transformation the components are multiplied by
+     * eigenvalues^eigen_power
+     *
+     * =0: no whitening
+     * =-0.5: full whitening
+     */
+    float eigen_power;
+
+    /// random rotation after PCA
+    bool random_rotation;
+
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
+
+    /// try to distribute output eigenvectors in this many bins
+    int balanced_bins;
+
+    /// Mean, size d_in
+    std::vector<float> mean;
+
+    /// eigenvalues of covariance matrix (= squared singular values)
+    std::vector<float> eigenvalues;
+
+    /// PCA matrix, size d_in * d_in
+    std::vector<float> PCAMat;
+
+    // the final matrix is computed after random rotation and/or whitening
+    explicit PCAMatrix (int d_in = 0, int d_out = 0,
+                        float eigen_power = 0, bool random_rotation = false);
+
+    /// train on n vectors. If n < d_in then the eigenvector matrix
+    /// will be completed with 0s
+    void train(idx_t n, const float* x) override;
+
+    /// copy pre-trained PCA matrix
+    void copy_from (const PCAMatrix & other);
+
+    /// called after mean, PCAMat and eigenvalues are computed
+    void prepare_Ab();
+
+};
+
+
+} // namespace faiss
+
+
+#endif
--- a/src/3rd_party/faiss/utils/Heap.cpp
+++ b/src/3rd_party/faiss/utils/Heap.cpp
@ -0,0 +1,122 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* Function for soft heap */
+
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+template <typename C>
+void HeapArray<C>::heapify ()
+{
+//#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++)
+        heap_heapify<C> (k, val + j * k, ids + j * k);
+}
+
+template <typename C>
+void HeapArray<C>::reorder ()
+{
+//#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++)
+        heap_reorder<C> (k, val + j * k, ids + j * k);
+}
+
+template <typename C>
+void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
+                         size_t i0, int64_t ni)
+{
+    if (ni == -1) ni = nh;
+    assert (i0 >= 0 && i0 + ni <= nh);
+//#pragma omp parallel for
+    for (size_t i = i0; i < i0 + ni; i++) {
+        T * __restrict simi = get_val(i);
+        TI * __restrict idxi = get_ids (i);
+        const T *ip_line = vin + (i - i0) * nj;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line [j];
+            if (C::cmp(simi[0], ip)) {
+                heap_pop<C> (k, simi, idxi);
+                heap_push<C> (k, simi, idxi, ip, j + j0);
+            }
+        }
+    }
+}
+
+template <typename C>
+void HeapArray<C>::addn_with_ids (
+     size_t nj, const T *vin, const TI *id_in,
+     int64_t id_stride, size_t i0, int64_t ni)
+{
+    if (id_in == nullptr) {
+        addn (nj, vin, 0, i0, ni);
+        return;
+    }
+    if (ni == -1) ni = nh;
+    assert (i0 >= 0 && i0 + ni <= nh);
+//#pragma omp parallel for
+    for (size_t i = i0; i < i0 + ni; i++) {
+        T * __restrict simi = get_val(i);
+        TI * __restrict idxi = get_ids (i);
+        const T *ip_line = vin + (i - i0) * nj;
+        const TI *id_line = id_in + (i - i0) * id_stride;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line [j];
+            if (C::cmp(simi[0], ip)) {
+                heap_pop<C> (k, simi, idxi);
+                heap_push<C> (k, simi, idxi, ip, id_line [j]);
+            }
+        }
+    }
+}
+
+template <typename C>
+void HeapArray<C>::per_line_extrema (
+                   T * out_val,
+                   TI * out_ids) const
+{
+//#pragma omp parallel for
+    for (size_t j = 0; j < nh; j++) {
+        int64_t imin = -1;
+        typename C::T xval = C::Crev::neutral ();
+        const typename C::T * x_ = val + j * k;
+        for (size_t i = 0; i < k; i++)
+            if (C::cmp (x_[i], xval)) {
+                xval = x_[i];
+                imin = i;
+            }
+        if (out_val)
+            out_val[j] = xval;
+
+        if (out_ids) {
+            if (ids && imin != -1)
+                out_ids[j] = ids [j * k + imin];
+            else
+                out_ids[j] = imin;
+        }
+    }
+}
+
+
+
+
+// explicit instanciations
+
+template struct HeapArray<CMin <float, int64_t> >;
+template struct HeapArray<CMax <float, int64_t> >;
+template struct HeapArray<CMin <int, int64_t> >;
+template struct HeapArray<CMax <int, int64_t> >;
+
+
+}  // END namespace fasis
--- a/src/3rd_party/faiss/utils/Heap.h
+++ b/src/3rd_party/faiss/utils/Heap.h
@ -0,0 +1,495 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * C++ support for heaps. The set of functions is tailored for
+ * efficient similarity search.
+ *
+ * There is no specific object for a heap, and the functions that
+ * operate on a signle heap are inlined, because heaps are often
+ * small. More complex functions are implemented in Heaps.cpp
+ *
+ */
+
+
+#ifndef FAISS_Heap_h
+#define FAISS_Heap_h
+
+#include <climits>
+#include <cstring>
+#include <cmath>
+
+#include <cassert>
+#include <cstdio>
+#include <stdint.h>
+
+#include <limits>
+
+
+namespace faiss {
+
+/*******************************************************************
+ * C object: uniform handling of min and max heap
+ *******************************************************************/
+
+/** The C object gives the type T of the values in the heap, the type
+ *  of the keys, TI and the comparison that is done: > for the minheap
+ *  and < for the maxheap. The neutral value will always be dropped in
+ *  favor of any other value in the heap.
+ */
+
+template <typename T_, typename TI_>
+struct CMax;
+
+// traits of minheaps = heaps where the minimum value is stored on top
+// useful to find the *max* values of an array
+template <typename T_, typename TI_>
+struct CMin {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMax<T_, TI_> Crev;
+    inline static bool cmp (T a, T b) {
+        return a < b;
+    }
+    // value that will be popped first -> must be smaller than all others
+    // for int types this is not strictly the smallest val (-max - 1)
+    inline static T neutral () {
+        return -std::numeric_limits<T>::max();
+    }
+};
+
+
+template <typename T_, typename TI_>
+struct CMax {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMin<T_, TI_> Crev;
+    inline static bool cmp (T a, T b) {
+        return a > b;
+    }
+    inline static T neutral () {
+        return std::numeric_limits<T>::max();
+    }
+};
+
+
+/*******************************************************************
+ * Basic heap ops: push and pop
+ *******************************************************************/
+
+/** Pops the top element from the heap defined by bh_val[0..k-1] and
+ * bh_ids[0..k-1].  on output the element at k-1 is undefined.
+ */
+template <class C> inline
+void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
+{
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    typename C::T val = bh_val[k];
+    size_t i = 1, i1, i2;
+    while (1) {
+        i1 = i << 1;
+        i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
+            if (C::cmp(val, bh_val[i1]))
+                break;
+            bh_val[i] = bh_val[i1];
+            bh_ids[i] = bh_ids[i1];
+            i = i1;
+        }
+        else {
+            if (C::cmp(val, bh_val[i2]))
+                break;
+            bh_val[i] = bh_val[i2];
+            bh_ids[i] = bh_ids[i2];
+            i = i2;
+        }
+    }
+    bh_val[i] = bh_val[k];
+    bh_ids[i] = bh_ids[k];
+}
+
+
+
+/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
+ * bh_ids[0..k-2].  on output the element at k-1 is defined.
+ */
+template <class C> inline
+void heap_push (size_t k,
+                typename C::T * bh_val, typename C::TI * bh_ids,
+                typename C::T val, typename C::TI ids)
+{
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    size_t i = k, i_father;
+    while (i > 1) {
+        i_father = i >> 1;
+        if (!C::cmp (val, bh_val[i_father]))  /* the heap structure is ok */
+            break;
+        bh_val[i] = bh_val[i_father];
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_val[i] = val;
+    bh_ids[i] = ids;
+}
+
+
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T> inline
+void minheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    heap_pop<CMin<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+template <typename T> inline
+void minheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
+{
+    heap_push<CMin<T, int64_t> > (k, bh_val, bh_ids, val, ids);
+}
+
+
+template <typename T> inline
+void maxheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    heap_pop<CMax<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+template <typename T> inline
+void maxheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
+{
+    heap_push<CMax<T, int64_t> > (k, bh_val, bh_ids, val, ids);
+}
+
+
+
+/*******************************************************************
+ * Heap initialization
+ *******************************************************************/
+
+/* Initialization phase for the heap (with unconditionnal pushes).
+ * Store k0 elements in a heap containing up to k values. Note that
+ * (bh_val, bh_ids) can be the same as (x, ids) */
+template <class C> inline
+void heap_heapify (
+        size_t k,
+        typename C::T *  bh_val,
+        typename C::TI *  bh_ids,
+        const typename C::T * x = nullptr,
+        const typename C::TI * ids = nullptr,
+        size_t k0 = 0)
+{
+   if (k0 > 0) assert (x);
+
+   if (ids) {
+       for (size_t i = 0; i < k0; i++)
+           heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
+   } else {
+       for (size_t i = 0; i < k0; i++)
+           heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
+   }
+
+   for (size_t i = k0; i < k; i++) {
+       bh_val[i] = C::neutral();
+       bh_ids[i] = -1;
+   }
+
+}
+
+template <typename T> inline
+void minheap_heapify (
+        size_t k, T *  bh_val,
+        int64_t * bh_ids,
+        const T * x = nullptr,
+        const int64_t * ids = nullptr,
+        size_t k0 = 0)
+{
+    heap_heapify< CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
+}
+
+
+template <typename T> inline
+void maxheap_heapify (
+        size_t k,
+        T * bh_val,
+        int64_t * bh_ids,
+         const T * x = nullptr,
+         const int64_t * ids = nullptr,
+         size_t k0 = 0)
+{
+    heap_heapify< CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
+}
+
+
+
+/*******************************************************************
+ * Add n elements to the heap
+ *******************************************************************/
+
+
+/* Add some elements to the heap  */
+template <class C> inline
+void heap_addn (size_t k,
+                typename C::T * bh_val, typename C::TI * bh_ids,
+                const typename C::T * x,
+                const typename C::TI * ids,
+                size_t n)
+{
+    size_t i;
+    if (ids)
+        for (i = 0; i < n; i++) {
+            if (C::cmp (bh_val[0], x[i])) {
+                heap_pop<C> (k, bh_val, bh_ids);
+                heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
+            }
+        }
+    else
+        for (i = 0; i < n; i++) {
+            if (C::cmp (bh_val[0], x[i])) {
+                heap_pop<C> (k, bh_val, bh_ids);
+                heap_push<C> (k, bh_val, bh_ids, x[i], i);
+            }
+        }
+}
+
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T> inline
+void minheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
+                   const T * x, const int64_t * ids, size_t n)
+{
+    heap_addn<CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
+}
+
+template <typename T> inline
+void maxheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
+                   const T * x, const int64_t * ids, size_t n)
+{
+    heap_addn<CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
+}
+
+
+
+
+
+
+/*******************************************************************
+ * Heap finalization (reorder elements)
+ *******************************************************************/
+
+
+/* This function maps a binary heap into an sorted structure.
+   It returns the number  */
+template <typename C> inline
+size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
+{
+    size_t i, ii;
+
+    for (i = 0, ii = 0; i < k; i++) {
+        /* top element should be put at the end of the list */
+        typename C::T val = bh_val[0];
+        typename C::TI id = bh_ids[0];
+
+        /* boundary case: we will over-ride this value if not a true element */
+        heap_pop<C> (k-i, bh_val, bh_ids);
+        bh_val[k-ii-1] = val;
+        bh_ids[k-ii-1] = id;
+        if (id != -1) ii++;
+    }
+    /* Count the number of elements which are effectively returned */
+    size_t nel = ii;
+
+    memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
+    memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
+
+    for (; ii < k; ii++) {
+        bh_val[ii] = C::neutral();
+        bh_ids[ii] = -1;
+    }
+    return nel;
+}
+
+template <typename T> inline
+size_t minheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    return heap_reorder< CMin<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+template <typename T> inline
+size_t maxheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
+{
+    return heap_reorder< CMax<T, int64_t> > (k, bh_val, bh_ids);
+}
+
+
+
+
+
+/*******************************************************************
+ * Operations on heap arrays
+ *******************************************************************/
+
+/** a template structure for a set of [min|max]-heaps it is tailored
+ * so that the actual data of the heaps can just live in compact
+ * arrays.
+ */
+template <typename C>
+struct HeapArray {
+    typedef typename C::TI TI;
+    typedef typename C::T T;
+
+    size_t nh;    ///< number of heaps
+    size_t k;     ///< allocated size per heap
+    TI * ids;     ///< identifiers (size nh * k)
+    T * val;      ///< values (distances or similarities), size nh * k
+
+    /// Return the list of values for a heap
+    T * get_val (size_t key) { return val + key * k; }
+
+    /// Correspponding identifiers
+    TI * get_ids (size_t key) { return ids + key * k; }
+
+    /// prepare all the heaps before adding
+    void heapify ();
+
+    /** add nj elements to heaps i0:i0+ni, with sequential ids
+     *
+     * @param nj    nb of elements to add to each heap
+     * @param vin   elements to add, size ni * nj
+     * @param j0    add this to the ids that are added
+     * @param i0    first heap to update
+     * @param ni    nb of elements to update (-1 = use nh)
+     */
+    void addn (size_t nj, const T *vin, TI j0 = 0,
+               size_t i0 = 0, int64_t ni = -1);
+
+    /** same as addn
+     *
+     * @param id_in     ids of the elements to add, size ni * nj
+     * @param id_stride stride for id_in
+     */
+    void addn_with_ids (
+        size_t nj, const T *vin, const TI *id_in = nullptr,
+        int64_t id_stride = 0, size_t i0 = 0, int64_t ni = -1);
+
+    /// reorder all the heaps
+    void reorder ();
+
+    /** this is not really a heap function. It just finds the per-line
+     *   extrema of each line of array D
+     * @param vals_out    extreme value of each line (size nh, or NULL)
+     * @param idx_out     index of extreme value (size nh or NULL)
+     */
+    void per_line_extrema (T *vals_out, TI *idx_out) const;
+
+};
+
+
+/* Define useful heaps */
+typedef HeapArray<CMin<float, int64_t> > float_minheap_array_t;
+typedef HeapArray<CMin<int, int64_t> > int_minheap_array_t;
+
+typedef HeapArray<CMax<float, int64_t> > float_maxheap_array_t;
+typedef HeapArray<CMax<int, int64_t> > int_maxheap_array_t;
+
+// The heap templates are instanciated explicitly in Heap.cpp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * Indirect heaps: instead of having
+ *
+ *          node i = (bh_ids[i], bh_val[i]),
+ *
+ * in indirect heaps,
+ *
+ *          node i = (bh_ids[i], bh_val[bh_ids[i]]),
+ *
+ *********************************************************************/
+
+
+template <class C>
+inline
+void indirect_heap_pop (
+    size_t k,
+    const typename C::T * bh_val,
+    typename C::TI * bh_ids)
+{
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[bh_ids[k]];
+    size_t i = 1;
+    while (1) {
+        size_t i1 = i << 1;
+        size_t i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
+        if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
+            if (C::cmp(val, bh_val[id1]))
+                break;
+            bh_ids[i] = id1;
+            i = i1;
+        } else {
+            if (C::cmp(val, bh_val[id2]))
+                break;
+            bh_ids[i] = id2;
+            i = i2;
+        }
+    }
+    bh_ids[i] = bh_ids[k];
+}
+
+
+
+template <class C>
+inline
+void indirect_heap_push (size_t k,
+                         const typename C::T * bh_val, typename C::TI * bh_ids,
+                         typename C::TI id)
+{
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[id];
+    size_t i = k;
+    while (i > 1) {
+        size_t i_father = i >> 1;
+        if (!C::cmp (val, bh_val[bh_ids[i_father]]))
+            break;
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_ids[i] = id;
+}
+
+
+} // namespace faiss
+
+#endif  /* FAISS_Heap_h */
--- a/src/3rd_party/faiss/utils/hamming-inl.h
+++ b/src/3rd_party/faiss/utils/hamming-inl.h
@ -0,0 +1,472 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+
+namespace faiss {
+
+
+inline BitstringWriter::BitstringWriter(uint8_t *code, int code_size):
+    code (code), code_size (code_size), i(0)
+{
+    bzero (code, code_size);
+}
+
+inline void BitstringWriter::write(uint64_t x, int nbit) {
+    assert (code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+
+    if (nbit <= na) {
+        code[i >> 3] |= x << (i & 7);
+        i += nbit;
+        return;
+    } else {
+        int j = i >> 3;
+        code[j++] |= x << (i & 7);
+        i += nbit;
+        x >>= na;
+        while (x != 0) {
+            code[j++] |= x;
+            x >>= 8;
+        }
+    }
+}
+
+
+inline BitstringReader::BitstringReader(const uint8_t *code, int code_size):
+    code (code), code_size (code_size), i(0)
+{}
+
+inline uint64_t BitstringReader::read(int nbit) {
+    assert (code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+    // get available bits in current byte
+    uint64_t res = code[i >> 3] >> (i & 7);
+    if (nbit <= na) {
+        res &= (1 << nbit) - 1;
+        i += nbit;
+        return res;
+    } else {
+        int ofs = na;
+        int j = (i >> 3) + 1;
+        i += nbit;
+        nbit -= na;
+        while (nbit > 8) {
+            res |= ((uint64_t)code[j++]) << ofs;
+            ofs += 8;
+            nbit -= 8; // TODO remove nbit
+        }
+        uint64_t last_byte = code[j];
+        last_byte &= (1 << nbit) - 1;
+        res |= last_byte << ofs;
+        return res;
+    }
+}
+
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4 () {}
+
+    HammingComputer4 (const uint8_t *a, int code_size) {
+        set (a, code_size);
+    }
+
+    void set (const uint8_t *a, int code_size) {
+        assert (code_size == 4);
+        a0 = *(uint32_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return popcount64 (*(uint32_t *)b ^ a0);
+    }
+
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8 () {}
+
+    HammingComputer8 (const uint8_t *a, int code_size) {
+        set (a, code_size);
+    }
+
+    void set (const uint8_t *a, int code_size) {
+        assert (code_size == 8);
+        a0 = *(uint64_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return popcount64 (*(uint64_t *)b ^ a0);
+    }
+
+};
+
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16 () {}
+
+    HammingComputer16 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 16);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1);
+    }
+
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20 () {}
+
+    HammingComputer20 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 20);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (*(uint32_t*)(b + 2) ^ a2);
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32 () {}
+
+    HammingComputer32 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 32);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3);
+    }
+
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64 () {}
+
+    HammingComputer64 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size == 64);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+        a4 = a[4]; a5 = a[5]; a6 = a[6]; a7 = a[7];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
+            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3) +
+            popcount64 (b[4] ^ a4) + popcount64 (b[5] ^ a5) +
+            popcount64 (b[6] ^ a6) + popcount64 (b[7] ^ a7);
+    }
+
+};
+
+// very inefficient...
+struct HammingComputerDefault {
+    const uint8_t *a;
+    int n;
+
+    HammingComputerDefault () {}
+
+    HammingComputerDefault (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        a =  a8;
+        n = code_size;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64 (a[i] ^ b8[i]);
+        return accu;
+    }
+
+};
+
+struct HammingComputerM8 {
+    const uint64_t *a;
+    int n;
+
+    HammingComputerM8 () {}
+
+    HammingComputerM8 (const uint8_t *a8, int code_size) {
+        set (a8, code_size);
+    }
+
+    void set (const uint8_t *a8, int code_size) {
+        assert (code_size % 8 == 0);
+        a =  (uint64_t *)a8;
+        n = code_size / 8;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+// even more inefficient!
+struct HammingComputerM4 {
+    const uint32_t *a;
+    int n;
+
+    HammingComputerM4 () {}
+
+    HammingComputerM4 (const uint8_t *a4, int code_size) {
+        set (a4, code_size);
+    }
+
+    void set (const uint8_t *a4, int code_size) {
+        assert (code_size % 4 == 0);
+        a =  (uint32_t *)a4;
+        n = code_size / 4;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint32_t *b = (uint32_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+             accu += popcount64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template<int CODE_SIZE>
+struct HammingComputer: HammingComputerM8 {
+    HammingComputer (const uint8_t *a, int code_size):
+    HammingComputerM8(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                     \
+    template<> struct HammingComputer<CODE_SIZE>:     \
+            HammingComputer ## CODE_SIZE {            \
+        HammingComputer (const uint8_t *a):           \
+        HammingComputer ## CODE_SIZE(a, CODE_SIZE) {} \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+
+inline int generalized_hamming_64 (uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64 (a);
+}
+
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8 (const uint8_t *a, int code_size) {
+        assert (code_size == 8);
+        a0 = *(uint64_t *)a;
+    }
+
+    inline int hamming (const uint8_t *b) const {
+        return generalized_hamming_64 (*(uint64_t *)b ^ a0);
+    }
+
+};
+
+
+struct GenHammingComputer16 {
+    uint64_t a0, a1;
+    GenHammingComputer16 (const uint8_t *a8, int code_size) {
+        assert (code_size == 16);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return generalized_hamming_64 (b[0] ^ a0) +
+            generalized_hamming_64 (b[1] ^ a1);
+    }
+
+};
+
+struct GenHammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    GenHammingComputer32 (const uint8_t *a8, int code_size) {
+        assert (code_size == 32);
+        const uint64_t *a = (uint64_t *)a8;
+        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
+    }
+
+    inline int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        return generalized_hamming_64 (b[0] ^ a0) +
+            generalized_hamming_64 (b[1] ^ a1) +
+            generalized_hamming_64 (b[2] ^ a2) +
+            generalized_hamming_64 (b[3] ^ a3);
+    }
+
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t *a;
+    int n;
+
+    GenHammingComputerM8 (const uint8_t *a8, int code_size) {
+        assert (code_size % 8 == 0);
+        a =  (uint64_t *)a8;
+        n = code_size / 8;
+    }
+
+    int hamming (const uint8_t *b8) const {
+        const uint64_t *b = (uint64_t *)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += generalized_hamming_64 (a[i] ^ b[i]);
+        return accu;
+    }
+
+};
+
+
+/** generalized Hamming distances (= count number of code bytes that
+    are the same) */
+void generalized_hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t code_size,
+        int ordered = true);
+
+
+
+/** This class maintains a list of best distances seen so far.
+ *
+ * Since the distances are in a limited range (0 to nbit), the
+ * object maintains one list per possible distance, and fills
+ * in only the n-first lists, such that the sum of sizes of the
+ * n lists is below k.
+ */
+template<class HammingComputer>
+struct HCounterState {
+  int *counters;
+  int64_t *ids_per_dis;
+
+  HammingComputer hc;
+  int thres;
+  int count_lt;
+  int count_eq;
+  int k;
+
+ HCounterState(int *counters, int64_t *ids_per_dis,
+               const uint8_t *x, int d, int k)
+ : counters(counters),
+        ids_per_dis(ids_per_dis),
+        hc(x, d / 8),
+        thres(d + 1),
+        count_lt(0),
+        count_eq(0),
+        k(k) {}
+
+  void update_counter(const uint8_t *y, size_t j) {
+    int32_t dis = hc.hamming(y);
+
+    if (dis <= thres) {
+      if (dis < thres) {
+        ids_per_dis[dis * k + counters[dis]++] = j;
+        ++count_lt;
+        while (count_lt == k && thres > 0) {
+          --thres;
+          count_eq = counters[thres];
+          count_lt -= count_eq;
+        }
+      } else if (count_eq < k) {
+        ids_per_dis[dis * k + count_eq++] = j;
+        counters[dis] = count_eq;
+      }
+    }
+  }
+};
+
+
+} // namespace faiss
--- a/src/3rd_party/faiss/utils/hamming.cpp
+++ b/src/3rd_party/faiss/utils/hamming.cpp
@ -0,0 +1,879 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * Implementation of Hamming related functions (distances, smallest distance
+ * selection with regular heap|radix and probabilistic heap|radix.
+ *
+ * IMPLEMENTATION NOTES
+ * Bitvectors are generally assumed to be multiples of 64 bits.
+ *
+ * hamdis_t is used for distances because at this time
+ * it is not clear how we will need to balance
+ * - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
+ * - memory usage
+ * - cache-misses when dealing with large volumes of data (lower bits is better)
+ *
+ * The hamdis_t should optimally be compatibe with one of the Torch Storage
+ * (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
+*/
+
+#include <faiss/utils/hamming.h>
+
+#include <vector>
+#include <memory>
+#include <stdio.h>
+#include <math.h>
+
+#include <faiss/utils/Heap.h>
+#include "common/logging.h"
+#include "misc.h"
+
+static const size_t BLOCKSIZE_QUERY = 8192;
+
+
+namespace faiss {
+  void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
+    for (size_t i = 0; i < d; ++i) {
+      x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
+    }
+  }
+
+  //////////////////////////////////////////
+size_t hamming_batch_size = 65536;
+
+static const uint8_t hamdis_tab_ham_bytes[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+T hamming (const uint8_t *bs1,
+           const uint8_t *bs2)
+{
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++)
+        h += (T) hamdis_tab_ham_bytes[bs1[i]^bs2[i]];
+    return h;
+}
+
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+hamdis_t hamming (const uint64_t * bs1, const uint64_t * bs2)
+{
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++)
+        h += popcount64 (bs1[i] ^ bs2[i]);
+    return h;
+}
+
+
+
+/* specialized (optimized) functions */
+template <>
+hamdis_t hamming<64> (const uint64_t * pa, const uint64_t * pb)
+{
+    return popcount64 (pa[0] ^ pb[0]);
+}
+
+
+template <>
+hamdis_t hamming<128> (const uint64_t *pa, const uint64_t *pb)
+{
+    return popcount64 (pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+
+template <>
+hamdis_t hamming<256> (const uint64_t * pa, const uint64_t * pb)
+{
+    return  popcount64 (pa[0] ^ pb[0])
+          + popcount64 (pa[1] ^ pb[1])
+          + popcount64 (pa[2] ^ pb[2])
+          + popcount64 (pa[3] ^ pb[3]);
+}
+
+
+/* Hamming distances for multiple of 64 bits */
+hamdis_t hamming (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t nwords)
+{
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++)
+        h += popcount64 (bs1[i] ^ bs2[i]);
+    return h;
+}
+
+
+
+template <size_t nbits>
+void hammings (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1, size_t n2,
+        hamdis_t * dis)
+
+{
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t * __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t * __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++)
+            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
+    }
+}
+
+
+
+void hammings (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        size_t nwords,
+        hamdis_t * __restrict dis)
+{
+    size_t i, j;
+    n1 *= nwords;
+    n2 *= nwords;
+    for (i = 0; i < n1; i+=nwords) {
+        const uint64_t * bs1_ = bs1+i;
+        for (j = 0; j < n2; j+=nwords)
+            dis[j] = hamming (bs1_, bs2+j, nwords);
+    }
+}
+
+
+
+
+/* Count number of matches given a max threshold */
+template <size_t nbits>
+void hamming_count_thres (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t * nptr)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t * bs2_ = bs2;
+
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            /* collect the match only if this satisfies the threshold */
+            if (hamming <nbits> (bs1, bs2) <= ht)
+                posm++;
+            bs2 += nwords;
+        }
+        bs1 += nwords;  /* next signature */
+    }
+    *nptr = posm;
+}
+
+
+template <size_t nbits>
+void crosshamming_count_thres (
+        const uint64_t * dbs,
+        size_t n,
+        int ht,
+        size_t * nptr)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t * bs1 = dbs;
+    for (i = 0; i < n; i++) {
+        const uint64_t * bs2 = bs1 + 2;
+        for (j = i + 1; j < n; j++) {
+            /* collect the match only if this satisfies the threshold */
+            if (hamming <nbits> (bs1, bs2) <= ht)
+                posm++;
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+
+template <size_t nbits>
+size_t match_hamming_thres (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n1,
+        size_t n2,
+        int ht,
+        int64_t * idx,
+        hamdis_t * hams)
+{
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    hamdis_t h;
+    const uint64_t * bs2_ = bs2;
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            /* Here perform the real work of computing the distance */
+            h = hamming <nbits> (bs1, bs2);
+
+            /* collect the match only if this satisfies the threshold */
+            if (h <= ht) {
+                /* Enough space to store another match ? */
+                *idx = i; idx++;
+                *idx = j; idx++;
+                *hams = h;
+                hams++;
+                posm++;
+            }
+            bs2+=nwords;  /* next signature */
+        }
+        bs1+=nwords;
+    }
+    return posm;
+}
+
+
+/* Return closest neighbors w.r.t Hamming distance, using a heap. */
+template <class HammingComputer>
+static
+void hammings_knn_hc (
+        int bytes_per_code,
+        int_maxheap_array_t * ha,
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true)
+{
+    size_t k = ha->k;
+    if (init_heap) ha->heapify ();
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
+      const size_t j1 = std::min(j0 + block_size, n2);
+//#pragma omp parallel for
+      for (size_t i = 0; i < ha->nh; i++) {
+        HammingComputer hc (bs1 + i * bytes_per_code, bytes_per_code);
+
+        const uint8_t * bs2_ = bs2 + j0 * bytes_per_code;
+        hamdis_t dis;
+        hamdis_t * __restrict bh_val_ = ha->val + i * k;
+        int64_t * __restrict bh_ids_ = ha->ids + i * k;
+        size_t j;
+        for (j = j0; j < j1; j++, bs2_+= bytes_per_code) {
+          dis = hc.hamming (bs2_);
+          if (dis < bh_val_[0]) {
+            faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+            faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
+          }
+        }
+      }
+    }
+    if (order) ha->reorder ();
+ }
+
+/* Return closest neighbors w.r.t Hamming distance, using max count. */
+template <class HammingComputer>
+static
+void hammings_knn_mc (
+        int bytes_per_code,
+        const uint8_t *a,
+        const uint8_t *b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        int32_t *distances,
+        int64_t *labels)
+{
+  const int nBuckets = bytes_per_code * 8 + 1;
+  std::vector<int> all_counters(na * nBuckets, 0);
+  std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
+
+  std::vector<HCounterState<HammingComputer>> cs;
+  for (size_t i = 0; i < na; ++i) {
+    cs.push_back(HCounterState<HammingComputer>(
+                   all_counters.data() + i * nBuckets,
+                   all_ids_per_dis.get() + i * nBuckets * k,
+                   a + i * bytes_per_code,
+                   8 * bytes_per_code,
+                   k
+                 ));
+  }
+
+  const size_t block_size = hamming_batch_size;
+  for (size_t j0 = 0; j0 < nb; j0 += block_size) {
+    const size_t j1 = std::min(j0 + block_size, nb);
+//#pragma omp parallel for
+    for (size_t i = 0; i < na; ++i) {
+      for (size_t j = j0; j < j1; ++j) {
+        cs[i].update_counter(b + j * bytes_per_code, j);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < na; ++i) {
+    HCounterState<HammingComputer>& csi = cs[i];
+
+    int nres = 0;
+    for (int b = 0; b < nBuckets && nres < k; b++) {
+      for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+        labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+        distances[i * k + nres] = b;
+        nres++;
+      }
+    }
+    while (nres < k) {
+      labels[i * k + nres] = -1;
+      distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+      ++nres;
+    }
+  }
+}
+
+
+
+// works faster than the template version
+static
+void hammings_knn_hc_1 (
+        int_maxheap_array_t * ha,
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true)
+{
+    const size_t nwords = 1;
+    size_t k = ha->k;
+
+
+    if (init_heap) {
+        ha->heapify ();
+    }
+
+//#pragma omp parallel for
+    for (size_t i = 0; i < ha->nh; i++) {
+        const uint64_t bs1_ = bs1 [i];
+        const uint64_t * bs2_ = bs2;
+        hamdis_t dis;
+        hamdis_t * bh_val_ = ha->val + i * k;
+        hamdis_t bh_val_0 = bh_val_[0];
+        int64_t * bh_ids_ = ha->ids + i * k;
+        size_t j;
+        for (j = 0; j < n2; j++, bs2_+= nwords) {
+            dis = popcount64 (bs1_ ^ *bs2_);
+            if (dis < bh_val_0) {
+                faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+                faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
+                bh_val_0 = bh_val_[0];
+            }
+        }
+    }
+    if (order) {
+        ha->reorder ();
+    }
+}
+
+
+
+
+/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
+   meaning that b should be be able to receive as many bits as x may produce. */
+
+/*
+ * dimension 0 corresponds to the least significant bit of b[0], or
+ * equivalently to the lsb of the first byte that is stored.
+ */
+void fvec2bitvec (const float * x, uint8_t * b, size_t d)
+{
+    for (int i = 0; i < d; i += 8) {
+        uint8_t w = 0;
+        uint8_t mask = 1;
+        int nj = i + 8 <= d ? 8 : d - i;
+        for (int j = 0; j < nj; j++) {
+            if (x[i + j] >= 0)
+                w |= mask;
+            mask <<= 1;
+        }
+        *b = w;
+        b++;
+    }
+}
+
+
+
+/* Same but for n vectors.
+   Ensure that the ouptut b is byte-aligned (pad with 0s). */
+void fvecs2bitvecs (const float * x, uint8_t * b, size_t d, size_t n)
+{
+    const int64_t ncodes = ((d + 7) / 8);
+//#pragma omp parallel for if(n > 100000)
+    for (size_t i = 0; i < n; i++)
+        fvec2bitvec (x + i * d, b + i * ncodes, d);
+}
+
+
+
+void bitvecs2fvecs (
+        const uint8_t * b,
+        float * x,
+        size_t d,
+        size_t n) {
+
+    const int64_t ncodes = ((d + 7) / 8);
+//#pragma omp parallel for if(n > 100000)
+    for (size_t i = 0; i < n; i++) {
+        binary_to_real (d, b + i * ncodes, x + i * d);
+    }
+}
+
+
+/* Reverse bit (NOT a optimized function, only used for print purpose) */
+static uint64_t uint64_reverse_bits (uint64_t b)
+{
+    int i;
+    uint64_t revb = 0;
+    for (i = 0; i < 64; i++) {
+        revb <<= 1;
+        revb |= b & 1;
+        b >>= 1;
+    }
+    return revb;
+}
+
+
+/* print the bit vector */
+void bitvec_print (const uint8_t * b, size_t d)
+{
+    size_t i, j;
+    for (i = 0; i < d; ) {
+        uint64_t brev = uint64_reverse_bits (* (uint64_t *) b);
+        for (j = 0; j < 64 && i < d; j++, i++) {
+            printf ("%d", (int) (brev & 1));
+            brev >>= 1;
+        }
+        b += 8;
+        printf (" ");
+    }
+}
+
+
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b)
+{
+    for(size_t i = 0; i < db; i++) {
+        ABORT_UNLESS (order[i] >= 0 && order[i] < da, "order[i] >= 0 && order[i] < da");
+    }
+    size_t lda = (da + 7) / 8;
+    size_t ldb = (db + 7) / 8;
+
+//#pragma omp parallel for if(n > 10000)
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *ai = a + i * lda;
+        uint8_t *bi = b + i * ldb;
+        memset (bi, 0, ldb);
+        for(size_t i = 0; i < db; i++) {
+            int o = order[i];
+            uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
+            bi[i >> 3] |= the_bit << (i & 7);
+        }
+    }
+
+}
+
+
+
+/*----------------------------------------*/
+/* Hamming distance computation and k-nn  */
+
+
+#define C64(x) ((uint64_t *)x)
+
+
+/* Compute a set of Hamming distances */
+void hammings (
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t na, size_t nb,
+        size_t ncodes,
+        hamdis_t * __restrict dis)
+{
+    ABORT_UNLESS (ncodes % 8 == 0, "ncodes % 8 == 0");
+    switch (ncodes) {
+        case 8:
+            faiss::hammings <64>  (C64(a), C64(b), na, nb, dis); return;
+        case 16:
+            faiss::hammings <128> (C64(a), C64(b), na, nb, dis); return;
+        case 32:
+            faiss::hammings <256> (C64(a), C64(b), na, nb, dis); return;
+        case 64:
+            faiss::hammings <512> (C64(a), C64(b), na, nb, dis); return;
+        default:
+            faiss::hammings (C64(a), C64(b), na, nb, ncodes * 8, dis); return;
+    }
+}
+
+void hammings_knn(
+    int_maxheap_array_t *ha,
+    const uint8_t *a,
+    const uint8_t *b,
+    size_t nb,
+    size_t ncodes,
+    int order)
+{
+    hammings_knn_hc(ha, a, b, nb, ncodes, order);
+}
+
+void hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t ncodes,
+        int order)
+{
+    switch (ncodes) {
+    case 4:
+        hammings_knn_hc<faiss::HammingComputer4>
+            (4, ha, a, b, nb, order, true);
+        break;
+    case 8:
+        hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
+        // hammings_knn_hc<faiss::HammingComputer8>
+        //      (8, ha, a, b, nb, order, true);
+        break;
+    case 16:
+        hammings_knn_hc<faiss::HammingComputer16>
+            (16, ha, a, b, nb, order, true);
+        break;
+    case 32:
+        hammings_knn_hc<faiss::HammingComputer32>
+            (32, ha, a, b, nb, order, true);
+        break;
+    default:
+        if(ncodes % 8 == 0) {
+            hammings_knn_hc<faiss::HammingComputerM8>
+                (ncodes, ha, a, b, nb, order, true);
+        } else {
+            hammings_knn_hc<faiss::HammingComputerDefault>
+                (ncodes, ha, a, b, nb, order, true);
+
+        }
+    }
+}
+
+void hammings_knn_mc(
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    size_t k,
+    size_t ncodes,
+    int32_t *distances,
+    int64_t *labels)
+{
+    switch (ncodes) {
+    case 4:
+        hammings_knn_mc<faiss::HammingComputer4>(
+          4, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 8:
+        // TODO(hoss): Write analog to hammings_knn_hc_1
+        // hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
+        hammings_knn_mc<faiss::HammingComputer8>(
+          8, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 16:
+        hammings_knn_mc<faiss::HammingComputer16>(
+          16, a, b, na, nb, k, distances, labels
+        );
+        break;
+    case 32:
+        hammings_knn_mc<faiss::HammingComputer32>(
+          32, a, b, na, nb, k, distances, labels
+        );
+        break;
+    default:
+        if(ncodes % 8 == 0) {
+            hammings_knn_mc<faiss::HammingComputerM8>(
+              ncodes, a, b, na, nb, k, distances, labels
+            );
+        } else {
+            hammings_knn_mc<faiss::HammingComputerDefault>(
+              ncodes, a, b, na, nb, k, distances, labels
+            );
+        }
+    }
+}
+template <class HammingComputer>
+static
+void hamming_range_search_template (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *res)
+{
+
+//#pragma omp parallel
+    {
+        RangeSearchPartialResult pres (res);
+
+//#pragma omp for
+        for (size_t i = 0; i < na; i++) {
+             HammingComputer hc (a + i * code_size, code_size);
+            const uint8_t * yi = b;
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t j = 0; j < nb; j++) {
+                int dis = hc.hamming (yi);
+                if (dis < radius) {
+                    qres.add(dis, j);
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize ();
+    }
+}
+
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *result)
+{
+
+#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
+
+    switch(code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+
+
+/* Count number of matches given a max threshold            */
+void hamming_count_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr)
+{
+    switch (ncodes) {
+        case 8:
+            faiss::hamming_count_thres <64> (C64(bs1), C64(bs2),
+                                             n1, n2, ht, nptr);
+            return;
+        case 16:
+            faiss::hamming_count_thres <128> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        case 32:
+            faiss::hamming_count_thres <256> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        case 64:
+            faiss::hamming_count_thres <512> (C64(bs1), C64(bs2),
+                                              n1, n2, ht, nptr);
+            return;
+        default:
+          ABORT ("not implemented for %zu bits", ncodes);
+    }
+}
+
+
+/* Count number of cross-matches given a threshold */
+void crosshamming_count_thres (
+        const uint8_t * dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr)
+{
+    switch (ncodes) {
+        case 8:
+            faiss::crosshamming_count_thres <64>  (C64(dbs), n, ht, nptr);
+            return;
+        case 16:
+            faiss::crosshamming_count_thres <128> (C64(dbs), n, ht, nptr);
+            return;
+        case 32:
+            faiss::crosshamming_count_thres <256> (C64(dbs), n, ht, nptr);
+            return;
+        case 64:
+            faiss::crosshamming_count_thres <512> (C64(dbs), n, ht, nptr);
+            return;
+        default:
+            ABORT ("not implemented for %zu bits", ncodes);
+    }
+}
+
+
+/* Returns all matches given a threshold */
+size_t match_hamming_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t * idx,
+        hamdis_t * dis)
+{
+    switch (ncodes) {
+        case 8:
+          return faiss::match_hamming_thres <64> (C64(bs1), C64(bs2),
+                                                  n1, n2, ht, idx, dis);
+        case 16:
+          return faiss::match_hamming_thres <128> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        case 32:
+          return faiss::match_hamming_thres <256> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        case 64:
+          return faiss::match_hamming_thres <512> (C64(bs1), C64(bs2),
+                                                   n1, n2, ht, idx, dis);
+        default:
+            ABORT ("not implemented for %zu bits", ncodes);
+            return 0;
+    }
+}
+
+
+#undef C64
+
+
+
+/*************************************
+ * generalized Hamming distances
+ ************************************/
+
+
+
+template <class HammingComputer>
+static void hamming_dis_inner_loop (
+        const uint8_t *ca,
+        const uint8_t *cb,
+        size_t nb,
+        size_t code_size,
+        int k,
+        hamdis_t * bh_val_,
+        int64_t *     bh_ids_)
+{
+
+    HammingComputer hc (ca, code_size);
+
+    for (size_t j = 0; j < nb; j++) {
+        int ndiff = hc.hamming (cb);
+        cb += code_size;
+        if (ndiff < bh_val_[0]) {
+            maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
+            maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, ndiff, j);
+        }
+    }
+}
+
+void generalized_hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t code_size,
+        int ordered)
+{
+    int na = ha->nh;
+    int k = ha->k;
+
+    if (ordered)
+        ha->heapify ();
+
+//#pragma omp parallel for
+    for (int i = 0; i < na; i++) {
+        const uint8_t *ca = a + i * code_size;
+        const uint8_t *cb = b;
+
+        hamdis_t * bh_val_ = ha->val + i * k;
+        int64_t *     bh_ids_ = ha->ids + i * k;
+
+        switch (code_size) {
+        case 8:
+            hamming_dis_inner_loop<GenHammingComputer8>
+                (ca, cb, nb, 8, k, bh_val_, bh_ids_);
+            break;
+        case 16:
+            hamming_dis_inner_loop<GenHammingComputer16>
+                (ca, cb, nb, 16, k, bh_val_, bh_ids_);
+            break;
+        case 32:
+            hamming_dis_inner_loop<GenHammingComputer32>
+                (ca, cb, nb, 32, k, bh_val_, bh_ids_);
+            break;
+        default:
+            hamming_dis_inner_loop<GenHammingComputerM8>
+                (ca, cb, nb, code_size, k, bh_val_, bh_ids_);
+            break;
+        }
+    }
+
+    if (ordered)
+        ha->reorder ();
+
+}
+
+
+} // namespace faiss
--- a/src/3rd_party/faiss/utils/hamming.h
+++ b/src/3rd_party/faiss/utils/hamming.h
@ -0,0 +1,240 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * Hamming distances. The binary vector dimensionality should be a
+ * multiple of 8, as the elementary operations operate on bytes. If
+ * you need other sizes, just pad with 0s (this is done by function
+ * fvecs2bitvecs).
+ *
+ * User-defined type hamdis_t is used for distances because at this time
+ * it is still uncler clear how we will need to balance
+ * - flexibility in vector size (may need 16- or even 8-bit vectors)
+ * - memory usage
+ * - cache-misses when dealing with large volumes of data (fewer bits is better)
+ *
+ */
+
+#ifndef FAISS_hamming_h
+#define FAISS_hamming_h
+
+
+#include <stdint.h>
+
+#include <faiss/utils/Heap.h>
+
+
+/* The Hamming distance type */
+typedef int32_t hamdis_t;
+
+namespace faiss {
+
+/**************************************************
+ * General bit vector functions
+ **************************************************/
+
+struct RangeSearchResult;
+
+void bitvec_print (const uint8_t * b, size_t d);
+
+
+/* Functions for casting vectors of regular types to compact bits.
+   They assume proper allocation done beforehand, meaning that b
+   should be be able to receive as many bits as x may produce.  */
+
+/* Makes an array of bits from the signs of a float array. The length
+   of the output array b is rounded up to byte size (allocate
+   accordingly) */
+void fvecs2bitvecs (
+        const float * x,
+        uint8_t * b,
+        size_t d,
+        size_t n);
+
+void bitvecs2fvecs (
+        const uint8_t * b,
+        float * x,
+        size_t d,
+        size_t n);
+
+
+void fvec2bitvec (const float * x, uint8_t * b, size_t d);
+
+/** Shuffle the bits from b(i, j) := a(i, order[j])
+ */
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b);
+
+
+/***********************************************
+ * Generic reader/writer for bit strings
+ ***********************************************/
+
+
+struct BitstringWriter {
+    uint8_t *code;
+    size_t code_size;
+    size_t i; // current bit offset
+
+    // code_size in bytes
+    BitstringWriter(uint8_t *code, int code_size);
+
+    // write the nbit low bits of x
+    void write(uint64_t x, int nbit);
+};
+
+struct BitstringReader {
+    const uint8_t *code;
+    size_t code_size;
+    size_t i;
+
+    // code_size in bytes
+    BitstringReader(const uint8_t *code, int code_size);
+
+    // read nbit bits from the code
+    uint64_t read(int nbit);
+};
+
+/**************************************************
+ * Hamming distance computation functions
+ **************************************************/
+
+
+
+extern size_t hamming_batch_size;
+
+inline int popcount64(uint64_t x) {
+    return __builtin_popcountl(x);
+}
+
+
+/** Compute a set of Hamming distances between na and nb binary vectors
+ *
+ * @param  a             size na * nbytespercode
+ * @param  b             size nb * nbytespercode
+ * @param  nbytespercode should be multiple of 8
+ * @param  dis           output distances, size na * nb
+ */
+void hammings (
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t na, size_t nb,
+        size_t nbytespercode,
+        hamdis_t * dis);
+
+
+
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using a max heap.
+ * @param a       queries, size ha->nh * ncodes
+ * @param b       database, size nb * ncodes
+ * @param nb      number of database vectors
+ * @param ncodes  size of the binary codes (bytes)
+ * @param ordered if != 0: order the results by decreasing distance
+ *                (may be bottleneck for k/n > 0.01) */
+void hammings_knn_hc (
+        int_maxheap_array_t * ha,
+        const uint8_t * a,
+        const uint8_t * b,
+        size_t nb,
+        size_t ncodes,
+        int ordered);
+
+/* Legacy alias to hammings_knn_hc. */
+void hammings_knn (
+  int_maxheap_array_t * ha,
+  const uint8_t * a,
+  const uint8_t * b,
+  size_t nb,
+  size_t ncodes,
+  int ordered);
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using counting max.
+ * @param a       queries, size na * ncodes
+ * @param b       database, size nb * ncodes
+ * @param na      number of query vectors
+ * @param nb      number of database vectors
+ * @param k       number of vectors/distances to return
+ * @param ncodes  size of the binary codes (bytes)
+ * @param distances output distances from each query vector to its k nearest
+ *                neighbors
+ * @param labels  output ids of the k nearest neighbors to each query vector
+ */
+void hammings_knn_mc (
+  const uint8_t * a,
+  const uint8_t * b,
+  size_t na,
+  size_t nb,
+  size_t k,
+  size_t ncodes,
+  int32_t *distances,
+  int64_t *labels);
+
+/** same as hammings_knn except we are doing a range search with radius */
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t ncodes,
+    RangeSearchResult *result);
+
+
+/* Counting the number of matches or of cross-matches (without returning them)
+   For use with function that assume pre-allocated memory */
+void hamming_count_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr);
+
+/* Return all Hamming distances/index passing a thres. Pre-allocation of output
+   is required. Use hamming_count_thres to determine the proper size. */
+size_t match_hamming_thres (
+        const uint8_t * bs1,
+        const uint8_t * bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t * idx,
+        hamdis_t * dis);
+
+/* Cross-matching in a set of vectors */
+void crosshamming_count_thres (
+        const uint8_t * dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t * nptr);
+
+
+/* compute the Hamming distances between two codewords of nwords*64 bits */
+hamdis_t hamming (
+        const uint64_t * bs1,
+        const uint64_t * bs2,
+        size_t nwords);
+
+
+
+} // namespace faiss
+
+// inlined definitions of HammingComputerXX and GenHammingComputerXX
+
+#include <faiss/utils/hamming-inl.h>
+
+#endif /* FAISS_hamming_h */
--- a/src/3rd_party/faiss/utils/misc.cpp
+++ b/src/3rd_party/faiss/utils/misc.cpp
@ -0,0 +1,162 @@
+#include "misc.h"
+#include <cstring>
+
+namespace faiss {
+  /***********************************************************************
+   * BufferList
+   ***********************************************************************/
+
+
+  BufferList::BufferList(size_t buffer_size) :
+    buffer_size(buffer_size)
+  {
+    wp = buffer_size;
+  }
+
+  BufferList::~BufferList()
+  {
+    for (int i = 0; i < buffers.size(); i++) {
+      delete[] buffers[i].ids;
+      delete[] buffers[i].dis;
+    }
+  }
+
+  void BufferList::add(idx_t id, float dis) {
+    if (wp == buffer_size) { // need new buffer
+      append_buffer();
+    }
+    Buffer & buf = buffers.back();
+    buf.ids[wp] = id;
+    buf.dis[wp] = dis;
+    wp++;
+  }
+
+
+  void BufferList::append_buffer()
+  {
+    Buffer buf = { new idx_t[buffer_size], new float[buffer_size] };
+    buffers.push_back(buf);
+    wp = 0;
+  }
+
+  /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+  /// tables dest_ids, dest_dis
+  void BufferList::copy_range(size_t ofs, size_t n,
+    idx_t * dest_ids, float *dest_dis)
+  {
+    size_t bno = ofs / buffer_size;
+    ofs -= bno * buffer_size;
+    while (n > 0) {
+      size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
+      Buffer buf = buffers[bno];
+      memcpy(dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
+      memcpy(dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
+      dest_ids += ncopy;
+      dest_dis += ncopy;
+      ofs = 0;
+      bno++;
+      n -= ncopy;
+    }
+  }
+
+  /***********************************************************************
+ * RangeSearchPartialResult
+ ***********************************************************************/
+
+  void RangeQueryResult::add(float dis, idx_t id) {
+    nres++;
+    pres->add(id, dis);
+  }
+
+
+
+  RangeSearchPartialResult::RangeSearchPartialResult(RangeSearchResult * res_in) :
+    BufferList(res_in->buffer_size),
+    res(res_in)
+  {}
+
+
+  /// begin a new result
+  RangeQueryResult &
+    RangeSearchPartialResult::new_result(idx_t qno)
+  {
+    RangeQueryResult qres = { qno, 0, this };
+    queries.push_back(qres);
+    return queries.back();
+  }
+
+
+  void RangeSearchPartialResult::finalize()
+  {
+    set_lims();
+//#pragma omp barrier
+
+//#pragma omp single
+    res->do_allocation();
+
+//#pragma omp barrier
+    copy_result();
+  }
+
+
+  /// called by range_search before do_allocation
+  void RangeSearchPartialResult::set_lims()
+  {
+    for (int i = 0; i < queries.size(); i++) {
+      RangeQueryResult & qres = queries[i];
+      res->lims[qres.qno] = qres.nres;
+    }
+  }
+
+  /// called by range_search after do_allocation
+  void RangeSearchPartialResult::copy_result(bool incremental)
+  {
+    size_t ofs = 0;
+    for (int i = 0; i < queries.size(); i++) {
+      RangeQueryResult & qres = queries[i];
+
+      copy_range(ofs, qres.nres,
+        res->labels + res->lims[qres.qno],
+        res->distances + res->lims[qres.qno]);
+      if (incremental) {
+        res->lims[qres.qno] += qres.nres;
+      }
+      ofs += qres.nres;
+    }
+  }
+
+  void RangeSearchPartialResult::merge(std::vector <RangeSearchPartialResult *> &
+    partial_results, bool do_delete)
+  {
+
+    int npres = partial_results.size();
+    if (npres == 0) return;
+    RangeSearchResult *result = partial_results[0]->res;
+    size_t nx = result->nq;
+
+    // count
+    for (const RangeSearchPartialResult * pres : partial_results) {
+      if (!pres) continue;
+      for (const RangeQueryResult &qres : pres->queries) {
+        result->lims[qres.qno] += qres.nres;
+      }
+    }
+    result->do_allocation();
+    for (int j = 0; j < npres; j++) {
+      if (!partial_results[j]) continue;
+      partial_results[j]->copy_result(true);
+      if (do_delete) {
+        delete partial_results[j];
+        partial_results[j] = nullptr;
+      }
+    }
+
+    // reset the limits
+    for (size_t i = nx; i > 0; i--) {
+      result->lims[i] = result->lims[i - 1];
+    }
+    result->lims[0] = 0;
+  }
+
+}
+
--- a/src/3rd_party/faiss/utils/misc.h
+++ b/src/3rd_party/faiss/utils/misc.h
@ -0,0 +1,144 @@
+#pragma once
+#include <algorithm>
+#include <vector>
+
+namespace faiss {
+  /// The metric space for vector comparison for Faiss indices and algorithms.
+///
+/// Most algorithms support both inner product and L2, with the flat
+/// (brute-force) indices supporting additional metric types for vector
+/// comparison.
+  enum MetricType {
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by a faiss::Index
+                               /// metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+  };
+
+  template<class T>
+  struct ScopeDeleter {
+    const T * ptr;
+    explicit ScopeDeleter(const T* ptr = nullptr) : ptr(ptr) {}
+    void release() { ptr = nullptr; }
+    void set(const T * ptr_in) { ptr = ptr_in; }
+    void swap(ScopeDeleter<T> &other) { std::swap(ptr, other.ptr); }
+    ~ScopeDeleter() {
+      delete[] ptr;
+    }
+  };
+
+  //////////////////////////////////////////
+  using idx_t = int64_t;
+
+  /** List of temporary buffers used to store results before they are
+ *  copied to the RangeSearchResult object. */
+  struct BufferList {
+    typedef faiss::idx_t idx_t;
+
+    // buffer sizes in # entries
+    size_t buffer_size;
+
+    struct Buffer {
+      idx_t *ids;
+      float *dis;
+    };
+
+    std::vector<Buffer> buffers;
+    size_t wp; ///< write pointer in the last buffer.
+
+    explicit BufferList(size_t buffer_size);
+
+    ~BufferList();
+
+    /// create a new buffer
+    void append_buffer();
+
+    /// add one result, possibly appending a new buffer if needed
+    void add(idx_t id, float dis);
+
+    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+    /// tables dest_ids, dest_dis
+    void copy_range(size_t ofs, size_t n,
+      idx_t * dest_ids, float *dest_dis);
+
+  };
+
+  /** The objective is to have a simple result structure while
+ *  minimizing the number of mem copies in the result. The method
+ *  do_allocation can be overloaded to allocate the result tables in
+ *  the matrix type of a scripting language like Lua or Python. */
+  struct RangeSearchResult {
+    size_t nq;      ///< nb of queries
+    size_t *lims;   ///< size (nq + 1)
+
+    typedef faiss::idx_t idx_t;
+
+    idx_t *labels;     ///< result for query i is labels[lims[i]:lims[i+1]]
+    float *distances;  ///< corresponding distances (not sorted)
+
+    size_t buffer_size; ///< size of the result buffers used
+
+    /// lims must be allocated on input to range_search.
+    explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
+
+    /// called when lims contains the nb of elements result entries
+    /// for each query
+
+    virtual void do_allocation();
+
+    virtual ~RangeSearchResult();
+  };
+
+  struct RangeSearchPartialResult;
+
+  /// result structure for a single query
+  struct RangeQueryResult {
+    using idx_t = faiss::idx_t;
+    idx_t qno;    //< id of the query
+    size_t nres;  //< nb of results for this query
+    RangeSearchPartialResult * pres;
+
+    /// called by search function to report a new result
+    void add(float dis, idx_t id);
+  };
+
+  /// the entries in the buffers are split per query
+  struct RangeSearchPartialResult : BufferList {
+    RangeSearchResult * res;
+
+    /// eventually the result will be stored in res_in
+    explicit RangeSearchPartialResult(RangeSearchResult * res_in);
+
+    /// query ids + nb of results per query.
+    std::vector<RangeQueryResult> queries;
+
+    /// begin a new result
+    RangeQueryResult & new_result(idx_t qno);
+
+    /*****************************************
+     * functions used at the end of the search to merge the result
+     * lists */
+    void finalize();
+
+    /// called by range_search before do_allocation
+    void set_lims();
+
+    /// called by range_search after do_allocation
+    void copy_result(bool incremental = false);
+
+    /// merge a set of PartialResult's into one RangeSearchResult
+    /// on ouptut the partialresults are empty!
+    static void merge(std::vector <RangeSearchPartialResult *> &
+      partial_results, bool do_delete = true);
+
+  };
+
+
+} // namespace
--- a/src/3rd_party/faiss/utils/random.cpp
+++ b/src/3rd_party/faiss/utils/random.cpp
@ -0,0 +1,192 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/random.h>
+
+namespace faiss {
+
+/**************************************************
+ * Random data generation functions
+ **************************************************/
+
+RandomGenerator::RandomGenerator (int64_t seed)
+    : mt((unsigned int)seed) {}
+
+int RandomGenerator::rand_int ()
+{
+    return mt() & 0x7fffffff;
+}
+
+int64_t RandomGenerator::rand_int64 ()
+{
+    return int64_t(rand_int()) | int64_t(rand_int()) << 31;
+}
+
+int RandomGenerator::rand_int (int max)
+{
+    return mt() % max;
+}
+
+float RandomGenerator::rand_float ()
+{
+    return mt() / float(mt.max());
+}
+
+double RandomGenerator::rand_double ()
+{
+    return mt() / double(mt.max());
+}
+
+
+/***********************************************************************
+ * Random functions in this C file only exist because Torch
+ *  counterparts are slow and not multi-threaded.  Typical use is for
+ *  more than 1-100 billion values. */
+
+
+/* Generate a set of random floating point values such that x[i] in [0,1]
+   multi-threading. For this reason, we rely on re-entreant functions.  */
+void float_rand (float * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+//#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_float ();
+    }
+}
+
+
+void float_randn (float * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+//#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+        RandomGenerator rng (a0 + j * b0);
+
+        double a = 0, b = 0, s = 0;
+        int state = 0;  /* generate two number per "do-while" loop */
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        for (size_t i = istart; i < iend; i++) {
+            /* Marsaglia's method (see Knuth) */
+            if (state == 0) {
+                do {
+                    a = 2.0 * rng.rand_double () - 1;
+                    b = 2.0 * rng.rand_double () - 1;
+                    s = a * a + b * b;
+                } while (s >= 1.0);
+                x[i] = a * sqrt(-2.0 * log(s) / s);
+            }
+            else
+                x[i] = b * sqrt(-2.0 * log(s) / s);
+            state = 1 - state;
+        }
+    }
+}
+
+
+/* Integer versions */
+void int64_rand (int64_t * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+//#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 ();
+    }
+}
+
+void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+//#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+        for (size_t i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 () % max;
+    }
+}
+
+
+void rand_perm (int *perm, size_t n, int64_t seed)
+{
+    for (size_t i = 0; i < n; i++) perm[i] = i;
+
+    RandomGenerator rng (seed);
+
+    for (size_t i = 0; i + 1 < n; i++) {
+        int i2 = i + rng.rand_int (n - i);
+        std::swap(perm[i], perm[i2]);
+    }
+}
+
+
+
+
+void byte_rand (uint8_t * x, size_t n, int64_t seed)
+{
+    // only try to parallelize on large enough arrays
+    const size_t nblock = n < 1024 ? 1 : 1024;
+
+    RandomGenerator rng0 (seed);
+    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
+
+//#pragma omp parallel for
+    for (size_t j = 0; j < nblock; j++) {
+
+        RandomGenerator rng (a0 + j * b0);
+
+        const size_t istart = j * n / nblock;
+        const size_t iend = (j + 1) * n / nblock;
+
+        size_t i;
+        for (i = istart; i < iend; i++)
+            x[i] = rng.rand_int64 ();
+    }
+}
+
+} // namespace faiss
--- a/src/3rd_party/faiss/utils/random.h
+++ b/src/3rd_party/faiss/utils/random.h
@ -0,0 +1,60 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* Random generators. Implemented here for speed and to make
+ * sequences reproducible.
+ */
+
+#pragma once
+
+#include <random>
+#include <stdint.h>
+
+
+namespace faiss {
+
+/**************************************************
+ * Random data generation functions
+ **************************************************/
+
+/// random generator that can be used in multithreaded contexts
+struct RandomGenerator {
+
+    std::mt19937 mt;
+
+    /// random positive integer
+    int rand_int ();
+
+    /// random int64_t
+    int64_t rand_int64 ();
+
+    /// generate random integer between 0 and max-1
+    int rand_int (int max);
+
+    /// between 0 and 1
+    float rand_float ();
+
+    double rand_double ();
+
+    explicit RandomGenerator (int64_t seed = 1234);
+};
+
+/* Generate an array of uniform random floats / multi-threaded implementation */
+void float_rand (float * x, size_t n, int64_t seed);
+void float_randn (float * x, size_t n, int64_t seed);
+void int64_rand (int64_t * x, size_t n, int64_t seed);
+void byte_rand (uint8_t * x, size_t n, int64_t seed);
+// max is actually the maximum value + 1
+void int64_rand_max (int64_t * x, size_t n, uint64_t max, int64_t seed);
+
+/* random permutation */
+void rand_perm (int * perm, size_t n, int64_t seed);
+
+
+} // namespace faiss
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -70,6 +70,7 @@ add_library(marian STATIC
  layers/generic.cpp
  layers/loss.cpp
  layers/weight.cpp
+  layers/lsh.cpp

  rnn/cells.cpp
  rnn/attention.cpp
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -646,6 +646,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
  cli.add<bool>("--output-sampling",
     "Noise output layer with gumbel noise",
      false);
+  cli.add<std::vector<int>>("--output-approx-knn",
+     "Use approximate knn search in output layer (currently only in transformer)")
+     ->implicit_val("100 1024");

 #if 0 // @TODO: Ask Hany if there are any decoding-time options
  // add ULR settings
--- a/src/common/logging.h
+++ b/src/common/logging.h
@ -120,6 +120,13 @@ namespace marian {
    }                            \
  } while(0)

+#define ABORT_UNLESS(condition, ...) \
+  do {                               \
+    if(!(bool)(condition)) {         \
+      ABORT(__VA_ARGS__);            \
+    }                                \
+  } while(0)
+
 typedef std::shared_ptr<spdlog::logger> Logger;
 Logger createStderrLogger(const std::string&,
                          const std::string&,
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -26,6 +26,16 @@ Expr checkpoint(Expr a) {
  return a;
 }

+Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type, 
+            LambdaNodeFunctor fwd) {
+  return Expression<LambdaNodeOp>(nodes, shape, type, fwd);
+}
+
+Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type, 
+            LambdaNodeFunctor fwd, LambdaNodeFunctor bwd) {
+  return Expression<LambdaNodeOp>(nodes, shape, type, fwd, bwd);
+}
+
 // logistic function. Note: scipy name is expit()
 Expr sigmoid(Expr a) {
  return Expression<SigmoidNodeOp>(a);
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -10,6 +10,10 @@ Expr checkpoint(Expr a);

 typedef Expr(ActivationFunction)(Expr);

+typedef std::function<void(Expr, const std::vector<Expr>&)> LambdaNodeFunctor;
+Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor);
+Expr lambda(const std::vector<Expr>&, Shape, Type, LambdaNodeFunctor, LambdaNodeFunctor);
+
 Expr plus(const std::vector<Expr>&);

 // TODO: should be logistic(), not sigmoid()
@ -188,6 +192,30 @@ Expr stopGradient(Expr a);

 Expr gather(Expr a, int axis, Expr indices);

+#if 0
+ // reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
+ // with broadcasting
+
+ auto knn = get<0>(KNN->apply(query, k)); // [beam, time, batch, k]
+
+ auto W = reshape(gather(Wt_, -2, flatten(knn)), {beam * time * batch, k, dim});
+ auto b = reshape(gather(b_,  -1, flatten(knn)), {beam * time * batch, 1, k });
+ query       = reshape(query, {beam * time * batch, 1, dim});
+ auto logits = bdot(query, W, false, true); // [beam * time * batch, 1, k]
+ logits      = reshape(logits + b, {beam, time, batch, k}); // @TODO: add baffine node
+
+ auto shape = indices.shape();
+ shape.set(-1, 32000);
+ auto output = grep->constant(shape, inits::lowest(), logits->value_type());
+ output = scatter(output, -1, indices, logits);
+
+ // auto a = graph->constant({2,2,5,32000}, inits::fromValue(minimal))
+ // scatter(a, -1, indices, values)
+ // PyTorch does for out-of-place scatter: out = a.scatter(-1, indices, values)
+Expr scatter(Expr a, int axis, Expr indices, Expr b);
+
+#endif
+
 // Warning: Don't try to pass a scalar literal 0 as indices; it will compile but pass nullptr...
 Expr index_select(Expr a, int axis, Expr indices);

--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -251,5 +251,24 @@ template Ptr<NodeInitializer> range<float>    (float     begin, float     end, f
 template Ptr<NodeInitializer> range<IndexType>(IndexType begin, IndexType end, IndexType step);

 }  // namespace inits
-
+}  // namespace marian
+
+
+#include "faiss/VectorTransform.h"
+
+namespace marian {
+namespace inits {
+
+Ptr<NodeInitializer> randomRotation(size_t seed) {
+  auto rot = [=](Tensor t) {
+    int rows = t->shape()[-2];
+    int cols = t->shape()[-1];
+    faiss::RandomRotationMatrix rrot(cols, rows); // transposed in faiss
+    rrot.init(seed);
+    t->set(rrot.A);
+  };
+  return fromLambda(rot, Type::float32);
+}
+
+}  // namespace inits
 }  // namespace marian
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -176,6 +176,13 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
 */
 Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);

+/**
+ * Computes a random rotation matrix for LSH hashing. This is part  
+ * of a hash function. The values are orthonormal and computed via
+ * QR decomposition. Same seed results in same random rotation.
+ */
+Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
+
 /**
 * Computes a range from begin to end-1, like Python's range().
 * The constant being initialized must have one dimension that matches
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -13,6 +13,62 @@

 namespace marian {

+class LambdaNodeOp : public NaryNodeOp {
+private:
+  typedef const std::vector<Expr>& Inputs;
+  typedef std::function<void(Expr, Inputs)> LambdaNodeFunctor;
+  
+  std::unique_ptr<LambdaNodeFunctor> forward_;
+  std::unique_ptr<LambdaNodeFunctor> backward_;
+  
+public:
+  LambdaNodeOp(Inputs inputs, Shape shape, Type type, 
+               LambdaNodeFunctor forward) 
+  : NaryNodeOp(inputs, shape, type), 
+    forward_(new LambdaNodeFunctor(forward)) {
+    Node::trainable_ = !!backward_;
+  }
+
+  LambdaNodeOp(Inputs inputs, Shape shape, Type type, 
+               LambdaNodeFunctor forward,
+               LambdaNodeFunctor backward) 
+  : NaryNodeOp(inputs, shape, type), 
+    forward_(new LambdaNodeFunctor(forward)),
+    backward_(new LambdaNodeFunctor(backward)) {
+  }
+
+  void forward() override {
+    (*forward_)(this, children_);
+  }
+
+  void backward() override {
+    ABORT_IF(!backward_, "No backward lambda given?");
+    (*backward_)(this, children_);
+  }
+
+  const std::string type() override { return "lambda"; }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, forward_.get());
+    util::hash_combine(seed, backward_.get());
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<LambdaNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(forward_ != cnode->forward_)   // pointer compare on purpose
+      return false;
+    if(backward_ != cnode->backward_) // pointer compare on purpose
+      return false;
+    return true;
+  }
+};
+
 class DotNodeOp : public NaryNodeOp {
 private:
  friend class SerializationHelpers;
--- a/src/layers/generic.cpp
+++ b/src/layers/generic.cpp
@ -6,8 +6,7 @@
 #include "data/factored_vocab.h"
 #include "rnn/types.h"     // for State::select()
 #include "models/states.h" // for EncoderState
-
-//using std::size_t; // not sure why this is needed
+#include "layers/lsh.h"

 namespace marian {
  Logits::Logits(Expr logits) : Logits(New<RationalLoss>(logits, nullptr)) {} // single-output constructor from Expr only (RationalLoss has no count)
@ -218,6 +217,13 @@ namespace marian {
      if (Wt_)
        return;

+      // this option is only set in the decoder
+      if(!lsh_ && options_->hasAndNotEmpty("output-approx-knn")) {
+        auto k     = opt<std::vector<int>>("output-approx-knn")[0];
+        auto nbits = opt<std::vector<int>>("output-approx-knn")[1];
+        lsh_ = New<LSH>(k, nbits);
+      }
+
      auto name = options_->get<std::string>("prefix");
      auto numOutputClasses = options_->get<int>("dim");

@ -257,6 +263,16 @@ namespace marian {
    Logits Output::applyAsLogits(Expr input) /*override final*/ {
      lazyConstruct(input->shape()[-1]);

+      auto affineOrLSH = [this](Expr x, Expr W, Expr b, bool transA, bool transB) {
+        if(lsh_) {
+          ABORT_IF( transA, "Transposed query not supported for LSH");
+          ABORT_IF(!transB, "Untransposed indexed matrix not supported for LSH");
+          return lsh_->apply(x, W, b);
+        } else {
+          return affine(x, W, b, transA, transB);
+        }
+      };
+
      if (shortlist_ && !cachedShortWt_) { // shortlisted versions of parameters are cached within one batch, then clear()ed
        cachedShortWt_ = index_select(Wt_, isLegacyUntransposedW ? -1 : 0, shortlist_->indices());
        cachedShortb_  = index_select(b_ ,                             -1, shortlist_->indices());
@ -333,7 +349,12 @@ namespace marian {
            input1 = layerNorm(input1, name + "_ffn");
          }
          // @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
-          auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
+          Expr factorLogits;
+          if(g == 0)
+            factorLogits = affineOrLSH(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
+          else
+            factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
+          
          // optionally add lemma-dependent bias
          if (Plemma) { // [B... x U0]
            int lemmaVocabDim = Plemma->shape()[-1];
@ -396,11 +417,11 @@ namespace marian {
          }
        }
        return Logits(std::move(allLogits), factoredVocab_);
+      } else if (shortlist_) {
+        return Logits(affineOrLSH(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
+      } else {
+        return Logits(affineOrLSH(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
      }
-      else if (shortlist_)
-        return Logits(affine(input, cachedShortWt_, cachedShortb_, false, /*transB=*/isLegacyUntransposedW ? false : true));
-      else
-        return Logits(affine(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
    }
  }

--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -228,6 +228,12 @@ public:
  Expr apply(Expr input) override { return apply(std::vector<Expr>({input})); }
 };

+} // namespace mlp
+
+class LSH;
+
+namespace mlp {
+
 class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList {
 private:
  // parameters held by this layer
@ -239,10 +245,11 @@ private:
  Expr cachedShortb_;   // these match the current value of shortlist_
  Expr cachedShortLemmaEt_;
  Ptr<FactoredVocab> factoredVocab_;
-
+  
  // optional parameters set/updated after construction
  Expr tiedParam_;
  Ptr<data::Shortlist> shortlist_;
+  Ptr<LSH> lsh_;

  void lazyConstruct(int inputDim);
 public:
--- a/src/layers/lsh.cpp
+++ b/src/layers/lsh.cpp
@ -0,0 +1,118 @@
+#include "layers/lsh.h"
+#include "3rd_party/faiss/IndexLSH.h"
+#include "graph/expression_operators.h"
+#include "tensors/cpu/prod_blas.h"
+
+namespace marian {
+
+Expr LSH::apply(Expr input, Expr W, Expr b) {
+  auto idx = search(input, W);
+  return affine(idx, input, W, b);
+}
+
+Expr LSH::search(Expr query, Expr values) {
+  ABORT_IF(query->graph()->getDeviceId().type == DeviceType::gpu,
+           "LSH index (--output-approx-knn) currently not implemented for GPU");
+
+  auto kShape = query->shape();
+  kShape.set(-1, k_);
+
+  auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
+    auto query  = inputs[0];
+    auto values = inputs[1];
+
+    int dim = values->shape()[-1];
+
+    if(!index_ || indexHash_ != values->hash()) {
+      LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_);
+      index_.reset(new faiss::IndexLSH(dim, nbits_, 
+                                       /*rotate=*/dim != nbits_, 
+                                       /*train_thesholds*/false));
+      int vRows = values->shape().elements() / dim;
+      index_->train(vRows, values->val()->data<float>());
+      index_->add(  vRows, values->val()->data<float>());
+      indexHash_ = values->hash();
+    }
+
+    int qRows = query->shape().elements() / dim;
+    std::vector<float> distances(qRows * k_);
+    std::vector<faiss::Index::idx_t> ids(qRows * k_);
+
+    index_->search(qRows, query->val()->data<float>(), k_,
+                   distances.data(), ids.data());
+    
+    std::vector<IndexType> vOut;
+    vOut.reserve(ids.size());
+    for(auto id : ids)
+      vOut.push_back((IndexType)id);
+
+    out->val()->set(vOut);
+  };
+
+  return lambda({query, values}, kShape, Type::uint32, forward);
+}
+
+Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
+  auto outShape = input->shape();
+  int dimVoc    = W->shape()[-2];
+  outShape.set(-1, dimVoc);
+
+  auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
+    auto lowest = NumericLimits<float>(out->value_type()).lowest;
+    out->val()->set(lowest);
+
+    int dimIn   = inputs[1]->shape()[-1];
+    int dimOut  = out->shape()[-1];
+    int dimRows = out->shape().elements() / dimOut;
+    
+    auto outPtr   = out->val()->data<float>();
+    auto idxPtr   = inputs[0]->val()->data<uint32_t>();
+    auto queryPtr = inputs[1]->val()->data<float>();
+    auto WPtr     = inputs[2]->val()->data<float>();
+    auto bPtr     = inputs[3]->val()->data<float>();
+
+    for(int row = 0; row < dimRows; ++row) {
+      auto currIdxPtr    = idxPtr   + row * k_;     // move to next batch of k entries
+      auto currQueryPtr  = queryPtr + row * dimIn;  // move to next input query vector
+      auto currOutPtr    = outPtr   + row * dimOut; // move to next output position vector (of vocabulary size)
+      for(int k = 0; k < k_; k++) {
+        int relPos = currIdxPtr[k];                 // k-th best vocabulay item
+        auto currWPtr      = WPtr + relPos * dimIn; // offset for k-th best embedding
+        currOutPtr[relPos] = bPtr[relPos];          // write bias value to position
+        
+        // proceed one vector product at a time writing to the correct position
+        sgemm(false, true, 1, 1, dimIn, 1.0f, currQueryPtr, dimIn, currWPtr, dimIn, 1.0f, &currOutPtr[relPos], 1);
+      }
+    }
+  };
+
+  return lambda({idx, input, W, b}, 
+                outShape,
+                input->value_type(),
+                forward);
+}
+
+// @TODO: alternative version which does the same as above with Marian operators, currently missing "scatter".
+// this uses more memory and likely to be slower. Would make sense to have a scatter node that actually creates
+// the node instead of relying on an existing node, e.g. scatter(shape, defaultValue, axis, indices, values);
+#if 0 
+Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
+  int dim  = input->shape()[-1];
+  int bch  = idx->shape().elements() / k;
+
+  auto W = reshape(rows(Wt_, flatten(idx)), {bch, k, dim}); // [rows, k, dim]
+  auto b = reshape(cols(b_,  flatten(idx)), {bch, 1,   k}); // [rows, 1,   k]
+
+  auto aff = reshape(bdot(reshape(input, {bch, 1, dim}), W, false, true) + b, idx->shape()); // [beam, time, batch, k]
+
+  int dimVoc  = Wt_->shape()[-2];
+  auto oShape = input->shape();
+  oShape.set(-1, dimVoc);
+  auto lowest = graph_->constant(oShape, 
+                                 inits::fromValue(NumericLimits<float>(input->value_type()).lowest), 
+                                 input->value_type());
+  return scatter(lowest, -1, idx, aff);
+}
+#endif
+
+}  // namespace marian
--- a/src/layers/lsh.h
+++ b/src/layers/lsh.h
@ -0,0 +1,26 @@
+#include "graph/expression_graph.h"
+#include <memory>
+
+namespace faiss {
+  class IndexLSH;
+}
+
+namespace marian {
+
+class LSH {  
+public:
+  LSH(int k, int nbits) : k_{k}, nbits_{nbits} {}
+  Expr apply(Expr query, Expr values, Expr bias);
+
+private:
+  Ptr<faiss::IndexLSH> index_;
+  size_t indexHash_{0};
+
+  int k_{100};
+  int nbits_{1024};
+
+  Expr search(Expr query, Expr values);
+  Expr affine(Expr idx, Expr query, Expr values, Expr bias);
+};
+
+}
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -615,6 +615,7 @@ private:
        "prefix", prefix_ + "_ff_logit_out",
        "dim", dimTrgVoc,
        "vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_], // for factored outputs
+        "output-approx-knn", opt<std::vector<int>>("output-approx-knn"),
        "lemma-dim-emb", opt<int>("lemma-dim-emb", 0)); // for factored outputs

    if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all"))
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@ -7,51 +7,13 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_allocator.h"

-#if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
-#endif
-
+#include "prod_blas.h"
 #include "sharp/int_gemm.h"

 namespace marian {

 namespace cpu {

-#if BLAS_FOUND
-inline void sgemm(bool transA,
-                  bool transB,
-                  int rows_a,
-                  int rows_b,
-                  int width,
-                  float alpha,
-                  float* a,
-                  int lda,
-                  float* b,
-                  int ldb,
-                  float beta,
-                  float* c,
-                  int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA ? CblasTrans : CblasNoTrans,
-              transB ? CblasTrans : CblasNoTrans,
-              rows_a,
-              rows_b,
-              width,
-              alpha,
-              a,
-              lda,
-              b,
-              ldb,
-              beta,
-              c,
-              ldc);
-}
-#endif
-
 void Prod(marian::Tensor C,
          const marian::Tensor& A,
          const marian::Tensor& B,
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@ -0,0 +1,38 @@
+#if MKL_FOUND
+#include <mkl.h>
+#else
+#if BLAS_FOUND
+#include <cblas.h>
+#endif
+#endif
+
+#if BLAS_FOUND
+inline void sgemm(bool transA,
+                  bool transB,
+                  int rows_a,
+                  int rows_b,
+                  int width,
+                  float alpha,
+                  float* a,
+                  int lda,
+                  float* b,
+                  int ldb,
+                  float beta,
+                  float* c,
+                  int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA ? CblasTrans : CblasNoTrans,
+              transB ? CblasTrans : CblasNoTrans,
+              rows_a,
+              rows_b,
+              width,
+              alpha,
+              a,
+              lda,
+              b,
+              ldb,
+              beta,
+              c,
+              ldc);
+}
+#endif
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -36,12 +36,24 @@ class TensorBase {
             Ptr<Backend> backend)
      : memory_(memory), shape_(shape), type_(type), backend_(backend) {}

-  TensorBase(MemoryPiece::PtrType memory, Shape shape, Ptr<Backend> backend)
+  TensorBase(MemoryPiece::PtrType memory, 
+             Shape shape, 
+             Ptr<Backend> backend)
      : memory_(memory),
        shape_(shape),
        type_(Type::float32),
        backend_(backend) {}

+  // Wraps existing memory
+  template <typename T>
+  TensorBase(T* rawMemory,
+             size_t rawMemoryNum,
+             Shape shape,
+             Type type,
+             Ptr<Backend> backend)
+      : memory_(MemoryPiece::New((uint8_t*)rawMemory, rawMemoryNum * sizeof(T))), 
+        shape_(shape), type_(type), backend_(backend) {}
+
 public:
  // Use this whenever pointing to MemoryPiece
  typedef IPtr<TensorBase> PtrType;
				`@ -0,0 +1 @@`
				`This is code extracted from the original FAISS repository: https://github.com/facebookresearch/faiss`