merge with internal master

2024-09-17 09:47:34 +03:00 · 2020-08-18 16:10:14 -07:00 · 2020-08-18 16:10:14 -07:00 · 089fb48e6c
commit 089fb48e6c
parent 3aed9143d9 9edf2cde34
57 changed files with 3630 additions and 1406 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -192,6 +192,13 @@ else(MSVC)
  set(CMAKE_C_FLAGS_PROFUSE         "${CMAKE_C_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
 endif(MSVC)

+# with gcc 7.0 and above we need to mark fallthrough in switch case statements
+# that can be done in comments for backcompat, but CCACHE removes comments.
+# -C makes gcc keep comments.
+if(USE_CCACHE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -C")
+endif()
+
 ###############################################################################
 # Downloading SentencePiece if requested and set to compile with it.
 # Requires all the dependencies imposed by SentencePiece
@ -205,6 +212,7 @@ if(USE_ONNX)
  message(STATUS "Enabling experimental ONNX support")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ONNX")
  set(EXT_LIBS ${EXT_LIBS} protobuf)
+  include_directories(${Protobuf_INCLUDE_DIRS})
 endif()

 # Find packages
@ -400,6 +408,13 @@ if(COMPILE_SERVER)
    message(STATUS "Found OpenSSL")
    include_directories(${OPENSSL_INCLUDE_DIR})
    set(EXT_LIBS ${EXT_LIBS} ${OPENSSL_CRYPTO_LIBRARY})
+    if(MSVC AND USE_STATIC_LIBS)
+      # "If you link with static OpenSSL libraries then you're expected to additionally link your
+      # application with WS2_32.LIB, GDI32.LIB, ADVAPI32.LIB, CRYPT32.LIB and USER32.LIB"
+      # See https://github.com/openssl/openssl/blob/OpenSSL_1_1_1d/NOTES.WIN#L127
+      # Linking with crypt32.lib seem to be enough.
+      set(EXT_LIBS ${EXT_LIBS} crypt32.lib)
+    endif()
    set(BOOST_COMPONENTS ${BOOST_COMPONENTS} system)
  else(OpenSSL_FOUND)
    message(WARNING "Cannot find OpenSSL library. Not compiling server.")
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@ -2,20 +2,25 @@
  "environments": [
    {
      // Dependencies can be found automatically if you use vcpkg as your library manager.
-      // In this case, please set the VCPKG_ROOT variable to the directory that contains the vcpkg.exe
+      // In this case, please set the VCPKG_ROOT variable to the directory that contains the
+      // vcpkg.exe. If you used CheckDeps.bat to install dependencies, by default this should be an
+      // absolute path to marian-dev\vs\deps\vcpkg.
      // If you prefer managing yourself the dependencies, please fill in the other variables.
-      "VCPKG_ROOT": "D:\\Perso\\Dev\\vcpkg",
+      "VCPKG_ROOT": "C:\\path\\to\\marian-dev\\vs\\deps\\vcpkg",

+      // The MKL library can be automatically found by CMake. However, if you installed it in a
+      // custom directory, please set the MKLROOT to this directory path.
+      // Default is c:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl on
+      // Windows, or /opt/intel/mkl on Linux
+      "MKLROOT": "",
+
+      // Boost and OpenSSL are required if you compile with COMPILE_SERVER=TRUE
      "BOOST_INCLUDEDIR": "",
      "BOOST_LIBRARYDIR": "",
-      "ZLIB_ROOT": "",
      "OPENSSL_ROOT_DIR": "",

-      // The MKL library can be automatically found by CMake. However, if you installed it in a custom
-      // directory, please set the MKLROOT to this directory path.
-      // Default is c:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl on Windows, or
-      // /opt/intel/mkl on Linux
-      "MKLROOT": ""
+      // Protobuf is required if you compile with USE_SENTENCEPIECE=TRUE
+      "Protobuf_SRC_ROOT_FOLDER": ""
    }
  ],
  "configurations": [
@ -32,18 +37,21 @@
      "variables": [
        { "name": "CMAKE_TOOLCHAIN_FILE", "value": "${env.VCPKG_ROOT}\\scripts\\buildsystems\\vcpkg.cmake" },
        { "name": "VCPKG_TARGET_TRIPLET", "value": "x64-windows-static" },
-        
+
        { "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
        { "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
-        
-        { "name": "COMPILE_SERVER:BOOL", "value": "TRUE" }, 
-        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" }, 
-        { "name": "COMPILE_TESTS:BOOL", "value": "FALSE" }, 
-        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" }, 
-        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" }, 
-        
-        { "name": "USE_CUDNN:BOOL", "value": "TRUE" }, 
-        { "name": "USE_MPI:BOOL", "value": "FALSE" }
+
+        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_SERVER:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_TESTS:BOOL", "value": "FALSE" },
+
+        { "name": "USE_CUDNN:BOOL", "value": "FALSE" },
+        { "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
+        { "name": "USE_MPI:BOOL", "value": "FALSE" },
+        { "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
+        { "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
      ]
    },
    {
@ -59,19 +67,22 @@
      "variables": [
        { "name": "CMAKE_TOOLCHAIN_FILE", "value": "${env.VCPKG_ROOT}\\scripts\\buildsystems\\vcpkg.cmake" },
        { "name": "VCPKG_TARGET_TRIPLET", "value": "x64-windows-static" },
-        
+
        { "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
        { "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
-        
-        { "name": "COMPILE_SERVER:BOOL", "value": "TRUE" }, 
-        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" }, 
-        { "name": "COMPILE_TESTS:BOOL", "value": "FALSE" }, 
-        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" }, 
-        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" }, 
-        
-        { "name": "USE_CUDNN:BOOL", "value": "TRUE" }, 
-        { "name": "USE_MPI:BOOL", "value": "FALSE" }
+
+        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_SERVER:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_TESTS:BOOL", "value": "TRUE" },
+
+        { "name": "USE_CUDNN:BOOL", "value": "FALSE" },
+        { "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
+        { "name": "USE_MPI:BOOL", "value": "FALSE" },
+        { "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
+        { "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
      ]
    }
  ]
-}
+}
--- a/3
+++ b/3
@ -1,2 +1 @@
-v1.9.33
-
+v1.9.35
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 864ea3c605305a6c0623e3df91b85afe13d37a46
+Subproject commit 0f8cabf13ec362d50544d33490024e00c3a763be
--- a/scripts/laser/laser2marian.py
+++ b/scripts/laser/laser2marian.py
@ -0,0 +1,85 @@
+import numpy as np
+import sys
+import yaml
+import argparse
+
+import torch
+
+parser = argparse.ArgumentParser(description='Convert LASER model to Marian weight file.')
+parser.add_argument('--laser', help='Path to LASER PyTorch model', required=True)
+parser.add_argument('--marian', help='Output path for Marian weight file', required=True)
+args = parser.parse_args()
+
+laser = torch.load(args.laser)
+    
+config = dict()
+config["type"] = "laser"
+config["input-types"] = ["sequence"]
+config["dim-vocabs"] = [laser["params"]["num_embeddings"]]
+
+config["version"] = "laser2marian.py conversion"
+
+config["enc-depth"] = laser["params"]["num_layers"]
+config["enc-cell"] = "lstm"
+config["dim-emb"] = laser["params"]["embed_dim"]
+config["dim-rnn"] = laser["params"]["hidden_size"]
+
+yaml.dump(laser["dictionary"], open(args.marian + ".vocab.yml", "w"))
+
+marianModel = dict()
+
+def transposeOrder(mat):
+    matT = np.transpose(mat) # just a view with changed row order
+    return matT.flatten(order="C").reshape(matT.shape) # force row order change and reshape
+    
+def convert(pd, srcs, trg, transpose=True, bias=False, lstm=False):
+    num = pd[srcs[0]].detach().numpy()
+    for i in range(1, len(srcs)):
+        num += pd[srcs[i]].detach().numpy()
+
+    out = num
+    if bias:
+        num = np.atleast_2d(num)
+    else:
+        if transpose:
+            num = transposeOrder(num) # transpose with row order change
+        
+    if lstm: # different order in pytorch than marian
+        stateDim = int(num.shape[-1] / 4)
+        i = np.copy(num[:, 0*stateDim:1*stateDim])
+        f = np.copy(num[:, 1*stateDim:2*stateDim])
+        num[:, 0*stateDim:1*stateDim] = f
+        num[:, 1*stateDim:2*stateDim] = i
+
+    marianModel[trg] = num
+
+for k in laser:
+    print(k)
+
+for k in laser["model"]:
+    print(k, laser["model"][k].shape)
+
+convert(laser["model"], ["embed_tokens.weight"], "encoder_Wemb", transpose=False)
+for i in range(laser["params"]["num_layers"]):
+    convert(laser["model"], [f"lstm.weight_ih_l{i}"], f"encoder_lstm_l{i}_W", lstm=True)
+    convert(laser["model"], [f"lstm.weight_hh_l{i}"], f"encoder_lstm_l{i}_U", lstm=True)
+    convert(laser["model"], [f"lstm.bias_ih_l{i}", f"lstm.bias_hh_l{i}"], f"encoder_lstm_l{i}_b", bias=True, lstm=True) # needs to be summed!
+    
+    convert(laser["model"], [f"lstm.weight_ih_l{i}_reverse"], f"encoder_lstm_l{i}_reverse_W", lstm=True)
+    convert(laser["model"], [f"lstm.weight_hh_l{i}_reverse"], f"encoder_lstm_l{i}_reverse_U", lstm=True)
+    convert(laser["model"], [f"lstm.bias_ih_l{i}_reverse", f"lstm.bias_hh_l{i}_reverse"], f"encoder_lstm_l{i}_reverse_b", bias=True, lstm=True) # needs to be summed!
+
+for m in marianModel:
+    print(m, marianModel[m].shape)
+
+configYamlStr = yaml.dump(config, default_flow_style=False)
+desc = list(configYamlStr)
+npDesc = np.chararray((len(desc),))
+npDesc[:] = desc
+npDesc.dtype = np.int8
+marianModel["special:model.yml"] = npDesc
+
+print("\nMarian config:")
+print(configYamlStr)
+print("Saving Marian model to %s" % (args.marian,))
+np.savez(args.marian, **marianModel)
--- a/scripts/onnx/example-greedy.py
+++ b/scripts/onnx/example-greedy.py
@ -1,81 +0,0 @@
-import onnxruntime as ort
-import numpy as np
-import onnx
-import os, sys, time
-
-os.environ['OMP_NUM_THREADS'] = '1'
-sess_options = ort.SessionOptions()
-sess_options.intra_op_num_threads = 1
-sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-
-def get_function(path, output_vars):
-    print("Reading ONNX function from", path)
-    #model = onnx.load(path)
-    #print("Done", flush=True)
-    #print(model)
-    ort_sess = ort.InferenceSession(path, sess_options)
-    output_defs = ort_sess.get_outputs()
-    for input in ort_sess.get_inputs():
-        print("  input: ", input.name, input.shape, input.type)
-    for output in output_defs:
-        print("  output: ", output.name, output.shape, output.type)
-    def invoke_model(**kwargs):
-        def to_numpy(val):
-            arr = np.array(val)
-            if arr.dtype == np.double:
-                arr = arr.astype(np.float32)
-            elif arr.dtype == np.int64:
-                arr = arr.astype(np.int32)
-            return arr
-        kwargs = { name: to_numpy(val) for name, val in kwargs.items() }
-        output_vals = ort_sess.run(None, kwargs)
-        output_dict = { output_def.name : output_val for output_val, output_def in zip(output_vals, output_defs) }
-        return [output_dict[output_var] for output_var in output_vars]
-    return invoke_model
-
-id2word = { id : word.rstrip() for id, word in enumerate(open('c:/work/marian-dev/local/model/vocab_v1.wl', encoding='utf-8').readlines()) }
-word2id = { word : id for id, word in id2word.items() }
-unk_id = word2id["<unk>"]
-
-model_path_prefix = "c:/work/marian-dev/local/model/model.npz.best-ce-mean-words-debug-sin-uniq-notrans-nounk"
-encode_source = get_function(model_path_prefix + '.encode_source.onnx',
-                             ['encoder_context_0'])
-decode_first  = get_function(model_path_prefix + '.decode_first.onnx',
-                             ['first_logits', 'first_decoder_state_0', 'first_decoder_state_1', 'first_decoder_state_2', 'first_decoder_state_3', 'first_decoder_state_4', 'first_decoder_state_5'])
-decode_next   = get_function(model_path_prefix + '.decode_next.onnx',
-                             ['next_logits', 'next_decoder_state_0', 'next_decoder_state_1', 'next_decoder_state_2', 'next_decoder_state_3', 'next_decoder_state_4', 'next_decoder_state_5'])
-
-def greedy_decode(data_0):
-    if len(data_0) == 1:  # special handling for the empty sentence, like Marian
-        return data_0
-    data_0_mask = [[[1.]]] * len(data_0)
-    data_0_index_range = [[[float(t)]] for t in range(len(data_0))]
-    #print(data_0, data_0_mask, data_0_index_range)
-
-    max_len = len(data_0) * 3
-    Y = []
-    encoder_context_0, *_ = encode_source(data_0=data_0, data_0_mask=data_0_mask, data_0_posrange=data_0_index_range)
-    logp, *out_decoder_states = decode_first(data_1_posrange=[[[float(0)]]],
-                                             encoder_context_0=encoder_context_0, data_0_mask=data_0_mask)
-    logp[:,:,:,unk_id] = -1e8  # suppress <unk>, like Marian
-    Y.append(np.argmax(logp[0][0]))
-    while Y[-1] != 0 and len(Y) < max_len:
-        logp, *out_decoder_states = decode_next(prev_word=[Y[-1]], data_1_posrange=[[[float(len(Y))]]],
-                                                encoder_context_0=encoder_context_0, data_0_mask=data_0_mask,
-                                                decoder_state_0=out_decoder_states[0], decoder_state_1=out_decoder_states[1],
-                                                decoder_state_2=out_decoder_states[2], decoder_state_3=out_decoder_states[3],
-                                                decoder_state_4=out_decoder_states[4], decoder_state_5=out_decoder_states[5])
-        logp[:,:,:,unk_id] = -1e8
-        Y.append(np.argmax(logp[0][0]))
-    return Y
-
-start_time = time.time()
-with open("C:/work/marian-dev/local/model/predictions.out-onnx-debug-sin-notrans-first100-d.tok", 'wt', encoding='utf-8') as out_f:
-    for line in open("C:/work/marian-dev/local/model/predictions.in-first100.tok", encoding='utf-8').readlines():
-        data = [word2id.get(w, unk_id) for w in (line.rstrip() + " </s>").split(' ') if w]
-        Y = greedy_decode(data)
-        print("input: ", ' '.join(id2word[x] for x in data))
-        print("output:", ' '.join(id2word[y] for y in Y))
-        print(' '.join(id2word[y] for y in Y[:-1]), file=out_f, flush=True)  # strip </s> for output to file
-print("--- %s seconds ---" % (time.time() - start_time))
--- a/scripts/onnx/marian_to_onnx.py
+++ b/scripts/onnx/marian_to_onnx.py
@ -0,0 +1,276 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+Library for converting certain types of Marian models to a standalone ONNX model.
+
+Because Marian and ONNX use very different philosophies, a conversion is not possible
+for all possible Marian models. Specifically, currently we don't support recurrent
+networks in the encoder, and we can only decode with greedy search (not beam search).
+
+This works by running a Marian decode for 2 output steps, and capturing three pieces of
+Marian's internal graph that correspond to the encoder, the first decoding steps, and the
+second decoding step. The graph of the second decoding step can be applied repeatedly in
+order to decoder a variable-length sequence.
+
+The three pieces are then composed with a greedy-search implementation, which is realized
+directly via ONNX operators. This is facilitated by the onnx_fx library. As of this writing,
+onnx_fx is still in experimental stage, and is not yet included in Release branches of
+the onnxconverter-common distribution. Hence, you must use the latest master branch, not
+the release.
+
+The code below assumes that the onnxconverter_common repo is cloned next to the marian-dev
+repo, and that you use the standard CMake build process on Linux. If not, please make sure
+that the onnxconverter-common repo is included in PYTHONPATH, and you may need to pass the
+binary path of Marian to export_marian_model_components() explicitly.
+
+Prerequisites:
+```
+pip install onnxruntime
+git clone https://github.com/microsoft/onnxconverter-common.git
+```
+You will also need to compile Marian with -DUSE_ONNX=ON.
+
+Known issue: If the number of decoder layers is not 6, you need to manually adjust one
+line of code in loop_body() below.
+"""
+
+import os, sys, inspect, subprocess
+from typing import List, Dict, Optional, Callable
+
+# get the Marian root path
+_marian_root_path = os.path.dirname(inspect.getfile(inspect.currentframe())) + "/../.."
+
+# we assume onnxconverter-common to be available next to the marian-dev repo; you may need to adjust this
+sys.path.append(_marian_root_path + "/../onnxconverter-common")
+from onnxconverter_common.onnx_fx import Graph
+from onnxconverter_common.onnx_fx import GraphFunctionType as _Ty
+from onnxconverter_common import optimize_onnx_graph
+import onnxruntime as _ort
+from onnxruntime import quantization
+
+def _ort_apply_model(model, inputs):  # ORT execution is a callback so that Graph itself does not need to depend on ORT
+    sess = _ort.InferenceSession(model.SerializeToString())
+    return sess.run(None, inputs)
+Graph.inference_runtime = _ort_apply_model
+Graph.opset = 11
+
+
+def _optimize_graph_in_place(graph: Graph):
+    # @TODO: This should really be methods on onnx_fx.Graph.
+    g = graph._oxml.graph
+    g_opt = optimize_onnx_graph(
+        onnx_nodes=g.node,              # the onnx node list in onnx model.
+        nchw_inputs=None,               # the name list of the inputs needed to be transposed as NCHW
+        inputs=g.input,                 # the model input
+        outputs=g.output,               # the model output
+        initializers=g.initializer,     # the model initializers
+        stop_initializers=None,         # 'stop' optimization on these initializers
+        model_value_info=g.value_info,  # the model value_info
+        model_name=g.name,              # the internal name of model
+        target_opset=graph.opset)
+    graph._oxml.graph.CopyFrom(g_opt)
+
+
+def export_marian_model_components(marian_model_path: str, marian_vocab_paths: List[str],
+                                   marian_executable_path: Optional[str]=None) -> Dict[str,Graph]:
+    """
+    Export the Marian graph to a set of models.
+
+    Args:
+        marian_model_path: path to Marian model to convert
+        marian_vocab_paths: paths of vocab files (normally, this requires 2 entries, which may be identical)
+        marian_executable_path: path to Marian executable; will default to THIS_SCRIPT_PATH/../../build/marian
+    Returns:
+        Dict of onnx_fx.Graph instances corresponding to pieces of the Marian model.
+    """
+    assert isinstance(marian_vocab_paths, list), "marian_vocab_paths must be a list of paths"
+    # default marian executable is found relative to location of this script (Linux/CMake only)
+    if marian_executable_path is None:
+        marian_executable_path = _marian_root_path + "/build/marian"
+    # partial models are written to /tmp
+    output_path_stem = "/tmp/" + os.path.basename(marian_model_path)
+    # exporting is done via invoking Marian via its command-line interface; models are written to tmp files
+    command = marian_executable_path
+    args = [
+        "convert",
+        "--from", marian_model_path,
+        "--vocabs", *marian_vocab_paths,
+        "--to", output_path_stem,
+        "--export-as", "onnx-encode"
+    ]
+    subprocess.run([command] + args, check=True)
+    # load the tmp files into onnx_fx.Graph objects
+    graph_names = ["encode_source", "decode_first", "decode_next"]                                # Marian generates graphs with these names
+    output_paths = [output_path_stem + "." + graph_name + ".onnx" for graph_name in graph_names]  # form pathnames under which Marian wrote the files
+    res = { graph_name: Graph.load(output_path) for graph_name, output_path in zip(graph_names, output_paths) }
+    # optimize the partial models in place, as Marian may not have used the most optimal way of expressing all operations
+    for graph_name in res.keys():
+        _optimize_graph_in_place(res[graph_name])
+    # clean up after ourselves
+    for output_path in output_paths:
+        os.unlink(output_path)
+    return res
+
+
+def quantize_models_in_place(partial_models: Dict[str,Graph], to_bits: int=8):
+    """
+    Quantize the partial models in place.
+
+    Args:
+        partial_models: models returned from export_marian_model_components()
+        to_bits: number of bits to quantize to, currently only supports 8
+    """
+    for graph_name in partial_models.keys():  # quantize each partial model
+        partial_models[graph_name]._oxml = quantization.quantize(
+            partial_models[graph_name]._oxml,
+            nbits=to_bits,
+            quantization_mode=quantization.QuantizationMode.IntegerOps,
+            symmetric_weight=True,
+            force_fusions=True)
+
+
+def compose_model_components_with_greedy_search(partial_models: Dict[str,Graph], num_decoder_layers: int):
+    """
+    Create an ONNX model that implements greedy search over the exported Marian pieces.
+
+    Args:
+        partial_models: models returned from export_marian_model_components()
+        num_decoder_layers: must be specified, since it cannot be inferred from the model files presently (e.g. 6)
+    Returns:
+        ONNX model that can be called as
+        result_ids = greedy_search_fn(np.array(source_ids, dtype=np.int64), np.array([target_eos_id], dtype=np.int64))[0]
+    """
+    decoder_state_dim = num_decoder_layers * 2  # each decoder has two state variables
+    # load our partial functions
+    # ONNX graph inputs and outputs are named but not ordered. Therefore, we must define the parameter order here.
+    def define_parameter_order(graph, inputs, outputs):
+        tmppath = "/tmp/tmpmodel.onnx"
+        graph.save(tmppath)  # unfortunately, Graph.load() cannot load from another Graph, so use a tmp file
+        graph = Graph.load(tmppath, inputs=inputs, outputs=outputs)
+        os.unlink(tmppath)
+        return graph
+    encode_source = define_parameter_order(partial_models["encode_source"],
+                                           inputs=['data_0', 'data_0_mask', 'data_0_posrange'],  # define the order of arguments
+                                           outputs=['encoder_context_0'])
+    decode_first = define_parameter_order(partial_models["decode_first"],
+                                          inputs=['data_1_posrange', 'encoder_context_0', 'data_0_mask'],
+                                          outputs=['first_logits'] +
+                                                  [f"first_decoder_state_{i}" for i in range(decoder_state_dim)])
+    decode_next = define_parameter_order(partial_models["decode_next"],
+                                         inputs=['prev_word', 'data_1_posrange', 'encoder_context_0', 'data_0_mask'] +
+                                                [f"decoder_state_{i}" for i in range(decoder_state_dim)],
+                                         outputs=['next_logits'] +
+                                                 [f"next_decoder_state_{i}" for i in range(decoder_state_dim)])
+
+    # create an ONNX graph that implements full greedy search
+    # The greedy search is implemented via the @onnx_fx.Graph.trace decorator, which allows us to
+    # author the greedy search in Python, similar to @CNTK.Function and PyTorch trace-based jit.
+    # The decorator executes greedy_search() below on a dummy input in order to generate an ONNX graph
+    # via invoking operators from the onnx.fx library.
+    # The partial functions exported from Marian are invoked (=inlined) by this.
+    # The result is a full ONNX graph that implements greedy search using the Marian model.
+    @Graph.trace(
+        input_types=[_Ty.I(shape=['N']), _Ty.I([1])],
+        output_types=[_Ty.I(shape=['T'])],
+        outputs="Y")
+    def greedy_search(X, eos_id):
+        """
+        Args:
+            X: sequence of input tokens, including EOS symbol, as integer indices into the input vocabulary
+            eos_id: id of the EOS symbol in the output vocabulary
+        """
+        ox = X.ox
+        data_0 = X
+        data_0_shape = data_0.shape()
+        data_0_mask = ox.constant_of_shape(data_0_shape, value=1.0)
+        seq_len = data_0_shape[-1]
+        data_0_index_range = ox.range([ox.constant(value=0), seq_len, ox.constant(value=1)]).cast(to=ox.float)
+        data_0_index_range = ox.unsqueeze(data_0_index_range, axes=[1, 2])
+        max_len = seq_len * 3
+
+        encoder_context_0 = encode_source(data_0=data_0, data_0_mask=data_0_mask,
+                                          data_0_posrange=data_0_index_range)
+
+        y_len_0 = ox.constant(value=0.0)
+        logp, *out_decoder_states = decode_first(data_1_posrange=y_len_0,
+                                                 encoder_context_0=encoder_context_0, data_0_mask=data_0_mask)
+
+        y_t = logp[0, 0, 0].argmax(axis=-1, keepdims=True)  # note: rank-1 tensor, not a scalar
+        eos_token = eos_id + 0
+        test_y_t = (y_t != eos_token)
+
+        @Graph.trace(outputs=['ty_t', 'y_t_o', *(f'ods_{i}' for i in range(decoder_state_dim)), 'y_t_o2'],
+                    output_types=[_Ty.b, _Ty.i] + [_Ty.f] * decoder_state_dim + [_Ty.i],
+                    input_types=[_Ty.I([1]), _Ty.b, _Ty.i] + [_Ty.f] * decoder_state_dim)
+        def loop_body(iteration_count, condition,  # these are not actually used inside
+                    y_t,
+                    out_decoder_states_0, out_decoder_states_1, out_decoder_states_2, out_decoder_states_3, out_decoder_states_4, out_decoder_states_5,
+                    out_decoder_states_6, out_decoder_states_7, out_decoder_states_8, out_decoder_states_9, out_decoder_states_10, out_decoder_states_11):
+            # @BUGBUG: Currently, we do not support variable number of arguments to the callable.
+            # @TODO: We have the information from the type signature in Graph.trace(), so this should be possible.
+            assert decoder_state_dim == 12, "Currently, decoder layers other than 6 require a manual code change"
+            out_decoder_states = [out_decoder_states_0, out_decoder_states_1, out_decoder_states_2, out_decoder_states_3, out_decoder_states_4, out_decoder_states_5,
+                    out_decoder_states_6, out_decoder_states_7, out_decoder_states_8, out_decoder_states_9, out_decoder_states_10, out_decoder_states_11]
+            """
+            Loop body follows the requirements of ONNX Loop:
+
+            "The graph run each iteration.
+            It has 2+N inputs: (iteration_num, condition, loop carried dependencies...).
+            It has 1+N+K outputs: (condition, loop carried dependencies..., scan_outputs...).
+            Each scan_output is created by concatenating the value of the specified output value at the end of each iteration of the loop.
+            It is an error if the dimensions or data type of these scan_outputs change across loop iterations."
+
+            Inputs:
+                iteration_num (not used by our function)
+                test_y_t: condition (not used as an input)
+                y_t, *out_decoder_states: N=(decoder_state_dim+1) loop-carried dependencies
+
+            Outputs:
+                test_y_t: condition, return True if there is more to decode
+                y_t, *out_decoder_states: N=(decoder_state_dim+1) loop-carried dependencies (same as in the Inputs section)
+                y_t: K=1 outputs
+            """
+            pos = iteration_count + 1
+            data_1_posrange = pos.cast(to=1).unsqueeze(axes=[0, 1, 2])
+            logp, *out_decoder_states = decode_next(
+                prev_word=y_t, data_1_posrange=data_1_posrange,
+                encoder_context_0=encoder_context_0, data_0_mask=data_0_mask,
+                **{f"decoder_state_{i}": out_decoder_states[i] for i in range(len(out_decoder_states))})
+            y_t = logp[0, 0, 0].argmax(axis=-1, keepdims=True)
+            test_y_t = (y_t != eos_token)
+            return [test_y_t, y_t] + out_decoder_states + [y_t]
+
+        # "Final N loop carried dependency values then K scan_outputs"
+        ret_vals = ox.loop(max_len, test_y_t, loop_body,
+                           inputs=[y_t] + out_decoder_states,
+                           outputs=['gy_t_o', *[f"gods_{i}" for i in range(len(out_decoder_states))], 'greedy_out'])
+        y = ret_vals[-1]  # scan_output
+
+        # we must prepend the very first token
+        Y = ox.concat([ox.unsqueeze(y_t), y], axis=0)  # note: y_t are rank-1 tensors, not scalars (ORT concat fails with scalars)
+        return ox.squeeze(Y, axes=[1])
+    greedy_search.to_model()  # this triggers the model tracing (which is lazy)
+    # optimize the final model as well
+    # @BUGBUG: This leads to a malformed or hanging model.
+    #_optimize_graph_in_place(greedy_search)
+    return greedy_search
+
+
+def apply_model(greedy_search_fn: Graph, source_ids: List[int], target_eos_id: int) -> List[int]:
+    """
+    Apply model to an input sequence, e.g. run translation.
+    This function is meant for quick testing, and as an example of how to invoke the final graph.
+
+    Args:
+        greedy_search_fn: ONNX model created with combine_model_components_with_greedy_search()\
+        source_ids: list of source tokens, as indices into soure vocabulary, ending in EOS symbol
+        target_eos_id: id of EOS symbol in target vocabulary
+    Returns:
+        Result as list of ids into target vocabulary
+    """
+    import numpy as np
+    Y = greedy_search_fn(
+        np.array(source_ids, dtype=np.int64),
+        np.array([target_eos_id], dtype=np.int64))[0]
+    return Y
--- a/scripts/onnx/marian_to_onnx_example.py
+++ b/scripts/onnx/marian_to_onnx_example.py
@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+Example program demonstrating how to convert a Marian model using the marian_to_onnx library
+to a self-contained ONNX model that implements greedy search.
+"""
+
+import os, sys
+import marian_to_onnx as mo
+
+# The following variables would normally be command-line arguments.
+# We use constants here to keep it simple. They reflect an example use. You must adjust these.
+my_dir = os.path.expanduser("~/young/wngt 2019/")
+marian_npz = my_dir + "model.base.npz"            # path to the Marian model to convert
+num_decoder_layers = 6                            # number of decoder layers
+marian_vocs = [my_dir + "en-de.wl"] * 2           # path to the vocabularies for source and target
+onnx_model_path = my_dir + "model.base.opt.onnx"  # resulting model gets written here
+quantize_to_bits = 8                              # None for no quantization
+
+# export Marian model as multiple ONNX models
+partial_models = mo.export_marian_model_components(marian_npz, marian_vocs)
+
+# quantize if desired
+if quantize_to_bits:
+    mo.quantize_models_in_place(partial_models, to_bits=quantize_to_bits)
+
+# use the ONNX models in a greedy-search
+# The result is a fully self-contained model that implements greedy search.
+onnx_model = mo.compose_model_components_with_greedy_search(partial_models, num_decoder_layers)
+
+# save as ONNX file
+onnx_model.save(onnx_model_path)
+
+# run a test sentence
+w2is = [{ word.rstrip(): id for id, word in enumerate(open(voc_path, "r").readlines()) } for voc_path in marian_vocs]
+i2ws = [{ id: tok for tok, id in w2i.items() } for w2i in w2is]
+src_tokens = "▁Republican ▁leaders ▁justifie d ▁their ▁policy ▁by ▁the ▁need ▁to ▁combat ▁electoral ▁fraud ▁.".split()
+src_ids = [w2is[0][tok] for tok in src_tokens]
+print(src_tokens)
+print(src_ids)
+Y = mo.apply_model(greedy_search_fn=onnx_model,
+                   source_ids=src_ids + [w2is[0]["</s>"]],
+                   target_eos_id=w2is[1]["</s>"])
+print(Y.shape, Y)
+tgt_tokens = [i2ws[1][y] for y in Y]
+print(" ".join(tgt_tokens))
--- a/src/3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp
+++ b/src/3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp
@ -18,9 +18,10 @@
 #pragma warning(disable : 4100 4125 4127 4244 4267 4512 4456 4510 4610 4800)
 #endif
 #ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wunused-variable"  // note: GCC <6.0 ignores this when inside push/pop
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
 #pragma GCC diagnostic ignored "-Wsuggest-override"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif

 #define AuxillaryParseTableField AuxiliaryParseTableField  // in protobuf 3.12, the generated source has a spelling error
--- a/src/3rd_party/onnx/protobuf/onnx-ml.pb.h
+++ b/src/3rd_party/onnx/protobuf/onnx-ml.pb.h
@ -13,7 +13,7 @@
 #error incompatible with your Protocol Buffer headers. Please update
 #error your headers.
 #endif
-#if 3012000 < PROTOBUF_MIN_PROTOC_VERSION
+#if 3012003 < PROTOBUF_MIN_PROTOC_VERSION
 #error This file was generated by an older version of protoc which is
 #error incompatible with your Protocol Buffer headers. Please
 #error regenerate this file with a newer version of protoc.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -70,6 +70,7 @@ add_library(marian STATIC
  layers/generic.cpp
  layers/loss.cpp
  layers/weight.cpp
+  layers/lsh.cpp

  rnn/cells.cpp
  rnn/attention.cpp
@ -82,7 +83,9 @@ add_library(marian STATIC
  models/transformer_stub.cpp

  rescorer/score_collector.cpp
+  embedder/vector_collector.cpp

+  translator/beam_search.cpp
  translator/history.cpp
  translator/output_collector.cpp
  translator/output_printer.cpp
@ -100,17 +103,15 @@ add_library(marian STATIC

  # this is only compiled to catch build errors, but not linked
  microsoft/quicksand.cpp
+  microsoft/cosmos.cpp

  $<TARGET_OBJECTS:libyaml-cpp>
  $<TARGET_OBJECTS:SQLiteCpp>
  $<TARGET_OBJECTS:pathie-cpp>
  $<TARGET_OBJECTS:zlib>
+  $<TARGET_OBJECTS:faiss>
 )

-if(BLAS_FOUND)
-  target_sources(marian PRIVATE ${CMAKE_CURRENT_LIST_DIR}/layers/lsh.cpp $<TARGET_OBJECTS:faiss>)
-endif()
-
 target_compile_options(marian PUBLIC ${ALL_WARNINGS})

 # Generate git_revision.h to reflect current git revision information
@ -225,7 +226,7 @@ if(COMPILE_SERVER)
  set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
  if(MSVC)
    # Disable warnings from the SimpleWebSocketServer library needed for compilation of marian-server
-    target_compile_options(marian_server PUBLIC ${ALL_WARNINGS})
+    target_compile_options(marian_server PUBLIC ${ALL_WARNINGS} /wd4267 /wd4244 /wd4456 /wd4458)
  else(MSVC)
    # -Wno-suggest-override disables warnings from Boost 1.69+
    target_compile_options(marian_server PUBLIC ${ALL_WARNINGS} -Wno-suggest-override)
@ -238,6 +239,7 @@ if(APPLE) # This is a dependency of pathie but I can't seem to link it into that
 endif()

 foreach(exec ${EXECUTABLES})
+  # @TODO: consider adding MKL and other libs to the library rather than the executables if at all possible
  target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
  if(CUDA_FOUND)
    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -70,15 +70,17 @@ int main(int argc, char** argv) {
    // added a flag if the weights needs to be packed or not
    graph->packAndSave(modelTo, configStr.str(), /* --gemm-type */ saveGemmType, Type::float32);
  }
-#ifdef USE_ONNX
  else if (exportAs == "onnx-encode") {
+#ifdef USE_ONNX
    auto graph = New<ExpressionGraphONNXExporter>();
    load(graph);
    auto modelOptions = New<Options>(config)->with("vocabs", vocabPaths, "inference", true);

    graph->exportToONNX(modelTo, modelOptions, vocabPaths);
-  }
+#else
+    ABORT("--export-as onnx-encode requires Marian to be built with USE_ONNX=ON");
 #endif // USE_ONNX
+  }
  else
    ABORT("Unknown --export-as value: {}", exportAs);

--- a/src/command/marian_embedder.cpp
+++ b/src/command/marian_embedder.cpp
@ -0,0 +1,14 @@
+#include "marian.h"
+
+#include "models/model_task.h"
+#include "embedder/embedder.h"
+#include "common/timer.h"
+
+int main(int argc, char** argv) {
+  using namespace marian;
+
+  auto options = parseOptions(argc, argv, cli::mode::embedding);
+  New<Embed<Embedder>>(options)->run();
+  
+  return 0;
+}
--- a/src/command/marian_main.cpp
+++ b/src/command/marian_main.cpp
@ -11,6 +11,7 @@
 //  train
 //  decode
 //  score
+//  embed
 //  vocab
 //  convert
 // Currently, marian_server is not supported, since it is a special use case with lots of extra dependencies.
@ -24,6 +25,9 @@
 #define main mainScorer
 #include "marian_scorer.cpp"
 #undef main
+#define main mainEmbedder
+#include "marian_embedder.cpp"
+#undef main
 #define main mainVocab
 #include "marian_vocab.cpp"
 #undef main
@ -44,9 +48,10 @@ int main(int argc, char** argv) {
    if(cmd == "train")           return mainTrainer(argc, argv);
    else if(cmd == "decode")     return mainDecoder(argc, argv);
    else if (cmd == "score")     return mainScorer(argc, argv);
+    else if (cmd == "embed")     return mainEmbedder(argc, argv);
    else if (cmd == "vocab")     return mainVocab(argc, argv);
    else if (cmd == "convert")   return mainConv(argc, argv);
-    std::cerr << "Command must be train, decode, score, vocab, or convert." << std::endl;
+    std::cerr << "Command must be train, decode, score, embed, vocab, or convert." << std::endl;
    exit(1);
  } else
    return mainTrainer(argc, argv);
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@ -22,8 +22,8 @@ namespace marian {
 *
 * @see CLIWrapper::alias()
 *
- * The order of alias definitions *does* matter: options from later aliases override earlier
- * regardless of its order in the command line or config file.
+ * The order of alias definitions *does* matter: options from an alias defined later override
+ * options defined in earlier aliases regardless of their order in the command line or config file.
 */
 void ConfigParser::addAliases(cli::CLIWrapper& cli) {
  cli.alias("fp16", "true", [&](YAML::Node& config) {
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@ -72,30 +72,45 @@ void Config::initialize(ConfigParser const& cp) {
    }
  }

-  // guess --tsv-fields (the number of streams) if not set
+  // guess --tsv-fields, i.e. the number of fields in a TSV input, if not set
  if(get<bool>("tsv") && get<size_t>("tsv-fields") == 0) {
    size_t tsvFields = 0;
-    if(loaded) {
-      // model.npz has properly set vocab dimensions in special:model.yml,
-      // so we may use them to determine the number of streams
-      for(auto dim : get<std::vector<size_t>>("dim-vocabs"))
-        if(dim != 0)  // language models have a fake extra vocab
-          ++tsvFields;
-      // For translation there is no target stream
-      if((mode == cli::mode::translation || mode == cli::mode::server) && tsvFields > 1)
-        --tsvFields;
-    } else {
-      // TODO: This is very britle, find a better solution
-      // If parameters from model.npz special:model.yml were not loaded,
-      // guess the number of inputs and outputs based on the model type name.
-      auto modelType = get<std::string>("type");

-      tsvFields = 1;
-      if(modelType.find("multi-", 0) != std::string::npos)  // is a dual-source model
-        tsvFields += 1;
-      if(mode == cli::mode::training || mode == cli::mode::scoring)
-        if(modelType.rfind("lm", 0) != 0)  // unless it is a language model
+    // use the length of --input-types if given
+    auto inputTypes = get<std::vector<std::string>>("input-types");
+    if(!inputTypes.empty()) {
+      tsvFields = inputTypes.size();
+    } else {
+      if(loaded) {
+        // model.npz has properly set vocab dimensions in special:model.yml,
+        // so we may use them to determine the number of streams
+        for(auto dim : get<std::vector<size_t>>("dim-vocabs"))
+          if(dim != 0)  // language models have a fake extra vocab
+            ++tsvFields;
+        // For translation there is no target stream
+        if((mode == cli::mode::translation || mode == cli::mode::server) && tsvFields > 1)
+          --tsvFields;
+      } else {
+        // If parameters from model.npz special:model.yml were not loaded,
+        // guess the number of inputs and outputs based on the model type name.
+        // TODO: This is very britle, find a better solution
+        auto modelType = get<std::string>("type");
+
+        tsvFields = 1;
+        if(modelType.find("multi-", 0) != std::string::npos)  // is a dual-source model
          tsvFields += 1;
+        if(mode == cli::mode::training || mode == cli::mode::scoring)
+          if(modelType.rfind("lm", 0) != 0)  // unless it is a language model
+            tsvFields += 1;
+      }
+
+      // count fields with guided-alignment or data-weighting too
+      if(mode == cli::mode::training) {
+        if(has("guided-alignment") && get<std::string>("guided-alignment") != "none")
+          tsvFields += 1;
+        if(has("data-weighting") && !get<std::string>("data-weighting").empty())
+          tsvFields += 1;
+      }
    }

    config_["tsv-fields"] = tsvFields;
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -91,6 +91,9 @@ ConfigParser::ConfigParser(cli::mode mode)
    case cli::mode::scoring:
      addOptionsScoring(cli_);
      break;
+    case cli::mode::embedding:
+      addOptionsEmbedding(cli_);
+      break;
    default:
      ABORT("wrong CLI mode");
      break;
@ -218,7 +221,8 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
      "Train right-to-left model");
  cli.add<std::vector<std::string>>("--input-types",
      "Provide type of input data if different than 'sequence'. "
-      "Possible values: sequence, class. You need to provide one type per input.",
+      "Possible values: sequence, class, alignment, weight. "
+      "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).",
      {});
  cli.add<bool>("--best-deep",
      "Use Edinburgh deep RNN configuration (s2s)");
@ -235,6 +239,8 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
      8);
  cli.add<bool>("--transformer-no-projection",
      "Omit linear projection after multi-head attention (transformer)");
+  cli.add<bool>("--transformer-pool",
+      "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
  cli.add<int>("--transformer-dim-ffn",
      "Size of position-wise feed-forward network (transformer)",
      2048);
@ -462,7 +468,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "Auto-adjusted to --mini-batch-words-ref if given.",
     0.f)->implicit_val("1e-4");
  cli.add<std::string>("--guided-alignment",
-     "Path to a file with word alignments. Use guided alignment to guide attention or 'none'",
+     "Path to a file with word alignments. Use guided alignment to guide attention or 'none'. "
+     "If --tsv it specifies the index of a TSV field that contains the alignments (0-based)",
     "none");
  cli.add<std::string>("--guided-alignment-cost",
     "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)",
@ -471,7 +478,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
     "Weight for guided alignment cost",
     0.1);
  cli.add<std::string>("--data-weighting",
-     "Path to a file with sentence or word weights");
+     "Path to a file with sentence or word weights. "
+     "If --tsv it specifies the index of a TSV field that contains the weights (0-based)");
  cli.add<std::string>("--data-weighting-type",
     "Processing level for data weighting: sentence, word",
     "sentence");
@ -707,6 +715,45 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
  // clang-format on
 }

+void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
+  auto previous_group = cli.switchGroup("Scorer options");
+
+  // clang-format off
+  cli.add<bool>("--no-reload",
+      "Do not load existing model specified in --model arg");
+  // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice
+  cli.add<std::vector<std::string>>("--train-sets,-t",
+      "Paths to corpora to be scored: source target");
+  cli.add<std::string>("--output,-o",
+      "Path to output file, stdout by default",
+      "stdout");
+  cli.add<std::vector<std::string>>("--vocabs,-v",
+      "Paths to vocabulary files have to correspond to --train-sets. "
+      "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
+      "If these files do not exists they are created");
+
+  cli.add<bool>("--compute-similarity",
+      "Expect two inputs and compute cosine similarity instead of outputting embedding vector");
+  cli.add<bool>("--binary",
+      "Output vectors as binary floats");
+
+  addSuboptionsInputLength(cli);
+  addSuboptionsTSV(cli);
+  addSuboptionsDevices(cli);
+  addSuboptionsBatching(cli);
+
+  cli.add<bool>("--optimize",
+      "Optimize speed aggressively sacrificing memory or precision");
+  cli.add<bool>("--fp16",
+      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
+  cli.add<std::vector<std::string>>("--precision",
+      "Mixed precision for inference, set parameter type in expression graph",
+      {"float32"});
+
+  cli.switchGroup(previous_group);
+  // clang-format on
+}
+
 void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) {
  // clang-format off
  cli.add<std::vector<std::string>>("--devices,-d",
@ -804,7 +851,7 @@ void ConfigParser::addSuboptionsTSV(cli::CLIWrapper& cli) {
  cli.add<bool>("--tsv",
      "Tab-separated input");
  cli.add<size_t>("--tsv-fields",
-      "Number of fields in the TSV input, guessed based on the model type");
+      "Number of fields in the TSV input. By default, it is guessed based on the model type");
  // clang-format on
 }

@ -904,6 +951,7 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate){
  // remove extra config files from the config to avoid redundancy
  config_.remove("config");

+  // dump config and exit
  if(!get<std::string>("dump-config").empty() && get<std::string>("dump-config") != "false") {
    auto dumpMode = get<std::string>("dump-config");
    config_.remove("dump-config");
@ -917,6 +965,43 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate){
    exit(0);
  }

+  // For TSV input, it is possible to use --input-types to determine fields that contain alignments
+  // or weights. In such case, the position of 'alignment' input type in --input-types determines
+  // the index of a TSV field that contains word alignments, and respectively, the position of
+  // 'weight' in --input-types determines the index of a TSV field that contains weights.
+  // Marian will abort if both the --guided-alignment and 'alignment' in --input-types are specified
+  // (or --data-weighting and 'weight').
+  //
+  // Note: this may modify the config, so it is safer to do it after --dump-config.
+  if(mode_ == cli::mode::training || get<bool>("tsv")) {
+    auto inputTypes = get<std::vector<std::string>>("input-types");
+    if(!inputTypes.empty()) {
+      bool seenAligns = false;
+      bool seenWeight = false;
+      YAML::Node config;
+      for(size_t i = 0; i < inputTypes.size(); ++i) {
+        if(inputTypes[i] == "alignment") {
+          ABORT_IF(seenAligns, "You can specify 'alignment' only once in input-types");
+          ABORT_IF(has("guided-alignment") && get<std::string>("guided-alignment") != "none",
+                   "You must use either guided-alignment or 'alignment' in input-types");
+          config["guided-alignment"] = std::to_string(i);
+          seenAligns = true;
+        }
+        if(inputTypes[i] == "weight") {
+          ABORT_IF(seenWeight, "You can specify 'weight' only once in input-types");
+          ABORT_IF(has("data-weighting") && !get<std::string>("data-weighting").empty(),
+                   "You must use either data-weighting or 'weight' in input-types");
+          config["data-weighting"] = std::to_string(i);
+          seenWeight = true;
+        }
+      }
+      if(!config.IsNull())
+        cli_.updateConfig(config,
+                          cli::OptionPriority::CommandLine,
+                          "Extracting 'alignment' and 'weight' types from input-types failed.");
+    }
+  }
+
  cli_.parseAliases();
  auto opts = New<Options>();
  opts->merge(Config(*this).get());
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@ -14,7 +14,7 @@
 namespace marian {

 namespace cli {
-enum struct mode { training, translation, scoring, server };
+enum struct mode { training, translation, scoring, server, embedding };
 }  // namespace cli

 /**
@ -129,6 +129,7 @@ private:
  void addOptionsValidation(cli::CLIWrapper&);
  void addOptionsTranslation(cli::CLIWrapper&);
  void addOptionsScoring(cli::CLIWrapper&);
+  void addOptionsEmbedding(cli::CLIWrapper&);

  void addAliases(cli::CLIWrapper&);

--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@ -27,6 +27,10 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
      validateOptionsParallelData();
      validateOptionsScoring();
      break;
+    case cli::mode::embedding:
+      validateOptionsParallelData();
+      validateOptionsScoring();
+      break;
    case cli::mode::training:
      validateOptionsParallelData();
      validateOptionsTraining();
--- a/src/common/file_stream.cpp
+++ b/src/common/file_stream.cpp
@ -1,8 +1,10 @@
 #include "common/file_stream.h"
+#include "common/utils.h"

 #include <streambuf>
 #include <string>
 #include <vector>
+#include <cstdio>
 #ifdef _MSC_VER
 #include <io.h>
 #include <windows.h>
@ -18,23 +20,44 @@ namespace io {

 ///////////////////////////////////////////////////////////////////////////////////////////////
 InputFileStream::InputFileStream(const std::string &file)
-    : std::istream(NULL), file_(file) {
-  ABORT_IF(!marian::filesystem::exists(file_), "File '{}' does not exist", file);
-
+    : std::istream(NULL) {
+  // the special syntax "command |" starts command in a sh shell and reads out its result
+  if (marian::utils::endsWith(file, "|")) {
+#ifdef __unix__
+    auto command = file.substr(0, file.size() - 1);
+    // open as a pipe
+    pipe_ = popen(command.c_str(), "r");
+    ABORT_IF(!pipe_, "Command failed to execute ({}): {}", errno, command);
+    // there is no official way to construct a filebuf from a FILE* or fd, so we use /proc/{pid}/fd/{fd}
+    // For now, this only works on Linux. There are similar workarounds for Windows.
+    file_ = "/proc/" + std::to_string(getpid()) + "/fd/" + std::to_string(fileno(pipe_));
+#else
+    ABORT("Pipe syntax not supported in this build of Marian: {}", file);
+#endif
+  }
+  else
+    file_ = file;
  streamBuf1_.reset(new std::filebuf());
-  auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(file.c_str(), std::ios::in | std::ios::binary);
-  ABORT_IF(!ret, "File cannot be opened", file);
+  auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(file_.string().c_str(), std::ios::in | std::ios::binary);
+  ABORT_IF(!ret, "Error opening file ({}): {}", errno, file_.string());
  ABORT_IF(ret != streamBuf1_.get(), "Return value is not equal to streambuf pointer, that is weird");

-  if(file_.extension() == marian::filesystem::Path(".gz")) {
-    streamBuf2_.reset(new zstr::istreambuf(streamBuf1_.get()));
-    this->init(streamBuf2_.get());
-  } else {
-    this->init(streamBuf1_.get());
+  // insert .gz decompression
+  if(marian::utils::endsWith(file, ".gz")) {
+    streamBuf2_ = std::move(streamBuf1_);
+    streamBuf1_.reset(new zstr::istreambuf(streamBuf2_.get()));
  }
+
+  // initialize the underlying istream
+  this->init(streamBuf1_.get());
 }

-InputFileStream::~InputFileStream() {}
+InputFileStream::~InputFileStream() {
+#ifdef __unix__  // (pipe syntax is only supported on UNIX-like OS)
+  if (pipe_)
+    pclose(pipe_);  // non-NULL if pipe syntax was used
+#endif
+}

 bool InputFileStream::empty() {
  return this->peek() == std::ifstream::traits_type::eof();
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -49,8 +49,9 @@ public:

 protected:
  marian::filesystem::Path file_;
-  std::unique_ptr<std::streambuf> streamBuf1_;
-  std::unique_ptr<std::streambuf> streamBuf2_;
+  std::unique_ptr<std::streambuf> streamBuf1_;  // main streambuf
+  std::unique_ptr<std::streambuf> streamBuf2_;  // in case of a .gz file
+  FILE* pipe_{};                                // in case of pipe syntax
  std::vector<char> readBuf_;
 };

--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -75,6 +75,8 @@ protected:

 private:
  Ptr<BatchStats> stats_;
+  
+  bool runAsync_{true}; // use asynchronous batch pre-fetching by default. We want to be able to disable this when running in library mode and for exception-safety.

  // state of fetching
  std::deque<BatchPtr> bufferedBatches_; // current swath of batches that next() reads from
@ -84,7 +86,7 @@ private:
  bool newlyPrepared_{ true }; // prepare() was just called: we need to reset current_  --@TODO: can we just reset it directly?

  // variables for multi-threaded pre-fetching
-  mutable ThreadPool threadPool_; // (we only use one thread, but keep it around)
+  mutable UPtr<ThreadPool> threadPool_; // (we only use one thread, but keep it around)
  std::future<std::deque<BatchPtr>> futureBufferedBatches_; // next swath of batches is returned via this

  // this runs on a bg thread; sequencing is handled by caller, but locking is done in here
@ -230,26 +232,37 @@ private:
  // this starts fillBatches() as a background operation
  void fetchBatchesAsync() {
    ABORT_IF(futureBufferedBatches_.valid(), "Attempted to restart futureBufferedBatches_ while still running");
-    futureBufferedBatches_ = threadPool_.enqueue([this]() {
+    ABORT_IF(!runAsync_, "Trying to run fetchBatchesAsync() but runAsync_ is false??");
+    ABORT_IF(!threadPool_, "Trying to run fetchBatchesAsync() without initialized threadPool_??");
+    futureBufferedBatches_ = threadPool_->enqueue([this]() {
      return fetchBatches();
    });
  }

  BatchPtr next() {
    if(bufferedBatches_.empty()) {
-      // out of data: need to get next batch from background thread
-      // We only get here if the future has been scheduled to run; it must be valid.
-      ABORT_IF(!futureBufferedBatches_.valid(), "Attempted to wait for futureBufferedBatches_ when none pending.\n"
-          "This error often occurs when Marian tries to restore the training data iterator, but the corpus has been changed or replaced.\n"
-          "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
-      bufferedBatches_ = std::move(futureBufferedBatches_.get());
-      // if bg thread returns an empty swath, we hit the end of the epoch
-      if (bufferedBatches_.empty()) {
-        return nullptr;
+      if(runAsync_) { // by default we will run in asynchronous mode
+        // out of data: need to get next batch from background thread
+        // We only get here if the future has been scheduled to run; it must be valid.
+        ABORT_IF(!futureBufferedBatches_.valid(), "Attempted to wait for futureBufferedBatches_ when none pending.\n"
+            "This error often occurs when Marian tries to restore the training data iterator, but the corpus has been changed or replaced.\n"
+            "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
+        bufferedBatches_ = std::move(futureBufferedBatches_.get());
+        // if bg thread returns an empty swath, we hit the end of the epoch
+        if (bufferedBatches_.empty()) {
+          return nullptr;
+        }
+        // and kick off the next bg operation
+        fetchBatchesAsync();
+      } else { // don't spawn any threads, i.e. batch fetching is blocking.
+        bufferedBatches_ = fetchBatches();
+        // if bufferedBatches is empty we hit the end of the epoch
+        if (bufferedBatches_.empty()) {
+          return nullptr;
+        }
      }
-      // and kick off the next bg operation
-      fetchBatchesAsync();
    }
+    
    auto batch = bufferedBatches_.front();
    bufferedBatches_.pop_front();
    return batch;
@ -259,8 +272,10 @@ public:

  BatchGenerator(Ptr<DataSet> data,
                 Ptr<Options> options,
-                 Ptr<BatchStats> stats = nullptr)
-      : data_(data), options_(options), stats_(stats), threadPool_(1) {
+                 Ptr<BatchStats> stats = nullptr,
+                 bool runAsync = true)
+      : data_(data), options_(options), stats_(stats), 
+        runAsync_(runAsync), threadPool_(runAsync ? new ThreadPool(1) : nullptr) {
    auto shuffle = options_->get<std::string>("shuffle", "none");
    shuffleData_ = shuffle == "data";
    shuffleBatches_ = shuffleData_ || shuffle == "batches";
@ -287,8 +302,9 @@ public:
      data_->reset();
    newlyPrepared_ = true;

-    // start the background pre-fetch operation
-    fetchBatchesAsync();
+    // start the background pre-fetch operation when running in asynchronous mode, otherwise we will fetch on demand.
+    if(runAsync_)
+      fetchBatchesAsync();
  }

  // Used to restore the state of a BatchGenerator after
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@ -46,7 +46,14 @@ void Corpus::preprocessLine(std::string& line, size_t streamId) {
 }

 SentenceTuple Corpus::next() {
-  std::vector<std::string> fields(tsvNumFields_);  // used for handling TSV inputs
+  // Used for handling TSV inputs
+  // Determine the total number of fields including alignments or weights
+  auto tsvNumAllFields = tsvNumInputFields_;
+  if(alignFileIdx_ > -1)
+    ++tsvNumAllFields;
+  if(weightFileIdx_ > -1)
+    ++tsvNumAllFields;
+  std::vector<std::string> fields(tsvNumAllFields);

  for(;;) { // (this is a retry loop for skipping invalid sentences)
    // get index of the current sentence
@ -86,11 +93,27 @@ SentenceTuple Corpus::next() {
        addWeightsToSentenceTuple(line, tup);
      } else {
        if(tsv_) {  // split TSV input and add each field into the sentence tuple
-          utils::splitTsv(line, fields, tsvNumFields_);
-          for(size_t j = 0; j < tsvNumFields_; ++j) {
-            preprocessLine(fields[j], j);
-            addWordsToSentenceTuple(fields[j], j, tup);
+          utils::splitTsv(line, fields, tsvNumAllFields);
+          size_t shift = 0;
+          for(size_t j = 0; j < tsvNumAllFields; ++j) {
+            // index j needs to be shifted to get the proper vocab index if guided-alignment or
+            // data-weighting are preceding source or target sequences in TSV input
+            if(j == alignFileIdx_ || j == weightFileIdx_) {
+              ++shift;
+            } else {
+              size_t vocabId = j - shift;
+              preprocessLine(fields[j], vocabId);
+              addWordsToSentenceTuple(fields[j], vocabId, tup);
+            }
          }
+
+          // weights are added last to the sentence tuple, because this runs a validation that needs
+          // length of the target sequence
+          if(alignFileIdx_ > -1)
+            addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
+          if(weightFileIdx_ > -1)
+            addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
+
        } else {
          preprocessLine(line, i);
          addWordsToSentenceTuple(line, i, tup);
@ -267,9 +290,10 @@ CorpusBase::batch_ptr Corpus::toBatch(const std::vector<Sample>& batchVector) {
  auto batch = batch_ptr(new batch_type(subBatches));
  batch->setSentenceIds(sentenceIds);

-  if(options_->get("guided-alignment", std::string("none")) != "none" && alignFileIdx_)
+  // Add prepared word alignments and weights if they are available
+  if(alignFileIdx_ > -1 && options_->get("guided-alignment", std::string("none")) != "none")
    addAlignmentsToBatch(batch, batchVector);
-  if(options_->hasAndNotEmpty("data-weighting") && weightFileIdx_)
+  if(weightFileIdx_ > -1 && options_->hasAndNotEmpty("data-weighting"))
    addWeightsToBatch(batch, batchVector);

  return batch;
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@ -30,6 +30,9 @@ const SentenceTuple& CorpusIterator::dereference() const {
  return tup_;
 }

+// These types of corpus constructors are used in in-training validators
+// (only?), so do not load additional files for guided alignment or data
+// weighting.
 CorpusBase::CorpusBase(const std::vector<std::string>& paths,
                       const std::vector<Ptr<Vocab>>& vocabs,
                       Ptr<Options> options)
@ -39,11 +42,11 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
      maxLengthCrop_(options_->get<bool>("max-length-crop")),
      rightLeft_(options_->get<bool>("right-left")),
      tsv_(options_->get<bool>("tsv", false)),
-      tsvNumFields_(options->get<size_t>("tsv-fields", 0)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
  // TODO: support passing only one vocab file if we have fully-tied embeddings
  if(tsv_) {
-    ABORT_IF(tsvNumFields_ != vocabs_.size(),
-             "Number of TSV fields and vocab files does not agree");
+    ABORT_IF(tsvNumInputFields_ != vocabs_.size(),
+             "Number of TSV input fields and vocab files does not agree");
  } else {
    ABORT_IF(paths_.size() != vocabs_.size(),
             "Number of corpus files and vocab files does not agree");
@ -64,7 +67,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
      maxLengthCrop_(options_->get<bool>("max-length-crop")),
      rightLeft_(options_->get<bool>("right-left")),
      tsv_(options_->get<bool>("tsv", false)),
-      tsvNumFields_(options->get<size_t>("tsv-fields", 0)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
  bool training = !translate;

  if(training)
@ -72,22 +75,65 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
  else
    paths_ = options_->get<std::vector<std::string>>("input");

-  initEOS(training);
-
  std::vector<std::string> vocabPaths;
  if(!options_->get<std::vector<std::string>>("vocabs").empty())
    vocabPaths = options_->get<std::vector<std::string>>("vocabs");

  if(training) {
    if(tsv_) {
-      ABORT_IF(!vocabPaths.empty() && tsvNumFields_ != vocabPaths.size(),
-               "Number of TSV fields and vocab files does not agree");
+      ABORT_IF(!vocabPaths.empty() && tsvNumInputFields_ != vocabPaths.size(),
+               "Number of TSV input fields and vocab files does not agree");
    } else {
      ABORT_IF(!vocabPaths.empty() && paths_.size() != vocabPaths.size(),
               "Number of corpus files and vocab files does not agree");
    }
  }

+  bool useGuidedAlignment = options_->get("guided-alignment", std::string("none")) != "none";
+  bool useDataWeighting = options_->hasAndNotEmpty("data-weighting");
+
+  if(training && tsv_) {
+    // For TSV input, we expect that guided-alignment or data-weighting provide the index of a TSV
+    // field that contains the alignments or weights.
+    //
+    // Alignments and weights for non TSV input are handled later, after vocab creation.
+    if(useGuidedAlignment) {
+      try {
+        alignFileIdx_ = std::stoul(options_->get<std::string>("guided-alignment"));
+      } catch(const std::invalid_argument& /*e*/) {
+        ABORT(
+            "For TSV input, guided-alignment must provide an index of a field with alignments. "
+            "The value '{}' could not be converted to an unsigned integer.",
+            options_->get<std::string>("guided-alignment"));
+      }
+      LOG(info, "[data] Using word alignments from TSV field no. {}", alignFileIdx_);
+    }
+
+    if(useDataWeighting) {
+      try {
+        weightFileIdx_ = std::stoul(options_->get<std::string>("data-weighting"));
+      } catch(const std::invalid_argument& /*e*/) {
+        ABORT(
+            "For TSV input, data-weighting must provide an index of a field with weights. "
+            "The value '{}' could not be converted to an unsigned integer.",
+            options_->get<std::string>("data-weighting"));
+      }
+      LOG(info, "[data] Using weights from TSV field no. {}", weightFileIdx_);
+    }
+
+    // check for identical or too large indices
+    size_t maxIndex = tsvNumInputFields_ + size_t(useGuidedAlignment) + size_t(useDataWeighting) - 1;
+    ABORT_IF((useGuidedAlignment && useDataWeighting && alignFileIdx_ == weightFileIdx_)
+                 || (useGuidedAlignment && (alignFileIdx_ > maxIndex))
+                 || (useDataWeighting && (weightFileIdx_ > maxIndex)),
+             "For TSV input, guided-alignment and data-weighting must provide an index <= {} "
+             "and be different",
+             maxIndex);
+  }
+
+  // run this after determining if guided alignment or data weighting is used in TSV input
+  initEOS(training);
+
  // @TODO: check if size_t can be used instead of int
  std::vector<int> maxVocabs = options_->get<std::vector<int>>("dim-vocabs");

@ -133,12 +179,23 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
    //  There is more cases for multi-encoder models not listed above.
    //
    if(vocabPaths.empty()) {
-      size_t numStreams = tsv_ ? tsvNumFields_ : paths_.size();
+      size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();

-      // Creating a vocabulary from stdin is not supported
-      ABORT_IF(tsv_ && (paths_[0] == "stdin" || paths_[0] == "-"),
-               "Creating vocabularies automatically from a data stream from STDIN is not supported. "
-               "Create vocabularies first and provide them with --vocabs");
+      if(tsv_) {
+        // Creating a vocabulary from stdin is not supported
+        ABORT_IF(paths_[0] == "stdin" || paths_[0] == "-",
+                 "Creating vocabularies automatically from a data stream from STDIN is not "
+                 "supported. Create vocabularies first and provide them with --vocabs");
+
+        // Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is
+        // not supported
+        ABORT_IF(useGuidedAlignment,
+                 "Creating vocabularies automatically from TSV data with alignments is not "
+                 "supported. Create vocabularies first and provide them with --vocabs");
+        ABORT_IF(useDataWeighting,
+                 "Creating vocabularies automatically from TSV data with weights is not "
+                 "supported. Create vocabularies first and provide them with --vocabs");
+      }

      if(maxVocabs.size() < paths_.size())
        maxVocabs.resize(paths_.size(), 0);
@ -170,7 +227,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
      options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths1);

    } else { // Vocabulary paths are given
-      size_t numStreams = tsv_ ? tsvNumFields_ : paths_.size();
+      size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();

      // Load all vocabs
      size_t numVocs = vocabPaths.size();
@ -200,11 +257,22 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
      vocabDims.resize(numVocs, 0); // make sure there is as many dims as vocab paths

      for(size_t i = 0; i < numVocs; ++i) {
-        // Creating a vocabulary from stdin is not supported
-        ABORT_IF(tsv_ && (paths_[0] == "stdin" || paths_[0] == "-")
-                 && (vocabPaths[i].empty() || !filesystem::exists(vocabPaths[i])),
-            "Creating vocabulary automatically from a data stream from STDIN is not supported. "
-            "Create vocabularies first and provide them with --vocabs");
+        if(tsv_) {
+          bool noVocabGiven = (vocabPaths[i].empty() || !filesystem::exists(vocabPaths[i]));
+
+          // Creating a vocabulary from stdin is not supported
+          ABORT_IF(noVocabGiven && (paths_[0] == "stdin" || paths_[0] == "-"),
+                   "Creating vocabulary automatically from a data stream from STDIN is not "
+                   "supported. Create vocabularies first and provide them with --vocabs");
+
+          // Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is not supported
+          ABORT_IF(noVocabGiven && useGuidedAlignment,
+                   "Creating vocabularies automatically from TSV data with alignments is not "
+                   "supported. Create vocabularies first and provide them with --vocabs");
+          ABORT_IF(noVocabGiven && useDataWeighting,
+                   "Creating vocabularies automatically from TSV data with weights is not "
+                   "supported. Create vocabularies first and provide them with --vocabs");
+        }

        // Get the set of files that corresponds to the vocab. If the next file is the same vocab,
        // it will not be created again, but just correctly loaded.
@ -230,7 +298,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
          fileutils::cut(groupedPaths[0],  // Index 0 because there is only one TSV file
                         tsvTempFile,
                         vocabDetails.streams,
-                         tsvNumFields_,
+                         tsvNumInputFields_,
                         " ");  // Notice that tab-separated fields are joined with a whitespace

          groupedPaths.clear();
@ -288,30 +356,34 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
           files_.size(),
           vocabs_.size());

-  if(training && options_->get("guided-alignment", std::string("none")) != "none") {
-    auto path = options_->get<std::string>("guided-alignment");
+  // Handle guided alignment and data weighting files. Alignments and weights in TSV input were
+  // handled earlier.
+  if(training && !tsv_) {
+    if(useGuidedAlignment) {
+      auto path = options_->get<std::string>("guided-alignment");

-    ABORT_IF(!filesystem::exists(path), "Alignment file does not exist");
-    LOG(info, "[data] Using word alignments from file {}", path);
+      ABORT_IF(!filesystem::exists(path), "Alignment file does not exist");
+      LOG(info, "[data] Using word alignments from file {}", path);

-    alignFileIdx_ = paths_.size();
-    paths_.emplace_back(path);
-    io::InputFileStream* strm = new io::InputFileStream(path);
-    ABORT_IF(strm->empty(), "File with alignments '{}' is empty", path);
-    files_.emplace_back(strm);
-  }
+      alignFileIdx_ = paths_.size();
+      paths_.emplace_back(path);
+      io::InputFileStream* strm = new io::InputFileStream(path);
+      ABORT_IF(strm->empty(), "File with alignments '{}' is empty", path);
+      files_.emplace_back(strm);
+    }

-  if(training && options_->hasAndNotEmpty("data-weighting")) {
-    auto path = options_->get<std::string>("data-weighting");
+    if(useDataWeighting) {
+      auto path = options_->get<std::string>("data-weighting");

-    ABORT_IF(!filesystem::exists(path), "Weight file does not exist");
-    LOG(info, "[data] Using weights from file {}", path);
+      ABORT_IF(!filesystem::exists(path), "Weight file does not exist");
+      LOG(info, "[data] Using weights from file {}", path);

-    weightFileIdx_ = paths_.size();
-    paths_.emplace_back(path);
-    io::InputFileStream* strm = new io::InputFileStream(path);
-    ABORT_IF(strm->empty(), "File with weights '{}' is empty", path);
-    files_.emplace_back(strm);
+      weightFileIdx_ = paths_.size();
+      paths_.emplace_back(path);
+      io::InputFileStream* strm = new io::InputFileStream(path);
+      ABORT_IF(strm->empty(), "File with weights '{}' is empty", path);
+      files_.emplace_back(strm);
+    }
  }
 }

@ -412,23 +484,37 @@ void CorpusBase::initEOS(bool training = true) {
  // add a EOS symbol. Hence decision to add EOS is now based on input stream positions and
  // correspoding input type.

-  size_t numStreams = tsv_ ? tsvNumFields_ : paths_.size(); // determine number of streams
-
+  // Determine the number of streams, i.e. the number of input files (if --train-sets) or fields in
+  // a TSV input (if --tsv). Notice that in case of a TSV input, fields that contain alignments and
+  // weights are *not* included.
+  size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();
  addEOS_.resize(numStreams, true);
-  // @TODO: think if this should be checked and processed here or in a validation step in config?
+
+  // input-types provides the input type for each input file (if --train-sets) or for each TSV field
+  // (if --tsv), for example: sequence, class, alignment.
  auto inputTypes = options_->get<std::vector<std::string>>("input-types", {}); // empty list by default

-  // make sure there is an input type for each stream
-  ABORT_IF(inputTypes.size() > 0 && inputTypes.size() < numStreams,
-           "Input types have been specified ({}), you need to specify one per input ({})",
-           inputTypes.size(),
-           numStreams);
+  // @TODO: think if this should be checked and processed here or in a validation step in config?
+  if(!inputTypes.empty()) {
+    if(tsv_) {
+      // Remove 'alignment' and 'weight' from input types.
+      // Note that these input types are not typical input streams with corresponding vocabularies.
+      // For a TSV input, they were used only to determine fields that contain alignments or weights
+      // and initialize guided-alignment and data-weighting options.
+      auto pos = std::find(inputTypes.begin(), inputTypes.end(), "alignment");
+      if(pos != inputTypes.end())
+        inputTypes.erase(pos);
+      pos = std::find(inputTypes.begin(), inputTypes.end(), "weight");
+      if(pos != inputTypes.end())
+        inputTypes.erase(pos);
+    }

-  // make sure there is an equal number of input types and streams when training
-  ABORT_IF(training && inputTypes.size() > 0 && inputTypes.size() != numStreams,
-           "Input types have been specified ({}), you need to specify one per input ({})",
-           inputTypes.size(),
-           numStreams);
+    // Make sure there is an input type for each stream
+    // and that there is an equal number of input types and streams when training
+    ABORT_IF((inputTypes.size() < numStreams) || (training && inputTypes.size() != numStreams),
+             "Input types have been specified ({}), you need to specify one per input stream ({})",
+             inputTypes.size(), numStreams);
+  }

  for(int i = 0; i < numStreams; ++i)
    if(inputTypes.size() > i) {
@ -444,6 +530,35 @@ void CorpusBase::initEOS(bool training = true) {
    }
 }

+size_t CorpusBase::getNumberOfTSVInputFields(Ptr<Options> options) {
+  if(options->get<bool>("tsv", false)) {
+    size_t n = options->get<size_t>("tsv-fields", 0);
+    if(n > 0 && options->get("guided-alignment", std::string("none")) != "none")
+      --n;
+    if(n > 0 && options->hasAndNotEmpty("data-weighting"))
+      --n;
+    return n;
+  }
+  return 0;
+}
+
+void SentenceTuple::setWeights(const std::vector<float>& weights) {
+  if(weights.size() != 1) {  // this assumes a single sentence-level weight is always fine
+    ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
+    auto numWeights = weights.size();
+    auto numTrgWords = back().size();
+    // word-level weights may or may not contain a weight for EOS tokens
+    if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
+      LOG(warn,
+          "[warn] "
+          "Number of weights ({}) does not match the number of target words ({}) in line #{}",
+          numWeights,
+          numTrgWords,
+          id_);
+  }
+  weights_ = weights;
+}
+
 // experimental: hide inline-fix source tokens from cross attention
 std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
 {
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@ -44,21 +44,24 @@ SentenceTuple TextInput::next() {
  for(size_t i = 0; i < files_.size(); ++i) {
    std::string line;
    if(io::getline(*files_[i], line)) {
-      Words words = vocabs_[i]->encode(line, /*addEOS =*/ true, /*inference =*/ inference_);
+      Words words = vocabs_[i]->encode(line, /*addEOS=*/true, /*inference=*/inference_);
      if(this->maxLengthCrop_ && words.size() > this->maxLength_) {
        words.resize(maxLength_);
        words.back() = vocabs_.back()->getEosId();  // note: this will not work with class-labels
      }
-      if(words.empty())
-        words.push_back(Word::ZERO); // @TODO: What is this for? @BUGBUG: addEOS=true, so this can never happen, right?
+
+      ABORT_IF(words.empty(),   "No words (not even EOS) found in string??");
+      ABORT_IF(tup.size() != i, "Previous tuple elements are missing.");
      tup.push_back(words);
    }
  }

-  // check if each input file provided an example
-  if(tup.size() == files_.size())
+  if(tup.size() == files_.size()) // check if each input file provided an example
    return tup;
-  return SentenceTuple(0);
+  else if(tup.size() == 0) // if no file provided examples we are done
+    return SentenceTuple(0);
+  else // neither all nor none => we have at least on missing entry
+    ABORT("There are missing entries in the text tuples.");
 }

 }  // namespace data
--- a/src/embedder/embedder.h
+++ b/src/embedder/embedder.h
@ -0,0 +1,168 @@
+#pragma once
+
+#include "marian.h"
+
+#include "common/config.h"
+#include "common/options.h"
+#include "data/batch_generator.h"
+#include "data/corpus.h"
+#include "data/corpus_nbest.h"
+#include "models/costs.h"
+#include "models/model_task.h"
+#include "embedder/vector_collector.h"
+#include "training/scheduler.h"
+#include "training/validator.h"
+
+namespace marian {
+
+using namespace data;
+
+/*
+ * The tool is used to create output sentence embeddings from available
+ * Marian encoders. With --compute-similiarity and can return the cosine
+ * similarity between two sentences provided from two sources.
+ */
+class Embedder {
+private:
+  Ptr<models::IModel> model_;
+
+public:
+  Embedder(Ptr<Options> options)
+    : model_(createModelFromOptions(options, models::usage::embedding)) {}
+
+  void load(Ptr<ExpressionGraph> graph, const std::string& modelFile) {
+    model_->load(graph, modelFile);
+  }
+
+  Expr build(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
+    auto embedder = std::dynamic_pointer_cast<EncoderPooler>(model_);
+    ABORT_IF(!embedder, "Could not cast to EncoderPooler");
+    return embedder->apply(graph, batch, /*clearGraph=*/true);
+  }
+};
+
+/*
+ * Actual Embed task. @TODO: this should be simplified in the future.
+ */
+template <class Model>
+class Embed : public ModelTask {
+private:
+  Ptr<Options> options_;
+  Ptr<CorpusBase> corpus_;
+  std::vector<Ptr<ExpressionGraph>> graphs_;
+  std::vector<Ptr<Model>> models_;
+
+public:
+  Embed(Ptr<Options> options) : options_(options) {
+    
+    options_ = options_->with("inference", true, 
+                              "shuffle", "none");
+
+    // if a similarity is computed then double the input types and vocabs for
+    // the two encoders that are used in the model.
+    if(options->get<bool>("compute-similarity")) {
+      auto vVocabs     = options_->get<std::vector<std::string>>("vocabs");
+      auto vDimVocabs  = options_->get<std::vector<size_t>>("dim-vocabs");
+
+      vVocabs.push_back(vVocabs.back());
+      vDimVocabs.push_back(vDimVocabs.back());
+
+      options_ = options_->with("vocabs",      vVocabs,
+                                "dim-vocabs",  vDimVocabs);
+    }
+
+    corpus_ = New<Corpus>(options_);
+    corpus_->prepare();
+
+    auto devices = Config::getDevices(options_);
+
+    for(auto device : devices) {
+      auto graph = New<ExpressionGraph>(true);
+
+      auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
+      graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
+      graph->setDevice(device);
+      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
+      if (device.type == DeviceType::cpu) {
+        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
+      }
+
+      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      graphs_.push_back(graph);
+    }
+
+    auto modelFile = options_->get<std::string>("model");
+
+    models_.resize(graphs_.size());
+    ThreadPool pool(graphs_.size(), graphs_.size());
+    for(size_t i = 0; i < graphs_.size(); ++i) {
+      pool.enqueue(
+          [=](size_t j) {
+            models_[j] = New<Model>(options_);
+            models_[j]->load(graphs_[j], modelFile);
+          },
+          i);
+    }
+  }
+
+  void run() override {
+    LOG(info, "Embedding");
+    timer::Timer timer;
+    
+    auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
+    batchGenerator->prepare();
+
+    auto output = New<VectorCollector>(options_);
+
+    size_t batchId = 0;
+    std::mutex smutex;
+    {
+      ThreadPool pool(graphs_.size(), graphs_.size());
+
+      for(auto batch : *batchGenerator) {
+        auto task = [=, &smutex](size_t id) {
+          thread_local Ptr<ExpressionGraph> graph;
+          thread_local Ptr<Model> builder;
+
+          if(!graph) {
+            graph = graphs_[id % graphs_.size()];
+            builder = models_[id % graphs_.size()];
+          }
+
+          auto embeddings = builder->build(graph, batch);
+          graph->forward();
+
+          std::vector<float> sentVectors;
+          embeddings->val()->get(sentVectors);
+          
+          // collect embedding vector per sentence.
+          // if we compute similarities this is only one similarity per sentence pair.
+          for(size_t i = 0; i < batch->size(); ++i) {
+              auto embSize = embeddings->shape()[-1];
+              auto beg = i * embSize;
+              auto end = (i + 1) * embSize;
+              std::vector<float> sentVector(sentVectors.begin() + beg, sentVectors.begin() + end);
+              output->Write((long)batch->getSentenceIds()[i],
+                            sentVector);
+          }
+        
+          // progress heartbeat for MS-internal Philly compute cluster
+          // otherwise this job may be killed prematurely if no log for 4 hrs
+          if (getenv("PHILLY_JOB_ID")   // this environment variable exists when running on the cluster
+              && id % 1000 == 0)  // hard beat once every 1000 batches
+          {
+            auto progress = id / 10000.f; //fake progress for now, becomes >100 after 1M batches
+            fprintf(stderr, "PROGRESS: %.2f%%\n", progress);
+            fflush(stderr);
+          }
+        };
+
+        pool.enqueue(task, batchId++);
+      }
+    }
+    LOG(info, "Total time: {:.5f}s wall", timer.elapsed());
+  }
+
+};
+
+}  // namespace marian
--- a/src/embedder/vector_collector.cpp
+++ b/src/embedder/vector_collector.cpp
@ -0,0 +1,71 @@
+#include "embedder/vector_collector.h"
+
+#include "common/logging.h"
+#include "common/utils.h"
+
+#include <iostream>
+#include <iomanip>
+
+namespace marian {
+
+// This class manages multi-threaded writing of embedded vectors to stdout or an output file.
+// It will either output string versions of float vectors or binary equal length versions depending
+// on its binary_ flag.
+
+VectorCollector::VectorCollector(const Ptr<Options>& options)
+    : nextId_(0), binary_{options->get<bool>("binary", false)} {
+    if(options->get<std::string>("output") == "stdout")
+      outStrm_.reset(new std::ostream(std::cout.rdbuf()));
+    else
+      outStrm_.reset(new io::OutputFileStream(options->get<std::string>("output")));
+  }
+
+void VectorCollector::Write(long id, const std::vector<float>& vec) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if(id == nextId_) {
+    WriteVector(vec);
+
+    ++nextId_;
+
+    typename Outputs::const_iterator iter, iterNext;
+    iter = outputs_.begin();
+    while(iter != outputs_.end()) {
+      long currId = iter->first;
+
+      if(currId == nextId_) {
+        // 1st element in the map is the next
+        WriteVector(iter->second);
+        
+        ++nextId_;
+
+        // delete current record, move iter on 1
+        iterNext = iter;
+        ++iterNext;
+        outputs_.erase(iter);
+        iter = iterNext;
+      } else {
+        // not the next. stop iterating
+        assert(nextId_ < currId);
+        break;
+      }
+    }
+
+  } else {
+    // save for later
+    outputs_[id] = vec;
+  }
+}
+
+void VectorCollector::WriteVector(const std::vector<float>& vec) {
+  if(binary_) {
+    outStrm_->write((char*)vec.data(), vec.size() * sizeof(float));
+  } else {
+    std::stringstream ss;
+    ss << std::fixed << std::setprecision(8);
+    for(auto v : vec)
+      *outStrm_ << v << " ";
+    *outStrm_ << std::endl;
+  }
+}
+
+}  // namespace marian
--- a/src/embedder/vector_collector.h
+++ b/src/embedder/vector_collector.h
@ -0,0 +1,32 @@
+#pragma once
+
+#include "common/options.h"
+#include "common/definitions.h"
+#include "common/file_stream.h"
+
+#include <map>
+#include <mutex>
+
+namespace marian {
+
+// This class manages multi-threaded writing of embedded vectors to stdout or an output file.
+// It will either output string versions of float vectors or binary equal length versions depending
+// on its binary_ flag.
+class VectorCollector {
+public:
+  VectorCollector(const Ptr<Options>& options);
+  virtual void Write(long id, const std::vector<float>& vec);
+
+protected:
+  long nextId_{0};
+  UPtr<std::ostream> outStrm_;
+  bool binary_; // output binary floating point vectors if set
+
+  std::mutex mutex_;
+
+  typedef std::map<long, std::vector<float>> Outputs;
+  Outputs outputs_;
+
+  virtual void WriteVector(const std::vector<float>& vec);
+};
+}  // namespace marian
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -416,7 +416,7 @@ protected:
    ABORT_IF(logits.getNumFactorGroups() > 1, "Unlikelihood loss is not implemented for factors");

    ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
-    // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
+    // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks BAD, mask again to eliminate padding (might be obsolete)
    auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);

    auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
--- a/src/microsoft/cosmos.cpp
+++ b/src/microsoft/cosmos.cpp
@ -0,0 +1,187 @@
+#include "cosmos.h"
+
+#include "models/model_base.h"
+#include "models/model_factory.h"
+#include "data/text_input.h"
+
+#if MKL_FOUND
+#include "mkl.h"
+#endif
+
+namespace marian {
+
+// Thin wrapper around IModel that makes sure model can be cast to an EncoderPooler
+// These poolers know how to collect embeddings from a seq2seq encoder.
+class EmbedderModel {
+private:
+  Ptr<models::IModel> model_;
+
+public:
+  EmbedderModel(Ptr<Options> options)
+    : model_(createModelFromOptions(options, models::usage::embedding)) {}
+
+  void load(Ptr<ExpressionGraph> graph, const std::string& modelFile) {
+    model_->load(graph, modelFile);
+  }
+
+  Expr build(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
+    auto embedder = std::dynamic_pointer_cast<EncoderPooler>(model_);
+    ABORT_IF(!embedder, "Could not cast to EncoderPooler");
+    return embedder->apply(graph, batch, /*clearGraph=*/true);
+  }
+};
+
+namespace cosmos {
+
+const size_t MAX_BATCH_SIZE =  32;
+const size_t MAX_LENGTH     = 256;
+
+/** 
+ * Single CPU-core implementation of an Embedder/Similiarity scorer. Turns sets of '\n' strings
+ * into parallel batches and either outputs embedding vectors or similarity scores.
+ */
+class Embedder {
+private: 
+  Ptr<Options> options_;
+  Ptr<ExpressionGraph> graph_;
+  Ptr<Vocab> vocab_;
+
+  Ptr<EmbedderModel> model_;
+  
+public:
+  Embedder(const std::string& modelPath, const std::string& vocabPath, bool computeSimilarity = false) {
+    options_ = New<Options>("inference", true, 
+                            "shuffle", "none",
+                            "mini-batch", MAX_BATCH_SIZE,
+                            "maxi-batch", 100,
+                            "maxi-batch-sort", "src",
+                            "max-length", MAX_LENGTH,
+                            "max-length-crop", true,
+                            "compute-similarity", computeSimilarity,
+                            "vocabs", std::vector<std::string>(computeSimilarity ? 2 : 1, vocabPath));
+  
+    vocab_ = New<Vocab>(options_, 0);
+    vocab_->load(vocabPath, 0);
+
+    graph_ = New<ExpressionGraph>(/*inference=*/true);
+    graph_->setDevice(CPU0);
+    graph_->reserveWorkspaceMB(512);
+
+    YAML::Node config;
+    io::getYamlFromModel(config, "special:model.yml", modelPath);
+    
+    Ptr<Options> modelOpts = New<Options>();
+    modelOpts->merge(options_);
+    modelOpts->merge(config);
+
+    model_ = New<EmbedderModel>(modelOpts);
+    model_->load(graph_, modelPath);
+  }
+
+  // Compute embedding vectors for a batch of sentences
+  std::vector<std::vector<float>> embed(const std::string& input) {
+    auto text = New<data::TextInput>(std::vector<std::string>({input}), 
+                                     std::vector<Ptr<Vocab>>({vocab_}),
+                                     options_);
+    // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well.
+    data::BatchGenerator<data::TextInput> batchGenerator(text, options_, /*stats=*/nullptr, /*runAsync=*/false);
+    batchGenerator.prepare();
+
+    std::vector<std::vector<float>> output;
+
+    for(auto batch : batchGenerator) {
+      auto embeddings = model_->build(graph_, batch);
+      graph_->forward();
+
+      std::vector<float> sentVectors;
+      embeddings->val()->get(sentVectors);
+
+      // collect embedding vector per sentence.
+      // if we compute similarities this is only one similarity per sentence pair.
+      for(size_t i = 0; i < batch->size(); ++i) {
+        auto batchIdx = batch->getSentenceIds()[i];
+        if(output.size() <= batchIdx)
+          output.resize(batchIdx + 1);
+        
+        int embSize = embeddings->shape()[-1];
+        size_t beg = i * embSize;
+        size_t end = (i + 1) * embSize;
+        std::vector<float> sentVector(sentVectors.begin() + beg, sentVectors.begin() + end);
+        output[batchIdx] = sentVector;
+      }
+    }
+
+    return output;
+  }
+
+  // Compute cosine similarity scores for a two batches of corresponding sentences
+  std::vector<float> similarity(const std::string& input1, const std::string& input2) {
+    auto text = New<data::TextInput>(std::vector<std::string>({input1, input2}), 
+                                     std::vector<Ptr<Vocab>>({vocab_, vocab_}),
+                                     options_);
+    // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well.
+    data::BatchGenerator<data::TextInput> batchGenerator(text, options_, /*stats=*/nullptr, /*runAsync=*/false);
+    batchGenerator.prepare();
+
+    std::vector<float> output;
+
+    for(auto batch : batchGenerator) {
+      auto similarities = model_->build(graph_, batch);
+      graph_->forward();
+
+      std::vector<float> vSimilarities;
+      similarities->val()->get(vSimilarities);
+
+      // collect similarity score per sentence pair.
+      for(size_t i = 0; i < batch->size(); ++i) {
+        auto batchIdx = batch->getSentenceIds()[i];
+        if(output.size() <= batchIdx)
+          output.resize(batchIdx + 1);
+        output[batchIdx] = vSimilarities[i];
+      }
+    }
+
+    return output;
+  };
+};
+
+/* Interface functions ***************************************************************************/
+
+MarianEmbedder::MarianEmbedder() {
+#if MKL_FOUND
+  mkl_set_num_threads(1);
+#endif
+  marian::setThrowExceptionOnAbort(true); // globally defined to throw now
+}
+
+std::vector<std::vector<float>> MarianEmbedder::embed(const std::string& input) {
+  ABORT_IF(!embedder_, "Embedder is not defined??");
+  return embedder_->embed(input);
+}
+
+bool MarianEmbedder::load(const std::string& modelPath, const std::string& vocabPath) {
+  embedder_ = New<Embedder>(modelPath, vocabPath, /*computeSimilarity*/false);
+  ABORT_IF(!embedder_, "Embedder is not defined??");
+  return true;
+}
+
+MarianCosineScorer::MarianCosineScorer() {
+#if MKL_FOUND
+  mkl_set_num_threads(1);
+#endif
+  marian::setThrowExceptionOnAbort(true); // globally defined to throw now
+}
+
+std::vector<float> MarianCosineScorer::score(const std::string& input1, const std::string& input2) {
+  ABORT_IF(!embedder_, "Embedder is not defined??");
+  return embedder_->similarity(input1, input2);
+};
+
+bool MarianCosineScorer::load(const std::string& modelPath, const std::string& vocabPath) {
+  embedder_ = New<Embedder>(modelPath, vocabPath, /*computeSimilarity*/true);
+  ABORT_IF(!embedder_, "Embedder is not defined??");
+  return true;
+}
+
+} // namespace cosmos
+} // namespace marian
--- a/src/microsoft/cosmos.h
+++ b/src/microsoft/cosmos.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace marian {
+
+template <typename T>
+using Ptr = std::shared_ptr<T>;
+
+namespace cosmos {
+  class Embedder;
+
+  /**
+   * MarianEmbedder takes a Marian sequence2sequence transformer model and produces
+   * sentence embeddings collected from the encoder. Currently the model file is supposed 
+   * to know how to do that. 
+   */
+  class MarianEmbedder {
+    private:
+      Ptr<Embedder> embedder_;
+
+    public:
+      MarianEmbedder();
+
+      /**
+       * `input` is a big string with multiple sentences separated by '\n'.
+       * Returns a vector of embedding vectors in order corresponding to input sentence order.
+       */
+      std::vector<std::vector<float>> embed(const std::string& input);
+
+      /** 
+       * `modelPath` is a Marian model, `vocabPath` a matching SentencePiece model with *.spm suffix.
+       */
+      bool load(const std::string& modelPath, const std::string& vocabPath);
+  };
+
+  /**
+   * MarianCosineScorer takes a Marian sequence2sequence transformer model and produces
+   * sentence-wise cosine similarities for two sentence embeddings.
+   */
+  class MarianCosineScorer {
+    private:
+      Ptr<Embedder> embedder_;
+
+    public:
+      MarianCosineScorer();
+
+      /**
+       * `input1` and `input2' are big strings with multiple sentences separated by '\n'.
+       * Both inputs have to have the same number of separated lines.
+       * Returns a vector of similarity scores in order corresponding to input sentence order.
+       */
+      std::vector<float> score(const std::string& input1, const std::string& input2);
+      
+      /** 
+       * `modelPath` is a Marian model, `vocabPath` a matching SentencePiece model with *.spm suffix.
+       */
+      bool load(const std::string& modelPath, const std::string& vocabPath);
+  };
+}
+
+}
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@ -51,6 +51,7 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
  modelFeatures_.insert("transformer-tied-layers");
  modelFeatures_.insert("transformer-guided-alignment-layer");
  modelFeatures_.insert("transformer-train-position-embeddings");
+  modelFeatures_.insert("transformer-pool");

  modelFeatures_.insert("bert-train-type-embeddings");
  modelFeatures_.insert("bert-type-vocab-size");
--- a/src/models/encoder_pooler.h
+++ b/src/models/encoder_pooler.h
@ -0,0 +1,219 @@
+#pragma once
+
+#include "marian.h"
+
+#include "models/encoder.h"
+#include "models/pooler.h"
+#include "models/model_base.h"
+#include "models/states.h"
+
+// @TODO: this introduces functionality to use LASER in Marian for the filtering workflow or for use in MS-internal 
+// COSMOS server-farm. There is a lot of code duplication with Classifier and EncoderDecoder and this needs to be fixed. 
+// This will be done after the new layer system has been finished.
+
+namespace marian {
+
+/**
+ * Combines sequence encoders with generic poolers
+ * Can be used to train sequence poolers like language detection, BERT-next-sentence-prediction etc.
+ * Already has support for multi-objective training.
+ *
+ * @TODO: this should probably be unified somehow with EncoderDecoder which could allow for deocder/pooler
+ * multi-objective training.
+ */
+class EncoderPoolerBase : public models::IModel {
+public:
+  virtual ~EncoderPoolerBase() {}
+
+  virtual void load(Ptr<ExpressionGraph> graph,
+                    const std::string& name,
+                    bool markedReloaded = true) override
+      = 0;
+
+  virtual void mmap(Ptr<ExpressionGraph> graph,
+                    const void* ptr,
+                    bool markedReloaded = true)
+      = 0;
+
+  virtual void save(Ptr<ExpressionGraph> graph,
+                    const std::string& name,
+                    bool saveTranslatorConfig = false) override
+      = 0;
+
+  virtual void clear(Ptr<ExpressionGraph> graph) override = 0;
+
+  virtual Expr apply(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, bool) = 0;
+
+  virtual Logits build(Ptr<ExpressionGraph> graph,
+                       Ptr<data::Batch> batch,
+                       bool clearGraph = true) override {
+    clearGraph;
+    ABORT("Poolers cannot produce Logits");
+  };
+
+  virtual Logits build(Ptr<ExpressionGraph> graph,
+                       Ptr<data::CorpusBatch> batch,
+                       bool clearGraph = true) {
+    clearGraph;
+    ABORT("Poolers cannot produce Logits");
+  }
+
+  virtual Ptr<Options> getOptions() = 0;
+};
+
+class EncoderPooler : public EncoderPoolerBase {
+protected:
+  Ptr<Options> options_;
+
+  std::string prefix_;
+
+  std::vector<Ptr<EncoderBase>> encoders_;
+  std::vector<Ptr<PoolerBase>> poolers_;
+
+  bool inference_{true};
+
+  std::set<std::string> modelFeatures_;
+
+  Config::YamlNode getModelParameters() {
+    Config::YamlNode modelParams;
+    auto clone = options_->cloneToYamlNode();
+    for(auto& key : modelFeatures_)
+      modelParams[key] = clone[key];
+
+    if(options_->has("original-type"))
+      modelParams["type"] = clone["original-type"];
+
+    modelParams["version"] = buildVersion();
+    return modelParams;
+  }
+
+  std::string getModelParametersAsString() {
+    auto yaml = getModelParameters();
+    YAML::Emitter out;
+    cli::OutputYaml(yaml, out);
+    return std::string(out.c_str());
+  }
+
+public:
+  typedef data::Corpus dataset_type;
+
+  // @TODO: lots of code-duplication with EncoderDecoder
+  EncoderPooler(Ptr<Options> options)
+    : options_(options),
+      prefix_(options->get<std::string>("prefix", "")),
+      inference_(options->get<bool>("inference", false)) {
+  modelFeatures_ = {"type",
+                    "dim-vocabs",
+                    "dim-emb",
+                    "dim-rnn",
+                    "enc-cell",
+                    "enc-type",
+                    "enc-cell-depth",
+                    "enc-depth",
+                    "dec-depth",
+                    "dec-cell",
+                    "dec-cell-base-depth",
+                    "dec-cell-high-depth",
+                    "skip",
+                    "layer-normalization",
+                    "right-left",
+                    "input-types",
+                    "special-vocab",
+                    "tied-embeddings",
+                    "tied-embeddings-src",
+                    "tied-embeddings-all"};
+
+    modelFeatures_.insert("transformer-heads");
+    modelFeatures_.insert("transformer-no-projection");
+    modelFeatures_.insert("transformer-dim-ffn");
+    modelFeatures_.insert("transformer-ffn-depth");
+    modelFeatures_.insert("transformer-ffn-activation");
+    modelFeatures_.insert("transformer-dim-aan");
+    modelFeatures_.insert("transformer-aan-depth");
+    modelFeatures_.insert("transformer-aan-activation");
+    modelFeatures_.insert("transformer-aan-nogate");
+    modelFeatures_.insert("transformer-preprocess");
+    modelFeatures_.insert("transformer-postprocess");
+    modelFeatures_.insert("transformer-postprocess-emb");
+    modelFeatures_.insert("transformer-decoder-autoreg");
+    modelFeatures_.insert("transformer-tied-layers");
+    modelFeatures_.insert("transformer-guided-alignment-layer");
+    modelFeatures_.insert("transformer-train-position-embeddings");
+    modelFeatures_.insert("transformer-pool");
+
+    modelFeatures_.insert("bert-train-type-embeddings");
+    modelFeatures_.insert("bert-type-vocab-size");
+
+    modelFeatures_.insert("ulr");
+    modelFeatures_.insert("ulr-trainable-transformation");
+    modelFeatures_.insert("ulr-dim-emb");
+    modelFeatures_.insert("lemma-dim-emb");
+  }
+
+  virtual Ptr<Options> getOptions() override { return options_; }
+
+  std::vector<Ptr<EncoderBase>>& getEncoders() { return encoders_; }
+  std::vector<Ptr<PoolerBase>>& getPoolers() { return poolers_; }
+
+  void push_back(Ptr<EncoderBase> encoder) { encoders_.push_back(encoder); }
+  void push_back(Ptr<PoolerBase> pooler) { poolers_.push_back(pooler); }
+
+  void load(Ptr<ExpressionGraph> graph,
+            const std::string& name,
+            bool markedReloaded) override {
+    graph->load(name, markedReloaded && !opt<bool>("ignore-model-config", false));
+  }
+
+  void mmap(Ptr<ExpressionGraph> graph,
+            const void* ptr,
+            bool markedReloaded) override {
+    graph->mmap(ptr, markedReloaded && !opt<bool>("ignore-model-config", false));
+  }
+
+  void save(Ptr<ExpressionGraph> graph,
+            const std::string& name,
+            bool /*saveModelConfig*/) override {
+    LOG(info, "Saving model weights and runtime parameters to {}", name);
+    graph->save(name , getModelParametersAsString());
+  }
+
+  void clear(Ptr<ExpressionGraph> graph) override {
+    graph->clear();
+
+    for(auto& enc : encoders_)
+      enc->clear();
+    for(auto& pooler : poolers_)
+      pooler->clear();
+  }
+
+  template <typename T>
+  T opt(const std::string& key) {
+    return options_->get<T>(key);
+  }
+
+  template <typename T>
+  T opt(const std::string& key, const T& def) {
+    return options_->get<T>(key, def);
+  }
+
+  template <typename T>
+  void set(std::string key, T value) {
+    options_->set(key, value);
+  }
+
+  /*********************************************************************/
+
+  virtual Expr apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, bool clearGraph) override {
+    if(clearGraph)
+      clear(graph);
+
+    std::vector<Ptr<EncoderState>> encoderStates;
+    for(auto& encoder : encoders_)
+        encoderStates.push_back(encoder->build(graph, batch));
+
+    ABORT_IF(poolers_.size() != 1, "Expected exactly one pooler");
+    return poolers_[0]->apply(graph, batch, encoderStates);
+  }
+};
+
+}  // namespace marian
--- a/src/models/laser.h
+++ b/src/models/laser.h
@ -0,0 +1,71 @@
+#pragma once
+
+#include "marian.h"
+
+#include "layers/constructors.h"
+#include "rnn/constructors.h"
+
+namespace marian {
+
+// Re-implements the LASER BiLSTM encoder from:
+// Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond
+// Mikel Artetxe, Holger Schwenk
+// https://arxiv.org/abs/1812.10464
+
+class EncoderLaser : public EncoderBase {
+  using EncoderBase::EncoderBase;
+
+public:
+  Expr applyEncoderRNN(Ptr<ExpressionGraph> graph,
+                       Expr embeddings,
+                       Expr mask) {
+    int depth = opt<int>("enc-depth");    
+    float dropoutRnn = inference_ ? 0 : opt<float>("dropout-rnn");
+
+    Expr output = embeddings;
+
+    auto applyRnn = [&](int layer, rnn::dir direction, Expr input, Expr mask) {
+
+        std::string paramPrefix = prefix_ + "_" + opt<std::string>("enc-cell");
+        paramPrefix += "_l" + std::to_string(layer);
+        if(direction == rnn::dir::backward)
+            paramPrefix += "_reverse";
+
+        auto rnnFactory = rnn::rnn()
+            ("type", opt<std::string>("enc-cell"))
+            ("direction", (int)direction)
+            ("dimInput", input->shape()[-1])
+            ("dimState", opt<int>("dim-rnn"))
+            ("dropout", dropoutRnn)
+            ("layer-normalization", opt<bool>("layer-normalization"))
+            ("skip", opt<bool>("skip"))
+            .push_back(rnn::cell()("prefix", paramPrefix));
+
+        return rnnFactory.construct(graph)->transduce(input, mask);
+    };
+
+    for(int i = 0; i < depth; ++i) {
+        output = concatenate({applyRnn(i, rnn::dir::forward, output, mask),
+                              applyRnn(i, rnn::dir::backward, output, mask)},
+                              /*axis =*/ -1);
+    }
+
+    return output;
+  }
+
+  virtual Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
+                                  Ptr<data::CorpusBatch> batch) override {
+    graph_ = graph;
+    // select embeddings that occur in the batch
+    Expr batchEmbeddings, batchMask; std::tie
+    (batchEmbeddings, batchMask) = getEmbeddingLayer()->apply((*batch)[batchIndex_]);
+
+    Expr context = applyEncoderRNN(graph_, batchEmbeddings, batchMask);
+
+    return New<EncoderState>(context, batchMask, batch);
+  }
+
+  void clear() override {}
+};
+
+}
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@ -8,7 +8,7 @@
 namespace marian {
 namespace models {

-enum struct usage { raw, training, scoring, translation };
+enum struct usage { raw, training, scoring, translation, embedding };
 }
 }  // namespace marian

--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@ -10,6 +10,7 @@
 #include "models/amun.h"
 #include "models/nematus.h"
 #include "models/s2s.h"
+#include "models/laser.h"
 #include "models/transformer_factory.h"

 #ifdef CUDNN
@ -29,6 +30,9 @@ namespace models {
 Ptr<EncoderBase> EncoderFactory::construct(Ptr<ExpressionGraph> graph) {
  if(options_->get<std::string>("type") == "s2s")
    return New<EncoderS2S>(graph, options_);
+  
+  if(options_->get<std::string>("type") == "laser" || options_->get<std::string>("type") == "laser-sim")
+    return New<EncoderLaser>(graph, options_);

 #ifdef CUDNN
  if(options_->get<std::string>("type") == "char-s2s")
@ -61,6 +65,17 @@ Ptr<ClassifierBase> ClassifierFactory::construct(Ptr<ExpressionGraph> graph) {
    ABORT("Unknown classifier type");
 }

+Ptr<PoolerBase> PoolerFactory::construct(Ptr<ExpressionGraph> graph) {
+  if(options_->get<std::string>("type") == "max-pooler")
+    return New<MaxPooler>(graph, options_);
+  if(options_->get<std::string>("type") == "slice-pooler")
+    return New<SlicePooler>(graph, options_);
+  else if(options_->get<std::string>("type") == "sim-pooler")
+    return New<SimPooler>(graph, options_);
+  else
+    ABORT("Unknown pooler type");
+}
+
 Ptr<IModel> EncoderDecoderFactory::construct(Ptr<ExpressionGraph> graph) {
  Ptr<EncoderDecoder> encdec;
  if(options_->get<std::string>("type") == "amun")
@ -97,9 +112,54 @@ Ptr<IModel> EncoderClassifierFactory::construct(Ptr<ExpressionGraph> graph) {
  return enccls;
 }

+Ptr<IModel> EncoderPoolerFactory::construct(Ptr<ExpressionGraph> graph) {
+  Ptr<EncoderPooler> encpool = New<EncoderPooler>(options_);
+
+  for(auto& ef : encoders_)
+    encpool->push_back(ef(options_).construct(graph));
+
+  for(auto& pl : poolers_)
+    encpool->push_back(pl(options_).construct(graph));
+
+  return encpool;
+}
+
 Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> options) {
  Ptr<ExpressionGraph> graph = nullptr; // graph unknown at this stage
  // clang-format off
+
+  if(use == usage::embedding) { // hijacking an EncoderDecoder model for embedding only
+    int dimVocab = options->get<std::vector<int>>("dim-vocabs")[0];
+    
+    Ptr<Options> newOptions;
+    if(options->get<bool>("compute-similarity")) {
+      newOptions = options->with("usage", use,
+                                 "original-type", type,
+                                 "input-types", std::vector<std::string>({"sequence", "sequence"}),
+                                 "dim-vocabs", std::vector<int>(2, dimVocab));
+    } else {
+      newOptions = options->with("usage", use,
+                                 "original-type", type,
+                                 "input-types", std::vector<std::string>({"sequence"}),
+                                 "dim-vocabs", std::vector<int>(1, dimVocab));
+    }
+    
+    auto res = New<EncoderPooler>(newOptions);      
+    if(options->get<bool>("compute-similarity")) {
+      res->push_back(models::encoder(newOptions->with("index", 0)).construct(graph));
+      res->push_back(models::encoder(newOptions->with("index", 1)).construct(graph));
+      res->push_back(New<SimPooler>(graph, newOptions->with("type", "sim-pooler")));
+    } else {
+      res->push_back(models::encoder(newOptions->with("index", 0)).construct(graph));
+      if(type == "laser")
+        res->push_back(New<MaxPooler>(graph, newOptions->with("type", "max-pooler")));
+      else
+        res->push_back(New<SlicePooler>(graph, newOptions->with("type", "slice-pooler")));
+    }
+
+    return res;
+  }
+
  if(type == "s2s" || type == "amun" || type == "nematus") {
    return models::encoder_decoder(options->with(
         "usage", use,
@ -313,7 +373,7 @@ Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage use) {
    else
      ABORT("'usage' parameter 'translation' cannot be applied to model type: {}", type);
  }
-  else if (use == usage::raw)
+  else if (use == usage::raw || use == usage::embedding)
    return baseModel;
  else
    ABORT("'Usage' parameter must be 'translation' or 'raw'");
--- a/src/models/model_factory.h
+++ b/src/models/model_factory.h
@ -5,6 +5,7 @@
 #include "layers/factory.h"
 #include "models/encoder_decoder.h"
 #include "models/encoder_classifier.h"
+#include "models/encoder_pooler.h"

 namespace marian {
 namespace models {
@ -33,6 +34,14 @@ public:

 typedef Accumulator<ClassifierFactory> classifier;

+class PoolerFactory : public Factory {
+  using Factory::Factory;
+public:
+  virtual Ptr<PoolerBase> construct(Ptr<ExpressionGraph> graph);
+};
+
+typedef Accumulator<PoolerFactory> pooler;
+
 class EncoderDecoderFactory : public Factory {
  using Factory::Factory;
 private:
@ -77,6 +86,28 @@ public:

 typedef Accumulator<EncoderClassifierFactory> encoder_classifier;

+class EncoderPoolerFactory : public Factory {
+  using Factory::Factory;
+private:
+  std::vector<encoder> encoders_;
+  std::vector<pooler> poolers_;
+
+public:
+  Accumulator<EncoderPoolerFactory> push_back(encoder enc) {
+    encoders_.push_back(enc);
+    return Accumulator<EncoderPoolerFactory>(*this);
+  }
+
+  Accumulator<EncoderPoolerFactory> push_back(pooler cls) {
+    poolers_.push_back(cls);
+    return Accumulator<EncoderPoolerFactory>(*this);
+  }
+
+  virtual Ptr<IModel> construct(Ptr<ExpressionGraph> graph);
+};
+
+typedef Accumulator<EncoderPoolerFactory> encoder_pooler;
+
 Ptr<IModel> createBaseModelByType(std::string type, usage, Ptr<Options> options);

 Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage);
--- a/src/models/pooler.h
+++ b/src/models/pooler.h
@ -0,0 +1,139 @@
+#pragma once
+
+#include "marian.h"
+#include "models/states.h"
+#include "layers/constructors.h"
+#include "layers/factory.h"
+
+namespace marian {
+
+/**
+ * Simple base class for Poolers to be used in EncoderPooler framework
+ * A pooler takes a encoder state (contextual word embeddings) and produces 
+ * a single sentence embedding.
+ */
+class PoolerBase : public LayerBase {
+  using LayerBase::LayerBase;
+
+protected:
+  const std::string prefix_{"pooler"};
+  const bool inference_{false};
+  const size_t batchIndex_{0};
+
+public:
+  PoolerBase(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+      : LayerBase(graph, options),
+        prefix_(options->get<std::string>("prefix", "pooler")),
+        inference_(options->get<bool>("inference", true)),
+        batchIndex_(options->get<size_t>("index", 1)) {} // assume that training input has batch index 0 and labels has 1
+
+  virtual ~PoolerBase() {}
+
+  virtual Expr apply(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, const std::vector<Ptr<EncoderState>>&) = 0;
+
+  template <typename T>
+  T opt(const std::string& key) const {
+    return options_->get<T>(key);
+  }
+
+  // Should be used to clear any batch-wise temporary objects if present
+  virtual void clear() = 0;
+};
+
+/**
+ * Pool encoder state (contextual word embeddings) via max-pooling along sentence-length dimension.
+ */
+class MaxPooler : public PoolerBase {
+public:
+  MaxPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : PoolerBase(graph, options) {}
+
+  Expr apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+    ABORT_IF(encoderStates.size() != 1, "Pooler expects exactly one encoder state");
+
+    auto context = encoderStates[0]->getContext();
+    auto batchMask = encoderStates[0]->getMask();
+
+    // do a max pool here
+    Expr logMask = (1.f - batchMask) * -9999.f;
+    Expr maxPool = max(context * batchMask + logMask, /*axis=*/-3);
+
+    return maxPool;
+  }
+
+  void clear() override {}
+
+};
+
+/**
+ * Pool encoder state (contextual word embeddings) by selecting 1st embedding along sentence-length dimension.
+ */
+class SlicePooler : public PoolerBase {
+public:
+  SlicePooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : PoolerBase(graph, options) {}
+
+  Expr apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+    ABORT_IF(encoderStates.size() != 1, "Pooler expects exactly one encoder state");
+
+    auto context = encoderStates[0]->getContext();
+    auto batchMask = encoderStates[0]->getMask();
+
+    // Corresponds to the way we do this in transformer.h
+    // @TODO: unify this better, this is currently hacky
+    Expr slicePool = slice(context * batchMask, /*axis=*/-3, 0);
+
+    return slicePool;
+  }
+
+  void clear() override {}
+
+};
+
+/**
+ * Not really a pooler but abusing the interface to compute a similarity of two pooled states
+ */
+class SimPooler : public PoolerBase {
+public:
+  SimPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
+  : PoolerBase(graph, options) {}
+
+  Expr apply(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, const std::vector<Ptr<EncoderState>>& encoderStates) override {
+    ABORT_IF(encoderStates.size() != 2, "SimPooler expects exactly two encoder states");
+
+    std::vector<Expr> vecs;
+    for(auto encoderState : encoderStates) {
+      auto context = encoderState->getContext();
+      auto batchMask = encoderState->getMask();
+
+      Expr pool;
+      auto type = options_->get<std::string>("original-type");
+      if(type == "laser") {
+        // LASER models do a max pool here
+        Expr logMask = (1.f - batchMask) * -9999.f;
+        pool         = max(context * batchMask + logMask, /*axis=*/-3);
+      } else if(type == "transformer") { 
+        // Our own implementation in transformer.h uses a slice of the first element
+        pool         = slice(context, -3, 0);
+      } else {
+        // @TODO: make SimPooler take Pooler objects as arguments then it won't need to know this.
+        ABORT("Don't know what type of pooler to use for model type {}", type);
+      }
+
+      vecs.push_back(pool);
+    }
+
+    auto scalars = scalar_product(vecs[0], vecs[1], /*axis*/-1);
+    auto length1 = sqrt(sum(square(vecs[0]), /*axis=*/-1));
+    auto length2 = sqrt(sum(square(vecs[1]), /*axis=*/-1));
+
+    auto cosine  = scalars / ( length1 * length2 );
+
+    return cosine;
+  }
+
+  void clear() override {}
+
+};
+
+}
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -328,6 +328,29 @@ public:
    return output;
  }

+  // Reduce the encoder to a single sentence vector, here we just take the contextual embedding of the first word per sentence
+  // Replaces cross-attention in LASER-like models
+  Expr LayerPooling(std::string prefix,
+                    Expr input,            // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
+                    const Expr& values) {  // [-4: beam depth=1, -3: batch size, -2: max length (src or trg), -1: vector dim]
+    int dimModel = input->shape()[-1];
+    auto output = slice(values, -2, 0); // Select first word [-4: beam depth, -3: batch size, -2: 1, -1: vector dim]
+
+    int dimPool = output->shape()[-1];
+    bool project = !opt<bool>("transformer-no-projection");
+    if(project || dimPool != dimModel) {
+      auto Wo = graph_->param(prefix + "_Wo", {dimPool, dimModel}, inits::glorotUniform());
+      auto bo = graph_->param(prefix + "_bo", {1, dimModel}, inits::zeros());
+      output = affine(output, Wo, bo);  // [-4: beam depth, -3: batch size, -2: 1, -1: vector dim]
+    }
+
+    auto opsPost = opt<std::string>("transformer-postprocess");
+    output = postProcess(prefix + "_Wo", opsPost, output, input, 0.f);
+
+    return output;
+  }
+
+
  Expr LayerAttention(std::string prefix,
                      Expr input,         // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
                      const Expr& keys,   // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
@ -790,14 +813,20 @@ public:
            saveAttentionWeights = i == attLayer;
          }

-          query = LayerAttention(prefix,
+          if(options_->get<bool>("transformer-pool", false)) {
+            query = LayerPooling(prefix,
                                 query,
-                                 encoderContexts[j], // keys
-                                 encoderContexts[j], // values
-                                 encoderMasks[j],
-                                 opt<int>("transformer-heads"),
-                                 /*cache=*/true,
-                                 saveAttentionWeights);
+                                 encoderContexts[j]); // values
+          } else {
+            query = LayerAttention(prefix,
+                                   query,
+                                   encoderContexts[j], // keys
+                                   encoderContexts[j], // values
+                                   encoderMasks[j],
+                                   opt<int>("transformer-heads"),
+                                   /*cache=*/true,
+                                   saveAttentionWeights);
+          }
        }
      }

--- a/src/onnx/expression_graph_onnx_serialization.cpp
+++ b/src/onnx/expression_graph_onnx_serialization.cpp
@ -622,7 +622,7 @@ namespace marian {
    model.set_ir_version(IR_VERSION);
    model.set_producer_name(producerName);
    model.mutable_graph()->CopyFrom(graph);
-#define OPSET_IMPORT_VERSION 9 // 9 is needed for some newer ops
+#define OPSET_IMPORT_VERSION 11
    model.add_opset_import()->set_version(OPSET_IMPORT_VERSION);
    return model;
  }
@ -833,7 +833,7 @@ namespace marian {
      LOG(info, s);
    }
    // axis attribute
-    size_t axis;
+    size_t axis{};
    std::vector<size_t> axes;
    if (E::tryGetAxisAttribute<ConcatenateNodeOp>(expr, axis)// ||
        //E::tryGetAxisAttribute<SelectNodeOp>(expr, axis)
--- a/src/rnn/cells.h
+++ b/src/rnn/cells.h
@ -651,7 +651,7 @@ public:
 using LSTM = FastLSTM;

 /******************************************************************************/
-// Experimentak cells, use with care
+// Experimental cells, use with care

 template <class CellType>
 class Multiplicative : public CellType {
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@ -6,7 +6,6 @@
 #endif
 #endif

-#if BLAS_FOUND
 inline void sgemm(bool transA,
                  bool transB,
                  int rows_a,
@ -20,6 +19,7 @@ inline void sgemm(bool transA,
                  float beta,
                  float* c,
                  int ldc) {
+#if BLAS_FOUND
  cblas_sgemm(CblasRowMajor,
              transA ? CblasTrans : CblasNoTrans,
              transB ? CblasTrans : CblasNoTrans,
@ -34,5 +34,7 @@ inline void sgemm(bool transA,
              beta,
              c,
              ldc);
+#else
+    ABORT("Marian must be compiled with a BLAS library");
+#endif
 }
-#endif
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -1282,67 +1282,104 @@ void SetSparse(float* out,
  }
 }

-void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) {
+// should be implemented via slicing and elementwise
+template <typename FType>
+void LSTMCellForwardTyped(Tensor out_, const std::vector<Tensor>& inputs) {
  int rows = out_->shape().elements() / out_->shape()[-1];
-  int cols = out_->shape()[-1];

-  float* out = out_->data();
-  const float* cell = inputs[0]->data();
-  const float* xW = inputs[1]->data();
-  const float* sU = inputs[2]->data();
-  const float* b = inputs[3]->data();
+  int fVecSize = sizeof(FType) / sizeof(float);
+  int cols = out_->shape()[-1] / fVecSize;
+
+  FType* out = out_->data<FType>();
+  const FType* cell = inputs[0]->data<FType>();
+  const FType* xW = inputs[1]->data<FType>();
+  const FType* sU = inputs[2]->data<FType>();
+  const FType* b = inputs[3]->data<FType>();
  const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;

+  using fop = functional::Ops<FType>;
+
  for(int j = 0; j < rows; ++j) {
    float m = !mask || mask[j];

-    float* rowOut = out + j * cols;
-    const float* rowCell = cell + j * cols;
+    FType* rowOut = out + j * cols;
+    const FType* rowCell = cell + j * cols;

-    const float* xWrow = xW + j * cols * 4;
-    const float* sUrow = sU + j * cols * 4;
+    const FType* xWrow = xW + j * cols * 4;
+    const FType* sUrow = sU + j * cols * 4;

    for(int i = 0; i < cols; ++i) {
-      float gf = functional::Ops<float>::sigmoid(xWrow[i] + sUrow[i] + b[i]);
+      FType gf   = fop::sigmoid(fop::add(fop::add(xWrow[i], sUrow[i]), b[i]));

      int k = i + cols;
-      float gi = functional::Ops<float>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
+      FType gi   = fop::sigmoid(fop::add(fop::add(xWrow[k], sUrow[k]), b[k]));

      int l = i + 2 * cols;
-      float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
+      FType gc   = fop::tanh(fop::add(fop::add(xWrow[l], sUrow[l]), b[l]));

-      float cout = gf * rowCell[i] + gi * gc;
-      rowOut[i] = m * cout + (1 - m) * rowCell[i];
+      FType cout = fop::add(fop::mul(gf, rowCell[i]), fop::mul(gi, gc));
+      rowOut[i]  = fop::add(fop::mul(m, cout), fop::mul(fop::sub(1.f, m), rowCell[i]));
    }
  }
 }

-void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) {
-  int rows = out_->shape().elements() / out_->shape()[-1];
-  int cols = out_->shape()[-1];
+void LSTMCellForward(Tensor out, std::vector<Tensor> inputs) {
+  int cols = out->shape()[-1];
+#ifdef __AVX__
+  if(cols % 8 == 0)
+    LSTMCellForwardTyped<float32x8>(out, inputs);
+  else
+#endif
+  if(cols % 4 == 0)
+    LSTMCellForwardTyped<float32x4>(out, inputs);
+  else
+    LSTMCellForwardTyped<float>(out, inputs);
+}

-  float* out = out_->data();
-  const float* cell = inputs[0]->data();
-  const float* xW = inputs[1]->data();
-  const float* sU = inputs[2]->data();
-  const float* b = inputs[3]->data();
+template <typename FType>
+void LSTMOutputForwardTyped(Tensor out_, const std::vector<Tensor>& inputs) {
+  int rows = out_->shape().elements() / out_->shape()[-1];
+  
+  int fVecSize = sizeof(FType) / sizeof(float);
+  int cols = out_->shape()[-1] / fVecSize;
+
+  FType* out = out_->data<FType>();
+  const FType* cell = inputs[0]->data<FType>();
+  const FType* xW   = inputs[1]->data<FType>();
+  const FType* sU   = inputs[2]->data<FType>();
+  const FType* b    = inputs[3]->data<FType>();
+
+  using fop = functional::Ops<FType>;

  for(int j = 0; j < rows; ++j) {
-    float* rowOut = out + j * cols;
-    const float* rowCell = cell + j * cols;
+    FType* rowOut = out + j * cols;
+    const FType* rowCell = cell + j * cols;

-    const float* xWrow = xW + j * cols * 4;
-    const float* sUrow = sU + j * cols * 4;
+    const FType* xWrow = xW + j * cols * 4;
+    const FType* sUrow = sU + j * cols * 4;

    for(int i = 0; i < cols; ++i) {
      int k = i + 3 * cols;
-      float go = functional::Ops<float>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
-
-      rowOut[i] = go * std::tanh(rowCell[i]);
+      FType go  = fop::sigmoid(fop::add(fop::add(xWrow[k], sUrow[k]), b[k]));
+      rowOut[i] = fop::mul(go, fop::tanh(rowCell[i]));
    }
  }
 }

+void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs) {
+  int cols = out->shape()[-1];
+
+#ifdef __AVX__
+  if(cols % 8 == 0)
+    LSTMOutputForwardTyped<float32x8>(out, inputs);
+  else 
+#endif
+  if(cols % 4 == 0)
+    LSTMOutputForwardTyped<float32x4>(out, inputs);
+  else
+    LSTMOutputForwardTyped<float>(out, inputs);
+}
+
 void LSTMCellBackward(std::vector<Tensor> outputs,
                      std::vector<Tensor> inputs,
                      Tensor adj_) {
--- a/src/tensors/gpu/prod.cpp
+++ b/src/tensors/gpu/prod.cpp
@ -12,6 +12,63 @@
 #include "tensors/gpu/cuda_helpers.h"
 // clang-format on

+// recreations of a few cusparse functions that were deprecated in CUDA 11
+// @TODO: Fill these in. This is not trivial. Until then, using these with CUDA 11 will fail.
+#if CUDA_VERSION >= 11000
+cusparseStatus_t
+cusparseSgemmi10(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const float*     alpha,
+               const float*     A,
+               int              lda,
+               const float*     cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const float*     beta,
+               float*           C,
+               int              ldc) {
+  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
+}
+#define cusparseSgemmi cusparseSgemmi10
+cusparseStatus_t
+cusparseScsr2csc(cusparseHandle_t    handle,
+                 int                 m,
+                 int                 n,
+                 int                 nnz,
+                 const float*        csrVal,
+                 const int*          csrRowPtr,
+                 const int*          csrColInd,
+                 float*              cscVal,
+                 int*                cscRowInd,
+                 int*                cscColPtr,
+                 cusparseAction_t    copyValues,
+                 cusparseIndexBase_t idxBase) {
+  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
+}
+cusparseStatus_t
+cusparseScsrmm(cusparseHandle_t         handle,
+               cusparseOperation_t      transA,
+               int                      m,
+               int                      n,
+               int                      k,
+               int                      nnz,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float*             csrValA,
+               const int*               csrRowPtrA,
+               const int*               csrColIndA,
+               const float*             B,
+               int                      ldb,
+               const float*             beta,
+               float*                   C,
+               int                      ldc) {
+  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
+}
+#endif
+
 namespace marian {

 namespace gpu {
--- a/src/tests/units/CMakeLists.txt
+++ b/src/tests/units/CMakeLists.txt
@ -6,6 +6,7 @@ set(UNIT_TESTS
    attention_tests
    fastopt_tests
    utils_tests
+    # cosmos_tests # optional, uncomment to test with specific files.
 )

 foreach(test ${UNIT_TESTS})
--- a/src/tests/units/cosmos_tests.cpp
+++ b/src/tests/units/cosmos_tests.cpp
@ -0,0 +1,141 @@
+#include "catch.hpp"
+
+#include "microsoft/cosmos.h"
+#include "common/definitions.h"
+#include "common/filesystem.h"
+
+using namespace marian;
+
+TEST_CASE("microsoft::cosmos::cosine_scorer", "[cosmos]") {
+  using namespace cosmos;
+
+  auto logger = spdlog::get("general");
+  if(!logger) {
+    std::vector<std::string> generalLogs;
+    logger = createStderrLogger("general", "[%Y-%m-%d %T] %v", generalLogs, /*quiet=*/true);
+  }
+  setThrowExceptionOnAbort(true);
+  
+  auto floatApprox = [](float x, float y) -> bool { 
+    return x == Approx(y).margin(0.001f); 
+  };
+
+  auto createScorer = [&]() {
+    std::string path = "/home/marcinjd/data2/cosmos/embedder/";
+    std::string modelPath = path + "2020-07-24.laser.model.npz";
+    std::string vocabPath = path + "2020-07-24.laser.vocab.spm";
+
+    CHECK( filesystem::exists(modelPath) );
+    CHECK( filesystem::exists(vocabPath) );
+    auto scorer = New<MarianCosineScorer>();
+    
+    CHECK( scorer->load(modelPath, vocabPath) ); 
+    
+    return scorer;
+  };
+
+  auto scorer = createScorer();
+  
+  SECTION("Compare two identical sentences") {
+    std::string input1 = "<CLS> This is a test.";
+    std::string input2 = "<CLS> This is a test.";
+
+    auto similarities = scorer->score(input1, input2);
+
+    CHECK( similarities.size() == 1 );
+    CHECK( floatApprox(similarities[0], 1.f) );
+  }
+
+  SECTION("Compare two different sentences") {
+    std::string input1 = "<CLS> This is a test.";
+    std::string input2 = "<CLS> This is another test.";
+
+    auto similarities = scorer->score(input1, input2);
+
+    CHECK( similarities.size() == 1 );
+    CHECK( floatApprox(similarities[0], 0.94101) );
+  }
+
+  SECTION("Compare small batches of sentences") {
+    std::string input1 = "<CLS> This is a test.\n<CLS> This is a test.";
+    std::string input2 = "<CLS> This is a test.\n<CLS> This is another test.";
+
+    auto similarities = scorer->score(input1, input2);
+
+    CHECK( similarities.size() == 2 );
+    CHECK( floatApprox(similarities[0], 1.f) );
+    CHECK( floatApprox(similarities[1], 0.94101) );
+  }
+
+  SECTION("Throw exception when there is a mismatch in number of sentences (first is shorter)") {  
+    std::string input1 = "<CLS> This is a test.\n";
+    std::string input2 = "<CLS> This is a test.\n<CLS> This is another test.";
+
+    try {
+      marian::setThrowExceptionOnAbort(true);
+      auto similarities = scorer->score(input1, input2);
+      CHECK( false ); // we shoudn't reach this check, hence a failed test if we do.
+    } catch(MarianRuntimeException& e) {
+      CHECK( e.what() == std::string("Previous tuple elements are missing.") );
+    }
+  }
+
+  SECTION("Throw exception when there is a mismatch in number of sentences (second is shorter)") {  
+    std::string input1 = "<CLS> This is a test.\n<CLS> This is a test.";
+    std::string input2 = "<CLS> This is a test.\n";
+
+    try {
+      marian::setThrowExceptionOnAbort(true);
+      auto similarities = scorer->score(input1, input2);
+      CHECK( false ); // we shoudn't reach this check, hence a failed test if we do.
+    } catch(MarianRuntimeException& e) {
+      CHECK( e.what() == std::string("There are missing entries in the text tuples.") );
+    }
+  }
+}
+
+TEST_CASE("microsoft::cosmos::embedder", "[cosmos]") {
+  using namespace cosmos;
+
+  auto floatApprox = [](float x, float y) -> bool { 
+    return x == Approx(y).margin(0.001f); 
+  };
+
+  auto createEmbedder = [&]() {
+    std::string path = "/home/marcinjd/data2/cosmos/embedder/";
+    std::string modelPath = path + "2020-07-24.laser.model.npz";
+    std::string vocabPath = path + "2020-07-24.laser.vocab.spm";
+
+    CHECK( filesystem::exists(modelPath) );
+    CHECK( filesystem::exists(vocabPath) );
+    auto embedder = New<MarianEmbedder>();
+    
+    CHECK( embedder->load(modelPath, vocabPath) ); 
+    
+    return embedder;
+  };
+
+  auto embedder = createEmbedder();
+  
+  SECTION("Embed a single sentence") {
+    std::string input = "<CLS> This is a test.";
+    auto embeddings = embedder->embed(input);
+
+    CHECK( embeddings.size()    ==   1 );
+    CHECK( embeddings[0].size() == 512 );
+
+    CHECK( floatApprox(embeddings[0][0], -0.04813f) );
+  }
+
+  SECTION("Embed two sentences") {
+    std::string input = "<CLS> This is a test.\n<CLS> This is another test.";
+    auto embeddings = embedder->embed(input);
+
+    CHECK( embeddings.size()    ==   2 );
+    CHECK( embeddings[0].size() == 512 );
+    CHECK( embeddings[1].size() == 512 );
+
+    CHECK( floatApprox(embeddings[0][0], -0.04813f) );
+    CHECK( floatApprox(embeddings[1][0], -0.04775f) );
+  }
+}
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@ -17,10 +17,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
                                               const std::vector<Ptr<Vocab>>& vocabs,
                                               double multiplier) {
  auto stats = New<data::BatchStats>();
-
-  size_t numFiles = options_->get<bool>("tsv", false)
-                        ? options_->get<size_t>("tsv-fields")
-                        : options_->get<std::vector<std::string>>("train-sets").size();
+  size_t numFiles = numberOfInputFiles();

  // Initialize first batch to step size
  size_t first = options_->get<size_t>("mini-batch-fit-step");
@ -77,7 +74,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
      } else {
        end = current - 1;
      }
-    } while(end - start > step);
+    } while(end - start > step); // @TODO: better replace with `end >= start` to remove the step here

    maxBatch = start;
  }
@ -88,4 +85,16 @@ void GraphGroup::setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // neede
  typicalTrgBatchWords_ = typicalTrgBatchWords;
 }

+size_t GraphGroup::numberOfInputFiles() {
+  if(options_->get<bool>("tsv", false)) {
+    size_t n = options_->get<size_t>("tsv-fields");
+    if(n > 0 && options_->get("guided-alignment", std::string("none")) != "none")
+      --n;
+    if(n > 0 && options_->hasAndNotEmpty("data-weighting"))
+      --n;
+    return n;
+  }
+  return options_->get<std::vector<std::string>>("train-sets").size();
 }
+
+}  // namespace marian
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -23,7 +23,11 @@ protected:
  Ptr<Scheduler> scheduler_; // scheduler that keeps track of how much has been processed

  bool finalized_{false};    // 'true' if training has completed (further updates are no longer allowed)
-  size_t typicalTrgBatchWords_{ 0 }; // for dynamic batch sizing: typical batch size in words
+  size_t typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
+
+  // determines the number of input streams (i.e. input files or fields in the TSV input) that need
+  // to be included in the batch, i.e. without alignments and weights
+  size_t numberOfInputFiles();

 public:
  GraphGroup(Ptr<Options> options);
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@ -0,0 +1,515 @@
+#include "translator/beam_search.h"
+
+#include "data/factored_vocab.h"
+#include "translator/helpers.h"
+#include "translator/nth_element.h"
+#include "data/shortlist.h"
+
+namespace marian {
+
+// combine new expandedPathScores and previous beams into new set of beams
+Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [currentDimBatch, beamSize] flattened -> ((batchIdx, beamHypIdx) flattened, word idx) flattened
+                         const std::vector<float>& nBestPathScores,  // [currentDimBatch, beamSize] flattened
+                         const size_t nBestBeamSize, // for interpretation of nBestKeys
+                         const size_t vocabSize,     // ditto.
+                         const Beams& beams,
+                         const std::vector<Ptr<ScorerState /*const*/>>& states,
+                         Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
+                         Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
+                         const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
+                         const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
+  std::vector<float> align; // collects alignment information from the last executed time step
+  if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
+    align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble,
+
+  const auto origDimBatch = beams.size(); // see function search for definition of origDimBatch and currentDimBatch etc.
+  Beams newBeams(origDimBatch);           // return value of this function goes here. There are always origDimBatch beams.
+
+  // create a reverse batchMap to obtain original batchIdx in the starting batch size
+  // and calculate the current batch size based on non-empty beams
+  std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
+  size_t currentDimBatch = beams.size();
+  if(PURGE_BATCH) {
+    reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
+    currentDimBatch = 0;
+    for(int i = 0; i < batchIdxMap.size(); ++i) {
+      reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
+                                              // which is expected due to down-shifting
+      if(!beams[i].empty())
+        currentDimBatch++;
+    }
+  }
+
+  for(size_t i = 0; i < nBestKeys.size(); ++i) { // [currentDimBatch, beamSize] flattened
+    // Keys encode batchIdx, beamHypIdx, and word index in the entire beam.
+    // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
+    // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
+    const auto  key = nBestKeys[i];
+    
+    // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
+    const auto beamHypIdx      = (key / vocabSize) % nBestBeamSize;
+    const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
+    const auto origBatchIdx    = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
+
+    bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx] && factorGroup == 0;
+    
+    WordIndex wordIdx;
+    if(dropHyp) { // if we force=drop the hypothesis, assign EOS, otherwise the expected word id.
+      if(factoredVocab) { // when using factoredVocab, extract the EOS lemma index from the word id, we predicting factors one by one here, hence lemma only
+        std::vector<size_t> eosFactors;
+        factoredVocab->word2factors(factoredVocab->getEosId(), eosFactors);
+        wordIdx = (WordIndex)eosFactors[0];
+      } else { // without factoredVocab lemma index and word index are the same. Safe cruising. 
+        wordIdx = trgVocab_->getEosId().toWordIndex();
+      }
+    } else { // we are not dropping anything, just assign the normal index
+      wordIdx = (WordIndex)(key % vocabSize);
+    }
+
+    // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
+    // the per Hyp pathScore without the current expansion (a bit hard to obtain). 
+    // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. 
+    // For the empty hyp this would naturally result in 0, too. 
+    const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)
+
+    const auto& beam = beams[origBatchIdx];
+    auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
+
+    if(newBeam.size() >= beam.size()) // getNBestList() generates N for all batch entries incl. those that already have a narrower beam
+      continue;
+    if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
+      continue;
+    
+    ABORT_IF(pathScore < INVALID_PATH_SCORE, "Actual pathScore ({}) is lower than INVALID_PATH_SCORE ({})??", pathScore, INVALID_PATH_SCORE); // This should not happen in valid situations. Currently the only smaller value would be -inf (effect of overflow in summation?)
+    ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
+
+    // map wordIdx to word
+    auto prevBeamHypIdx = beamHypIdx; // back pointer
+    auto prevHyp = beam[prevBeamHypIdx];
+    Word word;
+    // If short list has been set, then wordIdx is an index into the short-listed word set,
+    // rather than the true word index.
+    auto shortlist = scorers_[0]->getShortlist();
+    if (factoredVocab) {
+      // For factored decoding, the word is built over multiple decoding steps,
+      // starting with the lemma, then adding factors one by one.
+      if (factorGroup == 0) {
+        word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
+        std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
+        //LOG(info, "{} + {} ({}) -> {} -> {}",
+        //    factoredVocab->decode(prevHyp->tracebackWords()),
+        //    factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
+      }
+      else {
+        //LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
+        //    factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
+        //    factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
+        //    factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
+        //    prevHyp->getPathScore(), pathScore);
+        word = beam[beamHypIdx]->getWord();
+        ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
+                  "A word without this factor snuck through to here??");
+        word = factoredVocab->expandFactoredWord(word, factorGroup, wordIdx);
+        prevBeamHypIdx = prevHyp->getPrevStateIndex();
+        prevHyp = prevHyp->getPrevHyp(); // short-circuit the backpointer, so that the traceback does not contain partially factored words
+      }
+    }
+    else if (shortlist)
+      word = Word::fromWordIndex(shortlist->reverseMap(wordIdx));
+    else
+      word = Word::fromWordIndex(wordIdx);
+
+    auto hyp = Hypothesis::New(prevHyp, word, prevBeamHypIdx, pathScore);
+
+    // Set score breakdown for n-best lists
+    if(options_->get<bool>("n-best")) {
+      auto breakDown = beam[beamHypIdx]->getScoreBreakdown();
+      ABORT_IF(factoredVocab && factorGroup > 0 && !factoredVocab->canExpandFactoredWord(word, factorGroup),
+               "A word without this factor snuck through to here??");
+      breakDown.resize(states.size(), 0); // at start, this is empty, so this will set the initial score to 0
+      for(size_t j = 0; j < states.size(); ++j) {
+        auto lval = states[j]->getLogProbs().getFactoredLogitsTensor(factorGroup); // [maxBeamSize, 1, currentDimBatch, dimFactorVocab]
+        // The flatting happens based on actual (current) batch size and batch index computed with batch-pruning as we are looking into the pruned tensor
+        size_t flattenedLogitIndex = (beamHypIdx * currentDimBatch + currentBatchIdx) * vocabSize + wordIdx;  // (beam idx, batch idx, word idx); note: beam and batch are transposed, compared to 'key'
+
+        // @TODO: use a function on shape() to index, or new method val->at({i1, i2, i3, i4}) with broadcasting
+        ABORT_IF(lval->shape() != Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}) &&
+                 (beamHypIdx == 0 && lval->shape() != Shape({1, 1, (int)currentDimBatch, (int)vocabSize})),
+                 "Unexpected shape of logits?? {} != {}", lval->shape(), Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}));
+
+        breakDown[j] += lval->get(flattenedLogitIndex);
+      }
+      hyp->setScoreBreakdown(breakDown);
+    }
+
+    // Set alignments
+    if(!align.empty())
+      hyp->setAlignment(getAlignmentsForHypothesis(align, batch, (int)beamHypIdx, (int)currentBatchIdx, (int)origBatchIdx, (int)currentDimBatch));
+    else // not first factor: just copy
+      hyp->setAlignment(beam[beamHypIdx]->getAlignment());
+
+    newBeam.push_back(hyp);
+  }
+
+  // if factored vocab and this is not the first factor, we need to
+  // also propagate factored hypotheses that do not get expanded in this step because they don't have this factor
+  if (factorGroup > 0) {
+    for (size_t batchIdx = 0; batchIdx < beams.size(); batchIdx++) {
+      const auto& beam = beams[batchIdx];
+      auto& newBeam = newBeams[batchIdx];
+      for (const auto& beamHyp : beam) {
+        auto word = beamHyp->getWord();
+        //LOG(info, "Checking {}", factoredVocab->word2string(word));
+        if (factoredVocab->canExpandFactoredWord(word, factorGroup)) // handled above
+          continue;
+        //LOG(info, "Forwarded {}", factoredVocab->word2string(word));
+        newBeam.push_back(beamHyp);
+      }
+      if (newBeam.size() > beam.size()) {
+        //LOG(info, "Size {}, sorting...", newBeam.size());
+        std::nth_element(newBeam.begin(), newBeam.begin() + beam.size(), newBeam.end(), [](Hypothesis::PtrType a, Hypothesis::PtrType b) {
+          return a->getPathScore() > b->getPathScore(); // (sort highest score first)
+        });
+        //LOG(info, "Size {}, sorted...", newBeam.size());
+        newBeam.resize(beam.size());
+      }
+    }
+  }
+  return newBeams;
+}
+
+std::vector<float> BeamSearch::getAlignmentsForHypothesis( // -> P(s|t) for current t and given beam and batch dim
+    const std::vector<float> alignAll, // [beam depth, max src length, batch size, 1], flattened vector of all attention probablities
+    Ptr<data::CorpusBatch> batch,
+    int beamHypIdx,
+    int currentBatchIdx,
+    int origBatchIdx,
+    int currentDimBatch) const {
+  // Let's B be the beam size, N be the number of batched sentences,
+  // and L the number of words in the longest sentence in the batch.
+  // The alignment vector:
+  //
+  // if(first)
+  //   * has length of N x L if it's the first beam
+  //   * stores elements in the following order:
+  //     beam1 = [word1-batch1, word1-batch2, ..., word2-batch1, ...]
+  // else
+  //   * has length of N x L x B
+  //   * stores elements in the following order:
+  //     beams = [beam1, beam2, ..., beam_n]
+  //
+  // The mask vector is always of length N x L and has 1/0s stored like
+  // in a single beam, i.e.:
+  //   * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
+  //
+
+  size_t origDimBatch = batch->size();  // number of sentences in batch
+  size_t batchWidth   = batch->width(); // max src length
+
+  // loop over words of batch entry 'currentBatchIdx' and beam entry 'beamHypIdx'
+  std::vector<float> align;
+  for(size_t srcPos = 0; srcPos < batchWidth; ++srcPos) { // loop over source positions
+    // We are looking into the probabilites from an actual tensor, hence we need to use currentDimBatch and currentBatchIdx.
+    size_t currentAttIdx = (batchWidth * beamHypIdx + srcPos) * currentDimBatch + currentBatchIdx; // = flatten [beam index, s, batch index, 0]
+
+    // We are looking into the mask from the orginal batch, hence we need to use origDmBatch and origBatchIdx.
+    size_t origAttIdx  = (batchWidth * beamHypIdx + srcPos) * origDimBatch + origBatchIdx;; // = flatten [beam index, s, batch index, 0]
+    size_t origMaskIdx = origAttIdx % (batchWidth * origDimBatch); // == batchIdx + (batchSize * srcPos) = flatten [0, s, batch index, 0]
+
+    // If the original position is not masked out used the corresponding current attention score.
+    if(batch->front()->mask()[origMaskIdx] != 0)
+      align.emplace_back(alignAll[currentAttIdx]);
+  }
+  return align;
+}
+
+// remove all beam entries that have reached EOS
+Beams BeamSearch::purgeBeams(const Beams& beams, /*in/out=*/std::vector<IndexType>& batchIdxMap) {
+  const auto trgEosId = trgVocab_->getEosId();
+  Beams newBeams;
+  size_t beamIdx = 0; // beam index
+  for(auto beam : beams) {
+    Beam newBeam; // a beam of surviving hyps
+    for(auto hyp : beam)
+      if(hyp->getWord() != trgEosId) // if this hyp is not finished,
+        newBeam.push_back(hyp);      // move over to beam of surviving hyps
+
+    if(PURGE_BATCH)
+      if(newBeam.empty() && !beam.empty()) {      // previous beam had hyps, but all were finished in this step, newBeam will now stay empty
+        for(size_t i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
+          batchIdxMap[i] = batchIdxMap[i] - 1;  // make them look at one batch index below, as the current entry will be removed from the batch.
+    }
+
+    newBeams.push_back(newBeam);
+    beamIdx++; // move to next beam index
+  }
+  return newBeams;
+}
+
+//**********************************************************************
+// main decoding function
+Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
+  auto factoredVocab = trgVocab_->tryAs<FactoredVocab>();
+  size_t numFactorGroups = factoredVocab ? factoredVocab->getNumGroups() : 1;
+  if (numFactorGroups == 1) // if no factors then we didn't need this object in the first place
+    factoredVocab.reset();
+
+  // We will use the prefix "origBatch..." whenever we refer to batch dimensions of the original batch. These do not change during search.
+  // We will use the prefix "currentBatch.." whenever we refer to batch dimension that can change due to batch-pruning.
+  const int origDimBatch = (int)batch->size();
+  const auto trgEosId = trgVocab_->getEosId();
+  const auto trgUnkId = trgVocab_->getUnkId();
+
+  auto getNBestList = createGetNBestListFn(beamSize_, origDimBatch, graph->getDeviceId());
+
+  for(auto scorer : scorers_) {
+    scorer->clear(graph);
+  }
+
+  Histories histories(origDimBatch);
+  for(int i = 0; i < origDimBatch; ++i) {
+    size_t sentId = batch->getSentenceIds()[i];
+    histories[i] = New<History>(sentId,
+                                options_->get<float>("normalize"),
+                                options_->get<float>("word-penalty"));
+  }
+
+  // start states
+  std::vector<Ptr<ScorerState>> states;
+  for(auto scorer : scorers_) {
+    states.push_back(scorer->startState(graph, batch));
+  }
+
+  // create one beam per batch entry with sentence-start hypothesis
+  Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
+                                                                 // batch purging is determined from an empty sub-beam.
+  std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
+                                                    // By default that corresponds to position in array,
+                                                    // but shifts in the course of removing batch entries when they are finished.
+
+  const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
+  for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
+    batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
+    auto& beam = beams[origBatchIdx];
+    histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid
+
+    // Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
+    const auto& srcEosId = batch->front()->vocab()->getEosId();
+    const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
+  }
+
+  // determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
+  int unkColId = -1;
+  if (trgUnkId != Word::NONE && !options_->get<bool>("allow-unk", false)) { // do we need to suppress unk?
+    unkColId = factoredVocab ? factoredVocab->getUnkIndex() : trgUnkId.toWordIndex(); // what's the raw index of unk in the log prob vector?
+    auto shortlist = scorers_[0]->getShortlist();      // first shortlist is generally ok, @TODO: make sure they are the same across scorers?
+    if (shortlist)
+      unkColId = shortlist->tryForwardMap(unkColId); // use shifted postion of unk in case of using a shortlist, shortlist may have removed unk which results in -1
+  }
+
+  // the decoding process updates the following state information in each output time step:
+  //  - beams: array [origDimBatch] of array [maxBeamSize] of Hypothesis
+  //     - current output time step's set of active hypotheses, aka active search space
+  //  - states[.]: ScorerState
+  //     - NN state; one per scorer, e.g. 2 for ensemble of 2
+  // and it forms the following return value
+  //  - histories: array [origDimBatch] of History
+  //    with History: vector [t] of array [maxBeamSize] of Hypothesis
+  //    with Hypothesis: (last word, aggregate score, prev Hypothesis)
+
+  IndexType currentDimBatch = origDimBatch;
+  auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step
+  // main loop over output time steps
+  for (size_t t = 0; ; t++) {
+    ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??");
+    // determine beam size for next output time step, as max over still-active sentences
+    // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
+    // switch to beam of 4 for all. If all are done, then beam ends up being 0, and we are done.
+    size_t maxBeamSize = 0; // @TODO: is there some std::algorithm for this?
+    for(auto& beam : beams)
+      if(beam.size() > maxBeamSize)
+        maxBeamSize = beam.size();
+
+    // done if all batch entries have reached EOS on all beam entries
+    if (maxBeamSize == 0)
+      break;
+
+    for (size_t factorGroup = 0; factorGroup < numFactorGroups; factorGroup++) {
+      // for factored vocabs, we do one factor at a time, but without updating the scorer for secondary factors
+
+      //**********************************************************************
+      // create constant containing previous path scores for current beam
+      // Also create mapping of hyp indices, for reordering the decoder-state tensors.
+      std::vector<IndexType> batchIndices;    // [1,           1, currentDimBatch, 1] indices of currently used batch indices with regard to current, actual tensors
+      std::vector<IndexType> hypIndices;      // [maxBeamSize, 1, currentDimBatch, 1] (flattened) tensor index ((beamHypIdx, batchIdx), flattened) of prev hyp that a hyp originated from
+      std::vector<Word> prevWords;            // [maxBeamSize, 1, currentDimBatch, 1] (flattened) word that a hyp ended in, for advancing the decoder-model's history
+      Expr prevPathScores;                    // [maxBeamSize, 1, currentDimBatch, 1], path score that a hyp ended in (last axis will broadcast into vocab size when adding expandedPathScores)
+
+      bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions
+      if(t == 0 && factorGroup == 0) { // no scores yet
+        prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0));
+        anyCanExpand = true;
+
+        // at the beginning all batch entries are used
+        batchIndices.resize(origDimBatch);
+        std::iota(batchIndices.begin(), batchIndices.end(), 0);
+      } else {
+        if(factorGroup == 0)                                                              // only factorGroup==0 can subselect neural state
+          for(int currentBatchIdx = 0; currentBatchIdx < beams.size(); ++currentBatchIdx) // loop over batch entries (active sentences)
+            if(!beams[currentBatchIdx].empty() || !PURGE_BATCH)                           // for each beam check
+              batchIndices.push_back(prevBatchIdxMap[currentBatchIdx]);                   // which batch entries were active in previous step
+
+        std::vector<float> prevScores;
+        for(size_t beamHypIdx = 0; beamHypIdx < maxBeamSize; ++beamHypIdx) { // loop over globally maximal beam-size (maxBeamSize)
+          for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) { // loop over all batch entries (active and inactive)
+            auto& beam = beams[origBatchIdx];
+            if(beamHypIdx < beam.size()) {
+              auto hyp = beam[beamHypIdx];
+              auto word = hyp->getWord();
+              auto canExpand = (!factoredVocab || factoredVocab->canExpandFactoredWord(hyp->getWord(), factorGroup));
+              //LOG(info, "[{}, {}] Can expand {} with {} -> {}", batchIdx, beamHypIdx, (*batch->back()->vocab())[hyp->getWord()], factorGroup, canExpand);
+              anyCanExpand |= canExpand;
+
+              auto currentBatchIdx = origBatchIdx;
+              if(PURGE_BATCH) {
+                if(factorGroup == 0)
+                  currentBatchIdx = prevBatchIdxMap[origBatchIdx]; // subselection may happen for factorGroup == 0
+                else
+                  currentBatchIdx = batchIdxMap[origBatchIdx];     // no subselection happens for factorGroup > 0,
+                                                                   // but we treat it like a next step, since a step
+                                                                   // happened for factorGroup == 0
+              }
+
+              auto hypIndex = (IndexType)(hyp->getPrevStateIndex() * currentDimBatch + currentBatchIdx); // (beamHypIdx, batchIdx), flattened, for index_select() operation
+
+              hypIndices.push_back(hypIndex); // (beamHypIdx, batchIdx), flattened as said above.
+              prevWords .push_back(word);
+              prevScores.push_back(canExpand ? hyp->getPathScore() : INVALID_PATH_SCORE);
+            } else {  // pad to maxBeamSize (dummy hypothesis)
+              if(!PURGE_BATCH || !beam.empty()) { // but only if we are not pruning and the beam is not deactivated yet
+                hypIndices.push_back(0);
+                prevWords.push_back(trgEosId);    // (unused, but must be valid)
+                prevScores.push_back((float)INVALID_PATH_SCORE);
+              }
+            }
+          }
+        }
+        if(factorGroup == 0)
+          currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
+        prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
+      }
+      if (!anyCanExpand) // all words cannot expand this factor: skip
+        continue;
+
+      //**********************************************************************
+      // compute expanded path scores with word prediction probs from all scorers
+      auto expandedPathScores = prevPathScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab]
+      Expr logProbs;
+      for(size_t i = 0; i < scorers_.size(); ++i) {
+        if (factorGroup == 0) {
+          // compute output probabilities for current output time step
+          //  - uses hypIndices[index in beam, 1, batch index, 1] to reorder scorer state to reflect the top-N in beams[][]
+          //  - adds prevWords [index in beam, 1, batch index, 1] to the scorer's target history
+          //  - performs one step of the scorer
+          //  - returns new NN state for use in next output time step
+          //  - returns vector of prediction probabilities over output vocab via newState
+          // update state in-place for next output time step
+          //if (t > 0) for (size_t kk = 0; kk < prevWords.size(); kk++)
+          //  LOG(info, "prevWords[{},{}]={} -> {}", t/numFactorGroups, factorGroup,
+          //      factoredVocab ? factoredVocab->word2string(prevWords[kk]) : (*batch->back()->vocab())[prevWords[kk]],
+          //      prevScores[kk]);
+          states[i] = scorers_[i]->step(graph, states[i], hypIndices, prevWords, batchIndices, (int)maxBeamSize);
+          if (numFactorGroups == 1) // @TODO: this branch can go away
+            logProbs = states[i]->getLogProbs().getLogits(); // [maxBeamSize, 1, currentDimBatch, dimVocab]
+          else
+          {
+            auto shortlist = scorers_[i]->getShortlist();
+            logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, shortlist); // [maxBeamSize, 1, currentDimBatch, dimVocab]
+          }
+        }
+        else {
+          // add secondary factors
+          // For those, we don't update the decoder-model state in any way.
+          // Instead, we just keep expanding with the factors.
+          // We will have temporary Word entries in hyps with some factors set to FACTOR_NOT_SPECIFIED.
+          // For some lemmas, a factor is not applicable. For those, the factor score is the same (zero)
+          // for all factor values. This would thus unnecessarily pollute the beam with identical copies,
+          // and push out other hypotheses. Hence, we exclude those here by setting the path score to
+          // INVALID_PATH_SCORE. Instead, toHyps() explicitly propagates those hyps by simply copying the
+          // previous hypothesis.
+          logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, /*shortlist=*/ nullptr, hypIndices, maxBeamSize); // [maxBeamSize, 1, currentDimBatch, dimVocab]
+        }
+        // expand all hypotheses, [maxBeamSize, 1, currentDimBatch, 1] -> [maxBeamSize, 1, currentDimBatch, dimVocab]
+        expandedPathScores = expandedPathScores + scorers_[i]->getWeight() * logProbs;
+      }
+
+      // make beams continuous
+      expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab]
+
+      // perform NN computation
+      if(t == 0 && factorGroup == 0)
+        graph->forward();
+      else
+        graph->forwardNext();
+
+      //**********************************************************************
+      // suppress specific symbols if not at right positions
+      if(unkColId != -1 && factorGroup == 0)
+        suppressWord(expandedPathScores, unkColId);
+      for(auto state : states)
+        state->blacklist(expandedPathScores, batch);
+
+      //**********************************************************************
+      // perform beam search
+
+      // find N best amongst the (maxBeamSize * dimVocab) hypotheses
+      std::vector<unsigned int> nBestKeys; // [currentDimBatch, maxBeamSize] flattened -> (batchIdx, beamHypIdx, word idx) flattened
+      std::vector<float> nBestPathScores;  // [currentDimBatch, maxBeamSize] flattened
+      getNBestList(/*in*/   expandedPathScores->val(),   // [currentDimBatch, 1, maxBeamSize, dimVocab or dimShortlist]
+                  /*N=*/    maxBeamSize,                 // desired beam size
+                  /*out*/   nBestPathScores,
+                   /*out*/  nBestKeys,
+                  /*first=*/t == 0 && factorGroup == 0); // @TODO: this is only used for checking presently, and should be removed altogether
+      // Now, nBestPathScores contain N-best expandedPathScores for each batch and beam,
+      // and nBestKeys for each their original location (batchIdx, beamHypIdx, word).
+
+      // combine N-best sets with existing search space (beams) to updated search space
+      beams = toHyps(nBestKeys, nBestPathScores,
+                     /*nBestBeamSize*/expandedPathScores->shape()[-2], // used for interpretation of keys
+                     /*vocabSize=*/expandedPathScores->shape()[-1],    // used for interpretation of keys
+                     beams,
+                     states,            // used for keeping track of per-ensemble-member path score
+                     batch,             // only used for propagating alignment info
+                     factoredVocab, factorGroup,
+                     emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
+                     batchIdxMap);      // used to create a reverse batch index map to recover original batch indices for this step
+    } // END FOR factorGroup = 0 .. numFactorGroups-1
+
+    prevBatchIdxMap = batchIdxMap; // save current batchIdx map to be used in next step; we are then going to look one step back
+
+    // remove all hyps that end in EOS
+    // The position of a hyp in the beam may change.
+    // in/out = shifts the batch index map if a beam gets fully purged
+    const auto purgedNewBeams = purgeBeams(beams, /*in/out=*/batchIdxMap);
+
+    // add updated search space (beams) to our return value
+    bool maxLengthReached = false;
+    for(int batchIdx = 0; batchIdx < origDimBatch; ++batchIdx) {
+      // if this batch entry has surviving hyps then add them to the traceback grid
+      if(!beams[batchIdx].empty()) { // if the beam is not empty expand the history object associated with the beam
+        if (histories[batchIdx]->size() >= options_->get<float>("max-length-factor") * batch->front()->batchWidth())
+          maxLengthReached = true;
+        histories[batchIdx]->add(beams[batchIdx], trgEosId, purgedNewBeams[batchIdx].empty() || maxLengthReached);
+      }
+    }
+    if (maxLengthReached) // early exit if max length limit was reached
+      break;
+
+    // this is the search space for the next output time step
+    beams = purgedNewBeams;
+  } // end of main loop over output time steps
+
+  return histories; // [origDimBatch][t][N best hyps]
+}
+
+}  // namespace marian
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -1,14 +1,8 @@
 #pragma once
-#include <algorithm>

 #include "marian.h"
 #include "translator/history.h"
 #include "translator/scorers.h"
-#include "data/factored_vocab.h"
-#include "data/shortlist.h"
-
-#include "translator/helpers.h"
-#include "translator/nth_element.h"

 namespace marian {

@ -23,13 +17,9 @@ private:
  const bool PURGE_BATCH = true; // @TODO: diagnostic, to-be-removed once confirmed there are no issues.

 public:
-  BeamSearch(Ptr<Options> options,
-             const std::vector<Ptr<Scorer>>& scorers,
-             const Ptr<const Vocab> trgVocab)
-      : options_(options),
-        scorers_(scorers),
-        beamSize_(options_->get<size_t>("beam-size")),
-        trgVocab_(trgVocab) {}
+  BeamSearch(Ptr<Options> options, const std::vector<Ptr<Scorer>>& scorers, const Ptr<const Vocab> trgVocab)
+      : options_(options), scorers_(scorers), beamSize_(options_->get<size_t>("beam-size")), trgVocab_(trgVocab)
+  {}

  // combine new expandedPathScores and previous beams into new set of beams
  Beams toHyps(const std::vector<unsigned int>& nBestKeys, // [currentDimBatch, beamSize] flattened -> ((batchIdx, beamHypIdx) flattened, word idx) flattened
@ -39,168 +29,9 @@ public:
               const Beams& beams,
               const std::vector<Ptr<ScorerState /*const*/>>& states,
               Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
-               Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
+               Ptr<class FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
               const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
-               const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
-    std::vector<float> align; // collects alignment information from the last executed time step
-    if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
-      align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble,
-
-    const auto origDimBatch = beams.size(); // see function search for definition of origDimBatch and currentDimBatch etc.
-    Beams newBeams(origDimBatch);           // return value of this function goes here. There are always origDimBatch beams.
-
-    // create a reverse batchMap to obtain original batchIdx in the starting batch size
-    // and calculate the current batch size based on non-empty beams
-    std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
-    size_t currentDimBatch = beams.size();
-    if(PURGE_BATCH) {
-      reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
-      currentDimBatch = 0;
-      for(int i = 0; i < batchIdxMap.size(); ++i) {
-        reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
-                                                // which is expected due to down-shifting
-        if(!beams[i].empty())
-          currentDimBatch++;
-      }
-    }
-
-    for(size_t i = 0; i < nBestKeys.size(); ++i) { // [currentDimBatch, beamSize] flattened
-      // Keys encode batchIdx, beamHypIdx, and word index in the entire beam.
-      // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
-      // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
-      const auto  key       = nBestKeys[i];
-
-      // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
-      const auto beamHypIdx      = (key / vocabSize) % nBestBeamSize;
-      const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
-      const auto origBatchIdx    = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
-
-      bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx] && factorGroup == 0;
-
-      WordIndex wordIdx;
-      if(dropHyp) { // if we force=drop the hypothesis, assign EOS, otherwise the expected word id.
-        if(factoredVocab) { // when using factoredVocab, extract the EOS lemma index from the word id, we predicting factors one by one here, hence lemma only
-          std::vector<size_t> eosFactors;
-          factoredVocab->word2factors(factoredVocab->getEosId(), eosFactors);
-          wordIdx = (WordIndex)eosFactors[0];
-        } else { // without factoredVocab lemma index and word index are the same. Safe cruising.
-          wordIdx = trgVocab_->getEosId().toWordIndex();
-        }
-      } else { // we are not dropping anything, just assign the normal index
-        wordIdx = (WordIndex)(key % vocabSize);
-      }
-
-      // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
-      // the per Hyp pathScore without the current expansion (a bit hard to obtain).
-      // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better.
-      // For the empty hyp this would naturally result in 0, too.
-      const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)
-
-      const auto& beam = beams[origBatchIdx];
-      auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
-
-      if(newBeam.size() >= beam.size()) // getNBestList() generates N for all batch entries incl. those that already have a narrower beam
-        continue;
-      if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
-        continue;
-
-      ABORT_IF(pathScore < INVALID_PATH_SCORE, "Actual pathScore ({}) is lower than INVALID_PATH_SCORE ({})??", pathScore, INVALID_PATH_SCORE); // This should not happen in valid situations. Currently the only smaller value would be -inf (effect of overflow in summation?)
-      ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
-
-      // map wordIdx to word
-      auto prevBeamHypIdx = beamHypIdx; // back pointer
-      auto prevHyp = beam[prevBeamHypIdx];
-      Word word;
-      // If short list has been set, then wordIdx is an index into the short-listed word set,
-      // rather than the true word index.
-      auto shortlist = scorers_[0]->getShortlist();
-      if (factoredVocab) {
-        // For factored decoding, the word is built over multiple decoding steps,
-        // starting with the lemma, then adding factors one by one.
-        if (factorGroup == 0) {
-          word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
-          std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
-          //LOG(info, "{} + {} ({}) -> {} -> {}",
-          //    factoredVocab->decode(prevHyp->tracebackWords()),
-          //    factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
-        }
-        else {
-          //LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
-          //    factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
-          //    factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
-          //    factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
-          //    prevHyp->getPathScore(), pathScore);
-          word = beam[beamHypIdx]->getWord();
-          ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
-                   "A word without this factor snuck through to here??");
-          word = factoredVocab->expandFactoredWord(word, factorGroup, wordIdx);
-          prevBeamHypIdx = prevHyp->getPrevStateIndex();
-          prevHyp = prevHyp->getPrevHyp(); // short-circuit the backpointer, so that the traceback does not contain partially factored words
-        }
-      }
-      else if (shortlist)
-        word = Word::fromWordIndex(shortlist->reverseMap(wordIdx));
-      else
-        word = Word::fromWordIndex(wordIdx);
-
-      auto hyp = Hypothesis::New(prevHyp, word, prevBeamHypIdx, pathScore);
-
-      // Set score breakdown for n-best lists
-      if(options_->get<bool>("n-best")) {
-        auto breakDown = beam[beamHypIdx]->getScoreBreakdown();
-        ABORT_IF(factoredVocab && factorGroup > 0 && !factoredVocab->canExpandFactoredWord(word, factorGroup),
-                 "A word without this factor snuck through to here??");
-        breakDown.resize(states.size(), 0); // at start, this is empty, so this will set the initial score to 0
-        for(size_t j = 0; j < states.size(); ++j) {
-          auto lval = states[j]->getLogProbs().getFactoredLogitsTensor(factorGroup); // [maxBeamSize, 1, currentDimBatch, dimFactorVocab]
-          // The flatting happens based on actual (current) batch size and batch index computed with batch-pruning as we are looking into the pruned tensor
-          size_t flattenedLogitIndex = (beamHypIdx * currentDimBatch + currentBatchIdx) * vocabSize + wordIdx;  // (beam idx, batch idx, word idx); note: beam and batch are transposed, compared to 'key'
-
-          // @TODO: use a function on shape() to index, or new method val->at({i1, i2, i3, i4}) with broadcasting
-          ABORT_IF(lval->shape() != Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}) &&
-                   (beamHypIdx == 0 && lval->shape() != Shape({1, 1, (int)currentDimBatch, (int)vocabSize})),
-                   "Unexpected shape of logits?? {} != {}", lval->shape(), Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}));
-
-          breakDown[j] += lval->get(flattenedLogitIndex);
-        }
-        hyp->setScoreBreakdown(breakDown);
-      }
-
-      // Set alignments
-      if(!align.empty())
-        hyp->setAlignment(getAlignmentsForHypothesis(align, batch, (int)beamHypIdx, (int)currentBatchIdx, (int)origBatchIdx, (int)currentDimBatch));
-      else // not first factor: just copy
-        hyp->setAlignment(beam[beamHypIdx]->getAlignment());
-
-      newBeam.push_back(hyp);
-    }
-
-    // if factored vocab and this is not the first factor, we need to
-    // also propagate factored hypotheses that do not get expanded in this step because they don't have this factor
-    if (factorGroup > 0) {
-      for (size_t batchIdx = 0; batchIdx < beams.size(); batchIdx++) {
-        const auto& beam = beams[batchIdx];
-        auto& newBeam = newBeams[batchIdx];
-        for (const auto& beamHyp : beam) {
-          auto word = beamHyp->getWord();
-          //LOG(info, "Checking {}", factoredVocab->word2string(word));
-          if (factoredVocab->canExpandFactoredWord(word, factorGroup)) // handled above
-            continue;
-          //LOG(info, "Forwarded {}", factoredVocab->word2string(word));
-          newBeam.push_back(beamHyp);
-        }
-        if (newBeam.size() > beam.size()) {
-          //LOG(info, "Size {}, sorting...", newBeam.size());
-          std::nth_element(newBeam.begin(), newBeam.begin() + beam.size(), newBeam.end(), [](Hypothesis::PtrType a, Hypothesis::PtrType b) {
-            return a->getPathScore() > b->getPathScore(); // (sort highest score first)
-          });
-          //LOG(info, "Size {}, sorted...", newBeam.size());
-          newBeam.resize(beam.size());
-        }
-      }
-    }
-    return newBeams;
-  }
+               const std::vector<IndexType>& batchIdxMap) const;

  std::vector<float> getAlignmentsForHypothesis( // -> P(s|t) for current t and given beam and batch dim
      const std::vector<float> alignAll, // [beam depth, max src length, batch size, 1], flattened vector of all attention probablities
@ -208,334 +39,13 @@ public:
      int beamHypIdx,
      int currentBatchIdx,
      int origBatchIdx,
-      int currentDimBatch) const {
-    // Let's B be the beam size, N be the number of batched sentences,
-    // and L the number of words in the longest sentence in the batch.
-    // The alignment vector:
-    //
-    // if(first)
-    //   * has length of N x L if it's the first beam
-    //   * stores elements in the following order:
-    //     beam1 = [word1-batch1, word1-batch2, ..., word2-batch1, ...]
-    // else
-    //   * has length of N x L x B
-    //   * stores elements in the following order:
-    //     beams = [beam1, beam2, ..., beam_n]
-    //
-    // The mask vector is always of length N x L and has 1/0s stored like
-    // in a single beam, i.e.:
-    //   * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
-    //
-
-    size_t origDimBatch = batch->size();  // number of sentences in batch
-    size_t batchWidth   = batch->width(); // max src length
-
-    // loop over words of batch entry 'currentBatchIdx' and beam entry 'beamHypIdx'
-    std::vector<float> align;
-    for(size_t srcPos = 0; srcPos < batchWidth; ++srcPos) { // loop over source positions
-      // We are looking into the probabilites from an actual tensor, hence we need to use currentDimBatch and currentBatchIdx.
-      size_t currentAttIdx = (batchWidth * beamHypIdx + srcPos) * currentDimBatch + currentBatchIdx; // = flatten [beam index, s, batch index, 0]
-
-      // We are looking into the mask from the orginal batch, hence we need to use origDmBatch and origBatchIdx.
-      size_t origAttIdx  = (batchWidth * beamHypIdx + srcPos) * origDimBatch + origBatchIdx;; // = flatten [beam index, s, batch index, 0]
-      size_t origMaskIdx = origAttIdx % (batchWidth * origDimBatch); // == batchIdx + (batchSize * srcPos) = flatten [0, s, batch index, 0]
-
-      // If the original position is not masked out used the corresponding current attention score.
-      if(batch->front()->mask()[origMaskIdx] != 0)
-        align.emplace_back(alignAll[currentAttIdx]);
-    }
-    return align;
-  }
+      int currentDimBatch) const;

  // remove all beam entries that have reached EOS
-  Beams purgeBeams(const Beams& beams, /*in/out=*/std::vector<IndexType>& batchIdxMap) {
-    const auto trgEosId = trgVocab_->getEosId();
-    Beams newBeams;
-    size_t beamIdx = 0; // beam index
-    for(auto beam : beams) {
-      Beam newBeam; // a beam of surviving hyps
-      for(auto hyp : beam)
-        if(hyp->getWord() != trgEosId) // if this hyp is not finished,
-          newBeam.push_back(hyp);      // move over to beam of surviving hyps
+  Beams purgeBeams(const Beams& beams, /*in/out=*/std::vector<IndexType>& batchIdxMap);

-      if(PURGE_BATCH)
-        if(newBeam.empty() && !beam.empty()) {      // previous beam had hyps, but all were finished in this step, newBeam will now stay empty
-          for(size_t i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
-            batchIdxMap[i] = batchIdxMap[i] - 1;  // make them look at one batch index below, as the current entry will be removed from the batch.
-      }
-
-      newBeams.push_back(newBeam);
-      beamIdx++; // move to next beam index
-    }
-    return newBeams;
-  }
-
-  //**********************************************************************
  // main decoding function
-  Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
-    auto factoredVocab = trgVocab_->tryAs<FactoredVocab>();
-#if 0   // use '1' here to disable factored decoding, e.g. for comparisons
-    factoredVocab.reset();
-#endif
-    size_t numFactorGroups = factoredVocab ? factoredVocab->getNumGroups() : 1;
-    if (numFactorGroups == 1) // if no factors then we didn't need this object in the first place
-      factoredVocab.reset();
-
-    // We will use the prefix "origBatch..." whenever we refer to batch dimensions of the original batch. These do not change during search.
-    // We will use the prefix "currentBatch.." whenever we refer to batch dimension that can change due to batch-pruning.
-    const int origDimBatch = (int)batch->size();
-    const auto trgEosId = trgVocab_->getEosId();
-    const auto trgUnkId = trgVocab_->getUnkId();
-
-    auto getNBestList = createGetNBestListFn(beamSize_, origDimBatch, graph->getDeviceId());
-
-    for(auto scorer : scorers_) {
-      scorer->clear(graph);
-    }
-
-    Histories histories(origDimBatch);
-    for(int i = 0; i < origDimBatch; ++i) {
-      size_t sentId = batch->getSentenceIds()[i];
-      histories[i] = New<History>(sentId,
-                                  options_->get<float>("normalize"),
-                                  options_->get<float>("word-penalty"));
-    }
-
-    // start states
-    std::vector<Ptr<ScorerState>> states;
-    for(auto scorer : scorers_) {
-      states.push_back(scorer->startState(graph, batch));
-    }
-
-    // create one beam per batch entry with sentence-start hypothesis
-    Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
-                                                                   // batch purging is determined from an empty sub-beam.
-    std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
-                                                      // By default that corresponds to position in array,
-                                                      // but shifts in the course of removing batch entries when they are finished.
-
-    const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
-    for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
-      batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
-      auto& beam = beams[origBatchIdx];
-      histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid
-
-      // Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
-      const auto& srcEosId = batch->front()->vocab()->getEosId();
-      const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
-    }
-
-    // determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
-    int unkColId = -1;
-    if (trgUnkId != Word::NONE && !options_->get<bool>("allow-unk", false)) { // do we need to suppress unk?
-      unkColId = factoredVocab ? factoredVocab->getUnkIndex() : trgUnkId.toWordIndex(); // what's the raw index of unk in the log prob vector?
-      auto shortlist = scorers_[0]->getShortlist();      // first shortlist is generally ok, @TODO: make sure they are the same across scorers?
-      if (shortlist)
-        unkColId = shortlist->tryForwardMap(unkColId); // use shifted postion of unk in case of using a shortlist, shortlist may have removed unk which results in -1
-    }
-
-    // the decoding process updates the following state information in each output time step:
-    //  - beams: array [origDimBatch] of array [maxBeamSize] of Hypothesis
-    //     - current output time step's set of active hypotheses, aka active search space
-    //  - states[.]: ScorerState
-    //     - NN state; one per scorer, e.g. 2 for ensemble of 2
-    // and it forms the following return value
-    //  - histories: array [origDimBatch] of History
-    //    with History: vector [t] of array [maxBeamSize] of Hypothesis
-    //    with Hypothesis: (last word, aggregate score, prev Hypothesis)
-
-    IndexType currentDimBatch = origDimBatch;
-    auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step
-    // main loop over output time steps
-    for (size_t t = 0; ; t++) {
-      ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??");
-      // determine beam size for next output time step, as max over still-active sentences
-      // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
-      // switch to beam of 4 for all. If all are done, then beam ends up being 0, and we are done.
-      size_t maxBeamSize = 0; // @TODO: is there some std::algorithm for this?
-      for(auto& beam : beams)
-        if(beam.size() > maxBeamSize)
-          maxBeamSize = beam.size();
-
-      // done if all batch entries have reached EOS on all beam entries
-      if (maxBeamSize == 0)
-        break;
-
-      for (size_t factorGroup = 0; factorGroup < numFactorGroups; factorGroup++) {
-        // for factored vocabs, we do one factor at a time, but without updating the scorer for secondary factors
-
-        //**********************************************************************
-        // create constant containing previous path scores for current beam
-        // Also create mapping of hyp indices, for reordering the decoder-state tensors.
-        std::vector<IndexType> batchIndices;    // [1,           1, currentDimBatch, 1] indices of currently used batch indices with regard to current, actual tensors
-        std::vector<IndexType> hypIndices;      // [maxBeamSize, 1, currentDimBatch, 1] (flattened) tensor index ((beamHypIdx, batchIdx), flattened) of prev hyp that a hyp originated from
-        std::vector<Word> prevWords;            // [maxBeamSize, 1, currentDimBatch, 1] (flattened) word that a hyp ended in, for advancing the decoder-model's history
-        Expr prevPathScores;                    // [maxBeamSize, 1, currentDimBatch, 1], path score that a hyp ended in (last axis will broadcast into vocab size when adding expandedPathScores)
-
-        bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions
-        if(t == 0 && factorGroup == 0) { // no scores yet
-          prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0));
-          anyCanExpand = true;
-
-          // at the beginning all batch entries are used
-          batchIndices.resize(origDimBatch);
-          std::iota(batchIndices.begin(), batchIndices.end(), 0);
-        } else {
-          if(factorGroup == 0)                                                              // only factorGroup==0 can subselect neural state
-            for(int currentBatchIdx = 0; currentBatchIdx < beams.size(); ++currentBatchIdx) // loop over batch entries (active sentences)
-              if(!beams[currentBatchIdx].empty() || !PURGE_BATCH)                           // for each beam check
-                batchIndices.push_back(prevBatchIdxMap[currentBatchIdx]);                   // which batch entries were active in previous step
-
-          std::vector<float> prevScores;
-          for(size_t beamHypIdx = 0; beamHypIdx < maxBeamSize; ++beamHypIdx) { // loop over globally maximal beam-size (maxBeamSize)
-            for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) { // loop over all batch entries (active and inactive)
-              auto& beam = beams[origBatchIdx];
-              if(beamHypIdx < beam.size()) {
-                auto hyp = beam[beamHypIdx];
-                auto word = hyp->getWord();
-                auto canExpand = (!factoredVocab || factoredVocab->canExpandFactoredWord(hyp->getWord(), factorGroup));
-                //LOG(info, "[{}, {}] Can expand {} with {} -> {}", batchIdx, beamHypIdx, (*batch->back()->vocab())[hyp->getWord()], factorGroup, canExpand);
-                anyCanExpand |= canExpand;
-
-                auto currentBatchIdx = origBatchIdx;
-                if(PURGE_BATCH) {
-                  if(factorGroup == 0)
-                    currentBatchIdx = prevBatchIdxMap[origBatchIdx]; // subselection may happen for factorGroup == 0
-                  else
-                    currentBatchIdx = batchIdxMap[origBatchIdx];     // no subselection happens for factorGroup > 0,
-                                                                     // but we treat it like a next step, since a step
-                                                                     // happened for factorGroup == 0
-                }
-
-                auto hypIndex = (IndexType)(hyp->getPrevStateIndex() * currentDimBatch + currentBatchIdx); // (beamHypIdx, batchIdx), flattened, for index_select() operation
-
-                hypIndices.push_back(hypIndex); // (beamHypIdx, batchIdx), flattened as said above.
-                prevWords .push_back(word);
-                prevScores.push_back(canExpand ? hyp->getPathScore() : INVALID_PATH_SCORE);
-              } else {  // pad to maxBeamSize (dummy hypothesis)
-                if(!PURGE_BATCH || !beam.empty()) { // but only if we are not pruning and the beam is not deactivated yet
-                  hypIndices.push_back(0);
-                  prevWords.push_back(trgEosId);  // (unused, but must be valid)
-                  prevScores.push_back((float)INVALID_PATH_SCORE);
-                }
-              }
-            }
-          }
-          if(factorGroup == 0)
-            currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
-          prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
-        }
-        if (!anyCanExpand) // all words cannot expand this factor: skip
-          continue;
-
-        //**********************************************************************
-        // compute expanded path scores with word prediction probs from all scorers
-        auto expandedPathScores = prevPathScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab]
-        Expr logProbs;
-        for(size_t i = 0; i < scorers_.size(); ++i) {
-          if (factorGroup == 0) {
-            // compute output probabilities for current output time step
-            //  - uses hypIndices[index in beam, 1, batch index, 1] to reorder scorer state to reflect the top-N in beams[][]
-            //  - adds prevWords [index in beam, 1, batch index, 1] to the scorer's target history
-            //  - performs one step of the scorer
-            //  - returns new NN state for use in next output time step
-            //  - returns vector of prediction probabilities over output vocab via newState
-            // update state in-place for next output time step
-            //if (t > 0) for (size_t kk = 0; kk < prevWords.size(); kk++)
-            //  LOG(info, "prevWords[{},{}]={} -> {}", t/numFactorGroups, factorGroup,
-            //      factoredVocab ? factoredVocab->word2string(prevWords[kk]) : (*batch->back()->vocab())[prevWords[kk]],
-            //      prevScores[kk]);
-            states[i] = scorers_[i]->step(graph, states[i], hypIndices, prevWords, batchIndices, (int)maxBeamSize);
-            if (numFactorGroups == 1) // @TODO: this branch can go away
-              logProbs = states[i]->getLogProbs().getLogits(); // [maxBeamSize, 1, currentDimBatch, dimVocab]
-            else
-            {
-              auto shortlist = scorers_[i]->getShortlist();
-              logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, shortlist); // [maxBeamSize, 1, currentDimBatch, dimVocab]
-            }
-          }
-          else {
-            // add secondary factors
-            // For those, we don't update the decoder-model state in any way.
-            // Instead, we just keep expanding with the factors.
-            // We will have temporary Word entries in hyps with some factors set to FACTOR_NOT_SPECIFIED.
-            // For some lemmas, a factor is not applicable. For those, the factor score is the same (zero)
-            // for all factor values. This would thus unnecessarily pollute the beam with identical copies,
-            // and push out other hypotheses. Hence, we exclude those here by setting the path score to
-            // INVALID_PATH_SCORE. Instead, toHyps() explicitly propagates those hyps by simply copying the
-            // previous hypothesis.
-            logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, /*shortlist=*/ nullptr, hypIndices, maxBeamSize); // [maxBeamSize, 1, currentDimBatch, dimVocab]
-          }
-          // expand all hypotheses, [maxBeamSize, 1, currentDimBatch, 1] -> [maxBeamSize, 1, currentDimBatch, dimVocab]
-          expandedPathScores = expandedPathScores + scorers_[i]->getWeight() * logProbs;
-        }
-
-        // make beams continuous
-        expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab]
-
-        // perform NN computation
-        if(t == 0 && factorGroup == 0)
-          graph->forward();
-        else
-          graph->forwardNext();
-
-        //**********************************************************************
-        // suppress specific symbols if not at right positions
-        if(unkColId != -1 && factorGroup == 0)
-          suppressWord(expandedPathScores, unkColId);
-        for(auto state : states)
-          state->blacklist(expandedPathScores, batch);
-
-        //**********************************************************************
-        // perform beam search
-
-        // find N best amongst the (maxBeamSize * dimVocab) hypotheses
-        std::vector<unsigned int> nBestKeys; // [currentDimBatch, maxBeamSize] flattened -> (batchIdx, beamHypIdx, word idx) flattened
-        std::vector<float> nBestPathScores;  // [currentDimBatch, maxBeamSize] flattened
-        getNBestList(/*in*/ expandedPathScores->val(), // [currentDimBatch, 1, maxBeamSize, dimVocab or dimShortlist]
-                    /*N=*/ maxBeamSize,              // desired beam size
-                    /*out*/ nBestPathScores, /*out*/ nBestKeys,
-                    /*first=*/t == 0 && factorGroup == 0); // @TODO: this is only used for checking presently, and should be removed altogether
-        // Now, nBestPathScores contain N-best expandedPathScores for each batch and beam,
-        // and nBestKeys for each their original location (batchIdx, beamHypIdx, word).
-
-        // combine N-best sets with existing search space (beams) to updated search space
-        beams = toHyps(nBestKeys, nBestPathScores,
-                       /*nBestBeamSize*/expandedPathScores->shape()[-2], // used for interpretation of keys
-                       /*vocabSize=*/expandedPathScores->shape()[-1],    // used for interpretation of keys
-                       beams,
-                       states,    // used for keeping track of per-ensemble-member path score
-                       batch,     // only used for propagating alignment info
-                       factoredVocab, factorGroup,
-                       emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
-                       batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
-      } // END FOR factorGroup = 0 .. numFactorGroups-1
-
-      prevBatchIdxMap = batchIdxMap; // save current batchIdx map to be used in next step; we are then going to look one step back
-
-      // remove all hyps that end in EOS
-      // The position of a hyp in the beam may change.
-      // in/out = shifts the batch index map if a beam gets fully purged
-      const auto purgedNewBeams = purgeBeams(beams, /*in/out=*/batchIdxMap);
-
-      // add updated search space (beams) to our return value
-      bool maxLengthReached = false;
-      for(int batchIdx = 0; batchIdx < origDimBatch; ++batchIdx) {
-        // if this batch entry has surviving hyps then add them to the traceback grid
-        if(!beams[batchIdx].empty()) { // if the beam is not empty expand the history object associated with the beam
-          if (histories[batchIdx]->size() >= options_->get<float>("max-length-factor") * batch->front()->batchWidth())
-            maxLengthReached = true;
-          histories[batchIdx]->add(beams[batchIdx], trgEosId, purgedNewBeams[batchIdx].empty() || maxLengthReached);
-        }
-      }
-      if (maxLengthReached) // early exit if max length limit was reached
-        break;
-
-      // this is the search space for the next output time step
-      beams = purgedNewBeams;
-    } // end of main loop over output time steps
-
-    return histories; // [origDimBatch][t][N best hyps]
-  }
+  Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch);
 };
+
 }  // namespace marian
--- a/vs/CheckDeps.bat
+++ b/vs/CheckDeps.bat
@ -3,7 +3,8 @@
 ::
 :: This script is used to verify that all the dependencies required to build Marian are available.
 :: The Cuda SDK and the Intel MKL must be installed beforehand by the user.
-:: The Boost and OpenSSH libraries, if not found, will be installed by this script using vcpkg
+:: The rest of libraries (see README.md), if not found, will be installed by this script using
+:: vcpkg.
 ::
 ::
@echo off
@ -28,6 +29,7 @@ set ROOT=%~dp0
 ::set BOOST_INCLUDEDIR=
 ::set BOOST_LIBRARYDIR=
 ::set OPENSSL_ROOT_DIR=
+::set Protobuf_SRC_ROOT_FOLDER=


 :: If all the variables are empty and vcpkg is found in a known path, the script will download and
@ -41,9 +43,9 @@ set ROOT=%~dp0
 ::set MKLROOT=


-
 if "%BOOST_INCLUDEDIR%" == "" goto :needVcPkg
 if "%OPENSSL_ROOT_DIR%" == "" goto :needVcPkg
+if "%Protobuf_SRC_ROOT_FOLDER%"=="" goto :needVcPkg

 goto :checkDeps

@ -64,6 +66,7 @@ if not exist %VCPKG_ROOT% (
    echo --- Cloning vcpkg...
    git clone https://github.com/Microsoft/vcpkg.git %VCPKG_ROOT%

+    set USE_BOOST_172=1
    set BOOTSTRAP_VCPKG=1

 ) else (
@ -73,6 +76,7 @@ if not exist %VCPKG_ROOT% (
    echo --- Updating vcpkg...
    for /f "delims=" %%p in ('git pull') do (
        if not "%%p" == "Already up to date." (
+            set USE_BOOST_172=1
            set BOOTSTRAP_VCPKG=1
        )
    )
@ -80,6 +84,17 @@ if not exist %VCPKG_ROOT% (
    popd
 )

+:: Checkout to the last version of Boost that is supported by the 3rd party library
+:: SimpleWebSocketServer. Vcpkg does not allow installing a specific version of library yet, but
+:: the feature has a PR and should be available soon.
+:: For more details, see https://github.com/microsoft/vcpkg/issues/1681
+if "%USE_BOOST_172%"=="1" (
+    pushd %VCPKG_ROOT%
+    echo --- Checkout to Boost version 1.72...
+    git checkout 597038559647776ee39d02dcf159da05d9342f1d --pathspec-from-file=../../pathspec-boost-1.72.txt
+    popd
+)
+
 if "%BOOTSTRAP_VCPKG%"=="1" (
    pushd %VCPKG_ROOT%
    call bootstrap-vcpkg.bat
@ -91,7 +106,6 @@ set VCPKG_INSTALL=%VCPKG_ROOT%\installed\%VCPKG_DEFAULT_TRIPLET%
 set VCPKG=%VCPKG_ROOT%\vcpkg


-
 :: -------------------------------------------------------
 :: Check dependencies and configure CMake
 :checkDeps
@ -117,12 +131,12 @@ else (

 :: -------------------------
 :: The MKL setup does not set any environment variable to the installation path.
-:: The script look into the standard default installation dir
+:: The script look into the standard default installation directory.
 :: If you installed MKL in a custom directory, please set the variable MKLROOT at the top of this file.
 ::
 echo.
 echo ... Intel MKL
-if "%MKLROOT%" == "" ( 
+if "%MKLROOT%" == "" (
    set "MKLROOT=C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl"
 )
 if not exist "%MKLROOT%" (
@ -153,7 +167,7 @@ echo Found Intel MKL library in %MKLROOT%
 echo.
 echo ... Boost (1.58+)
 if "%BOOST_INCLUDEDIR%" == "" (
-    "%VCPKG%" install boost-chrono boost-filesystem boost-iostreams boost-program-options boost-regex boost-system boost-thread boost-timer boost-asio
+    "%VCPKG%" install boost-system boost-asio
    set BOOST_INCLUDEDIR=%VCPKG_INSTALL%\include
    set BOOST_LIBRARYDIR=%VCPKG_INSTALL%\lib
 )
@ -197,17 +211,15 @@ if "%OPENSSL_ROOT_DIR%"=="" (
    set OPENSSL_ROOT_DIR=%VCPKG_INSTALL%
 )

-if not exist "%VCPKG_INSTALL%/bin/protoc.exe" (
-mkdir build
-cd build
-git clone https://github.com/protocolbuffers/protobuf
-cd protobuf
-git checkout v3.6.1
-cd cmake
-cmake . -A x64 -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=%VCPKG_INSTALL%
-cmake --build . --config Release --target install
-cd ..\..\..
-
+:: -------------------------
+:: Protobuf_SRC_ROOT_FOLDER can be set to an existing Protobuf installation.
+:: If not, we use vcpkg to install the library
+::
+echo.
+echo ... Protobuf
+if "%Protobuf_SRC_ROOT_FOLDER%"=="" (
+    %VCPKG% install protobuf
+    set Protobuf_SRC_ROOT_FOLDER=%VCPKG_INSTALL%
 )

 set CMAKE_PREFIX_PATH=%VCPKG_INSTALL%
@ -215,13 +227,14 @@ set CMAKE_PREFIX_PATH=%VCPKG_INSTALL%
 echo.
 echo.
 echo --------------------------------------------------
-echo           CUDA_PATH ^| %CUDA_PATH%
-echo             MKLROOT ^| %MKLROOT%
-echo          VCPKG_ROOT ^| %VCPKG_ROOT%
-echo    BOOST_INCLUDEDIR ^| %BOOST_INCLUDEDIR%
-echo    BOOST_LIBRARYDIR ^| %BOOST_LIBRARYDIR%
-echo    OPENSSL_ROOT_DIR ^| %OPENSSL_ROOT_DIR%
-echo   CMAKE_PREFIX_PATH ^| %CMAKE_PREFIX_PATH%
+echo         BOOST_INCLUDEDIR ^| %BOOST_INCLUDEDIR%
+echo         BOOST_LIBRARYDIR ^| %BOOST_LIBRARYDIR%
+echo        CMAKE_PREFIX_PATH ^| %CMAKE_PREFIX_PATH%
+echo                CUDA_PATH ^| %CUDA_PATH%
+echo                  MKLROOT ^| %MKLROOT%
+echo         OPENSSL_ROOT_DIR ^| %OPENSSL_ROOT_DIR%
+echo Protobuf_SRC_ROOT_FOLDER ^| %Protobuf_SRC_ROOT_FOLDER%
+echo               VCPKG_ROOT ^| %VCPKG_ROOT%
 echo --------------------------------------------------
 echo.
 echo.
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -1379,6 +1379,10 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </ClInclude>
+    <ClCompile Include="..\src\command\marian_embedder.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\src\command\marian_main.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
@ -1414,6 +1418,7 @@
    <ClCompile Include="..\src\data\corpus_nbest.cpp" />
    <ClCompile Include="..\src\data\text_input.cpp" />
    <ClCompile Include="..\src\3rd_party\cnpy\cnpy.cpp" />
+    <ClCompile Include="..\src\embedder\vector_collector.cpp" />
    <ClCompile Include="..\src\examples\iris\helper.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -1556,7 +1561,12 @@
    <ClCompile Include="..\src\3rd_party\yaml-cpp\tag.cpp" />
    <ClCompile Include="..\src\3rd_party\yaml-cpp\contrib\graphbuilder.cpp" />
    <ClCompile Include="..\src\3rd_party\yaml-cpp\contrib\graphbuilderadapter.cpp" />
+    <ClInclude Include="..\src\embedder\embedder.h" />
+    <ClInclude Include="..\src\embedder\vector_collector.h" />
    <ClInclude Include="..\src\layers\lsh.h" />
+    <ClInclude Include="..\src\models\encoder_pooler.h" />
+    <ClInclude Include="..\src\models\laser.h" />
+    <ClInclude Include="..\src\models\pooler.h" />
    <ClInclude Include="..\src\onnx\expression_graph_onnx_exporter.h" />
    <ClInclude Include="resource.h" />
  </ItemGroup>
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -931,6 +931,12 @@
    <ClCompile Include="..\src\layers\lsh.cpp">
      <Filter>layers</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\command\marian_embedder.cpp">
+      <Filter>command</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\embedder\vector_collector.cpp">
+      <Filter>embedder</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -2348,6 +2354,21 @@
    <ClInclude Include="..\src\layers\lsh.h">
      <Filter>layers</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\models\encoder_pooler.h">
+      <Filter>models</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\models\laser.h">
+      <Filter>models</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\models\pooler.h">
+      <Filter>models</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\embedder\embedder.h">
+      <Filter>embedder</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\embedder\vector_collector.h">
+      <Filter>embedder</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">
@ -2614,6 +2635,9 @@
    <Filter Include="3rd_party\faiss\utils">
      <UniqueIdentifier>{b100324b-a506-45fa-948e-40be75b239fc}</UniqueIdentifier>
    </Filter>
+    <Filter Include="embedder">
+      <UniqueIdentifier>{7a11415f-9bc5-4fe4-8bf2-95fa4114736d}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\src\3rd_party\nccl\src\bootstrap.cu">
--- a/vs/NOTES.md
+++ b/vs/NOTES.md
@ -0,0 +1,58 @@
+# How to build Marian on Windows with GPU support
+
+This is interesting for developers, exctracted from README.
+
+---
+## Changes from the master branch
+This part gives more information on all changes done in this PR. Refer to [this page](https://github.com/cedrou/marian-dev/commits/build_on_win) for commits.
+
+1. __Fix Cuda error : Unsupported Visual Studio Version Error__
+   See above for justification and fixes
+
+2. __Fix VS compiler flags / Build in Release, with improved debug info__
+   Added VS specific compile and link flags
+
+3. __Fix Warning: D9002: ignoring unknown option '-m64'__
+   This one is related to a compiler flag added while finding the package MKL that does not exists for MS compiler.
+
+4. __Fix marian::Backend, marian::cpu::Backend and marian::gpu::Backend conflicts__
+   There were name conflicts between the 3 `Backend` classes that confused the compiler:
+
+   >  template instantiation resulted in unexpected function type of "void(Ptr\<marian::gpu::Backend\> backend, [...])" (the meaning of a name may have changed since the template declaration -- the type of the template is "void(Ptr\<marian::Backend\> backend, [...]").
+
+   To solve this, I changed the declaration of 3 methods to specify the full name with namespace (`marian::Backend`, instead of `Backend`).
+
+5. __Fix error : identifier "CUDA_FLT_MAX" is undefined in device code__
+   `CUDA_FLT_MAX` is not seen by CUDA from the device code and I had to declare it as `__constant__`.
+
+   From [StackOverflow](https://stackoverflow.com/questions/20111409/how-to-pass-structures-into-cuda-device#comment29972423_20112013):
+   > Undecorated constants get compiled into both host and device code with gcc based toolchains, but not with the Microsoft compiler.
+
+6. __Fix fatal error C1019: unexpected #else__
+   There was preprocessor instructions (`#ifdef ... #else ... #endif`) in the middle of a call of a macro function (`CUDNN_CALL`), which is not allowed with MS compiler.
+
+7. __Fix mismatched class/struct forward declarations__
+   Microsoft's C++ name mangling makes a distinction between `class` and `struct` objects, so definitions and forward declaration must match.
+   See [this pdf](https://www.agner.org/optimize/calling_conventions.pdf), page 27, for more information.
+
+   _Note_: This fix was invalidated by commit # from @frankseide
+
+8. __Fix unresolved external due to a removed #include directive__
+   There was an include directive removed from MSVC compilation, but this prevented the build of the project.
+   I'm not sure why this was removed; the comment is:
+
+        #ifndef _WIN32  // TODO: remove this once I updated the Linux-side makefile
+
+9. __Fix CUDA+MSVC incompatibility with Boost.Preprocessor__
+   The toolchain nvcc+msvc is not correctly handled in Boost.Preprocessor module. See [this issue](https://github.com/boostorg/preprocessor/issues/15). In the meantime, the recommended workaround is to disable Variadic Macro support in Boost.
+   I created a [PR](https://github.com/boostorg/preprocessor/pull/18) in the Boost repo on GitHub to fix this.
+
+   _Note_: The library sources have been fixed, but this fix is still needed until the next release of Boost.Preprocessor
+
+10. __Provide implementation for mkstemp / Fix temporary file creation__
+   The code explicitely disabled the creation of temporary files because "mkstemp not available in Windows". In fact, `mktemp` and `unlink` are both implemented, but they don't work as expected. I used `tempnam` to replace `mkstemp`, and added the flag `_O_TEMPORARY` to the parameters of `open` to automatically delete the file when it is closed. If `unlinkEarly` is not set, I added a call to `remove` in the destructor to delete the file after its closure.
+   I also handled the case of the default value for the `base` parameter: the path `\tmp` doesnot exist on Windows, so it is replaced by the value of the `%TMP%` environment variable in `NormalizeTempPrefix`.
+
+11. __Revert commit #2f8b093 + Fix copy/paste error while fixing #301 + restrict fix to MSVC compiler.__
+   cf [Issue #301](https://github.com/marian-nmt/marian-dev/issues/301)   -->
+
--- a/vs/README.md
+++ b/vs/README.md
@ -3,81 +3,72 @@

 ## Install prerequisites

-The following SDK are required to build Marian with GPU support. At least one of them needs to be installed. If only CUDA is installed but not MKL,
-a GPU-only version will be build. If only MKL is installed and not CUDA, only the CPU version will be built. So if you are interested in only one
-functionality, you can ommit one of them. Install both for full functionality. 
+The following SDK are required to build Marian with GPU support. At least one of them needs to be
+installed. If only CUDA is installed but not MKL, a GPU-only version will be build. If only MKL is
+installed and not CUDA, only the CPU version will be built. So if you are interested in only one
+functionality, you can omit one of them. Install both for full functionality.

-   - [Cuda 10](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal)
-        - Base installer
-
-   - [MKL](https://software.intel.com/en-us/mkl)
-
-
-__Note: Patch for CUDA 9.2 error: Unsupported Visual Studio Version Error__
-
-This seems to work fine with CUDA 10.0.
-
-When using CUDA 9.2, the latest versions of Visual Studio 2017 are not officially supported by CUDA. Two fixes are proposed:
-
-   - Downgrade Visual Studio to a supported version
-
-   - Edit the file `<CUDA install path>\include\crt\host_config.h` and change the line 131:
-
-         131     #if _MSC_VER < 1600 || _MSC_VER > 1914
-
-     into:
-
-         131     #if _MSC_VER < 1600 || _MSC_VER > 1915
-
-For more information, read this [nVidia forum](https://devtalk.nvidia.com/default/topic/1022648/cuda-setup-and-installation/cuda-9-unsupported-visual-studio-version-error/4)
+   - [Cuda 10](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal),
+     Base installer, CUDA 10.0+ is recommended, there might be issues with CUDA 9.2, see below
+   - [Intel MKL](https://software.intel.com/en-us/mkl)

 ---
 ## Check dependencies : `CheckDeps.bat`

-In addition to the 2 previous prerequisites, Marian needs 2 libraries that you may already have on your system:
+In addition to the 2 previous prerequisites, Marian may need the following libraries that you may
+already have on your system:

-    - Boost (1.58+)
-    - OpenSSL (optional for server)
+    - Boost (1.58-1.72), optional for marian-server (`COMPILE_SERVER=TRUE` in CMake)
+    - OpenSSL, optional for marian-server
+    - Protobuf, optional for compiling with SentencePiece (`USE_SENTENCEPIECE=TRUE` in CMake),
+      recommended

-The script `CheckDeps.bat` can be used to verify that all dependencies are found on your system. If not, it will use the `vcpkg` library manager to download and manage your dependencies for CMake.
+The script `CheckDeps.bat` can be used to verify that all dependencies are found on your system. If
+not, it will use the `vcpkg` library manager to download and manage your dependencies for CMake.

-If you already have a working `vcpkg` installation, this script can use it:
- If vcpkg is in your `PATH` environment variable, the script will find it and use it automatically.
- Otherwise, you need to edit the script and set the `VCPKG_ROOT` variable to the directory that contains the vcpkg.exe
-
-
-If you prefer to manage yourself the dependencies, you can edit the script file to set the following variables to the respective installation paths. These variable can also be already set in your environment.
- `BOOST_INCLUDE_PATH` and `BOOST_LIB_PATH`
- `OPENSSL_PATH`
+If you already have a working `vcpkg` installation, this script can use it.
+If vcpkg is in your `PATH` environment variable, the script will find it and use it automatically.
+Otherwise, you need to edit the script and set the `VCPKG_ROOT` variable.
+Please see the script for more details.

 ---
 ## Build the project

 There are 3 alternatives to build the project:
-1. Use Visual Studio 2017 built-in support for CMake
+1. Use Visual Studio 2017+ built-in support for CMake
 2. Create a Solution file for Visual Studio
 3. Use a script (MSBuild)

-### 1. Use VS2017 with built-in support for CMake

-VS2017 now allows to develop projects built with CMake without the need to generate VS projects and solutions. For more information, please read [this article](https://blogs.msdn.microsoft.com/vcblog/2016/10/05/cmake-support-in-visual-studio/) from the Visual C++ Team.
+### 1. Use VS2017+ with built-in support for CMake
+
+VS2017 or newer now allows to develop projects built with CMake without the need to generate VS
+projects and solutions. For more information, please read [this article](https://blogs.msdn.microsoft.com/vcblog/2016/10/05/cmake-support-in-visual-studio/)
+from the Visual C++ Team.
+
+You just need to open the root folder of the git repository in VS (which contains the file
+`CMakeSettings.json`):

-You just need to open the root folder of the git repository in VS (which contains the file `CMakeSettings.json`):
 - In an Explorer window, right-click then `Open in Visual Studio`
 - In a VS2017 instance, `File > Open > Folder...`

-You may need to edit the file `CMakeSettings.json` to set the environment variable for the dependencies.
+You may need to edit the file `CMakeSettings.json` to set the environment variable for the
+dependencies.

-The developing experience is very similar than when using a solution file (Intellisense, build project with `F7`, debug, set breakpoints and watch variables, ...), except that the project configuration is done in 3 different files:
+The developing experience is very similar that when using a solution file (Intellisense, build
+project with `F7`, debug, set breakpoints and watch variables, ...), except that the project
+configuration is done in 3 different files:

-   - `CMakeList.txt`: this is the CMake source file from the original project.  
-     It is used to configure the build targets, add/remove files to compile and configure the compiler flags.
+   - `CMakeList.txt`: this is the CMake source file from the original project.
+     It is used to configure the build targets, add/remove files to compile and configure the
+     compiler flags.

-   - `CMakeSettings.json`: this file is required to enable CMake integration in VS2017.  
-     Use this file to configure the environment variables and the parameters passed to CMake to generate the project.
-   
-   - `.vs\launch.vs.json`: this is a user specific file and it is not commited in the Git repo  
-     Use this file to configure the debugging targets.  
+   - `CMakeSettings.json`: this file is required to enable CMake integration in VS2017.
+     Use this file to configure the environment variables and the parameters passed to CMake to
+     generate the project.
+
+   - `.vs\launch.vs.json`: this is a user specific file and it is not commited in the Git repo.
+     Use this file to configure the debugging targets.
     For example:

         {
@ -116,8 +107,8 @@ The developing experience is very similar than when using a solution file (Intel
                     "--keep-best",
                     "--seed 1111",
                     "--exponential-smoothing",
-                     "--normalize=1",
-                     "--beam-size=12",
+                     "--normalize 1",
+                     "--beam-size 12",
                     "--quiet-translation"
                 ]
                 }
@ -125,12 +116,13 @@ The developing experience is very similar than when using a solution file (Intel
         }


-
 ### 2. Create solution and projects files for Visual Studio : `CreateVSProjects.bat`

-If you have a previous version of Visual Studio, you will need to use CMake to generate the projects files.
+If you have a previous version of Visual Studio, you will need to use CMake to generate the projects
+files.

-The provided script `CreateVSProjects.bat` runs the dependency checks then invokes CMake with the right parameters to create the solutions for Visual Studio.
+The provided script `CreateVSProjects.bat` runs the dependency checks then invokes CMake with the
+right parameters to create the solutions for Visual Studio.


 ### 3. Use MSBuild : `BuildRelease.bat`
@ -140,61 +132,31 @@ The last alternative is to use the script `BuildRelease.bat` that will:
 - Create the VS project files
 - Invoke MSBuild on these projects to build the targets in Release.

-<!-- 
-This is interesting for developers, hiding away from users.
-
 ---
-## Changes from the master branch
-This part gives more information on all changes done in this PR. Refer to [this page](https://github.com/cedrou/marian-dev/commits/build_on_win) for commits.
+## Known issues

-1. __Fix Cuda error : Unsupported Visual Studio Version Error__   
-   See above for justification and fixes
+1. __Patch for CUDA 9.2 error: Unsupported Visual Studio Version Error__

-2. __Fix VS compiler flags / Build in Release, with improved debug info__  
-   Added VS specific compile and link flags
+    When using CUDA 9.2, the latest versions of Visual Studio 2017 are not officially supported by
+    CUDA. Two fixes are proposed:
+    - Downgrade Visual Studio to a supported version
+    - Edit the file `<CUDA install path>\include\crt\host_config.h` and change the line 131:

-3. __Fix Warning: D9002: ignoring unknown option '-m64'__  
-   This one is related to a compiler flag added while finding the package MKL that does not exists for MS compiler. 
+            131     #if _MSC_VER < 1600 || _MSC_VER > 1914

-4. __Fix marian::Backend, marian::cpu::Backend and marian::gpu::Backend conflicts__  
-   There were name conflicts between the 3 `Backend` classes that confused the compiler:
-   
-   >  template instantiation resulted in unexpected function type of "void(Ptr\<marian::gpu::Backend\> backend, [...])" (the meaning of a name may have changed since the template declaration -- the type of the template is "void(Ptr\<marian::Backend\> backend, [...]").
+        into:

-   To solve this, I changed the declaration of 3 methods to specify the full name with namespace (`marian::Backend`, instead of `Backend`).
+            131     #if _MSC_VER < 1600 || _MSC_VER > 1915

-5. __Fix error : identifier "CUDA_FLT_MAX" is undefined in device code__  
-   `CUDA_FLT_MAX` is not seen by CUDA from the device code and I had to declare it as `__constant__`.
+    For more information, read this [nVidia forum](https://devtalk.nvidia.com/default/topic/1022648/cuda-setup-and-installation/cuda-9-unsupported-visual-studio-version-error/4)

-   From [StackOverflow](https://stackoverflow.com/questions/20111409/how-to-pass-structures-into-cuda-device#comment29972423_20112013):
-   > Undecorated constants get compiled into both host and device code with gcc based toolchains, but not with the Microsoft compiler. 
+2. __It does not compile with Boost 1.73 or newer__

-6. __Fix fatal error C1019: unexpected #else__  
-   There was preprocessor instructions (`#ifdef ... #else ... #endif`) in the middle of a call of a macro function (`CUDNN_CALL`), which is not allowed with MS compiler.
+    At the moment (version 1.9.26) SimpleWebSocketServer, a 3rd party library that Marian uses for
+    marian-server, does not support Boost newer than 1.72. Since vcpkg does not allow installing a
+    specific library versions, you need to revert `ports/boost*` directories to install older Boost.
+    See `CheckDeps.bat` for an example.

-7. __Fix mismatched class/struct forward declarations__  
-   Microsoft's C++ name mangling makes a distinction between `class` and `struct` objects, so definitions and forward declaration must match.  
-   See [this pdf](https://www.agner.org/optimize/calling_conventions.pdf), page 27, for more information.
+    Note that Boost is required only if you compile with marian-server, for compilation using CMake,
+    it is if you set `COMPILE_SERVER` to `TRUE` in CMakeSettings.json.

-   _Note_: This fix was invalidated by commit # from @frankseide
-
-8. __Fix unresolved external due to a removed #include directive__  
-   There was an include directive removed from MSVC compilation, but this prevented the build of the project.  
-   I'm not sure why this was removed; the comment is:
-
-        #ifndef _WIN32  // TODO: remove this once I updated the Linux-side makefile
-
-9. __Fix CUDA+MSVC incompatibility with Boost.Preprocessor__  
-   The toolchain nvcc+msvc is not correctly handled in Boost.Preprocessor module. See [this issue](https://github.com/boostorg/preprocessor/issues/15). In the meantime, the recommended workaround is to disable Variadic Macro support in Boost.  
-   I created a [PR](https://github.com/boostorg/preprocessor/pull/18) in the Boost repo on GitHub to fix this.
-
-   _Note_: The library sources have been fixed, but this fix is still needed until the next release of Boost.Preprocessor
-
-10. __Provide implementation for mkstemp / Fix temporary file creation__  
-   The code explicitely disabled the creation of temporary files because "mkstemp not available in Windows". In fact, `mktemp` and `unlink` are both implemented, but they don't work as expected. I used `tempnam` to replace `mkstemp`, and added the flag `_O_TEMPORARY` to the parameters of `open` to automatically delete the file when it is closed. If `unlinkEarly` is not set, I added a call to `remove` in the destructor to delete the file after its closure.  
-   I also handled the case of the default value for the `base` parameter: the path `\tmp` doesnot exist on Windows, so it is replaced by the value of the `%TMP%` environment variable in `NormalizeTempPrefix`.
-
-11. __Revert commit #2f8b093 + Fix copy/paste error while fixing #301 + restrict fix to MSVC compiler.__  
-   cf [Issue #301](https://github.com/marian-nmt/marian-dev/issues/301)   -->
-   
-   
--- a/vs/pathspec-boost-1.72.txt
+++ b/vs/pathspec-boost-1.72.txt
@ -0,0 +1,142 @@
+ports/boost/
+ports/boost-accumulators/
+ports/boost-algorithm/
+ports/boost-align/
+ports/boost-any/
+ports/boost-array/
+ports/boost-asio/
+ports/boost-assert/
+ports/boost-assign/
+ports/boost-atomic/
+ports/boost-beast/
+ports/boost-bimap/
+ports/boost-bind/
+ports/boost-build/
+ports/boost-callable-traits/
+ports/boost-chrono/
+ports/boost-circular-buffer/
+ports/boost-compatibility/
+ports/boost-compute/
+ports/boost-concept-check/
+ports/boost-config/
+ports/boost-container/
+ports/boost-container-hash/
+ports/boost-context/
+ports/boost-contract/
+ports/boost-conversion/
+ports/boost-convert/
+ports/boost-core/
+ports/boost-coroutine/
+ports/boost-coroutine2/
+ports/boost-crc/
+ports/boost-date-time/
+ports/boost-detail/
+ports/boost-di/
+ports/boost-dll/
+ports/boost-dynamic-bitset/
+ports/boost-endian/
+ports/boost-exception/
+ports/boost-fiber/
+ports/boost-filesystem/
+ports/boost-flyweight/
+ports/boost-foreach/
+ports/boost-format/
+ports/boost-function/
+ports/boost-functional/
+ports/boost-function-types/
+ports/boost-fusion/
+ports/boost-geometry/
+ports/boost-gil/
+ports/boost-graph/
+ports/boost-graph-parallel/
+ports/boost-hana/
+ports/boost-heap/
+ports/boost-histogram/
+ports/boost-hof/
+ports/boost-icl/
+ports/boost-integer/
+ports/boost-interprocess/
+ports/boost-interval/
+ports/boost-intrusive/
+ports/boost-io/
+ports/boost-iostreams/
+ports/boost-iterator/
+ports/boost-lambda/
+ports/boost-lexical-cast/
+ports/boost-locale/
+ports/boost-local-function/
+ports/boost-lockfree/
+ports/boost-log/
+ports/boost-logic/
+ports/boost-math/
+ports/boost-metaparse/
+ports/boost-modular-build-helper/
+ports/boost-move/
+ports/boost-mp11/
+ports/boost-mpi/
+ports/boost-mpl/
+ports/boost-msm/
+ports/boost-multi-array/
+ports/boost-multi-index/
+ports/boost-multiprecision/
+ports/boost-numeric-conversion/
+ports/boost-odeint/
+ports/boost-optional/
+ports/boost-outcome/
+ports/boost-parameter/
+ports/boost-parameter-python/
+ports/boost-phoenix/
+ports/boost-poly-collection/
+ports/boost-polygon/
+ports/boost-pool/
+ports/boost-predef/
+ports/boost-preprocessor/
+ports/boost-process/
+ports/boost-program-options/
+ports/boost-property-map/
+ports/boost-property-tree/
+ports/boost-proto/
+ports/boost-ptr-container/
+ports/boost-python/
+ports/boost-qvm/
+ports/boost-random/
+ports/boost-range/
+ports/boost-ratio/
+ports/boost-rational/
+ports/boost-regex/
+ports/boost-safe-numerics/
+ports/boost-scope-exit/
+ports/boost-serialization/
+ports/boost-signals/
+ports/boost-signals2/
+ports/boost-smart-ptr/
+ports/boost-sort/
+ports/boost-spirit/
+ports/boost-stacktrace/
+ports/boost-statechart/
+ports/boost-static-assert/
+ports/boost-system/
+ports/boost-test/
+ports/boost-thread/
+ports/boost-throw-exception/
+ports/boost-timer/
+ports/boost-tokenizer/
+ports/boost-tti/
+ports/boost-tuple/
+ports/boost-type-erasure/
+ports/boost-type-index/
+ports/boost-typeof/
+ports/boost-type-traits/
+ports/boost-ublas/
+ports/boost-units/
+ports/boost-unordered/
+ports/boost-utility/
+ports/boost-uuid/
+ports/boost-variant/
+ports/boost-variant2/
+ports/boost-vcpkg-helpers/
+ports/boost-vmd/
+ports/boost-wave/
+ports/boost-winapi/
+ports/boost-xpressive/
+ports/boost-yap/