Merge branch 'master' of github.com:marian-nmt/marian

2024-11-27 10:33:14 +03:00 · 2017-11-21 11:12:10 +01:00 · 2017-11-21 11:12:10 +01:00 · 4874c0d0fd
commit 4874c0d0fd
parent 3a80bd299f 6a8a394b92
42 changed files with 977 additions and 12928 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -2,3 +2,6 @@
 	path = src/marian
 	url = https://github.com/marian-nmt/marian-dev
 	branch = stable
+[submodule "examples"]
+	path = examples
+	url = https://github.com/marian-nmt/marian-examples.git
--- a/2
+++ b/2
@ -1 +1 @@
-v0.0.0-test+3.13905be*v0.0.0-test*0*0*0*test*3*13905be
+v1.0.0
--- a/examples/training/.gitignore
+++ b/examples/training/.gitignore
@ -1,10 +0,0 @@
-model/
-moses-scripts/
-subword-nmt/
-data/corpus.*
-data/europarl-*
-data/SETIMES2.*
-data/*.tok.*
-data/*.tc.*
-data/*.bpe.*
-data/*.tgz
--- a/examples/training/LICENSE
+++ b/examples/training/LICENSE
@ -1,21 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2016 University of Edinburgh
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/examples/training/README.md
+++ b/examples/training/README.md
@ -1,84 +0,0 @@
-# Example for training with Marian
-
-Files and scripts in this folder have been adapted from the Romanian-English sample 
-from https://github.com/rsennrich/wmt16-scripts. We also add the back-translated data from
-http://data.statmt.org/rsennrich/wmt16_backtranslations/ as desribed in 
-http://www.aclweb.org/anthology/W16-2323. The resulting system should be competitive 
-or even slightly better than reported in the Edinburgh WMT2016 paper. 
-
-To execute the complete example type:
-
-```
-./run-me.sh
-```
-
-which downloads the Romanian-English training files and preprocesses them (tokenization, 
-truecasing, segmentation into subwords units). 
-
-To use with a different GPU than device 0 or more GPUs (here 0 1 2 3) type the command below. 
-Training time on 1 NVIDIA GTX 1080 GPU should be roughly 24 hours.
-
-```
-./run-me.sh 0 1 2 3
-```
-
-Next it executes a training run with `marian`:
-
-```
-../../build/marian \
- --model model/model.npz \
- --devices $GPUS \
- --train-sets data/corpus.bpe.ro data/corpus.bpe.en \
- --vocabs model/vocab.ro.yml model/vocab.en.yml \
- --dim-vocabs 66000 50000 \
- --mini-batch 80 \
- --layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
- --early-stopping 5 --moving-average \
- --valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
- --valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
- --valid-metrics cross-entropy valid-script \
- --valid-script-path ./scripts/validate.sh \
- --log model/train.log --valid-log model/valid.log
-```
-After training (the training should stop if cross-entropy on the validation set stops improving) a final model 
-`model/model.avg.npz` is created from the 4 best models on the validation sets (by element-wise averaging). This model is used to 
-translate the WMT2016 dev set and test set with `amun`:
-
-```
-cat data/newstest2016.bpe.ro \
- | ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -b 12 -n --mini-batch 100 --maxi-batch 1000 \
- | sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl \
- > data/newstest2016.bpe.ro.output
-```
-after which BLEU scores for the dev and test set are reported. Results should be somewhere in the area of:
-
-```
-newsdev2016:
-BLEU = 35.88, 67.4/42.3/28.8/20.2 (BP=1.000, ratio=1.012, hyp_len=51085, ref_len=50483)
-
-newstest2016:
-BLEU = 34.53, 66.0/40.7/27.5/19.2 (BP=1.000, ratio=1.015, hyp_len=49258, ref_len=48531)
-```
-
-## Custom validation script
-
-The validation script `scripts/validate.sh` is a quick example how to write a custom validation script. The training pauses until the validation script finishes executing. A validation script should not output anything to `stdout` apart from the final single score (last line): 
-
-```
-#!/bin/bash
-
-#model prefix
-prefix=model/model.npz
-
-dev=data/newsdev2016.bpe.ro
-ref=data/newsdev2016.tok.en
-
-# decode
-
-cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml --mini-batch 10 --maxi-batch 100 2>/dev/null \
- | sed 's/\@\@ //g' | ./mosesdecoder/scripts/recaser/detruecase.perl > $dev.output.postprocessed
-
-## get BLEU
-./mosesdecoder/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed \
-| cut -f 3 -d ' ' | cut -f 1 -d ','
-```
--- a/examples/training/clean.sh
+++ b/examples/training/clean.sh
@ -1,4 +0,0 @@
-#!/bin/bash -v
-
-git clean -f
-rm -rf moses-scripts subword-nmt model data/ro-en.tgz
--- a/examples/training/data/newsdev2016.en
+++ b/examples/training/data/newsdev2016.en
--- a/examples/training/data/newsdev2016.ro
+++ b/examples/training/data/newsdev2016.ro
--- a/examples/training/data/newstest2016.en
+++ b/examples/training/data/newstest2016.en
--- a/examples/training/data/newstest2016.ro
+++ b/examples/training/data/newstest2016.ro
--- a/examples/training/run-me.sh
+++ b/examples/training/run-me.sh
@ -1,86 +0,0 @@
-#!/bin/bash -v
-
-# set chosen gpus
-GPUS=0
-if [ $# -ne 0 ]
-then
-    GPUS=$@
-fi
-echo Using gpus $GPUS
-
-if [ ! -e ../../build/amun ]
-then
-    echo "amun is not installed in ../../build, you need to compile the toolkit first."
-    exit 1
-fi
-
-if [ ! -e ../../build/marian ]
-then
-    echo "marian is not installed in ../../build, you need to compile the toolkit first."
-    exit 1
-fi
-
-# download dependencies and data
-if [ ! -e "moses-scripts" ]
-then
-    git clone https://github.com/amunmt/moses-scripts
-fi
-
-if [ ! -e "subword-nmt" ]
-then
-    git clone https://github.com/rsennrich/subword-nmt
-fi
-
-if [ ! -e "data/ro-en.tgz" ]
-then
-    ./scripts/download-files.sh
-fi
-
-mkdir -p model
-
-# preprocess data
-if [ ! -e "data/corpus.bpe.en" ]
-then
-    ./scripts/preprocess.sh
-fi
-
-# train model
-if [ ! -e "model/model.npz" ]
-then
-
-    ../../build/marian \
-        --model model/model.npz \
-        --devices $GPUS --seed 0 \
-        --train-sets data/corpus.bpe.ro data/corpus.bpe.en \
-        --vocabs model/vocab.ro.yml model/vocab.en.yml \
-        --dim-vocabs 66000 50000 \
-        --dynamic-batching -w 3000 \
-        --layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
-        --early-stopping 5 --moving-average \
-        --valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
-        --valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
-        --valid-metrics cross-entropy valid-script \
-        --valid-script-path ./scripts/validate.sh \
-        --log model/train.log --valid-log model/valid.log
-
-fi
-
-# collect 4 best models on dev set
-MODELS=`cat model/valid.log | grep valid-script | sort -rg -k8,8 -t ' ' | cut -f 4 -d ' ' | head -n 4 | xargs -I {} echo model/model.iter{}.npz | xargs`
-
-# average 4 best models into single model
-../../scripts/average.py -m $MODELS -o model/model.avg.npz
-
-# translate dev set with averaged model
-cat data/newsdev2016.bpe.ro \
-    | ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
-    | sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newsdev2016.bpe.ro.output.postprocessed
-
-# translate test set with averaged model
-cat data/newstest2016.bpe.ro \
-    | ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
-    | sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newstest2016.bpe.ro.output.postprocessed
-
-# calculate bleu scores for dev and test set
-./moses-scripts/scripts/generic/multi-bleu.perl data/newsdev2016.tok.en < data/newsdev2016.bpe.ro.output.postprocessed
-./moses-scripts/scripts/generic/multi-bleu.perl data/newstest2016.tok.en < data/newstest2016.bpe.ro.output.postprocessed
--- a/examples/training/scripts/download-files.sh
+++ b/examples/training/scripts/download-files.sh
@ -1,29 +0,0 @@
-#!/bin/bash -v
-
-# get En-Ro training data for WMT16
-
-if [ ! -f data/ro-en.tgz ];
-then
-    wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz
-fi
-
-if [ ! -f data/SETIMES2.ro-en.txt.zip ];
-then
-    wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip
-fi
-
-if [ ! -f data/corpus.bt.ro-en.en.gz ];
-then
-    wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.en.gz -O data/corpus.bt.ro-en.en.gz
-    wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.ro.gz -O data/corpus.bt.ro-en.ro.gz
-fi
-
-cd data/
-tar -xf ro-en.tgz
-unzip SETIMES2.ro-en.txt.zip
-gzip -d corpus.bt.ro-en.en.gz corpus.bt.ro-en.ro.gz
-
-cat europarl-v7.ro-en.en SETIMES2.en-ro.en corpus.bt.ro-en.en > corpus.en
-cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro corpus.bt.ro-en.ro > corpus.ro
-
-cd ..
--- a/examples/training/scripts/normalise-romanian.py
+++ b/examples/training/scripts/normalise-romanian.py
@ -1,17 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Author: Barry Haddow
-# Distributed under MIT license
-
-#
-# Normalise Romanian s-comma and t-comma
-
-import io
-import sys
-istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
-ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-
-for line in istream:
-  line = line.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
-  line = line.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
-  ostream.write(line)
--- a/examples/training/scripts/preprocess.sh
+++ b/examples/training/scripts/preprocess.sh
@ -1,75 +0,0 @@
-#!/bin/bash -v
-
-# this sample script preprocesses a sample corpus, including tokenization,
-# truecasing, and subword segmentation.
-# for application to a different language pair,
-# change source and target prefix, optionally the number of BPE operations,
-# and the file names (currently, data/corpus and data/newsdev2016 are being processed)
-
-# in the tokenization step, you will want to remove Romanian-specific normalization / diacritic removal,
-# and you may want to add your own.
-# also, you may want to learn BPE segmentations separately for each language,
-# especially if they differ in their alphabet
-
-# suffix of source language files
-SRC=ro
-
-# suffix of target language files
-TRG=en
-
-# number of merge operations. Network vocabulary should be slightly larger (to include characters),
-# or smaller if the operations are learned on the joint vocabulary
-bpe_operations=85000
-
-# path to moses decoder: https://github.com/moses-smt/mosesdecoder
-mosesdecoder=moses-scripts
-
-# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
-subword_nmt=subword-nmt
-
-# tokenize
-for prefix in corpus newsdev2016 newstest2016
-do
-    cat data/$prefix.$SRC \
-        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
-        | ./scripts/normalise-romanian.py \
-        | ./scripts/remove-diacritics.py \
-        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
-
-    cat data/$prefix.$TRG \
-        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
-        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
-
-done
-
-# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
-$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
-
-# train truecaser
-$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC
-$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG
-
-# apply truecaser (cleaned training corpus)
-for prefix in corpus
-do
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
-done
-
-# apply truecaser (dev/test files)
-for prefix in newsdev2016 newstest2016
-do
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
-done
-
-# train BPE
-cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
-
-# apply BPE
-
-for prefix in corpus newsdev2016 newstest2016
-do
-    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
-    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
-done
--- a/examples/training/scripts/remove-diacritics.py
+++ b/examples/training/scripts/remove-diacritics.py
@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Author: Barry Haddow
-# Distributed under MIT license
-
-#
-# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
-
-import io
-import sys
-istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
-ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-
-for line in istream:
-  line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
-  line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
-  line = line.replace("\u0102", "A").replace("\u0103", "a")
-  line = line.replace("\u00C2", "A").replace("\u00E2", "a")
-  line = line.replace("\u00CE", "I").replace("\u00EE", "i")
-  ostream.write(line)
--- a/examples/training/scripts/validate.sh
+++ b/examples/training/scripts/validate.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-# model prefix
-prefix=model/model.npz
-
-dev=data/newsdev2016.bpe.ro
-ref=data/newsdev2016.tok.en
-
-# decode
-
-cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml -b 12 -n --mini-batch 10 --maxi-batch 100 2>/dev/null \
-    | sed 's/\@\@ //g' | ./moses-scripts/scripts/recaser/detruecase.perl > $dev.output.postprocessed
-
-# get BLEU
-./moses-scripts/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed | cut -f 3 -d ' ' | cut -f 1 -d ','
--- a/examples/translate/.gitignore
+++ b/examples/translate/.gitignore
@ -1,4 +0,0 @@
-moses-scripts
-en-de
-data/*.out
-*.yml
--- a/examples/translate/README.md
+++ b/examples/translate/README.md
@ -1,2 +0,0 @@
-Translation example
-===================
--- a/examples/translate/clean.sh
+++ b/examples/translate/clean.sh
@ -1,3 +0,0 @@
-#!/bin/bash -v
-
-rm -rf moses-scripts en-de data/*.out *.yml
--- a/examples/translate/data/newstest2015.ende.de
+++ b/examples/translate/data/newstest2015.ende.de
--- a/examples/translate/data/newstest2015.ende.en
+++ b/examples/translate/data/newstest2015.ende.en
--- a/examples/translate/run-me.sh
+++ b/examples/translate/run-me.sh
@ -1,60 +0,0 @@
-#!/bin/bash
-
-# set chosen gpus
-GPUS=0
-if [ $# -ne 0 ]
-then
-    GPUS=$@
-fi
-echo Using gpus $GPUS
-
-if [ ! -e ../../build/amun ]
-then
-    echo "amun is not installed in ../../build, you need to compile the toolkit first."
-    exit 1
-fi
-
-# download dependencies and data
-if [ ! -e "moses-scripts" ]
-then
-    git clone https://github.com/amunmt/moses-scripts
-fi
-
-if [ ! -e "en-de/model.npz" ]
-then
-    wget -r -l 1 --cut-dirs=2 -e robots=off -nH -np -R index.html* http://data.statmt.org/rsennrich/wmt16_systems/en-de/
-fi
-
-
-# translate test set with single model
-cat data/newstest2015.ende.en | \
-    # preprocess
-    moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
-    moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
-    moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
-    # translate
-    ../../build/amun -m en-de/model.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
-    --mini-batch 50 --maxi-batch 1000 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe | \
-    # postprocess
-    moses-scripts/scripts/recaser/detruecase.perl | \
-    moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.single.out
-
-# create configuration file for model ensemble
-../../build/amun -m en-de/model-ens?.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
-    --mini-batch 1 --maxi-batch 1 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe \
-    --relative-paths --dump-config > ensemble.yml
-
-# translate test set with ensemble
-cat data/newstest2015.ende.en | \
-    # preprocess
-    moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
-    moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
-    moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
-    # translate
-    ../../build/amun -c ensemble.yml --gpu-threads 1 | \
-    # postprocess
-    moses-scripts/scripts/recaser/detruecase.perl | \
-    moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.ensemble.out
-
-moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.single.out
-moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.ensemble.out
--- a/src/amun/common/base_best_hyps.h
+++ b/src/amun/common/base_best_hyps.h
@ -9,17 +9,19 @@

 namespace amunmt {

+class God;
+
 class BestHypsBase
 {
  public:
    BestHypsBase(
+        const God &god,
        bool forbidUNK,
-        bool returnNBestList,
        bool isInputFiltered,
        bool returnAttentionWeights,
        const std::map<std::string, float>& weights)
-    : forbidUNK_(forbidUNK),
-      returnNBestList_(returnNBestList),
+    : god_(god),
+      forbidUNK_(forbidUNK),
      isInputFiltered_(isInputFiltered),
      returnAttentionWeights_(returnAttentionWeights),
      weights_(weights)
@ -35,8 +37,8 @@ class BestHypsBase
        std::vector<uint>& beamSizes) = 0;

  protected:
+    const God &god_;
    const bool forbidUNK_;
-    const bool returnNBestList_;
    const bool isInputFiltered_;
    const bool returnAttentionWeights_;
    const std::map<std::string, float> weights_;
--- a/src/amun/common/god.cpp
+++ b/src/amun/common/god.cpp
@ -83,6 +83,19 @@ God& God::Init(int argc, char** argv) {
  LoadScorers();
  LoadFiltering();

+  returnNBestList_ = Get<bool>("return-alignment")
+                   || Get<bool>("return-soft-alignment")
+                   || Get<bool>("return-nematus-alignment");
+
+  useFusedSoftmax_ = true;
+  if (returnNBestList_ ||
+      gpuLoaders_.size() != 1 || // more than 1 scorer
+      God::Get<size_t>("beam-size") > 11 // beam size affect shared mem alloc in gLogSoftMax()
+      ) {
+    useFusedSoftmax_ = false;
+  }
+  //cerr << "useFusedSoftmax_=" << useFusedSoftmax_ << endl;
+
  if (Has("input-file")) {
    LOG(info)->info("Reading from {}", Get<std::string>("input-file"));
    inputStream_.reset(new InputFileStream(Get<std::string>("input-file")));
--- a/src/amun/common/god.h
+++ b/src/amun/common/god.h
@ -77,6 +77,12 @@ class God {
    ThreadPool &GetThreadPool()
    { return *pool_; }

+    bool ReturnNBestList() const
+    { return returnNBestList_; }
+
+    bool UseFusedSoftmax() const
+    { return useFusedSoftmax_; }
+
  private:
    void LoadScorers();
    void LoadFiltering();
@ -107,6 +113,9 @@ class God {
    mutable boost::shared_mutex accessLock_;

    std::unique_ptr<ThreadPool> pool_;
+
+    bool returnNBestList_;
+    bool useFusedSoftmax_;
 };

 }
--- a/src/amun/common/scorer.cpp
+++ b/src/amun/common/scorer.cpp
@ -5,7 +5,10 @@ namespace amunmt {
 Scorer::Scorer(const God &god,
              const std::string& name,
              const YAML::Node& config, size_t tab)
-: name_(name), config_(config), tab_(tab)
+:god_(god)
+,name_(name)
+,config_(config)
+,tab_(tab)
 {
 }

--- a/src/amun/common/scorer.h
+++ b/src/amun/common/scorer.h
@ -64,8 +64,11 @@ class Scorer {
    }

    virtual BaseMatrix& GetProbs() = 0;
+    virtual void *GetNBest() = 0; // hack - need to return matrix<NthOut> but NthOut contain cuda code
+    virtual const BaseMatrix *GetBias() const = 0;

  protected:
+    const God &god_;
    const std::string& name_;
    const YAML::Node& config_;
    size_t tab_;
--- a/src/amun/cpu/decoder/best_hyps.h
+++ b/src/amun/cpu/decoder/best_hyps.h
@ -26,9 +26,8 @@ class BestHyps : public BestHypsBase
 {
  public:
    BestHyps(const God &god)
-      : BestHypsBase(
+      : BestHypsBase(god,
          !god.Get<bool>("allow-unk"),
-          god.Get<bool>("n-best"),
          god.Get<std::vector<std::string>>("softmax-filter").size(),
          god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
          god.GetScorerWeights())
@ -83,7 +82,7 @@ class BestHyps : public BestHypsBase
      }

      std::vector<std::vector<float>> breakDowns;
-      if (returnNBestList_) {
+      if (god_.ReturnNBestList()) {
        breakDowns.push_back(bestCosts);
        for (auto& scorer : scorers) {
          std::vector<float> modelCosts(beamSize);
@ -123,7 +122,7 @@ class BestHyps : public BestHypsBase
          hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
        }

-        if (returnNBestList_) {
+        if (god_.ReturnNBestList()) {
          hyp->GetCostBreakdown().resize(scorers.size());
          float sum = 0;
          for(size_t j = 0; j < scorers.size(); ++j) {
--- a/src/amun/cpu/decoder/encoder_decoder.h
+++ b/src/amun/cpu/decoder/encoder_decoder.h
@ -25,6 +25,18 @@ class CPUEncoderDecoderBase : public Scorer {
    virtual void GetAttention(mblas::Matrix& Attention) = 0;
    virtual mblas::Matrix& GetAttention() = 0;

+    virtual void *GetNBest()
+    {
+      assert(false);
+      return nullptr;
+    }
+
+    virtual const BaseMatrix *GetBias() const
+    {
+      assert(false);
+      return nullptr;
+    }
+
  protected:
    mblas::Matrix SourceContext_;
 };
--- a/src/amun/gpu/decoder/best_hyps.cu
+++ b/src/amun/gpu/decoder/best_hyps.cu
@ -4,16 +4,20 @@ namespace amunmt {
 namespace GPU {

 BestHyps::BestHyps(const God &god)
-      : BestHypsBase(
+      : BestHypsBase(god,
          !god.Get<bool>("allow-unk"),
          god.Get<bool>("n-best"),
          god.Get<std::vector<std::string>>("softmax-filter").size(),
-          god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
          god.GetScorerWeights()),
-        nthElement_(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch")),
        keys(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
-        Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch"))
-{}
+        Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
+        maxBeamSize_(god.Get<uint>("beam-size"))
+{
+  if (!god_.UseFusedSoftmax()) {
+    NthElement *obj = new NthElement(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch"));
+    nthElement_.reset(obj);
+  }
+}

 void BestHyps::DisAllowUNK(mblas::Matrix& Prob) {
  SetColumn(Prob, UNK_ID, std::numeric_limits<float>::lowest());
@ -24,7 +28,17 @@ void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Prob
               std::vector<unsigned>& outKeys,
               const bool isFirst)
 {
-  nthElement_.getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
+  nthElement_->getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
+}
+
+// fast fused softmax and nth_element
+void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
+               DeviceVector<NthOutBatch> &nBest,
+               std::vector<float>& outCosts,
+               std::vector<unsigned>& outKeys,
+               const bool isFirst)
+{
+  getNBestList(beamSizes, Probs, nBest, outCosts, outKeys, isFirst);
 }

 std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPtr>& scorers,
@ -52,6 +66,7 @@ std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPt
  return alignments;
 }

+// standard nth_element
 void  BestHyps::CalcBeam(
    const Beam& prevHyps,
    const std::vector<ScorerPtr>& scorers,
@ -77,35 +92,49 @@ void  BestHyps::CalcBeam(
              cudaMemcpyHostToDevice);
  //mblas::copy(vCosts.begin(), vCosts.end(), Costs.begin());

-  const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
-
-  BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
-
-  for (size_t i = 1; i < scorers.size(); ++i) {
-    mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
-
-    Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
-  }
-
-  if (forbidUNK_) {
-    DisAllowUNK(Probs);
-  }
-
  size_t beamSizeSum = std::accumulate(beamSizes.begin(), beamSizes.end(), 0);

  std::vector<float> bestCosts;
  std::vector<unsigned> bestKeys;

-  FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
+  const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
+
+  if (god_.UseFusedSoftmax()) {
+    const mblas::Matrix& b4 = *static_cast<const mblas::Matrix*>(scorers[0]->GetBias());
+    DeviceVector<NthOutBatch> &nBest = *static_cast<DeviceVector<NthOutBatch>*>(scorers[0]->GetNBest());
+    nBest.resize(beamSizeSum);
+
+    BEGIN_TIMER("GetProbs.LogSoftmaxAndNBest");
+    mblas::LogSoftmaxAndNBest(nBest, Probs, b4, Costs, forbidUNK_, maxBeamSize_, beamSizes, beamSizeSum, isFirst);
+    PAUSE_TIMER("GetProbs.LogSoftmaxAndNBest");
+    //std::cerr << "2Probs=" << Probs.Debug(1) << std::endl;
+
+    FindBests(beamSizes, Probs, nBest, bestCosts, bestKeys, isFirst);
+  }
+  else {
+    BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
+
+    for (size_t i = 1; i < scorers.size(); ++i) {
+      mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
+
+      Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
+    }
+
+    if (forbidUNK_) {
+      DisAllowUNK(Probs);
+    }
+
+    FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
+  }

  std::vector<HostVector<float>> breakDowns;
-  if (returnNBestList_) {
+  if (god_.ReturnNBestList()) {
      breakDowns.push_back(bestCosts);
      for (size_t i = 1; i < scorers.size(); ++i) {
        std::vector<float> modelCosts(beamSizeSum);
        mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());

-        nthElement_.getValueByKey(modelCosts, currProbs);
+        nthElement_->getValueByKey(modelCosts, currProbs);
        breakDowns.push_back(modelCosts);
      }
  }
@ -135,7 +164,7 @@ void  BestHyps::CalcBeam(
      hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
    }

-    if(returnNBestList_) {
+    if(god_.ReturnNBestList()) {
      hyp->GetCostBreakdown().resize(scorers.size());
      float sum = 0;
      for (size_t j = 0; j < scorers.size(); ++j) {
@ -162,5 +191,43 @@ void  BestHyps::CalcBeam(
  PAUSE_TIMER("CalcBeam");
 }

+//////////////////////////////////////////////////////////////////////////
+void BestHyps::getNBestList(const std::vector<uint>& beamSizes,
+                  mblas::Matrix& Probs,
+                  DeviceVector<NthOutBatch> &nBest,
+                  std::vector<float>& outCosts,
+                  std::vector<uint>& outKeys,
+                  const bool isFirst) const
+{
+  GetPairs(nBest, outKeys, outCosts);
+  assert(outCosts.size() == outKeys.size());
+
+  /*
+  cerr << "outCosts/outKeys=";
+  for (size_t i = 0; i < outKeys.size(); ++i) {
+    cerr << "(" << outCosts[i] << "," << outKeys[i] << ") ";
+  }
+  cerr << endl;
+  */
+  //cerr << endl;
+}
+
+void BestHyps::GetPairs(DeviceVector<NthOutBatch> &nBest,
+              std::vector<uint>& outKeys,
+              std::vector<float>& outValues) const
+{
+  //cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl;
+  outKeys.resize(nBest.size());
+  outValues.resize(nBest.size());
+
+  HostVector<NthOutBatch> hostVec(nBest.size());
+  mblas::copy(thrust::raw_pointer_cast(nBest.data()), nBest.size(), thrust::raw_pointer_cast(hostVec.data()), cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < nBest.size(); ++i) {
+    outKeys[i] = hostVec[i].ind;
+    outValues[i] = hostVec[i].score;
+  }
+}
+
 } // namespace
 }
--- a/src/amun/gpu/decoder/best_hyps.h
+++ b/src/amun/gpu/decoder/best_hyps.h
@ -24,6 +24,7 @@ class BestHyps : public BestHypsBase

    void DisAllowUNK(mblas::Matrix& Prob);

+    // standard nth_element
    void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
                   std::vector<float>& outCosts,
                   std::vector<unsigned>& outKeys,
@ -31,6 +32,7 @@ class BestHyps : public BestHypsBase

    std::vector<SoftAlignmentPtr> GetAlignments(const std::vector<ScorerPtr>& scorers,
                                                size_t hypIndex);
+
    void CalcBeam(
        const Beam& prevHyps,
        const std::vector<ScorerPtr>& scorers,
@ -39,9 +41,29 @@ class BestHyps : public BestHypsBase
        std::vector<uint>& beamSizes);

  private:
-    NthElement nthElement_;
+    std::unique_ptr<NthElement> nthElement_;
    DeviceVector<unsigned> keys;
    DeviceVector<float> Costs;
+    uint maxBeamSize_;
+
+    // fast fused softmax and nth_element
+    void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
+                   DeviceVector<NthOutBatch> &nBest,
+                   std::vector<float>& outCosts,
+                   std::vector<unsigned>& outKeys,
+                   const bool isFirst);
+
+    void getNBestList(const std::vector<uint>& beamSizes,
+                      mblas::Matrix& Probs,
+                      DeviceVector<NthOutBatch> &nBest,
+                      std::vector<float>& outCosts,
+                      std::vector<uint>& outKeys,
+                      const bool isFirst=false) const;
+
+    void GetPairs(DeviceVector<NthOutBatch> &nBest,
+                  std::vector<uint>& outKeys,
+                  std::vector<float>& outValues) const;
+
 };

 }
--- a/src/amun/gpu/decoder/encoder_decoder.cu
+++ b/src/amun/gpu/decoder/encoder_decoder.cu
@ -70,8 +70,9 @@ void EncoderDecoder::Decode(const State& in, State& out, const std::vector<uint>
                     edIn.GetStates(),
                     edIn.GetEmbeddings(),
                     *SourceContext_,
-                     sentencesMask_,
-                     beamSizes);
+                     sentenceLengths_,
+                     beamSizes,
+                     god_.UseFusedSoftmax());
  PAUSE_TIMER("Decode");
 }

@ -81,7 +82,7 @@ State* EncoderDecoder::NewState() const {

 void EncoderDecoder::Encode(const Sentences& source) {
  BEGIN_TIMER("Encode");
-  encoder_->Encode(source, tab_, *SourceContext_, sentencesMask_);
+  encoder_->Encode(source, tab_, *SourceContext_, sentenceLengths_);
  //cerr << "GPU SourceContext_=" << SourceContext_.Debug(1) << endl;
  PAUSE_TIMER("Encode");
 }
@ -89,7 +90,7 @@ void EncoderDecoder::Encode(const Sentences& source) {
 void EncoderDecoder::BeginSentenceState(State& state, size_t batchSize) {
  //BEGIN_TIMER("BeginSentenceState");
  EDState& edState = state.get<EDState>();
-  decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentencesMask_);
+  decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentenceLengths_);

  decoder_->EmptyEmbedding(edState.GetEmbeddings(), batchSize);
  //PAUSE_TIMER("BeginSentenceState");
@ -143,6 +144,16 @@ BaseMatrix& EncoderDecoder::GetProbs() {
  return decoder_->GetProbs();
 }

+void *EncoderDecoder::GetNBest()
+{
+  return &decoder_->GetNBest();
+}
+
+const BaseMatrix *EncoderDecoder::GetBias() const
+{
+  return decoder_->GetBias();
+}
+
 mblas::Matrix& EncoderDecoder::GetAttention() {
  return decoder_->GetAttention();
 }
--- a/src/amun/gpu/decoder/encoder_decoder.h
+++ b/src/amun/gpu/decoder/encoder_decoder.h
@ -50,6 +50,9 @@ class EncoderDecoder : public Scorer {
    mblas::Matrix& GetAttention();
    virtual BaseMatrix& GetProbs();

+    virtual void *GetNBest();
+    virtual const BaseMatrix *GetBias() const;
+
    size_t GetVocabSize() const;

    void Filter(const std::vector<size_t>& filterIds);
@ -59,7 +62,7 @@ class EncoderDecoder : public Scorer {
    std::unique_ptr<Encoder> encoder_;
    std::unique_ptr<Decoder> decoder_;
    DeviceVector<uint> indices_;
-    mblas::IMatrix sentencesMask_;
+    mblas::IMatrix sentenceLengths_;
      // set in Encoder::GetContext() to length (maxSentenceLength * batchSize). 1 if it's a word, 0 otherwise

    std::unique_ptr<mblas::Matrix> SourceContext_;
--- a/src/amun/gpu/dl4mt/decoder.h
+++ b/src/amun/gpu/dl4mt/decoder.h
@ -66,7 +66,7 @@ class Decoder {
        void InitializeState(CellState& State,
                             const mblas::Matrix& SourceContext,
                             const size_t batchSize,
-                             const mblas::IMatrix &sentencesMask)
+                             const mblas::IMatrix &sentenceLengths)
        {
          using namespace mblas;

@ -81,7 +81,7 @@ class Decoder {

          //std::cerr << "SourceContext=" << SourceContext.Debug(1) << std::endl;
          //std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
-          Mean(Temp2_, SourceContext, sentencesMask);
+          Mean(Temp2_, SourceContext, sentenceLengths);

          //std::cerr << "1State=" << State.Debug(1) << std::endl;
          //std::cerr << "3Temp2_=" << Temp2_.Debug(1) << std::endl;
@ -156,7 +156,7 @@ class Decoder {
        void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
                                     const CellState& HiddenState,
                                     const mblas::Matrix& SourceContext,
-                                     const mblas::IMatrix &sentencesMask,
+                                     const mblas::IMatrix &sentenceLengths,
                                     const std::vector<uint>& beamSizes)
        {
          // mapping = 1/0 whether each position, in each sentence in the batch is actually a valid word
@ -165,6 +165,7 @@ class Decoder {

          using namespace mblas;

+          size_t maxLength = SourceContext.dim(0);
          size_t batchSize = SourceContext.dim(3);
          //std::cerr << "batchSize=" << batchSize << std::endl;
          //std::cerr << "HiddenState=" << HiddenState.Debug(0) << std::endl;
@ -182,11 +183,13 @@ class Decoder {
              batchMapping.size(),
              thrust::raw_pointer_cast(dBatchMapping_.data()),
              cudaMemcpyHostToDevice);
-          //std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
-          //std::cerr << "batchMapping=" << Debug(batchMapping, 2) << std::endl;
-          //std::cerr << "dBatchMapping_=" << Debug(dBatchMapping_, 2) << std::endl;

-          const size_t srcSize = sentencesMask.size() / beamSizes.size();
+          /*
+          std::cerr << "SourceContext=" << SourceContext.Debug(0) << std::endl;
+          std::cerr << "AlignedSourceContext=" << AlignedSourceContext.Debug(0) << std::endl;
+          std::cerr << "A_=" << A_.Debug(0) << std::endl;
+          std::cerr << "sentenceLengths=" << sentenceLengths.Debug(2) << std::endl;
+          */

          Prod(/*h_[1],*/ Temp2_, *(HiddenState.output), *w_.W_);
          //std::cerr << "1Temp2_=" << Temp2_.Debug() << std::endl;
@ -198,14 +201,14 @@ class Decoder {
          }
          //std::cerr << "2Temp2_=" << Temp2_.Debug() << std::endl;

-          Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, srcSize);
+          Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, maxLength);

          //std::cerr << "w_.V_=" << w_.V_->Debug(0) << std::endl;
          //std::cerr << "3Temp1_=" << Temp1_.Debug(0) << std::endl;

          Prod(A_, *w_.V_, Temp1_, false, true);

-          mblas::Softmax(A_, dBatchMapping_, sentencesMask, batchSize);
+          mblas::Softmax(A_, dBatchMapping_, sentenceLengths, batchSize);
          mblas::WeightedMean(AlignedSourceContext, A_, SourceContext, dBatchMapping_);

          /*
@ -253,9 +256,12 @@ class Decoder {
        }

        void GetProbs(mblas::Matrix& Probs,
+            std::shared_ptr<mblas::Matrix> &b4,
                  const CellState& State,
                  const mblas::Matrix& Embedding,
-                  const mblas::Matrix& AlignedSourceContext) {
+                  const mblas::Matrix& AlignedSourceContext,
+                  bool useFusedSoftmax)
+        {
          using namespace mblas;

          //BEGIN_TIMER("GetProbs.Prod");
@ -298,7 +304,7 @@ class Decoder {
          Element(Tanh(_1 + _2 + _3), T1_, T2_, T3_);
          //PAUSE_TIMER("GetProbs.Element");

-          std::shared_ptr<mblas::Matrix> w4, b4;
+          std::shared_ptr<mblas::Matrix> w4;
          if(!filtered_) {
            w4 = w_.W4_;
            b4 = w_.B4_;
@ -315,13 +321,15 @@ class Decoder {
          Prod(Probs, T1_, *w4);
          PAUSE_TIMER("GetProbs.Prod4");

-          BEGIN_TIMER("GetProbs.BroadcastVec");
-          BroadcastVec(_1 + _2, Probs, *b4);
-          PAUSE_TIMER("GetProbs.BroadcastVec");
+          if (!useFusedSoftmax) {
+            BEGIN_TIMER("GetProbs.BroadcastVec");
+            BroadcastVec(_1 + _2, Probs, *b4);
+            PAUSE_TIMER("GetProbs.BroadcastVec");

-          BEGIN_TIMER("GetProbs.LogSoftMax");
-          mblas::LogSoftmax(Probs);
-          PAUSE_TIMER("GetProbs.LogSoftMax");
+            BEGIN_TIMER("GetProbs.LogSoftMax");
+            mblas::LogSoftmax(Probs);
+            PAUSE_TIMER("GetProbs.LogSoftMax");
+          }
        }

        void Filter(const std::vector<size_t>& ids) {
@ -365,8 +373,9 @@ class Decoder {
                  const CellState& State,
                  const mblas::Matrix& Embeddings,
                  const mblas::Matrix& SourceContext,
-                  const mblas::IMatrix &sentencesMask,
-                  const std::vector<uint>& beamSizes)
+                  const mblas::IMatrix &sentenceLengths,
+                  const std::vector<uint>& beamSizes,
+                  bool useFusedSoftmax)
    {
      //BEGIN_TIMER("Decode");

@ -379,7 +388,11 @@ class Decoder {
      //PAUSE_TIMER("GetHiddenState");

      //BEGIN_TIMER("GetAlignedSourceContext");
-      GetAlignedSourceContext(AlignedSourceContext_, HiddenState_, SourceContext, sentencesMask, beamSizes);
+      GetAlignedSourceContext(AlignedSourceContext_,
+                              HiddenState_,
+                              SourceContext,
+                              sentenceLengths,
+                              beamSizes);
      //std::cerr << "AlignedSourceContext_=" << AlignedSourceContext_.Debug(1) << std::endl;
      //PAUSE_TIMER("GetAlignedSourceContext");

@ -389,7 +402,7 @@ class Decoder {
      //PAUSE_TIMER("GetNextState");

      //BEGIN_TIMER("GetProbs");
-      GetProbs(NextState, Embeddings, AlignedSourceContext_);
+      GetProbs(NextState, Embeddings, AlignedSourceContext_, useFusedSoftmax);
      //std::cerr << "Probs_=" << Probs_.Debug(1) << std::endl;
      //PAUSE_TIMER("GetProbs");

@ -403,9 +416,9 @@ class Decoder {
    void EmptyState(CellState& State,
                    const mblas::Matrix& SourceContext,
                    size_t batchSize,
-                    const mblas::IMatrix &sentencesMask)
+                    const mblas::IMatrix &sentenceLengths)
    {
-      rnn1_.InitializeState(State, SourceContext, batchSize, sentencesMask);
+      rnn1_.InitializeState(State, SourceContext, batchSize, sentenceLengths);
      alignment_.Init(SourceContext);
    }

@ -435,6 +448,14 @@ class Decoder {
      return alignment_.GetAttention();
    }

+    DeviceVector<NthOutBatch>& GetNBest() {
+      return nBest_;
+    }
+
+    const mblas::Matrix *GetBias() const {
+      return b4_.get();
+    }
+
  private:

    void GetHiddenState(CellState& HiddenState,
@ -446,10 +467,13 @@ class Decoder {
    void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
                                  const CellState& HiddenState,
                                  const mblas::Matrix& SourceContext,
-                                  const mblas::IMatrix &sentencesMask,
+                                  const mblas::IMatrix &sentenceLengths,
                                  const std::vector<uint>& beamSizes) {
-      alignment_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext,
-                                         sentencesMask, beamSizes);
+      alignment_.GetAlignedSourceContext(AlignedSourceContext,
+                                        HiddenState,
+                                        SourceContext,
+                                        sentenceLengths,
+                                        beamSizes);
    }

    void GetNextState(CellState& State,
@ -461,8 +485,10 @@ class Decoder {

    void GetProbs(const CellState& State,
                  const mblas::Matrix& Embedding,
-                  const mblas::Matrix& AlignedSourceContext) {
-      softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext);
+                  const mblas::Matrix& AlignedSourceContext,
+                  bool useFusedSoftmax)
+    {
+      softmax_.GetProbs(Probs_, b4_, State, Embedding, AlignedSourceContext, useFusedSoftmax);
    }

    std::unique_ptr<Cell> InitHiddenCell(const Weights& model, const YAML::Node& config){
@ -505,6 +531,9 @@ class Decoder {
    Alignment<Weights::DecAlignment> alignment_;
    Softmax<Weights::DecSoftmax> softmax_;

+    DeviceVector<NthOutBatch> nBest_;
+    std::shared_ptr<mblas::Matrix> b4_;
+
    Decoder(const Decoder&) = delete;
 };

--- a/src/amun/gpu/dl4mt/encoder.cu
+++ b/src/amun/gpu/dl4mt/encoder.cu
@ -63,22 +63,19 @@ std::vector<std::vector<size_t>> GetBatchInput(const Sentences& source, size_t t
 }

 void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context,
-                         mblas::IMatrix &sentencesMask)
+                         mblas::IMatrix &sentenceLengths)
 {
  size_t maxSentenceLength = GetMaxLength(source, tab);

-  //cerr << "1dMapping=" << mblas::Debug(dMapping, 2) << endl;
-  HostVector<uint> hMapping(maxSentenceLength * source.size(), 0);
+  HostVector<uint> hSentenceLengths(source.size());
  for (size_t i = 0; i < source.size(); ++i) {
-    for (size_t j = 0; j < source.at(i)->GetWords(tab).size(); ++j) {
-      hMapping[i * maxSentenceLength + j] = 1;
-    }
+    hSentenceLengths[i] = source.at(i)->GetWords(tab).size();
  }

-  sentencesMask.NewSize(maxSentenceLength, source.size(), 1, 1);
-  mblas::copy(thrust::raw_pointer_cast(hMapping.data()),
-              hMapping.size(),
-              sentencesMask.data(),
+  sentenceLengths.NewSize(source.size(), 1, 1, 1);
+  mblas::copy(thrust::raw_pointer_cast(hSentenceLengths.data()),
+              hSentenceLengths.size(),
+              sentenceLengths.data(),
              cudaMemcpyHostToDevice);

  //cerr << "GetContext1=" << context.Debug(1) << endl;
@ -106,7 +103,7 @@ void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context

  backwardRnn_.Encode(embeddedWords_.crend() - maxSentenceLength,
                          embeddedWords_.crend() ,
-                          context, source.size(), true, &sentencesMask);
+                          context, source.size(), true, &sentenceLengths);
  //cerr << "GetContext5=" << context.Debug(1) << endl;
 }

--- a/src/amun/gpu/dl4mt/encoder.h
+++ b/src/amun/gpu/dl4mt/encoder.h
@ -71,8 +71,9 @@ class Encoder {
        }

        template <class It>
-        void Encode(It it, It end, mblas::Matrix& Context, size_t batchSize, bool invert,
-                        const mblas::IMatrix *sentencesMask=nullptr)
+        void Encode(It it, It end, mblas::Matrix& Context,
+                    size_t batchSize, bool invert,
+                    const mblas::IMatrix *sentenceLengths=nullptr)
        {
          InitializeState(batchSize);

@ -86,12 +87,15 @@ class Encoder {

            //std::cerr << "invert=" << invert << std::endl;
            if(invert) {
-              assert(sentencesMask);
+              assert(sentenceLengths);

              //std::cerr << "1State_=" << State_.Debug(1) << std::endl;
              //std::cerr << "mapping=" << mblas::Debug(*mapping) << std::endl;
              //mblas::MapMatrix(*(State_.cell), *sentencesMask, n - i - 1);
-              mblas::MapMatrix(*(State_.output), *sentencesMask, n - i - 1);
+              mblas::MapMatrix(*(State_.output), *sentenceLengths, n - i - 1);
+              if (State_.cell->size()) {
+                mblas::MapMatrix(*(State_.cell), *sentenceLengths, n - i - 1);
+              }
              //std::cerr << "2State_=" << State_.Debug(1) << std::endl;

              mblas::PasteRows(Context, *(State_.output), (n - i - 1), gru_->GetStateLength().output);
@ -124,7 +128,7 @@ class Encoder {
    Encoder(const Weights& model, const YAML::Node& config);

    void Encode(const Sentences& words, size_t tab, mblas::Matrix& context,
-                    mblas::IMatrix &sentencesMask);
+                    mblas::IMatrix &sentenceLengths);

  private:
    std::unique_ptr<Cell> InitForwardCell(const Weights& model, const YAML::Node& config);
--- a/src/amun/gpu/mblas/matrix_functions.cu
+++ b/src/amun/gpu/mblas/matrix_functions.cu
@ -17,7 +17,7 @@ Matrix& Swap(Matrix& Out, Matrix& In) {

 __global__ void gMean(MatrixWrapper<float> out,
                      const MatrixWrapper<float> in,
-                      const MatrixWrapper<uint>  mapping)
+                      const MatrixWrapper<uint> sentenceLengths)
 {
  // out = batches * states
  // in = max sentence length * states * 1 * batches
@ -37,7 +37,7 @@ __global__ void gMean(MatrixWrapper<float> out,
    float sum = 0.0f;
    int counter = 0;
    for (size_t row = 0; row < in.dim(0); ++row) {
-      int isWord = mapping(row, batch, 0, 0);
+      bool isWord = row < sentenceLengths[batch];
      //printf("batch=%lu startMapInd=%lu  mapOffset=%lu -> %d \n", batch, startMapInd, mapOffset, isWord);
      if (isWord) {
        sum += in(row, state, 0, batch);
@ -50,13 +50,14 @@ __global__ void gMean(MatrixWrapper<float> out,
  }
 }

-void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
+void Mean(Matrix& Out,
+          const Matrix& In,
+          const mblas::IMatrix &sentenceLengths)
 {
  assert(Out.dim(2) == 1);
  assert(Out.dim(3) == 1);
  assert(Out.dim(0) == In.dim(3));
  assert(Out.dim(1) == In.dim(1));
-  assert(In.dim(0) * In.dim(3) == sentencesMask.size());

  // mean of each ROW
  size_t batchNum = Out.dim(0) * Out.dim(2) * Out.dim(3);
@ -67,14 +68,14 @@ void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
  MatrixWrapper<float> inWrap(In);
  //cerr << "outWrap=" << outWrap.Debug() << endl;

-  MatrixWrapper<uint> mappingWrap(sentencesMask, false);
+  MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);

  uint size = outWrap.size();
  uint threads = std::min((uint)MAX_THREADS, size);
  uint blocks =  (size / threads) + ((size % threads == 0) ?  0 : 1);

  gMean<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
-    (outWrap, inWrap, mappingWrap);
+    (outWrap, inWrap, sentenceLengthsWrap);

 }

@ -432,27 +433,27 @@ Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,

 __global__ void gSoftMax(MatrixWrapper<float> out,
                         const MatrixWrapper<uint> batchIdsWrap,
-                         const MatrixWrapper<uint> sentencesMappingWrap,
+                         const MatrixWrapper<uint> sentenceLengthsWrap,
                         uint shareSize)
 {
  extern __shared__ float _share[];

  size_t numHypos = out.dim(0);
-  size_t srcLen = out.dim(1);
+  size_t maxLength = out.dim(1);

  int hypoInd =  blockIdx.x;
  int origSrcPos = threadIdx.x;

  while (hypoInd < numHypos) {
-    MatrixWrapper<float> _max(_share, shareSize);
+    MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);
    _max[origSrcPos] = out(hypoInd, origSrcPos, 0, 0);
-    for (int tid = 0; tid < srcLen; tid += blockDim.x) {
+    for (int tid = 0; tid < maxLength; tid += blockDim.x) {
      int srcPos = tid + origSrcPos;
-      if (srcPos < srcLen) {
+      if (srcPos < maxLength) {
        float value = out(hypoInd, srcPos, 0, 0);

        int batch = batchIdsWrap[hypoInd];
-        value *= sentencesMappingWrap(srcPos, batch, 0, 0);
+        value *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0;
        if (value > _max[origSrcPos]) {
          _max[origSrcPos] = value;
        }
@ -475,16 +476,16 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
    __syncthreads();

    //float* _sum = _share;// + blockDim.x;
-    MatrixWrapper<float> _sum(_share, shareSize);
+    MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);

    _sum[origSrcPos] = 0.0f;
-    for (int tid = 0; tid < srcLen; tid += blockDim.x) {
+    for (int tid = 0; tid < maxLength; tid += blockDim.x) {
      int srcPos = tid + origSrcPos;
-      if (srcPos < srcLen) {
+      if (srcPos < maxLength) {
        out(hypoInd, srcPos, 0, 0) = __expf(out(hypoInd, srcPos, 0, 0) - max);

        int batch = batchIdsWrap[hypoInd];
-        out(hypoInd, srcPos, 0, 0) *= sentencesMappingWrap(srcPos, batch, 0, 0);
+        out(hypoInd, srcPos, 0, 0) *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0; // sentencesMappingWrap(srcPos, batch, 0, 0);
        _sum[origSrcPos] += out(hypoInd, srcPos, 0, 0);
      }
    }
@ -504,9 +505,9 @@ __global__ void gSoftMax(MatrixWrapper<float> out,

    __syncthreads();

-    for (int tid = 0; tid < srcLen; tid += blockDim.x) {
+    for (int tid = 0; tid < maxLength; tid += blockDim.x) {
      int srcPos = tid + origSrcPos;
-      if (srcPos < srcLen) {
+      if (srcPos < maxLength) {
        out(hypoInd, srcPos, 0, 0) /= _sum[0];
      }
    }
@ -515,20 +516,23 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
  }
 }

-Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize)
+Matrix& Softmax(Matrix& Out,
+                const DeviceVector<uint>& batchIds,
+                const mblas::IMatrix &sentenceLengths,
+                size_t batchSize)
 {
-  size_t srcSize = Out.dim(1);
+  size_t maxLength = Out.dim(1);

  MatrixWrapper<float> outWrap(Out);
  const MatrixWrapper<uint> batchIdsWrap(batchIds);
-  const MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
+  const MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);

  int blocks = batchSize;
-  int threads = std::min(MAX_THREADS, (int)srcSize);
+  int threads = std::min(MAX_THREADS, (int)maxLength);
  int shared = sizeof(float) * threads;

  gSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
-    (outWrap, batchIdsWrap, sentencesMappingWrap, threads);
+    (outWrap, batchIdsWrap, sentenceLengthsWrap, threads);

  return Out;
 }
@ -544,7 +548,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)

  while (rowIdx < rows) {
    //float* _max = _share;
-    MatrixWrapper<float> _max(_share, shareSize);
+    MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);

    _max[threadIdx.x] = out(rowIdx, threadIdx.x, 0, 0);
    for (int tid = 0; tid < cols; tid += blockDim.x) {
@ -573,7 +577,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)
    __syncthreads();

    //float* _sum = _share;// + blockDim.x;
-    MatrixWrapper<float> _sum(_share, shareSize);
+    MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);

    _sum[threadIdx.x] = 0.0f;
    for (int tid = 0; tid < cols; tid += blockDim.x) {
@ -675,8 +679,8 @@ void Fill(Matrix& In, float value) {

 __global__
 void gMapMatrix(MatrixWrapper<float> in,
-                const MatrixWrapper<uint> sentencesMappingWrap,
-                int mappingCols, int i)
+                const MatrixWrapper<uint> sentenceLengthsWrap,
+                int i)
 {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid < in.size()) {
@ -685,27 +689,28 @@ void gMapMatrix(MatrixWrapper<float> in,
    int col = tid % numCols;

    //in[tid] *= mappingWrap(i, batchIdx, 0, 0);
-    in(batchIdx, col, 0, 0) *= sentencesMappingWrap(i, batchIdx, 0, 0); // [mappingCols * batchIdx + i];
+    in(batchIdx, col, 0, 0) *= (i < sentenceLengthsWrap[batchIdx] ? 1 : 0);
  }
 }

-void MapMatrix(Matrix& state, const mblas::IMatrix &sentencesMask, size_t i)
+void MapMatrix(Matrix& state,
+              const mblas::IMatrix &sentenceLengths,
+              size_t i)
 {
  // blank out rows in the state matrix where the word position i does not exist
  // mapping is a concatenated array of 1 & 0 of each sentence in the batch to say whether word exists or not.

  int batchSize = state.dim(0);
  int stateLength = state.dim(1);
-  int sentenceLength = sentencesMask.size() / batchSize;

  int numThreads = std::min((int)state.size(), MAX_THREADS);
  int numBlocks = (state.size() / numThreads) + ((state.size() % numThreads == 0) ? 0 : 1);

  MatrixWrapper<float> stateWrap(state);
-  MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
+  MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths);

  gMapMatrix<<<numBlocks, numThreads, 0, CudaStreamHandler::GetStream()>>>
-    (stateWrap, sentencesMappingWrap, sentenceLength, i);
+    (stateWrap, sentenceLengthsWrap, i);

  /*
  cerr << "nBlocks=" << numBlocks << endl;
@ -852,20 +857,569 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps
  Normalization(out, in, alpha, nullptr, eps);
 }

-__global__ void gRandomizeMemory(int *data)
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__
+void gBeamSizeInit(MatrixWrapper<uint> hypo2BeamSizeWrap,
+                  MatrixWrapper<uint> batch2HypoWrap,
+                  MatrixWrapper<uint> hypo2CandidateWrap,
+                  bool isFirst,
+                  uint beamSizeSum,
+                  const MatrixWrapper<uint> beamSizesWrap)
 {
-  clock_t start = clock();
+  uint hypoInd = 0;
+  uint candidateInd = 0;
+
+  uint a = 0, b = 0;
+  //printf("beamSizesWrap.size()=%u \n", beamSizesWrap.size());
+  for (size_t batchInd = 0; batchInd < beamSizesWrap.size(); ++batchInd) {
+    uint beamSize = beamSizesWrap[batchInd];
+    /*
+    printf("batchInd=%u ", batchInd);
+    printf("beamSize=%u ", beamSize);
+    printf("a=%u ", a);
+    printf("b=%u \n", b);
+    */
+
+    if (beamSize) {
+      if (isFirst) {
+        assert(a < hypo2BeamSizeWrap.size());
+        assert(a < hypo2CandidateWrap.size());
+        hypo2BeamSizeWrap[a] = beamSize;
+        hypo2CandidateWrap[a] = candidateInd;
+        ++a;
+
+        assert(b < batch2HypoWrap.size());
+        batch2HypoWrap[b] = batchInd;
+        ++b;
+
+        candidateInd += beamSize;
+      }
+      else {
+        for (size_t j = 0; j < beamSize; ++j) {
+          assert(a < hypo2BeamSizeWrap.size());
+          assert(a < hypo2CandidateWrap.size());
+          hypo2BeamSizeWrap[a] = beamSize;
+          hypo2CandidateWrap[a] = candidateInd;
+          ++a;
+
+          candidateInd += beamSize;
+        }
+
+        assert(b < batch2HypoWrap.size());
+        batch2HypoWrap[b] = hypoInd;
+        ++b;
+      }
+
+      hypoInd += beamSize;
+    }
+  }

 }

-void RandomizeMemory()
+__device__
+float GetMaxScore(const MatrixWrapper<NthOutBatch> &nBestMatrix)
 {
-  int *data;
-  HANDLE_ERROR( cudaMalloc((void**)&data, 8 * 1024 ^ 3) );
+  float ret = -1111111111111;
+  for (uint i = 0; i < nBestMatrix.dim(1); ++i) {
+      const NthOutBatch &curr = nBestMatrix[i];
+      if (curr.score > ret) {
+        ret = curr.score;
+    }
+  }

-  uint threads = 1024;
-  uint blocks = 8 * 1024 ^ 3 / threads;
-  gRandomizeMemory<<<blocks, threads>>>(data);
+  return ret;
+}
+
+__device__
+void AddElement(float &minScore,
+    uint &i,
+    NthOutBatch *arr,
+    bool forbidUNK,
+    uint vocabInd,
+    const NthOutBatch &ele)
+{
+  const float score = ele.score;
+
+  if (forbidUNK && vocabInd == UNK_ID) {
+    arr[i].score = -1111111111111;
+    minScore = -1111111111111;
+  }
+  else {
+    arr[i] = ele;
+
+    if (score < minScore) {
+      minScore = score;
+    }
+
+    ++i;
+  }
+
+}
+
+__device__
+void MergeElement(float &minScore,
+                  NthOutBatch *arr,
+                  uint arrSize,
+                  const NthOutBatch &ele)
+{
+  float newMinScore = +1111111111;
+  bool found = false;
+  for (uint i = 0; i < arrSize; ++i) {
+    NthOutBatch &currEle = arr[i];
+    if (!found && minScore == currEle.score) {
+      currEle = ele;
+      found = true;
+    }
+
+    // update min score
+    if (currEle.score < newMinScore) {
+      newMinScore = currEle.score;
+    }
+  }
+
+  minScore = newMinScore;
+}
+
+__device__
+void MergeElement(float &minScore,
+                  NthOutBatch *arr,
+                  uint arrSize,
+                  const NthOutBatch &ele,
+                  bool forbidUNK,
+                  uint vocabInd)
+{
+  if (forbidUNK && vocabInd == UNK_ID) {
+    // do nothing
+  }
+  else if (ele.score > minScore) {
+    // replace element with min score
+    MergeElement(minScore, arr, arrSize, ele);
+
+    /*
+    printf("arrInd=%d ind=%d vocabId=%d \n",
+          arrInd,
+          _max[threadIdx.x].ind,
+          _max[threadIdx.x].vocabId);
+    */
+  }
+}
+
+__device__
+void NBestAndMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
+              float &topScore,
+              const MatrixWrapper<float> in,
+              const MatrixWrapper<float> b4Wrap,
+              uint hypoInd,
+              uint maxBeamSize,
+              bool forbidUNK,
+              const MatrixWrapper<uint> hypo2BeamSizeWrap,
+              const MatrixWrapper<uint> hypo2CandidateWrap)
+{
+  extern __shared__ char _sharePtr[];
+
+  MatrixWrapper<float> maxMatrix((float*)_sharePtr, blockDim.x, 1, 1, 1);
+
+  void *ptrOffset = _sharePtr + sizeof(float) * blockDim.x;
+  MatrixWrapper<NthOutBatch> nBestMatrix((NthOutBatch*)ptrOffset, blockDim.x, maxBeamSize, 1, 1);
+  NthOutBatch *arr = &nBestMatrix(threadIdx.x, 0, 0, 0);
+
+  uint vocabSize = in.dim(1);
+
+  assert(hypoInd < hypo2BeamSizeWrap.size());
+  uint beamSize = hypo2BeamSizeWrap[hypoInd];
+
+  float minScore = +1111111111;
+
+  // init
+  uint vocabInd = threadIdx.x;
+  uint i = 0;
+  while (vocabInd < vocabSize && i < beamSize) {
+    const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
+
+    uint arrInd = hypoInd * vocabSize + vocabInd;
+    NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
+
+    AddElement(minScore, i, arr, forbidUNK, vocabInd, ele);
+
+    vocabInd += blockDim.x;
+  }
+
+  // MAIN LOOP
+  while (vocabInd < vocabSize) {
+    const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
+    uint arrInd = hypoInd * vocabSize + vocabInd;
+    NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
+
+    MergeElement(minScore, arr, beamSize, ele, forbidUNK, vocabInd);
+
+    vocabInd += blockDim.x;
+  } // while (vocabInd < vocabSize) {
+
+  // merge nbest from different threads
+  int len = blockDim.x;
+  while (len != 1) {
+    __syncthreads();
+    int skip = (len + 1) >> 1;
+    if (threadIdx.x < (len >> 1)) {
+      NthOutBatch *dest = &nBestMatrix(threadIdx.x, 0, 0, 0);
+
+      for (uint i = 0; i < beamSize; ++i) {
+        const NthOutBatch &ele = nBestMatrix(threadIdx.x + skip, i, 0, 0);
+        if (ele.score > minScore) {
+          MergeElement(minScore, dest, beamSize, ele);
+        }
+      }
+    }
+    len = (len + 1) >> 1;
+
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    // copy to output array
+    assert(hypoInd < hypo2CandidateWrap.size());
+    uint candidateInd = hypo2CandidateWrap[hypoInd];
+    for (uint i = 0; i < beamSize; ++i) {
+      const NthOutBatch &curr = nBestMatrix(0, i, 0, 0);
+      //printf("vocabInd=%u \n", best.vocabInd);
+
+      assert(candidateInd + i < nBestCandidatesWrap.size());
+      nBestCandidatesWrap[candidateInd + i] = curr;
+    }
+  }
+
+  __syncthreads();
+  topScore = GetMaxScore(nBestMatrix);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__device__
+void SumAndLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
+                            const MatrixWrapper<float> in,
+                            const MatrixWrapper<float> b4Wrap,
+                            uint hypoInd,
+                            uint maxBeamSize,
+                            float topScore,
+                            const MatrixWrapper<uint> hypo2BeamSizeWrap,
+                            const MatrixWrapper<uint> hypo2CandidateWrap)
+{
+  extern __shared__ float _share[];
+
+  size_t vocabSize = in.dim(1);
+  //assert(nBestCandidatesWrap.dim(0) == rows);
+
+  //float* _sum = _share;// + blockDim.x;
+  MatrixWrapper<float> _sum(_share, blockDim.x, 1, 1, 1);
+
+  // calc sum
+  _sum[threadIdx.x] = 0.0f;
+  for (int id = threadIdx.x; id < vocabSize; id += blockDim.x) {
+    //row[id] = exp(row[id] - max);
+    float val = in(hypoInd, id, 0, 0) + b4Wrap(0, id, 0, 0);
+    val = __expf(val - topScore);
+    _sum[threadIdx.x] += val;
+  }
+
+  int len = blockDim.x;
+  while (len != 1) {
+    __syncthreads();
+
+    int skip = (len + 1) >> 1;
+    if (threadIdx.x < (len >> 1)) {
+      _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+    }
+    len = (len + 1) >> 1;
+  }
+
+  __syncthreads();
+
+  // apply partition and log to top
+  if (threadIdx.x == 0) {
+    //__syncthreads();
+    //printf("val=%f %f \n", in(rowIdx, ele.vocabId, 0, 0), val);
+
+    // nbest
+    uint beamSize = hypo2BeamSizeWrap[hypoInd];
+    uint startPos = hypo2CandidateWrap[hypoInd];
+    for (uint i = 0; i < beamSize; ++i) {
+      //__syncthreads();
+      NthOutBatch &ele = nBestCandidatesWrap[startPos + i];
+
+      float &val = ele.score;
+      val = __expf(val - topScore);
+      val = __logf(val /_sum[0]);
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void gLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
+                        const MatrixWrapper<float> in,
+                        const MatrixWrapper<float> b4Wrap,
+                        uint maxBeamSize,
+                        bool forbidUNK,
+                        const MatrixWrapper<uint> hypo2BeamSizeWrap,
+                        const MatrixWrapper<uint> hypo2CandidateWrap)
+{
+  uint hypos = in.dim(0);
+  uint vocabSize = in.dim(1);
+
+  uint hypoInd =  blockIdx.x; // index of previous hypo
+  while (hypoInd < hypos) {
+    float topScore;
+
+    NBestAndMax(nBestCandidatesWrap,
+            topScore,
+            in,
+            b4Wrap,
+            hypoInd,
+            maxBeamSize,
+            forbidUNK,
+            hypo2BeamSizeWrap,
+            hypo2CandidateWrap);
+
+    SumAndLogSoftMax(nBestCandidatesWrap,
+                in,
+                b4Wrap,
+                hypoInd,
+                maxBeamSize,
+                topScore,
+                hypo2BeamSizeWrap,
+                hypo2CandidateWrap);
+
+
+    __syncthreads();
+    hypoInd += gridDim.x;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void gNBestPerBatch(MatrixWrapper<NthOutBatch> nBestWrap,
+                        MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
+                        const MatrixWrapper<float> in,
+                        const MatrixWrapper<float> costsWrap,
+                        uint maxBeamSize,
+                        bool forbidUNK,
+                        bool isFirst,
+                        const MatrixWrapper<uint> hypo2BeamSizeWrap,
+                        const MatrixWrapper<uint> batch2HypoWrap,
+                        const MatrixWrapper<uint> hypo2CandidateWrap)
+{
+  //uint rows = in.dim(0);
+  uint batchSize = batch2HypoWrap.dim(0);
+
+  uint batchInd =  blockIdx.x;
+  while (batchInd < batchSize) {
+    assert(batchInd < batch2HypoWrap.size());
+    assert(batchInd < hypo2BeamSizeWrap.size());
+    assert(batchInd < nBestWrap.size());
+
+    uint hypoInd = batch2HypoWrap[batchInd];
+    uint beamSize = hypo2BeamSizeWrap[hypoInd];
+    assert(beamSize);
+
+    uint nextHypoInd;
+    if (isFirst) {
+      nextHypoInd = batchInd * beamSize;
+    }
+    else {
+      nextHypoInd = hypoInd;
+    }
+
+    // candiate from 1st hypo
+    float minScore = +999999;
+    assert(hypoInd < hypo2CandidateWrap.size());
+    uint candidateInd = hypo2CandidateWrap[hypoInd];
+    for (uint i = 0; i < beamSize; ++i) {
+      float prevCost;
+      if (isFirst) {
+        assert(batchInd < costsWrap.size());
+        prevCost = costsWrap[batchInd];
+      }
+      else {
+        //printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
+        assert(hypoInd < costsWrap.size());
+        prevCost = costsWrap[hypoInd];
+      }
+
+      assert((nextHypoInd + i) < nBestWrap.size());
+      assert(candidateInd + i < nBestCandidatesWrap.size());
+      nBestWrap[nextHypoInd + i] = nBestCandidatesWrap[candidateInd + i];
+
+      float &score = nBestWrap[nextHypoInd + i].score;
+      score += prevCost;
+
+      if (score < minScore) {
+        minScore = score;
+      }
+    }
+
+    // candidates from other previous hypos
+    if (!isFirst) {
+      for (uint hypoOffset = 1; hypoOffset < beamSize; ++hypoOffset) {
+        //printf("hypoInd=%d \n", (hypoInd + hypoOffset));
+
+        //printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
+        assert((hypoInd + hypoOffset) < costsWrap.size());
+        float prevCost = costsWrap[hypoInd + hypoOffset];
+
+        assert((hypoInd + hypoOffset) < hypo2CandidateWrap.size());
+        uint candidateInd = hypo2CandidateWrap[hypoInd + hypoOffset];
+
+        for (uint candidateOffset = 0; candidateOffset < beamSize; ++candidateOffset) {
+          assert((candidateInd + candidateOffset) < nBestCandidatesWrap.size());
+          NthOutBatch &candidate = nBestCandidatesWrap[candidateInd + candidateOffset];
+          candidate.score += prevCost;
+
+          assert(nextHypoInd < nBestWrap.size());
+          NthOutBatch *arr = &nBestWrap[nextHypoInd];
+
+          if (candidate.score > minScore) {
+            MergeElement(minScore, arr, beamSize, candidate);
+          }
+        }
+      }
+    }
+
+    batchInd += gridDim.x;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
+                const Matrix& in,
+                const Matrix& b4,
+                const DeviceVector<float> &costs,
+                bool forbidUNK,
+                uint maxBeamSize,
+                const std::vector<uint>& beamSizes,
+                uint beamSizeSum,
+                bool isFirst)
+{
+  //BEGIN_TIMER("LogSoftmax excl kernels");
+
+  //cerr << "in=" << in.Debug(0) << endl;
+  //cerr << "beamSizes=" << beamSizes.size() << endl;
+
+  // create beam size vectors on GPU but exclude empty beams
+  uint batchSize = 0;
+  uint candidateInd = 0;
+  for (size_t batchInd = 0; batchInd < beamSizes.size(); ++batchInd) {
+    uint beamSize = beamSizes[batchInd];
+    //cerr << "(" << beamSize << "," << hypoInd << ") ";
+
+    if (beamSize) {
+      if (isFirst) {
+        candidateInd += beamSize;
+      }
+      else {
+        candidateInd += beamSize * beamSize;
+      }
+
+      ++batchSize;
+    }
+  }
+
+  DeviceVector<uint> d_beamSizes(beamSizes);
+  DeviceVector<uint> hypo2BeamSize(in.dim(0));
+  DeviceVector<uint> hypo2Candidate(in.dim(0));
+  DeviceVector<uint> batch2Hypo(batchSize);
+  DeviceVector<NthOutBatch> nBestCandidates(candidateInd);
+
+  /*
+  cerr << "in=" << in.Debug(0) << endl;
+  cerr << "beamSizes=" << beamSizes.size() << endl;
+  cerr << "beamSizeSum=" << beamSizeSum << endl;
+  cerr << "batchSize=" << batchSize << endl;
+  cerr << "candidateInd=" << candidateInd << endl;
+  cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl;
+  cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl;
+  cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl;
+  cerr << "nBest=" << Debug(nBest, 0) << endl;
+  cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl;
+  cerr << endl;
+  */
+  //DeviceVector<NthOutBatch> nBest(beamSizeSum);
+  //cerr << "nBest=" << nBest.size() << endl;
+
+  MatrixWrapper<float> inWrap(in);
+  MatrixWrapper<float> b4Wrap(b4);
+  MatrixWrapper<uint> hypo2BeamSizeWrap(hypo2BeamSize);
+  MatrixWrapper<uint> hypo2CandidateWrap(hypo2Candidate);
+  MatrixWrapper<uint> batch2HypoWrap(batch2Hypo);
+  MatrixWrapper<NthOutBatch> nBestWrap(nBest);
+  MatrixWrapper<NthOutBatch> nBestCandidatesWrap(nBestCandidates);
+  MatrixWrapper<float> costsWrap(costs);
+
+  MatrixWrapper<uint> beamSizesWrap(d_beamSizes);
+
+  //PAUSE_TIMER("LogSoftmax excl kernels");
+
+  int blocks = std::min(MAX_BLOCKS, (int)in.dim(0));
+  int threads = std::min(MAX_THREADS, (int)in.dim(1));
+  int shared = sizeof(NthOutBatch) * threads * maxBeamSize
+             + sizeof(float) * threads;
+  //cerr << "shared=" << shared << endl;
+
+  //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
+  //cerr << "step0" << endl;
+
+  //BEGIN_TIMER("gBeamSizeInit");
+  gBeamSizeInit<<<1, 1, 0, CudaStreamHandler::GetStream()>>>
+    (hypo2BeamSizeWrap,
+    batch2HypoWrap,
+    hypo2CandidateWrap,
+    isFirst,
+    beamSizeSum,
+    beamSizesWrap
+    );
+  //PAUSE_TIMER("gBeamSizeInit");
+
+  /*
+  cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 2) << endl;
+  cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 2) << endl;
+  cerr << "batch2Hypo=" << Debug(batch2Hypo, 2) << endl;
+  cerr << endl;
+  */
+  //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
+  //cerr << "step1" << endl;
+
+  //BEGIN_TIMER("gLogSoftMax");
+  gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
+    (nBestCandidatesWrap,
+     inWrap,
+     b4Wrap,
+     maxBeamSize,
+     forbidUNK,
+     hypo2BeamSizeWrap,
+     hypo2CandidateWrap);
+  //PAUSE_TIMER("gLogSoftMax");
+
+  //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
+  //cerr << "step2" << endl;
+
+  threads = 1;
+
+  //BEGIN_TIMER("gNBestPerBatch");
+  gNBestPerBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
+    (nBestWrap,
+     nBestCandidatesWrap,
+     inWrap,
+     costsWrap,
+     maxBeamSize,
+     forbidUNK,
+     isFirst,
+     hypo2BeamSizeWrap,
+     batch2HypoWrap,
+     hypo2CandidateWrap);
+  //PAUSE_TIMER("gNBestPerBatch");
+
+  //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
+  //cerr << "step3" << endl;
+  //cerr << "3costs=" << Debug(costs, 0) << endl;
 }

 }  // namespace mblas
--- a/src/amun/gpu/mblas/matrix_functions.h
+++ b/src/amun/gpu/mblas/matrix_functions.h
@ -13,6 +13,7 @@
 #include "gpu/mblas/matrix.h"
 #include "gpu/mblas/matrix_wrapper.h"
 #include "gpu/mblas/handles.h"
+#include "gpu/mblas/nth_element_kernels.h"

 namespace amunmt {
 namespace GPU {
@ -83,6 +84,26 @@ std::string Debug(const HostVector<T> &vec, size_t verbosity = 1)
  return strm.str();
 }

+template<typename T>
+std::string Debug(const std::vector<T> &vec, size_t verbosity = 1)
+{
+  std::stringstream strm;
+
+  strm << "size=" << vec.size();
+
+  if (verbosity) {
+    T sum = Sum(vec.data(), vec.size());
+    strm << " sum=" << sum;
+  }
+
+  if (verbosity == 2) {
+    for (size_t i = 0; i < vec.size(); ++i) {
+      strm << " " << vec[i];
+    }
+  }
+
+  return strm.str();
+}

 template<typename T>
 void copy(const T *in, size_t count, T *out,  cudaMemcpyKind kind) {
@ -93,7 +114,9 @@ void Fill(Matrix& In, float value=0.0f);

 Matrix& Swap(Matrix& Out, Matrix& In);

-void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask);
+void Mean(Matrix& Out,
+          const Matrix& In,
+          const mblas::IMatrix &sentenceLengths);

 void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const DeviceVector<uint>& mapping);

@ -117,7 +140,7 @@ Matrix& CopyRow(Matrix& Out,
 Matrix& Concat(Matrix& Out, const Matrix& In);

 void MapMatrix(Matrix& state,
-              const mblas::IMatrix &sentencesMask,
+              const mblas::IMatrix &sentenceLengths,
              size_t i);

 Matrix& CopyRows(Matrix& Out,
@ -135,7 +158,10 @@ Matrix& Slice(Matrix& Out,
 Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
             bool transA = false, bool transB = false);

-Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize);
+Matrix& Softmax(Matrix& Out,
+                const DeviceVector<uint>& batchIds,
+                const mblas::IMatrix &sentenceLengths,
+                size_t batchSize);

 Matrix& LogSoftmax(Matrix& Out);

@ -232,7 +258,7 @@ __global__ void gBroadcastVecColumn(Functor functor,
  size_t rows  = outWrap.dim(0);
  size_t cols = outWrap.dim(1);

-  MatrixWrapper<float> sdata(sdataOrig, rows);
+  MatrixWrapper<float> sdata(sdataOrig, rows, 1, 1, 1);

  if (threadIdx.x == 0) {
    for (int i = 0; i < rows; ++i)
@ -422,7 +448,15 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, const Mat

 void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps);

-void RandomizeMemory();
+void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
+                const Matrix& in,
+                const Matrix& b4,
+                const DeviceVector<float> &costs,
+                bool forbidUNK,
+                uint maxBeamSize,
+                const std::vector<uint>& beamSizes,
+                uint beamSizeSum,
+                bool isFirst);

 } // namespace mblas
 } // namespace GPU
--- a/src/amun/gpu/mblas/matrix_wrapper.h
+++ b/src/amun/gpu/mblas/matrix_wrapper.h
@ -94,39 +94,14 @@ public:
    dataConst_ = data_;
  }

-  MatrixWrapper(DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
-  {
-    dim_[0] = a;
-    dim_[1] = b;
-    dim_[2] = c;
-    dim_[3] = d;
-    updateStridesRowMajor();
-
-    assert(size() == vec.size());
-
-    data_ = thrust::raw_pointer_cast(vec.data());
-    dataConst_ = data_;
-  }
-
-  MatrixWrapper(const DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
-  {
-    dim_[0] = a;
-    dim_[1] = b;
-    dim_[2] = c;
-    dim_[3] = d;
-    updateStridesRowMajor();
-
-    assert(size() == vec.size());
-
-    data_ = nullptr;
-    dataConst_ = thrust::raw_pointer_cast(vec.data());
-  }
-
  __device__
-  MatrixWrapper(T *ptr, uint size)
+  MatrixWrapper(T *ptr, uint a, uint b, uint c, uint d)
  {
-    dim_[0] = size;
-    size_ = size;
+    dim_[0] = a;
+    dim_[1] = b;
+    dim_[2] = c;
+    dim_[3] = d;
+    updateStrides();

    data_ = ptr;
    dataConst_ = ptr;
@ -148,7 +123,7 @@ public:
    return stride_[i];
  }

-  __host__
+  __device__ __host__
  void updateStrides()
  {
    stride_[0] = dim_[1];
@ -159,7 +134,7 @@ public:
    size_ = stride_[3] * dim_[3];
  }

-  __host__
+  __device__ __host__
  void updateStridesRowMajor()
  {
    stride_[0] = 1;
--- a/src/amun/gpu/mblas/nth_element.cu
+++ b/src/amun/gpu/mblas/nth_element.cu
@ -166,5 +166,7 @@ void NthElement::getValueByKey(std::vector<float>& out, const mblas::Matrix &d_i
  HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
 }

+//////////////////////////////////////////////////////////////////////////
+
 }  // namespace GPU
 } // namespace amunmt
--- a/src/amun/gpu/mblas/nth_element.h
+++ b/src/amun/gpu/mblas/nth_element.h
@ -18,8 +18,11 @@ class NthElement {
    NthElement(uint maxBeamSize, uint maxBatchSize);
    virtual ~NthElement();

-    void getNBestList(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
-                      std::vector<float>& outCosts, std::vector<uint>& outKeys,
+    // standard nth_element
+    void getNBestList(const std::vector<uint>& beamSizes,
+                      mblas::Matrix& Probs,
+                      std::vector<float>& outCosts,
+                      std::vector<uint>& outKeys,
                      const bool isFirst=false);

    void GetPairs(uint number,
@ -47,6 +50,7 @@ class NthElement {
                      const HostVector<uint>& batchFirstElementIdxs,
                      const HostVector<uint>& cummulatedBeamSizes);

+
 };

 }  // namespace GPU
--- a/src/amun/gpu/mblas/nth_element_kernels.h
+++ b/src/amun/gpu/mblas/nth_element_kernels.h
@ -1,3 +1,5 @@
+#pragma once
+
 #include "matrix_wrapper.h"

 namespace amunmt {
@ -32,6 +34,60 @@ struct NthOut
  }
 };

+/////////////////////////////////////////////////////////////////////////////////////////
+
+struct NthOutBatch
+{
+  uint ind;
+  float score;
+  //uint hypoInd;
+  //uint vocabInd;
+
+  __device__ __host__
+  NthOutBatch(const float& rhs)
+  {
+    // only to be used to init variable in matrix.h gSum
+    assert(rhs == 0.0f);
+    ind = rhs;
+    score = rhs;
+    //hypoInd = rhs;
+    //vocabInd = rhs;
+  }
+
+  __device__ __host__
+  NthOutBatch() {}
+
+  __device__ __host__
+  NthOutBatch(uint vInd, float vScore, uint vHypoInd, uint vVocabInd)
+  :ind(vInd)
+  ,score(vScore)
+  //,hypoInd(vHypoInd)
+  //,vocabInd(vVocabInd)
+  {}
+
+  __device__ __host__
+  NthOutBatch& operator=(const NthOutBatch& rhs)
+  {
+    ind = rhs.ind;
+    score = rhs.score;
+    //hypoInd = rhs.hypoInd;
+    //vocabInd = rhs.vocabInd;
+    return *this;
+  }
+
+  __device__ __host__
+  NthOutBatch& operator+=(const NthOutBatch& rhs)
+  {
+    ind += rhs.ind;
+    score += rhs.score;
+    //hypoInd += rhs.hypoInd;
+    //vocabInd += rhs.vocabInd;
+    return *this;
+  }
+
+};
+
+
 /////////////////////////////////////////////////////////////////////////////////////////

 inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
@ -40,6 +96,17 @@ inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
  return out;
 }

+inline std::ostream& operator<<(std::ostream &out, const NthOutBatch &obj)
+{
+  out << "("
+      << obj.ind << ","
+      << obj.score << ","
+      //<< obj.hypoInd << ","
+      //<< obj.vocabInd
+      << ")";
+  return out;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////

 __global__ void gMaxElement(mblas::MatrixWrapper<NthOut> out,