Merge branch 'master' of github.com:marian-nmt/marian

This commit is contained in:
Marcin Junczys-Dowmunt 2017-11-21 11:12:10 +01:00
commit 4874c0d0fd
42 changed files with 977 additions and 12928 deletions

3
.gitmodules vendored
View File

@ -2,3 +2,6 @@
path = src/marian
url = https://github.com/marian-nmt/marian-dev
branch = stable
[submodule "examples"]
path = examples
url = https://github.com/marian-nmt/marian-examples.git

View File

@ -1 +1 @@
v0.0.0-test+3.13905be*v0.0.0-test*0*0*0*test*3*13905be
v1.0.0

View File

@ -1,10 +0,0 @@
model/
moses-scripts/
subword-nmt/
data/corpus.*
data/europarl-*
data/SETIMES2.*
data/*.tok.*
data/*.tc.*
data/*.bpe.*
data/*.tgz

View File

@ -1,21 +0,0 @@
The MIT License (MIT)
Copyright (c) 2016 University of Edinburgh
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,84 +0,0 @@
# Example for training with Marian
Files and scripts in this folder have been adapted from the Romanian-English sample
from https://github.com/rsennrich/wmt16-scripts. We also add the back-translated data from
http://data.statmt.org/rsennrich/wmt16_backtranslations/ as desribed in
http://www.aclweb.org/anthology/W16-2323. The resulting system should be competitive
or even slightly better than reported in the Edinburgh WMT2016 paper.
To execute the complete example type:
```
./run-me.sh
```
which downloads the Romanian-English training files and preprocesses them (tokenization,
truecasing, segmentation into subwords units).
To use with a different GPU than device 0 or more GPUs (here 0 1 2 3) type the command below.
Training time on 1 NVIDIA GTX 1080 GPU should be roughly 24 hours.
```
./run-me.sh 0 1 2 3
```
Next it executes a training run with `marian`:
```
../../build/marian \
--model model/model.npz \
--devices $GPUS \
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
--vocabs model/vocab.ro.yml model/vocab.en.yml \
--dim-vocabs 66000 50000 \
--mini-batch 80 \
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
--early-stopping 5 --moving-average \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
--valid-metrics cross-entropy valid-script \
--valid-script-path ./scripts/validate.sh \
--log model/train.log --valid-log model/valid.log
```
After training (the training should stop if cross-entropy on the validation set stops improving) a final model
`model/model.avg.npz` is created from the 4 best models on the validation sets (by element-wise averaging). This model is used to
translate the WMT2016 dev set and test set with `amun`:
```
cat data/newstest2016.bpe.ro \
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -b 12 -n --mini-batch 100 --maxi-batch 1000 \
| sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl \
> data/newstest2016.bpe.ro.output
```
after which BLEU scores for the dev and test set are reported. Results should be somewhere in the area of:
```
newsdev2016:
BLEU = 35.88, 67.4/42.3/28.8/20.2 (BP=1.000, ratio=1.012, hyp_len=51085, ref_len=50483)
newstest2016:
BLEU = 34.53, 66.0/40.7/27.5/19.2 (BP=1.000, ratio=1.015, hyp_len=49258, ref_len=48531)
```
## Custom validation script
The validation script `scripts/validate.sh` is a quick example how to write a custom validation script. The training pauses until the validation script finishes executing. A validation script should not output anything to `stdout` apart from the final single score (last line):
```
#!/bin/bash
#model prefix
prefix=model/model.npz
dev=data/newsdev2016.bpe.ro
ref=data/newsdev2016.tok.en
# decode
cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml --mini-batch 10 --maxi-batch 100 2>/dev/null \
| sed 's/\@\@ //g' | ./mosesdecoder/scripts/recaser/detruecase.perl > $dev.output.postprocessed
## get BLEU
./mosesdecoder/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed \
| cut -f 3 -d ' ' | cut -f 1 -d ','
```

View File

@ -1,4 +0,0 @@
#!/bin/bash -v
git clean -f
rm -rf moses-scripts subword-nmt model data/ro-en.tgz

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,86 +0,0 @@
#!/bin/bash -v
# set chosen gpus
GPUS=0
if [ $# -ne 0 ]
then
GPUS=$@
fi
echo Using gpus $GPUS
if [ ! -e ../../build/amun ]
then
echo "amun is not installed in ../../build, you need to compile the toolkit first."
exit 1
fi
if [ ! -e ../../build/marian ]
then
echo "marian is not installed in ../../build, you need to compile the toolkit first."
exit 1
fi
# download dependencies and data
if [ ! -e "moses-scripts" ]
then
git clone https://github.com/amunmt/moses-scripts
fi
if [ ! -e "subword-nmt" ]
then
git clone https://github.com/rsennrich/subword-nmt
fi
if [ ! -e "data/ro-en.tgz" ]
then
./scripts/download-files.sh
fi
mkdir -p model
# preprocess data
if [ ! -e "data/corpus.bpe.en" ]
then
./scripts/preprocess.sh
fi
# train model
if [ ! -e "model/model.npz" ]
then
../../build/marian \
--model model/model.npz \
--devices $GPUS --seed 0 \
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
--vocabs model/vocab.ro.yml model/vocab.en.yml \
--dim-vocabs 66000 50000 \
--dynamic-batching -w 3000 \
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
--early-stopping 5 --moving-average \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
--valid-metrics cross-entropy valid-script \
--valid-script-path ./scripts/validate.sh \
--log model/train.log --valid-log model/valid.log
fi
# collect 4 best models on dev set
MODELS=`cat model/valid.log | grep valid-script | sort -rg -k8,8 -t ' ' | cut -f 4 -d ' ' | head -n 4 | xargs -I {} echo model/model.iter{}.npz | xargs`
# average 4 best models into single model
../../scripts/average.py -m $MODELS -o model/model.avg.npz
# translate dev set with averaged model
cat data/newsdev2016.bpe.ro \
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
| sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newsdev2016.bpe.ro.output.postprocessed
# translate test set with averaged model
cat data/newstest2016.bpe.ro \
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
| sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newstest2016.bpe.ro.output.postprocessed
# calculate bleu scores for dev and test set
./moses-scripts/scripts/generic/multi-bleu.perl data/newsdev2016.tok.en < data/newsdev2016.bpe.ro.output.postprocessed
./moses-scripts/scripts/generic/multi-bleu.perl data/newstest2016.tok.en < data/newstest2016.bpe.ro.output.postprocessed

View File

@ -1,29 +0,0 @@
#!/bin/bash -v
# get En-Ro training data for WMT16
if [ ! -f data/ro-en.tgz ];
then
wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz
fi
if [ ! -f data/SETIMES2.ro-en.txt.zip ];
then
wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip
fi
if [ ! -f data/corpus.bt.ro-en.en.gz ];
then
wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.en.gz -O data/corpus.bt.ro-en.en.gz
wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.ro.gz -O data/corpus.bt.ro-en.ro.gz
fi
cd data/
tar -xf ro-en.tgz
unzip SETIMES2.ro-en.txt.zip
gzip -d corpus.bt.ro-en.en.gz corpus.bt.ro-en.ro.gz
cat europarl-v7.ro-en.en SETIMES2.en-ro.en corpus.bt.ro-en.en > corpus.en
cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro corpus.bt.ro-en.ro > corpus.ro
cd ..

View File

@ -1,17 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Barry Haddow
# Distributed under MIT license
#
# Normalise Romanian s-comma and t-comma
import io
import sys
istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in istream:
line = line.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
line = line.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
ostream.write(line)

View File

@ -1,75 +0,0 @@
#!/bin/bash -v
# this sample script preprocesses a sample corpus, including tokenization,
# truecasing, and subword segmentation.
# for application to a different language pair,
# change source and target prefix, optionally the number of BPE operations,
# and the file names (currently, data/corpus and data/newsdev2016 are being processed)
# in the tokenization step, you will want to remove Romanian-specific normalization / diacritic removal,
# and you may want to add your own.
# also, you may want to learn BPE segmentations separately for each language,
# especially if they differ in their alphabet
# suffix of source language files
SRC=ro
# suffix of target language files
TRG=en
# number of merge operations. Network vocabulary should be slightly larger (to include characters),
# or smaller if the operations are learned on the joint vocabulary
bpe_operations=85000
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=moses-scripts
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
subword_nmt=subword-nmt
# tokenize
for prefix in corpus newsdev2016 newstest2016
do
cat data/$prefix.$SRC \
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
| ./scripts/normalise-romanian.py \
| ./scripts/remove-diacritics.py \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
cat data/$prefix.$TRG \
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
done
# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
# train truecaser
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG
# apply truecaser (cleaned training corpus)
for prefix in corpus
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
done
# apply truecaser (dev/test files)
for prefix in newsdev2016 newstest2016
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
# train BPE
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
# apply BPE
for prefix in corpus newsdev2016 newstest2016
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
done

View File

@ -1,20 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Barry Haddow
# Distributed under MIT license
#
# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
import io
import sys
istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in istream:
line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
line = line.replace("\u0102", "A").replace("\u0103", "a")
line = line.replace("\u00C2", "A").replace("\u00E2", "a")
line = line.replace("\u00CE", "I").replace("\u00EE", "i")
ostream.write(line)

View File

@ -1,15 +0,0 @@
#!/bin/bash
# model prefix
prefix=model/model.npz
dev=data/newsdev2016.bpe.ro
ref=data/newsdev2016.tok.en
# decode
cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml -b 12 -n --mini-batch 10 --maxi-batch 100 2>/dev/null \
| sed 's/\@\@ //g' | ./moses-scripts/scripts/recaser/detruecase.perl > $dev.output.postprocessed
# get BLEU
./moses-scripts/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed | cut -f 3 -d ' ' | cut -f 1 -d ','

View File

@ -1,4 +0,0 @@
moses-scripts
en-de
data/*.out
*.yml

View File

@ -1,2 +0,0 @@
Translation example
===================

View File

@ -1,3 +0,0 @@
#!/bin/bash -v
rm -rf moses-scripts en-de data/*.out *.yml

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,60 +0,0 @@
#!/bin/bash
# set chosen gpus
GPUS=0
if [ $# -ne 0 ]
then
GPUS=$@
fi
echo Using gpus $GPUS
if [ ! -e ../../build/amun ]
then
echo "amun is not installed in ../../build, you need to compile the toolkit first."
exit 1
fi
# download dependencies and data
if [ ! -e "moses-scripts" ]
then
git clone https://github.com/amunmt/moses-scripts
fi
if [ ! -e "en-de/model.npz" ]
then
wget -r -l 1 --cut-dirs=2 -e robots=off -nH -np -R index.html* http://data.statmt.org/rsennrich/wmt16_systems/en-de/
fi
# translate test set with single model
cat data/newstest2015.ende.en | \
# preprocess
moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
# translate
../../build/amun -m en-de/model.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
--mini-batch 50 --maxi-batch 1000 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe | \
# postprocess
moses-scripts/scripts/recaser/detruecase.perl | \
moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.single.out
# create configuration file for model ensemble
../../build/amun -m en-de/model-ens?.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
--mini-batch 1 --maxi-batch 1 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe \
--relative-paths --dump-config > ensemble.yml
# translate test set with ensemble
cat data/newstest2015.ende.en | \
# preprocess
moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
# translate
../../build/amun -c ensemble.yml --gpu-threads 1 | \
# postprocess
moses-scripts/scripts/recaser/detruecase.perl | \
moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.ensemble.out
moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.single.out
moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.ensemble.out

View File

@ -9,17 +9,19 @@
namespace amunmt {
class God;
class BestHypsBase
{
public:
BestHypsBase(
const God &god,
bool forbidUNK,
bool returnNBestList,
bool isInputFiltered,
bool returnAttentionWeights,
const std::map<std::string, float>& weights)
: forbidUNK_(forbidUNK),
returnNBestList_(returnNBestList),
: god_(god),
forbidUNK_(forbidUNK),
isInputFiltered_(isInputFiltered),
returnAttentionWeights_(returnAttentionWeights),
weights_(weights)
@ -35,8 +37,8 @@ class BestHypsBase
std::vector<uint>& beamSizes) = 0;
protected:
const God &god_;
const bool forbidUNK_;
const bool returnNBestList_;
const bool isInputFiltered_;
const bool returnAttentionWeights_;
const std::map<std::string, float> weights_;

View File

@ -83,6 +83,19 @@ God& God::Init(int argc, char** argv) {
LoadScorers();
LoadFiltering();
returnNBestList_ = Get<bool>("return-alignment")
|| Get<bool>("return-soft-alignment")
|| Get<bool>("return-nematus-alignment");
useFusedSoftmax_ = true;
if (returnNBestList_ ||
gpuLoaders_.size() != 1 || // more than 1 scorer
God::Get<size_t>("beam-size") > 11 // beam size affect shared mem alloc in gLogSoftMax()
) {
useFusedSoftmax_ = false;
}
//cerr << "useFusedSoftmax_=" << useFusedSoftmax_ << endl;
if (Has("input-file")) {
LOG(info)->info("Reading from {}", Get<std::string>("input-file"));
inputStream_.reset(new InputFileStream(Get<std::string>("input-file")));

View File

@ -77,6 +77,12 @@ class God {
ThreadPool &GetThreadPool()
{ return *pool_; }
bool ReturnNBestList() const
{ return returnNBestList_; }
bool UseFusedSoftmax() const
{ return useFusedSoftmax_; }
private:
void LoadScorers();
void LoadFiltering();
@ -107,6 +113,9 @@ class God {
mutable boost::shared_mutex accessLock_;
std::unique_ptr<ThreadPool> pool_;
bool returnNBestList_;
bool useFusedSoftmax_;
};
}

View File

@ -5,7 +5,10 @@ namespace amunmt {
Scorer::Scorer(const God &god,
const std::string& name,
const YAML::Node& config, size_t tab)
: name_(name), config_(config), tab_(tab)
:god_(god)
,name_(name)
,config_(config)
,tab_(tab)
{
}

View File

@ -64,8 +64,11 @@ class Scorer {
}
virtual BaseMatrix& GetProbs() = 0;
virtual void *GetNBest() = 0; // hack - need to return matrix<NthOut> but NthOut contain cuda code
virtual const BaseMatrix *GetBias() const = 0;
protected:
const God &god_;
const std::string& name_;
const YAML::Node& config_;
size_t tab_;

View File

@ -26,9 +26,8 @@ class BestHyps : public BestHypsBase
{
public:
BestHyps(const God &god)
: BestHypsBase(
: BestHypsBase(god,
!god.Get<bool>("allow-unk"),
god.Get<bool>("n-best"),
god.Get<std::vector<std::string>>("softmax-filter").size(),
god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
god.GetScorerWeights())
@ -83,7 +82,7 @@ class BestHyps : public BestHypsBase
}
std::vector<std::vector<float>> breakDowns;
if (returnNBestList_) {
if (god_.ReturnNBestList()) {
breakDowns.push_back(bestCosts);
for (auto& scorer : scorers) {
std::vector<float> modelCosts(beamSize);
@ -123,7 +122,7 @@ class BestHyps : public BestHypsBase
hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
}
if (returnNBestList_) {
if (god_.ReturnNBestList()) {
hyp->GetCostBreakdown().resize(scorers.size());
float sum = 0;
for(size_t j = 0; j < scorers.size(); ++j) {

View File

@ -25,6 +25,18 @@ class CPUEncoderDecoderBase : public Scorer {
virtual void GetAttention(mblas::Matrix& Attention) = 0;
virtual mblas::Matrix& GetAttention() = 0;
virtual void *GetNBest()
{
assert(false);
return nullptr;
}
virtual const BaseMatrix *GetBias() const
{
assert(false);
return nullptr;
}
protected:
mblas::Matrix SourceContext_;
};

View File

@ -4,16 +4,20 @@ namespace amunmt {
namespace GPU {
BestHyps::BestHyps(const God &god)
: BestHypsBase(
: BestHypsBase(god,
!god.Get<bool>("allow-unk"),
god.Get<bool>("n-best"),
god.Get<std::vector<std::string>>("softmax-filter").size(),
god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
god.GetScorerWeights()),
nthElement_(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch")),
keys(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch"))
{}
Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
maxBeamSize_(god.Get<uint>("beam-size"))
{
if (!god_.UseFusedSoftmax()) {
NthElement *obj = new NthElement(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch"));
nthElement_.reset(obj);
}
}
void BestHyps::DisAllowUNK(mblas::Matrix& Prob) {
SetColumn(Prob, UNK_ID, std::numeric_limits<float>::lowest());
@ -24,7 +28,17 @@ void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Prob
std::vector<unsigned>& outKeys,
const bool isFirst)
{
nthElement_.getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
nthElement_->getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
}
// fast fused softmax and nth_element
void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
DeviceVector<NthOutBatch> &nBest,
std::vector<float>& outCosts,
std::vector<unsigned>& outKeys,
const bool isFirst)
{
getNBestList(beamSizes, Probs, nBest, outCosts, outKeys, isFirst);
}
std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPtr>& scorers,
@ -52,6 +66,7 @@ std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPt
return alignments;
}
// standard nth_element
void BestHyps::CalcBeam(
const Beam& prevHyps,
const std::vector<ScorerPtr>& scorers,
@ -77,35 +92,49 @@ void BestHyps::CalcBeam(
cudaMemcpyHostToDevice);
//mblas::copy(vCosts.begin(), vCosts.end(), Costs.begin());
const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
for (size_t i = 1; i < scorers.size(); ++i) {
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
}
if (forbidUNK_) {
DisAllowUNK(Probs);
}
size_t beamSizeSum = std::accumulate(beamSizes.begin(), beamSizes.end(), 0);
std::vector<float> bestCosts;
std::vector<unsigned> bestKeys;
FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
if (god_.UseFusedSoftmax()) {
const mblas::Matrix& b4 = *static_cast<const mblas::Matrix*>(scorers[0]->GetBias());
DeviceVector<NthOutBatch> &nBest = *static_cast<DeviceVector<NthOutBatch>*>(scorers[0]->GetNBest());
nBest.resize(beamSizeSum);
BEGIN_TIMER("GetProbs.LogSoftmaxAndNBest");
mblas::LogSoftmaxAndNBest(nBest, Probs, b4, Costs, forbidUNK_, maxBeamSize_, beamSizes, beamSizeSum, isFirst);
PAUSE_TIMER("GetProbs.LogSoftmaxAndNBest");
//std::cerr << "2Probs=" << Probs.Debug(1) << std::endl;
FindBests(beamSizes, Probs, nBest, bestCosts, bestKeys, isFirst);
}
else {
BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
for (size_t i = 1; i < scorers.size(); ++i) {
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
}
if (forbidUNK_) {
DisAllowUNK(Probs);
}
FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
}
std::vector<HostVector<float>> breakDowns;
if (returnNBestList_) {
if (god_.ReturnNBestList()) {
breakDowns.push_back(bestCosts);
for (size_t i = 1; i < scorers.size(); ++i) {
std::vector<float> modelCosts(beamSizeSum);
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
nthElement_.getValueByKey(modelCosts, currProbs);
nthElement_->getValueByKey(modelCosts, currProbs);
breakDowns.push_back(modelCosts);
}
}
@ -135,7 +164,7 @@ void BestHyps::CalcBeam(
hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
}
if(returnNBestList_) {
if(god_.ReturnNBestList()) {
hyp->GetCostBreakdown().resize(scorers.size());
float sum = 0;
for (size_t j = 0; j < scorers.size(); ++j) {
@ -162,5 +191,43 @@ void BestHyps::CalcBeam(
PAUSE_TIMER("CalcBeam");
}
//////////////////////////////////////////////////////////////////////////
void BestHyps::getNBestList(const std::vector<uint>& beamSizes,
mblas::Matrix& Probs,
DeviceVector<NthOutBatch> &nBest,
std::vector<float>& outCosts,
std::vector<uint>& outKeys,
const bool isFirst) const
{
GetPairs(nBest, outKeys, outCosts);
assert(outCosts.size() == outKeys.size());
/*
cerr << "outCosts/outKeys=";
for (size_t i = 0; i < outKeys.size(); ++i) {
cerr << "(" << outCosts[i] << "," << outKeys[i] << ") ";
}
cerr << endl;
*/
//cerr << endl;
}
void BestHyps::GetPairs(DeviceVector<NthOutBatch> &nBest,
std::vector<uint>& outKeys,
std::vector<float>& outValues) const
{
//cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl;
outKeys.resize(nBest.size());
outValues.resize(nBest.size());
HostVector<NthOutBatch> hostVec(nBest.size());
mblas::copy(thrust::raw_pointer_cast(nBest.data()), nBest.size(), thrust::raw_pointer_cast(hostVec.data()), cudaMemcpyDeviceToHost);
for (size_t i = 0; i < nBest.size(); ++i) {
outKeys[i] = hostVec[i].ind;
outValues[i] = hostVec[i].score;
}
}
} // namespace
}

View File

@ -24,6 +24,7 @@ class BestHyps : public BestHypsBase
void DisAllowUNK(mblas::Matrix& Prob);
// standard nth_element
void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
std::vector<float>& outCosts,
std::vector<unsigned>& outKeys,
@ -31,6 +32,7 @@ class BestHyps : public BestHypsBase
std::vector<SoftAlignmentPtr> GetAlignments(const std::vector<ScorerPtr>& scorers,
size_t hypIndex);
void CalcBeam(
const Beam& prevHyps,
const std::vector<ScorerPtr>& scorers,
@ -39,9 +41,29 @@ class BestHyps : public BestHypsBase
std::vector<uint>& beamSizes);
private:
NthElement nthElement_;
std::unique_ptr<NthElement> nthElement_;
DeviceVector<unsigned> keys;
DeviceVector<float> Costs;
uint maxBeamSize_;
// fast fused softmax and nth_element
void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
DeviceVector<NthOutBatch> &nBest,
std::vector<float>& outCosts,
std::vector<unsigned>& outKeys,
const bool isFirst);
void getNBestList(const std::vector<uint>& beamSizes,
mblas::Matrix& Probs,
DeviceVector<NthOutBatch> &nBest,
std::vector<float>& outCosts,
std::vector<uint>& outKeys,
const bool isFirst=false) const;
void GetPairs(DeviceVector<NthOutBatch> &nBest,
std::vector<uint>& outKeys,
std::vector<float>& outValues) const;
};
}

View File

@ -70,8 +70,9 @@ void EncoderDecoder::Decode(const State& in, State& out, const std::vector<uint>
edIn.GetStates(),
edIn.GetEmbeddings(),
*SourceContext_,
sentencesMask_,
beamSizes);
sentenceLengths_,
beamSizes,
god_.UseFusedSoftmax());
PAUSE_TIMER("Decode");
}
@ -81,7 +82,7 @@ State* EncoderDecoder::NewState() const {
void EncoderDecoder::Encode(const Sentences& source) {
BEGIN_TIMER("Encode");
encoder_->Encode(source, tab_, *SourceContext_, sentencesMask_);
encoder_->Encode(source, tab_, *SourceContext_, sentenceLengths_);
//cerr << "GPU SourceContext_=" << SourceContext_.Debug(1) << endl;
PAUSE_TIMER("Encode");
}
@ -89,7 +90,7 @@ void EncoderDecoder::Encode(const Sentences& source) {
void EncoderDecoder::BeginSentenceState(State& state, size_t batchSize) {
//BEGIN_TIMER("BeginSentenceState");
EDState& edState = state.get<EDState>();
decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentencesMask_);
decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentenceLengths_);
decoder_->EmptyEmbedding(edState.GetEmbeddings(), batchSize);
//PAUSE_TIMER("BeginSentenceState");
@ -143,6 +144,16 @@ BaseMatrix& EncoderDecoder::GetProbs() {
return decoder_->GetProbs();
}
void *EncoderDecoder::GetNBest()
{
return &decoder_->GetNBest();
}
const BaseMatrix *EncoderDecoder::GetBias() const
{
return decoder_->GetBias();
}
mblas::Matrix& EncoderDecoder::GetAttention() {
return decoder_->GetAttention();
}

View File

@ -50,6 +50,9 @@ class EncoderDecoder : public Scorer {
mblas::Matrix& GetAttention();
virtual BaseMatrix& GetProbs();
virtual void *GetNBest();
virtual const BaseMatrix *GetBias() const;
size_t GetVocabSize() const;
void Filter(const std::vector<size_t>& filterIds);
@ -59,7 +62,7 @@ class EncoderDecoder : public Scorer {
std::unique_ptr<Encoder> encoder_;
std::unique_ptr<Decoder> decoder_;
DeviceVector<uint> indices_;
mblas::IMatrix sentencesMask_;
mblas::IMatrix sentenceLengths_;
// set in Encoder::GetContext() to length (maxSentenceLength * batchSize). 1 if it's a word, 0 otherwise
std::unique_ptr<mblas::Matrix> SourceContext_;

View File

@ -66,7 +66,7 @@ class Decoder {
void InitializeState(CellState& State,
const mblas::Matrix& SourceContext,
const size_t batchSize,
const mblas::IMatrix &sentencesMask)
const mblas::IMatrix &sentenceLengths)
{
using namespace mblas;
@ -81,7 +81,7 @@ class Decoder {
//std::cerr << "SourceContext=" << SourceContext.Debug(1) << std::endl;
//std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
Mean(Temp2_, SourceContext, sentencesMask);
Mean(Temp2_, SourceContext, sentenceLengths);
//std::cerr << "1State=" << State.Debug(1) << std::endl;
//std::cerr << "3Temp2_=" << Temp2_.Debug(1) << std::endl;
@ -156,7 +156,7 @@ class Decoder {
void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
const CellState& HiddenState,
const mblas::Matrix& SourceContext,
const mblas::IMatrix &sentencesMask,
const mblas::IMatrix &sentenceLengths,
const std::vector<uint>& beamSizes)
{
// mapping = 1/0 whether each position, in each sentence in the batch is actually a valid word
@ -165,6 +165,7 @@ class Decoder {
using namespace mblas;
size_t maxLength = SourceContext.dim(0);
size_t batchSize = SourceContext.dim(3);
//std::cerr << "batchSize=" << batchSize << std::endl;
//std::cerr << "HiddenState=" << HiddenState.Debug(0) << std::endl;
@ -182,11 +183,13 @@ class Decoder {
batchMapping.size(),
thrust::raw_pointer_cast(dBatchMapping_.data()),
cudaMemcpyHostToDevice);
//std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
//std::cerr << "batchMapping=" << Debug(batchMapping, 2) << std::endl;
//std::cerr << "dBatchMapping_=" << Debug(dBatchMapping_, 2) << std::endl;
const size_t srcSize = sentencesMask.size() / beamSizes.size();
/*
std::cerr << "SourceContext=" << SourceContext.Debug(0) << std::endl;
std::cerr << "AlignedSourceContext=" << AlignedSourceContext.Debug(0) << std::endl;
std::cerr << "A_=" << A_.Debug(0) << std::endl;
std::cerr << "sentenceLengths=" << sentenceLengths.Debug(2) << std::endl;
*/
Prod(/*h_[1],*/ Temp2_, *(HiddenState.output), *w_.W_);
//std::cerr << "1Temp2_=" << Temp2_.Debug() << std::endl;
@ -198,14 +201,14 @@ class Decoder {
}
//std::cerr << "2Temp2_=" << Temp2_.Debug() << std::endl;
Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, srcSize);
Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, maxLength);
//std::cerr << "w_.V_=" << w_.V_->Debug(0) << std::endl;
//std::cerr << "3Temp1_=" << Temp1_.Debug(0) << std::endl;
Prod(A_, *w_.V_, Temp1_, false, true);
mblas::Softmax(A_, dBatchMapping_, sentencesMask, batchSize);
mblas::Softmax(A_, dBatchMapping_, sentenceLengths, batchSize);
mblas::WeightedMean(AlignedSourceContext, A_, SourceContext, dBatchMapping_);
/*
@ -253,9 +256,12 @@ class Decoder {
}
void GetProbs(mblas::Matrix& Probs,
std::shared_ptr<mblas::Matrix> &b4,
const CellState& State,
const mblas::Matrix& Embedding,
const mblas::Matrix& AlignedSourceContext) {
const mblas::Matrix& AlignedSourceContext,
bool useFusedSoftmax)
{
using namespace mblas;
//BEGIN_TIMER("GetProbs.Prod");
@ -298,7 +304,7 @@ class Decoder {
Element(Tanh(_1 + _2 + _3), T1_, T2_, T3_);
//PAUSE_TIMER("GetProbs.Element");
std::shared_ptr<mblas::Matrix> w4, b4;
std::shared_ptr<mblas::Matrix> w4;
if(!filtered_) {
w4 = w_.W4_;
b4 = w_.B4_;
@ -315,13 +321,15 @@ class Decoder {
Prod(Probs, T1_, *w4);
PAUSE_TIMER("GetProbs.Prod4");
BEGIN_TIMER("GetProbs.BroadcastVec");
BroadcastVec(_1 + _2, Probs, *b4);
PAUSE_TIMER("GetProbs.BroadcastVec");
if (!useFusedSoftmax) {
BEGIN_TIMER("GetProbs.BroadcastVec");
BroadcastVec(_1 + _2, Probs, *b4);
PAUSE_TIMER("GetProbs.BroadcastVec");
BEGIN_TIMER("GetProbs.LogSoftMax");
mblas::LogSoftmax(Probs);
PAUSE_TIMER("GetProbs.LogSoftMax");
BEGIN_TIMER("GetProbs.LogSoftMax");
mblas::LogSoftmax(Probs);
PAUSE_TIMER("GetProbs.LogSoftMax");
}
}
void Filter(const std::vector<size_t>& ids) {
@ -365,8 +373,9 @@ class Decoder {
const CellState& State,
const mblas::Matrix& Embeddings,
const mblas::Matrix& SourceContext,
const mblas::IMatrix &sentencesMask,
const std::vector<uint>& beamSizes)
const mblas::IMatrix &sentenceLengths,
const std::vector<uint>& beamSizes,
bool useFusedSoftmax)
{
//BEGIN_TIMER("Decode");
@ -379,7 +388,11 @@ class Decoder {
//PAUSE_TIMER("GetHiddenState");
//BEGIN_TIMER("GetAlignedSourceContext");
GetAlignedSourceContext(AlignedSourceContext_, HiddenState_, SourceContext, sentencesMask, beamSizes);
GetAlignedSourceContext(AlignedSourceContext_,
HiddenState_,
SourceContext,
sentenceLengths,
beamSizes);
//std::cerr << "AlignedSourceContext_=" << AlignedSourceContext_.Debug(1) << std::endl;
//PAUSE_TIMER("GetAlignedSourceContext");
@ -389,7 +402,7 @@ class Decoder {
//PAUSE_TIMER("GetNextState");
//BEGIN_TIMER("GetProbs");
GetProbs(NextState, Embeddings, AlignedSourceContext_);
GetProbs(NextState, Embeddings, AlignedSourceContext_, useFusedSoftmax);
//std::cerr << "Probs_=" << Probs_.Debug(1) << std::endl;
//PAUSE_TIMER("GetProbs");
@ -403,9 +416,9 @@ class Decoder {
void EmptyState(CellState& State,
const mblas::Matrix& SourceContext,
size_t batchSize,
const mblas::IMatrix &sentencesMask)
const mblas::IMatrix &sentenceLengths)
{
rnn1_.InitializeState(State, SourceContext, batchSize, sentencesMask);
rnn1_.InitializeState(State, SourceContext, batchSize, sentenceLengths);
alignment_.Init(SourceContext);
}
@ -435,6 +448,14 @@ class Decoder {
return alignment_.GetAttention();
}
DeviceVector<NthOutBatch>& GetNBest() {
return nBest_;
}
const mblas::Matrix *GetBias() const {
return b4_.get();
}
private:
void GetHiddenState(CellState& HiddenState,
@ -446,10 +467,13 @@ class Decoder {
void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
const CellState& HiddenState,
const mblas::Matrix& SourceContext,
const mblas::IMatrix &sentencesMask,
const mblas::IMatrix &sentenceLengths,
const std::vector<uint>& beamSizes) {
alignment_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext,
sentencesMask, beamSizes);
alignment_.GetAlignedSourceContext(AlignedSourceContext,
HiddenState,
SourceContext,
sentenceLengths,
beamSizes);
}
void GetNextState(CellState& State,
@ -461,8 +485,10 @@ class Decoder {
void GetProbs(const CellState& State,
const mblas::Matrix& Embedding,
const mblas::Matrix& AlignedSourceContext) {
softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext);
const mblas::Matrix& AlignedSourceContext,
bool useFusedSoftmax)
{
softmax_.GetProbs(Probs_, b4_, State, Embedding, AlignedSourceContext, useFusedSoftmax);
}
std::unique_ptr<Cell> InitHiddenCell(const Weights& model, const YAML::Node& config){
@ -505,6 +531,9 @@ class Decoder {
Alignment<Weights::DecAlignment> alignment_;
Softmax<Weights::DecSoftmax> softmax_;
DeviceVector<NthOutBatch> nBest_;
std::shared_ptr<mblas::Matrix> b4_;
Decoder(const Decoder&) = delete;
};

View File

@ -63,22 +63,19 @@ std::vector<std::vector<size_t>> GetBatchInput(const Sentences& source, size_t t
}
void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context,
mblas::IMatrix &sentencesMask)
mblas::IMatrix &sentenceLengths)
{
size_t maxSentenceLength = GetMaxLength(source, tab);
//cerr << "1dMapping=" << mblas::Debug(dMapping, 2) << endl;
HostVector<uint> hMapping(maxSentenceLength * source.size(), 0);
HostVector<uint> hSentenceLengths(source.size());
for (size_t i = 0; i < source.size(); ++i) {
for (size_t j = 0; j < source.at(i)->GetWords(tab).size(); ++j) {
hMapping[i * maxSentenceLength + j] = 1;
}
hSentenceLengths[i] = source.at(i)->GetWords(tab).size();
}
sentencesMask.NewSize(maxSentenceLength, source.size(), 1, 1);
mblas::copy(thrust::raw_pointer_cast(hMapping.data()),
hMapping.size(),
sentencesMask.data(),
sentenceLengths.NewSize(source.size(), 1, 1, 1);
mblas::copy(thrust::raw_pointer_cast(hSentenceLengths.data()),
hSentenceLengths.size(),
sentenceLengths.data(),
cudaMemcpyHostToDevice);
//cerr << "GetContext1=" << context.Debug(1) << endl;
@ -106,7 +103,7 @@ void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context
backwardRnn_.Encode(embeddedWords_.crend() - maxSentenceLength,
embeddedWords_.crend() ,
context, source.size(), true, &sentencesMask);
context, source.size(), true, &sentenceLengths);
//cerr << "GetContext5=" << context.Debug(1) << endl;
}

View File

@ -71,8 +71,9 @@ class Encoder {
}
template <class It>
void Encode(It it, It end, mblas::Matrix& Context, size_t batchSize, bool invert,
const mblas::IMatrix *sentencesMask=nullptr)
void Encode(It it, It end, mblas::Matrix& Context,
size_t batchSize, bool invert,
const mblas::IMatrix *sentenceLengths=nullptr)
{
InitializeState(batchSize);
@ -86,12 +87,15 @@ class Encoder {
//std::cerr << "invert=" << invert << std::endl;
if(invert) {
assert(sentencesMask);
assert(sentenceLengths);
//std::cerr << "1State_=" << State_.Debug(1) << std::endl;
//std::cerr << "mapping=" << mblas::Debug(*mapping) << std::endl;
//mblas::MapMatrix(*(State_.cell), *sentencesMask, n - i - 1);
mblas::MapMatrix(*(State_.output), *sentencesMask, n - i - 1);
mblas::MapMatrix(*(State_.output), *sentenceLengths, n - i - 1);
if (State_.cell->size()) {
mblas::MapMatrix(*(State_.cell), *sentenceLengths, n - i - 1);
}
//std::cerr << "2State_=" << State_.Debug(1) << std::endl;
mblas::PasteRows(Context, *(State_.output), (n - i - 1), gru_->GetStateLength().output);
@ -124,7 +128,7 @@ class Encoder {
Encoder(const Weights& model, const YAML::Node& config);
void Encode(const Sentences& words, size_t tab, mblas::Matrix& context,
mblas::IMatrix &sentencesMask);
mblas::IMatrix &sentenceLengths);
private:
std::unique_ptr<Cell> InitForwardCell(const Weights& model, const YAML::Node& config);

View File

@ -17,7 +17,7 @@ Matrix& Swap(Matrix& Out, Matrix& In) {
__global__ void gMean(MatrixWrapper<float> out,
const MatrixWrapper<float> in,
const MatrixWrapper<uint> mapping)
const MatrixWrapper<uint> sentenceLengths)
{
// out = batches * states
// in = max sentence length * states * 1 * batches
@ -37,7 +37,7 @@ __global__ void gMean(MatrixWrapper<float> out,
float sum = 0.0f;
int counter = 0;
for (size_t row = 0; row < in.dim(0); ++row) {
int isWord = mapping(row, batch, 0, 0);
bool isWord = row < sentenceLengths[batch];
//printf("batch=%lu startMapInd=%lu mapOffset=%lu -> %d \n", batch, startMapInd, mapOffset, isWord);
if (isWord) {
sum += in(row, state, 0, batch);
@ -50,13 +50,14 @@ __global__ void gMean(MatrixWrapper<float> out,
}
}
void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
void Mean(Matrix& Out,
const Matrix& In,
const mblas::IMatrix &sentenceLengths)
{
assert(Out.dim(2) == 1);
assert(Out.dim(3) == 1);
assert(Out.dim(0) == In.dim(3));
assert(Out.dim(1) == In.dim(1));
assert(In.dim(0) * In.dim(3) == sentencesMask.size());
// mean of each ROW
size_t batchNum = Out.dim(0) * Out.dim(2) * Out.dim(3);
@ -67,14 +68,14 @@ void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
MatrixWrapper<float> inWrap(In);
//cerr << "outWrap=" << outWrap.Debug() << endl;
MatrixWrapper<uint> mappingWrap(sentencesMask, false);
MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);
uint size = outWrap.size();
uint threads = std::min((uint)MAX_THREADS, size);
uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1);
gMean<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
(outWrap, inWrap, mappingWrap);
(outWrap, inWrap, sentenceLengthsWrap);
}
@ -432,27 +433,27 @@ Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
__global__ void gSoftMax(MatrixWrapper<float> out,
const MatrixWrapper<uint> batchIdsWrap,
const MatrixWrapper<uint> sentencesMappingWrap,
const MatrixWrapper<uint> sentenceLengthsWrap,
uint shareSize)
{
extern __shared__ float _share[];
size_t numHypos = out.dim(0);
size_t srcLen = out.dim(1);
size_t maxLength = out.dim(1);
int hypoInd = blockIdx.x;
int origSrcPos = threadIdx.x;
while (hypoInd < numHypos) {
MatrixWrapper<float> _max(_share, shareSize);
MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);
_max[origSrcPos] = out(hypoInd, origSrcPos, 0, 0);
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
int srcPos = tid + origSrcPos;
if (srcPos < srcLen) {
if (srcPos < maxLength) {
float value = out(hypoInd, srcPos, 0, 0);
int batch = batchIdsWrap[hypoInd];
value *= sentencesMappingWrap(srcPos, batch, 0, 0);
value *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0;
if (value > _max[origSrcPos]) {
_max[origSrcPos] = value;
}
@ -475,16 +476,16 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
__syncthreads();
//float* _sum = _share;// + blockDim.x;
MatrixWrapper<float> _sum(_share, shareSize);
MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);
_sum[origSrcPos] = 0.0f;
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
int srcPos = tid + origSrcPos;
if (srcPos < srcLen) {
if (srcPos < maxLength) {
out(hypoInd, srcPos, 0, 0) = __expf(out(hypoInd, srcPos, 0, 0) - max);
int batch = batchIdsWrap[hypoInd];
out(hypoInd, srcPos, 0, 0) *= sentencesMappingWrap(srcPos, batch, 0, 0);
out(hypoInd, srcPos, 0, 0) *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0; // sentencesMappingWrap(srcPos, batch, 0, 0);
_sum[origSrcPos] += out(hypoInd, srcPos, 0, 0);
}
}
@ -504,9 +505,9 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
__syncthreads();
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
int srcPos = tid + origSrcPos;
if (srcPos < srcLen) {
if (srcPos < maxLength) {
out(hypoInd, srcPos, 0, 0) /= _sum[0];
}
}
@ -515,20 +516,23 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
}
}
Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize)
Matrix& Softmax(Matrix& Out,
const DeviceVector<uint>& batchIds,
const mblas::IMatrix &sentenceLengths,
size_t batchSize)
{
size_t srcSize = Out.dim(1);
size_t maxLength = Out.dim(1);
MatrixWrapper<float> outWrap(Out);
const MatrixWrapper<uint> batchIdsWrap(batchIds);
const MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
const MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);
int blocks = batchSize;
int threads = std::min(MAX_THREADS, (int)srcSize);
int threads = std::min(MAX_THREADS, (int)maxLength);
int shared = sizeof(float) * threads;
gSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
(outWrap, batchIdsWrap, sentencesMappingWrap, threads);
(outWrap, batchIdsWrap, sentenceLengthsWrap, threads);
return Out;
}
@ -544,7 +548,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)
while (rowIdx < rows) {
//float* _max = _share;
MatrixWrapper<float> _max(_share, shareSize);
MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);
_max[threadIdx.x] = out(rowIdx, threadIdx.x, 0, 0);
for (int tid = 0; tid < cols; tid += blockDim.x) {
@ -573,7 +577,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)
__syncthreads();
//float* _sum = _share;// + blockDim.x;
MatrixWrapper<float> _sum(_share, shareSize);
MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);
_sum[threadIdx.x] = 0.0f;
for (int tid = 0; tid < cols; tid += blockDim.x) {
@ -675,8 +679,8 @@ void Fill(Matrix& In, float value) {
__global__
void gMapMatrix(MatrixWrapper<float> in,
const MatrixWrapper<uint> sentencesMappingWrap,
int mappingCols, int i)
const MatrixWrapper<uint> sentenceLengthsWrap,
int i)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < in.size()) {
@ -685,27 +689,28 @@ void gMapMatrix(MatrixWrapper<float> in,
int col = tid % numCols;
//in[tid] *= mappingWrap(i, batchIdx, 0, 0);
in(batchIdx, col, 0, 0) *= sentencesMappingWrap(i, batchIdx, 0, 0); // [mappingCols * batchIdx + i];
in(batchIdx, col, 0, 0) *= (i < sentenceLengthsWrap[batchIdx] ? 1 : 0);
}
}
void MapMatrix(Matrix& state, const mblas::IMatrix &sentencesMask, size_t i)
void MapMatrix(Matrix& state,
const mblas::IMatrix &sentenceLengths,
size_t i)
{
// blank out rows in the state matrix where the word position i does not exist
// mapping is a concatenated array of 1 & 0 of each sentence in the batch to say whether word exists or not.
int batchSize = state.dim(0);
int stateLength = state.dim(1);
int sentenceLength = sentencesMask.size() / batchSize;
int numThreads = std::min((int)state.size(), MAX_THREADS);
int numBlocks = (state.size() / numThreads) + ((state.size() % numThreads == 0) ? 0 : 1);
MatrixWrapper<float> stateWrap(state);
MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths);
gMapMatrix<<<numBlocks, numThreads, 0, CudaStreamHandler::GetStream()>>>
(stateWrap, sentencesMappingWrap, sentenceLength, i);
(stateWrap, sentenceLengthsWrap, i);
/*
cerr << "nBlocks=" << numBlocks << endl;
@ -852,20 +857,569 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps
Normalization(out, in, alpha, nullptr, eps);
}
__global__ void gRandomizeMemory(int *data)
///////////////////////////////////////////////////////////////////////////////////////////////////////
__global__
void gBeamSizeInit(MatrixWrapper<uint> hypo2BeamSizeWrap,
MatrixWrapper<uint> batch2HypoWrap,
MatrixWrapper<uint> hypo2CandidateWrap,
bool isFirst,
uint beamSizeSum,
const MatrixWrapper<uint> beamSizesWrap)
{
clock_t start = clock();
uint hypoInd = 0;
uint candidateInd = 0;
uint a = 0, b = 0;
//printf("beamSizesWrap.size()=%u \n", beamSizesWrap.size());
for (size_t batchInd = 0; batchInd < beamSizesWrap.size(); ++batchInd) {
uint beamSize = beamSizesWrap[batchInd];
/*
printf("batchInd=%u ", batchInd);
printf("beamSize=%u ", beamSize);
printf("a=%u ", a);
printf("b=%u \n", b);
*/
if (beamSize) {
if (isFirst) {
assert(a < hypo2BeamSizeWrap.size());
assert(a < hypo2CandidateWrap.size());
hypo2BeamSizeWrap[a] = beamSize;
hypo2CandidateWrap[a] = candidateInd;
++a;
assert(b < batch2HypoWrap.size());
batch2HypoWrap[b] = batchInd;
++b;
candidateInd += beamSize;
}
else {
for (size_t j = 0; j < beamSize; ++j) {
assert(a < hypo2BeamSizeWrap.size());
assert(a < hypo2CandidateWrap.size());
hypo2BeamSizeWrap[a] = beamSize;
hypo2CandidateWrap[a] = candidateInd;
++a;
candidateInd += beamSize;
}
assert(b < batch2HypoWrap.size());
batch2HypoWrap[b] = hypoInd;
++b;
}
hypoInd += beamSize;
}
}
}
void RandomizeMemory()
__device__
float GetMaxScore(const MatrixWrapper<NthOutBatch> &nBestMatrix)
{
int *data;
HANDLE_ERROR( cudaMalloc((void**)&data, 8 * 1024 ^ 3) );
float ret = -1111111111111;
for (uint i = 0; i < nBestMatrix.dim(1); ++i) {
const NthOutBatch &curr = nBestMatrix[i];
if (curr.score > ret) {
ret = curr.score;
}
}
uint threads = 1024;
uint blocks = 8 * 1024 ^ 3 / threads;
gRandomizeMemory<<<blocks, threads>>>(data);
return ret;
}
__device__
void AddElement(float &minScore,
uint &i,
NthOutBatch *arr,
bool forbidUNK,
uint vocabInd,
const NthOutBatch &ele)
{
const float score = ele.score;
if (forbidUNK && vocabInd == UNK_ID) {
arr[i].score = -1111111111111;
minScore = -1111111111111;
}
else {
arr[i] = ele;
if (score < minScore) {
minScore = score;
}
++i;
}
}
__device__
void MergeElement(float &minScore,
NthOutBatch *arr,
uint arrSize,
const NthOutBatch &ele)
{
float newMinScore = +1111111111;
bool found = false;
for (uint i = 0; i < arrSize; ++i) {
NthOutBatch &currEle = arr[i];
if (!found && minScore == currEle.score) {
currEle = ele;
found = true;
}
// update min score
if (currEle.score < newMinScore) {
newMinScore = currEle.score;
}
}
minScore = newMinScore;
}
__device__
void MergeElement(float &minScore,
NthOutBatch *arr,
uint arrSize,
const NthOutBatch &ele,
bool forbidUNK,
uint vocabInd)
{
if (forbidUNK && vocabInd == UNK_ID) {
// do nothing
}
else if (ele.score > minScore) {
// replace element with min score
MergeElement(minScore, arr, arrSize, ele);
/*
printf("arrInd=%d ind=%d vocabId=%d \n",
arrInd,
_max[threadIdx.x].ind,
_max[threadIdx.x].vocabId);
*/
}
}
__device__
void NBestAndMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
float &topScore,
const MatrixWrapper<float> in,
const MatrixWrapper<float> b4Wrap,
uint hypoInd,
uint maxBeamSize,
bool forbidUNK,
const MatrixWrapper<uint> hypo2BeamSizeWrap,
const MatrixWrapper<uint> hypo2CandidateWrap)
{
extern __shared__ char _sharePtr[];
MatrixWrapper<float> maxMatrix((float*)_sharePtr, blockDim.x, 1, 1, 1);
void *ptrOffset = _sharePtr + sizeof(float) * blockDim.x;
MatrixWrapper<NthOutBatch> nBestMatrix((NthOutBatch*)ptrOffset, blockDim.x, maxBeamSize, 1, 1);
NthOutBatch *arr = &nBestMatrix(threadIdx.x, 0, 0, 0);
uint vocabSize = in.dim(1);
assert(hypoInd < hypo2BeamSizeWrap.size());
uint beamSize = hypo2BeamSizeWrap[hypoInd];
float minScore = +1111111111;
// init
uint vocabInd = threadIdx.x;
uint i = 0;
while (vocabInd < vocabSize && i < beamSize) {
const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
uint arrInd = hypoInd * vocabSize + vocabInd;
NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
AddElement(minScore, i, arr, forbidUNK, vocabInd, ele);
vocabInd += blockDim.x;
}
// MAIN LOOP
while (vocabInd < vocabSize) {
const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
uint arrInd = hypoInd * vocabSize + vocabInd;
NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
MergeElement(minScore, arr, beamSize, ele, forbidUNK, vocabInd);
vocabInd += blockDim.x;
} // while (vocabInd < vocabSize) {
// merge nbest from different threads
int len = blockDim.x;
while (len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
NthOutBatch *dest = &nBestMatrix(threadIdx.x, 0, 0, 0);
for (uint i = 0; i < beamSize; ++i) {
const NthOutBatch &ele = nBestMatrix(threadIdx.x + skip, i, 0, 0);
if (ele.score > minScore) {
MergeElement(minScore, dest, beamSize, ele);
}
}
}
len = (len + 1) >> 1;
}
__syncthreads();
if (threadIdx.x == 0) {
// copy to output array
assert(hypoInd < hypo2CandidateWrap.size());
uint candidateInd = hypo2CandidateWrap[hypoInd];
for (uint i = 0; i < beamSize; ++i) {
const NthOutBatch &curr = nBestMatrix(0, i, 0, 0);
//printf("vocabInd=%u \n", best.vocabInd);
assert(candidateInd + i < nBestCandidatesWrap.size());
nBestCandidatesWrap[candidateInd + i] = curr;
}
}
__syncthreads();
topScore = GetMaxScore(nBestMatrix);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
__device__
void SumAndLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
const MatrixWrapper<float> in,
const MatrixWrapper<float> b4Wrap,
uint hypoInd,
uint maxBeamSize,
float topScore,
const MatrixWrapper<uint> hypo2BeamSizeWrap,
const MatrixWrapper<uint> hypo2CandidateWrap)
{
extern __shared__ float _share[];
size_t vocabSize = in.dim(1);
//assert(nBestCandidatesWrap.dim(0) == rows);
//float* _sum = _share;// + blockDim.x;
MatrixWrapper<float> _sum(_share, blockDim.x, 1, 1, 1);
// calc sum
_sum[threadIdx.x] = 0.0f;
for (int id = threadIdx.x; id < vocabSize; id += blockDim.x) {
//row[id] = exp(row[id] - max);
float val = in(hypoInd, id, 0, 0) + b4Wrap(0, id, 0, 0);
val = __expf(val - topScore);
_sum[threadIdx.x] += val;
}
int len = blockDim.x;
while (len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
_sum[threadIdx.x] += _sum[threadIdx.x + skip];
}
len = (len + 1) >> 1;
}
__syncthreads();
// apply partition and log to top
if (threadIdx.x == 0) {
//__syncthreads();
//printf("val=%f %f \n", in(rowIdx, ele.vocabId, 0, 0), val);
// nbest
uint beamSize = hypo2BeamSizeWrap[hypoInd];
uint startPos = hypo2CandidateWrap[hypoInd];
for (uint i = 0; i < beamSize; ++i) {
//__syncthreads();
NthOutBatch &ele = nBestCandidatesWrap[startPos + i];
float &val = ele.score;
val = __expf(val - topScore);
val = __logf(val /_sum[0]);
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void gLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
const MatrixWrapper<float> in,
const MatrixWrapper<float> b4Wrap,
uint maxBeamSize,
bool forbidUNK,
const MatrixWrapper<uint> hypo2BeamSizeWrap,
const MatrixWrapper<uint> hypo2CandidateWrap)
{
uint hypos = in.dim(0);
uint vocabSize = in.dim(1);
uint hypoInd = blockIdx.x; // index of previous hypo
while (hypoInd < hypos) {
float topScore;
NBestAndMax(nBestCandidatesWrap,
topScore,
in,
b4Wrap,
hypoInd,
maxBeamSize,
forbidUNK,
hypo2BeamSizeWrap,
hypo2CandidateWrap);
SumAndLogSoftMax(nBestCandidatesWrap,
in,
b4Wrap,
hypoInd,
maxBeamSize,
topScore,
hypo2BeamSizeWrap,
hypo2CandidateWrap);
__syncthreads();
hypoInd += gridDim.x;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void gNBestPerBatch(MatrixWrapper<NthOutBatch> nBestWrap,
MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
const MatrixWrapper<float> in,
const MatrixWrapper<float> costsWrap,
uint maxBeamSize,
bool forbidUNK,
bool isFirst,
const MatrixWrapper<uint> hypo2BeamSizeWrap,
const MatrixWrapper<uint> batch2HypoWrap,
const MatrixWrapper<uint> hypo2CandidateWrap)
{
//uint rows = in.dim(0);
uint batchSize = batch2HypoWrap.dim(0);
uint batchInd = blockIdx.x;
while (batchInd < batchSize) {
assert(batchInd < batch2HypoWrap.size());
assert(batchInd < hypo2BeamSizeWrap.size());
assert(batchInd < nBestWrap.size());
uint hypoInd = batch2HypoWrap[batchInd];
uint beamSize = hypo2BeamSizeWrap[hypoInd];
assert(beamSize);
uint nextHypoInd;
if (isFirst) {
nextHypoInd = batchInd * beamSize;
}
else {
nextHypoInd = hypoInd;
}
// candiate from 1st hypo
float minScore = +999999;
assert(hypoInd < hypo2CandidateWrap.size());
uint candidateInd = hypo2CandidateWrap[hypoInd];
for (uint i = 0; i < beamSize; ++i) {
float prevCost;
if (isFirst) {
assert(batchInd < costsWrap.size());
prevCost = costsWrap[batchInd];
}
else {
//printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
assert(hypoInd < costsWrap.size());
prevCost = costsWrap[hypoInd];
}
assert((nextHypoInd + i) < nBestWrap.size());
assert(candidateInd + i < nBestCandidatesWrap.size());
nBestWrap[nextHypoInd + i] = nBestCandidatesWrap[candidateInd + i];
float &score = nBestWrap[nextHypoInd + i].score;
score += prevCost;
if (score < minScore) {
minScore = score;
}
}
// candidates from other previous hypos
if (!isFirst) {
for (uint hypoOffset = 1; hypoOffset < beamSize; ++hypoOffset) {
//printf("hypoInd=%d \n", (hypoInd + hypoOffset));
//printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
assert((hypoInd + hypoOffset) < costsWrap.size());
float prevCost = costsWrap[hypoInd + hypoOffset];
assert((hypoInd + hypoOffset) < hypo2CandidateWrap.size());
uint candidateInd = hypo2CandidateWrap[hypoInd + hypoOffset];
for (uint candidateOffset = 0; candidateOffset < beamSize; ++candidateOffset) {
assert((candidateInd + candidateOffset) < nBestCandidatesWrap.size());
NthOutBatch &candidate = nBestCandidatesWrap[candidateInd + candidateOffset];
candidate.score += prevCost;
assert(nextHypoInd < nBestWrap.size());
NthOutBatch *arr = &nBestWrap[nextHypoInd];
if (candidate.score > minScore) {
MergeElement(minScore, arr, beamSize, candidate);
}
}
}
}
batchInd += gridDim.x;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
const Matrix& in,
const Matrix& b4,
const DeviceVector<float> &costs,
bool forbidUNK,
uint maxBeamSize,
const std::vector<uint>& beamSizes,
uint beamSizeSum,
bool isFirst)
{
//BEGIN_TIMER("LogSoftmax excl kernels");
//cerr << "in=" << in.Debug(0) << endl;
//cerr << "beamSizes=" << beamSizes.size() << endl;
// create beam size vectors on GPU but exclude empty beams
uint batchSize = 0;
uint candidateInd = 0;
for (size_t batchInd = 0; batchInd < beamSizes.size(); ++batchInd) {
uint beamSize = beamSizes[batchInd];
//cerr << "(" << beamSize << "," << hypoInd << ") ";
if (beamSize) {
if (isFirst) {
candidateInd += beamSize;
}
else {
candidateInd += beamSize * beamSize;
}
++batchSize;
}
}
DeviceVector<uint> d_beamSizes(beamSizes);
DeviceVector<uint> hypo2BeamSize(in.dim(0));
DeviceVector<uint> hypo2Candidate(in.dim(0));
DeviceVector<uint> batch2Hypo(batchSize);
DeviceVector<NthOutBatch> nBestCandidates(candidateInd);
/*
cerr << "in=" << in.Debug(0) << endl;
cerr << "beamSizes=" << beamSizes.size() << endl;
cerr << "beamSizeSum=" << beamSizeSum << endl;
cerr << "batchSize=" << batchSize << endl;
cerr << "candidateInd=" << candidateInd << endl;
cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl;
cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl;
cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl;
cerr << "nBest=" << Debug(nBest, 0) << endl;
cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl;
cerr << endl;
*/
//DeviceVector<NthOutBatch> nBest(beamSizeSum);
//cerr << "nBest=" << nBest.size() << endl;
MatrixWrapper<float> inWrap(in);
MatrixWrapper<float> b4Wrap(b4);
MatrixWrapper<uint> hypo2BeamSizeWrap(hypo2BeamSize);
MatrixWrapper<uint> hypo2CandidateWrap(hypo2Candidate);
MatrixWrapper<uint> batch2HypoWrap(batch2Hypo);
MatrixWrapper<NthOutBatch> nBestWrap(nBest);
MatrixWrapper<NthOutBatch> nBestCandidatesWrap(nBestCandidates);
MatrixWrapper<float> costsWrap(costs);
MatrixWrapper<uint> beamSizesWrap(d_beamSizes);
//PAUSE_TIMER("LogSoftmax excl kernels");
int blocks = std::min(MAX_BLOCKS, (int)in.dim(0));
int threads = std::min(MAX_THREADS, (int)in.dim(1));
int shared = sizeof(NthOutBatch) * threads * maxBeamSize
+ sizeof(float) * threads;
//cerr << "shared=" << shared << endl;
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
//cerr << "step0" << endl;
//BEGIN_TIMER("gBeamSizeInit");
gBeamSizeInit<<<1, 1, 0, CudaStreamHandler::GetStream()>>>
(hypo2BeamSizeWrap,
batch2HypoWrap,
hypo2CandidateWrap,
isFirst,
beamSizeSum,
beamSizesWrap
);
//PAUSE_TIMER("gBeamSizeInit");
/*
cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 2) << endl;
cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 2) << endl;
cerr << "batch2Hypo=" << Debug(batch2Hypo, 2) << endl;
cerr << endl;
*/
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
//cerr << "step1" << endl;
//BEGIN_TIMER("gLogSoftMax");
gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
(nBestCandidatesWrap,
inWrap,
b4Wrap,
maxBeamSize,
forbidUNK,
hypo2BeamSizeWrap,
hypo2CandidateWrap);
//PAUSE_TIMER("gLogSoftMax");
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
//cerr << "step2" << endl;
threads = 1;
//BEGIN_TIMER("gNBestPerBatch");
gNBestPerBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
(nBestWrap,
nBestCandidatesWrap,
inWrap,
costsWrap,
maxBeamSize,
forbidUNK,
isFirst,
hypo2BeamSizeWrap,
batch2HypoWrap,
hypo2CandidateWrap);
//PAUSE_TIMER("gNBestPerBatch");
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
//cerr << "step3" << endl;
//cerr << "3costs=" << Debug(costs, 0) << endl;
}
} // namespace mblas

View File

@ -13,6 +13,7 @@
#include "gpu/mblas/matrix.h"
#include "gpu/mblas/matrix_wrapper.h"
#include "gpu/mblas/handles.h"
#include "gpu/mblas/nth_element_kernels.h"
namespace amunmt {
namespace GPU {
@ -83,6 +84,26 @@ std::string Debug(const HostVector<T> &vec, size_t verbosity = 1)
return strm.str();
}
template<typename T>
std::string Debug(const std::vector<T> &vec, size_t verbosity = 1)
{
std::stringstream strm;
strm << "size=" << vec.size();
if (verbosity) {
T sum = Sum(vec.data(), vec.size());
strm << " sum=" << sum;
}
if (verbosity == 2) {
for (size_t i = 0; i < vec.size(); ++i) {
strm << " " << vec[i];
}
}
return strm.str();
}
template<typename T>
void copy(const T *in, size_t count, T *out, cudaMemcpyKind kind) {
@ -93,7 +114,9 @@ void Fill(Matrix& In, float value=0.0f);
Matrix& Swap(Matrix& Out, Matrix& In);
void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask);
void Mean(Matrix& Out,
const Matrix& In,
const mblas::IMatrix &sentenceLengths);
void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const DeviceVector<uint>& mapping);
@ -117,7 +140,7 @@ Matrix& CopyRow(Matrix& Out,
Matrix& Concat(Matrix& Out, const Matrix& In);
void MapMatrix(Matrix& state,
const mblas::IMatrix &sentencesMask,
const mblas::IMatrix &sentenceLengths,
size_t i);
Matrix& CopyRows(Matrix& Out,
@ -135,7 +158,10 @@ Matrix& Slice(Matrix& Out,
Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
bool transA = false, bool transB = false);
Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize);
Matrix& Softmax(Matrix& Out,
const DeviceVector<uint>& batchIds,
const mblas::IMatrix &sentenceLengths,
size_t batchSize);
Matrix& LogSoftmax(Matrix& Out);
@ -232,7 +258,7 @@ __global__ void gBroadcastVecColumn(Functor functor,
size_t rows = outWrap.dim(0);
size_t cols = outWrap.dim(1);
MatrixWrapper<float> sdata(sdataOrig, rows);
MatrixWrapper<float> sdata(sdataOrig, rows, 1, 1, 1);
if (threadIdx.x == 0) {
for (int i = 0; i < rows; ++i)
@ -422,7 +448,15 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, const Mat
void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps);
void RandomizeMemory();
void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
const Matrix& in,
const Matrix& b4,
const DeviceVector<float> &costs,
bool forbidUNK,
uint maxBeamSize,
const std::vector<uint>& beamSizes,
uint beamSizeSum,
bool isFirst);
} // namespace mblas
} // namespace GPU

View File

@ -94,39 +94,14 @@ public:
dataConst_ = data_;
}
MatrixWrapper(DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
{
dim_[0] = a;
dim_[1] = b;
dim_[2] = c;
dim_[3] = d;
updateStridesRowMajor();
assert(size() == vec.size());
data_ = thrust::raw_pointer_cast(vec.data());
dataConst_ = data_;
}
MatrixWrapper(const DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
{
dim_[0] = a;
dim_[1] = b;
dim_[2] = c;
dim_[3] = d;
updateStridesRowMajor();
assert(size() == vec.size());
data_ = nullptr;
dataConst_ = thrust::raw_pointer_cast(vec.data());
}
__device__
MatrixWrapper(T *ptr, uint size)
MatrixWrapper(T *ptr, uint a, uint b, uint c, uint d)
{
dim_[0] = size;
size_ = size;
dim_[0] = a;
dim_[1] = b;
dim_[2] = c;
dim_[3] = d;
updateStrides();
data_ = ptr;
dataConst_ = ptr;
@ -148,7 +123,7 @@ public:
return stride_[i];
}
__host__
__device__ __host__
void updateStrides()
{
stride_[0] = dim_[1];
@ -159,7 +134,7 @@ public:
size_ = stride_[3] * dim_[3];
}
__host__
__device__ __host__
void updateStridesRowMajor()
{
stride_[0] = 1;

View File

@ -166,5 +166,7 @@ void NthElement::getValueByKey(std::vector<float>& out, const mblas::Matrix &d_i
HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
}
//////////////////////////////////////////////////////////////////////////
} // namespace GPU
} // namespace amunmt

View File

@ -18,8 +18,11 @@ class NthElement {
NthElement(uint maxBeamSize, uint maxBatchSize);
virtual ~NthElement();
void getNBestList(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
std::vector<float>& outCosts, std::vector<uint>& outKeys,
// standard nth_element
void getNBestList(const std::vector<uint>& beamSizes,
mblas::Matrix& Probs,
std::vector<float>& outCosts,
std::vector<uint>& outKeys,
const bool isFirst=false);
void GetPairs(uint number,
@ -47,6 +50,7 @@ class NthElement {
const HostVector<uint>& batchFirstElementIdxs,
const HostVector<uint>& cummulatedBeamSizes);
};
} // namespace GPU

View File

@ -1,3 +1,5 @@
#pragma once
#include "matrix_wrapper.h"
namespace amunmt {
@ -32,6 +34,60 @@ struct NthOut
}
};
/////////////////////////////////////////////////////////////////////////////////////////
struct NthOutBatch
{
uint ind;
float score;
//uint hypoInd;
//uint vocabInd;
__device__ __host__
NthOutBatch(const float& rhs)
{
// only to be used to init variable in matrix.h gSum
assert(rhs == 0.0f);
ind = rhs;
score = rhs;
//hypoInd = rhs;
//vocabInd = rhs;
}
__device__ __host__
NthOutBatch() {}
__device__ __host__
NthOutBatch(uint vInd, float vScore, uint vHypoInd, uint vVocabInd)
:ind(vInd)
,score(vScore)
//,hypoInd(vHypoInd)
//,vocabInd(vVocabInd)
{}
__device__ __host__
NthOutBatch& operator=(const NthOutBatch& rhs)
{
ind = rhs.ind;
score = rhs.score;
//hypoInd = rhs.hypoInd;
//vocabInd = rhs.vocabInd;
return *this;
}
__device__ __host__
NthOutBatch& operator+=(const NthOutBatch& rhs)
{
ind += rhs.ind;
score += rhs.score;
//hypoInd += rhs.hypoInd;
//vocabInd += rhs.vocabInd;
return *this;
}
};
/////////////////////////////////////////////////////////////////////////////////////////
inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
@ -40,6 +96,17 @@ inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
return out;
}
inline std::ostream& operator<<(std::ostream &out, const NthOutBatch &obj)
{
out << "("
<< obj.ind << ","
<< obj.score << ","
//<< obj.hypoInd << ","
//<< obj.vocabInd
<< ")";
return out;
}
/////////////////////////////////////////////////////////////////////////////////////////
__global__ void gMaxElement(mblas::MatrixWrapper<NthOut> out,