mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-27 10:33:14 +03:00
Merge branch 'master' of github.com:marian-nmt/marian
This commit is contained in:
commit
4874c0d0fd
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -2,3 +2,6 @@
|
||||
path = src/marian
|
||||
url = https://github.com/marian-nmt/marian-dev
|
||||
branch = stable
|
||||
[submodule "examples"]
|
||||
path = examples
|
||||
url = https://github.com/marian-nmt/marian-examples.git
|
||||
|
10
examples/training/.gitignore
vendored
10
examples/training/.gitignore
vendored
@ -1,10 +0,0 @@
|
||||
model/
|
||||
moses-scripts/
|
||||
subword-nmt/
|
||||
data/corpus.*
|
||||
data/europarl-*
|
||||
data/SETIMES2.*
|
||||
data/*.tok.*
|
||||
data/*.tc.*
|
||||
data/*.bpe.*
|
||||
data/*.tgz
|
@ -1,21 +0,0 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 University of Edinburgh
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,84 +0,0 @@
|
||||
# Example for training with Marian
|
||||
|
||||
Files and scripts in this folder have been adapted from the Romanian-English sample
|
||||
from https://github.com/rsennrich/wmt16-scripts. We also add the back-translated data from
|
||||
http://data.statmt.org/rsennrich/wmt16_backtranslations/ as desribed in
|
||||
http://www.aclweb.org/anthology/W16-2323. The resulting system should be competitive
|
||||
or even slightly better than reported in the Edinburgh WMT2016 paper.
|
||||
|
||||
To execute the complete example type:
|
||||
|
||||
```
|
||||
./run-me.sh
|
||||
```
|
||||
|
||||
which downloads the Romanian-English training files and preprocesses them (tokenization,
|
||||
truecasing, segmentation into subwords units).
|
||||
|
||||
To use with a different GPU than device 0 or more GPUs (here 0 1 2 3) type the command below.
|
||||
Training time on 1 NVIDIA GTX 1080 GPU should be roughly 24 hours.
|
||||
|
||||
```
|
||||
./run-me.sh 0 1 2 3
|
||||
```
|
||||
|
||||
Next it executes a training run with `marian`:
|
||||
|
||||
```
|
||||
../../build/marian \
|
||||
--model model/model.npz \
|
||||
--devices $GPUS \
|
||||
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
|
||||
--vocabs model/vocab.ro.yml model/vocab.en.yml \
|
||||
--dim-vocabs 66000 50000 \
|
||||
--mini-batch 80 \
|
||||
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
|
||||
--early-stopping 5 --moving-average \
|
||||
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||||
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
|
||||
--valid-metrics cross-entropy valid-script \
|
||||
--valid-script-path ./scripts/validate.sh \
|
||||
--log model/train.log --valid-log model/valid.log
|
||||
```
|
||||
After training (the training should stop if cross-entropy on the validation set stops improving) a final model
|
||||
`model/model.avg.npz` is created from the 4 best models on the validation sets (by element-wise averaging). This model is used to
|
||||
translate the WMT2016 dev set and test set with `amun`:
|
||||
|
||||
```
|
||||
cat data/newstest2016.bpe.ro \
|
||||
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -b 12 -n --mini-batch 100 --maxi-batch 1000 \
|
||||
| sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl \
|
||||
> data/newstest2016.bpe.ro.output
|
||||
```
|
||||
after which BLEU scores for the dev and test set are reported. Results should be somewhere in the area of:
|
||||
|
||||
```
|
||||
newsdev2016:
|
||||
BLEU = 35.88, 67.4/42.3/28.8/20.2 (BP=1.000, ratio=1.012, hyp_len=51085, ref_len=50483)
|
||||
|
||||
newstest2016:
|
||||
BLEU = 34.53, 66.0/40.7/27.5/19.2 (BP=1.000, ratio=1.015, hyp_len=49258, ref_len=48531)
|
||||
```
|
||||
|
||||
## Custom validation script
|
||||
|
||||
The validation script `scripts/validate.sh` is a quick example how to write a custom validation script. The training pauses until the validation script finishes executing. A validation script should not output anything to `stdout` apart from the final single score (last line):
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
|
||||
#model prefix
|
||||
prefix=model/model.npz
|
||||
|
||||
dev=data/newsdev2016.bpe.ro
|
||||
ref=data/newsdev2016.tok.en
|
||||
|
||||
# decode
|
||||
|
||||
cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml --mini-batch 10 --maxi-batch 100 2>/dev/null \
|
||||
| sed 's/\@\@ //g' | ./mosesdecoder/scripts/recaser/detruecase.perl > $dev.output.postprocessed
|
||||
|
||||
## get BLEU
|
||||
./mosesdecoder/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed \
|
||||
| cut -f 3 -d ' ' | cut -f 1 -d ','
|
||||
```
|
@ -1,4 +0,0 @@
|
||||
#!/bin/bash -v
|
||||
|
||||
git clean -f
|
||||
rm -rf moses-scripts subword-nmt model data/ro-en.tgz
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,86 +0,0 @@
|
||||
#!/bin/bash -v
|
||||
|
||||
# set chosen gpus
|
||||
GPUS=0
|
||||
if [ $# -ne 0 ]
|
||||
then
|
||||
GPUS=$@
|
||||
fi
|
||||
echo Using gpus $GPUS
|
||||
|
||||
if [ ! -e ../../build/amun ]
|
||||
then
|
||||
echo "amun is not installed in ../../build, you need to compile the toolkit first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -e ../../build/marian ]
|
||||
then
|
||||
echo "marian is not installed in ../../build, you need to compile the toolkit first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# download dependencies and data
|
||||
if [ ! -e "moses-scripts" ]
|
||||
then
|
||||
git clone https://github.com/amunmt/moses-scripts
|
||||
fi
|
||||
|
||||
if [ ! -e "subword-nmt" ]
|
||||
then
|
||||
git clone https://github.com/rsennrich/subword-nmt
|
||||
fi
|
||||
|
||||
if [ ! -e "data/ro-en.tgz" ]
|
||||
then
|
||||
./scripts/download-files.sh
|
||||
fi
|
||||
|
||||
mkdir -p model
|
||||
|
||||
# preprocess data
|
||||
if [ ! -e "data/corpus.bpe.en" ]
|
||||
then
|
||||
./scripts/preprocess.sh
|
||||
fi
|
||||
|
||||
# train model
|
||||
if [ ! -e "model/model.npz" ]
|
||||
then
|
||||
|
||||
../../build/marian \
|
||||
--model model/model.npz \
|
||||
--devices $GPUS --seed 0 \
|
||||
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
|
||||
--vocabs model/vocab.ro.yml model/vocab.en.yml \
|
||||
--dim-vocabs 66000 50000 \
|
||||
--dynamic-batching -w 3000 \
|
||||
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
|
||||
--early-stopping 5 --moving-average \
|
||||
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||||
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
|
||||
--valid-metrics cross-entropy valid-script \
|
||||
--valid-script-path ./scripts/validate.sh \
|
||||
--log model/train.log --valid-log model/valid.log
|
||||
|
||||
fi
|
||||
|
||||
# collect 4 best models on dev set
|
||||
MODELS=`cat model/valid.log | grep valid-script | sort -rg -k8,8 -t ' ' | cut -f 4 -d ' ' | head -n 4 | xargs -I {} echo model/model.iter{}.npz | xargs`
|
||||
|
||||
# average 4 best models into single model
|
||||
../../scripts/average.py -m $MODELS -o model/model.avg.npz
|
||||
|
||||
# translate dev set with averaged model
|
||||
cat data/newsdev2016.bpe.ro \
|
||||
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
|
||||
| sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newsdev2016.bpe.ro.output.postprocessed
|
||||
|
||||
# translate test set with averaged model
|
||||
cat data/newstest2016.bpe.ro \
|
||||
| ../../build/amun -c model/model.npz.amun.yml -m model/model.avg.npz -d $GPUS -b 12 -n --mini-batch 10 --maxi-batch 1000 \
|
||||
| sed 's/\@\@ //g' | moses-scripts/scripts/recaser/detruecase.perl > data/newstest2016.bpe.ro.output.postprocessed
|
||||
|
||||
# calculate bleu scores for dev and test set
|
||||
./moses-scripts/scripts/generic/multi-bleu.perl data/newsdev2016.tok.en < data/newsdev2016.bpe.ro.output.postprocessed
|
||||
./moses-scripts/scripts/generic/multi-bleu.perl data/newstest2016.tok.en < data/newstest2016.bpe.ro.output.postprocessed
|
@ -1,29 +0,0 @@
|
||||
#!/bin/bash -v
|
||||
|
||||
# get En-Ro training data for WMT16
|
||||
|
||||
if [ ! -f data/ro-en.tgz ];
|
||||
then
|
||||
wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz
|
||||
fi
|
||||
|
||||
if [ ! -f data/SETIMES2.ro-en.txt.zip ];
|
||||
then
|
||||
wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip
|
||||
fi
|
||||
|
||||
if [ ! -f data/corpus.bt.ro-en.en.gz ];
|
||||
then
|
||||
wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.en.gz -O data/corpus.bt.ro-en.en.gz
|
||||
wget http://data.statmt.org/rsennrich/wmt16_backtranslations/ro-en/corpus.bt.ro-en.ro.gz -O data/corpus.bt.ro-en.ro.gz
|
||||
fi
|
||||
|
||||
cd data/
|
||||
tar -xf ro-en.tgz
|
||||
unzip SETIMES2.ro-en.txt.zip
|
||||
gzip -d corpus.bt.ro-en.en.gz corpus.bt.ro-en.ro.gz
|
||||
|
||||
cat europarl-v7.ro-en.en SETIMES2.en-ro.en corpus.bt.ro-en.en > corpus.en
|
||||
cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro corpus.bt.ro-en.ro > corpus.ro
|
||||
|
||||
cd ..
|
@ -1,17 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Barry Haddow
|
||||
# Distributed under MIT license
|
||||
|
||||
#
|
||||
# Normalise Romanian s-comma and t-comma
|
||||
|
||||
import io
|
||||
import sys
|
||||
istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
|
||||
ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
for line in istream:
|
||||
line = line.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
|
||||
line = line.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
|
||||
ostream.write(line)
|
@ -1,75 +0,0 @@
|
||||
#!/bin/bash -v
|
||||
|
||||
# this sample script preprocesses a sample corpus, including tokenization,
|
||||
# truecasing, and subword segmentation.
|
||||
# for application to a different language pair,
|
||||
# change source and target prefix, optionally the number of BPE operations,
|
||||
# and the file names (currently, data/corpus and data/newsdev2016 are being processed)
|
||||
|
||||
# in the tokenization step, you will want to remove Romanian-specific normalization / diacritic removal,
|
||||
# and you may want to add your own.
|
||||
# also, you may want to learn BPE segmentations separately for each language,
|
||||
# especially if they differ in their alphabet
|
||||
|
||||
# suffix of source language files
|
||||
SRC=ro
|
||||
|
||||
# suffix of target language files
|
||||
TRG=en
|
||||
|
||||
# number of merge operations. Network vocabulary should be slightly larger (to include characters),
|
||||
# or smaller if the operations are learned on the joint vocabulary
|
||||
bpe_operations=85000
|
||||
|
||||
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
|
||||
mosesdecoder=moses-scripts
|
||||
|
||||
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
|
||||
subword_nmt=subword-nmt
|
||||
|
||||
# tokenize
|
||||
for prefix in corpus newsdev2016 newstest2016
|
||||
do
|
||||
cat data/$prefix.$SRC \
|
||||
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
|
||||
| ./scripts/normalise-romanian.py \
|
||||
| ./scripts/remove-diacritics.py \
|
||||
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
|
||||
|
||||
cat data/$prefix.$TRG \
|
||||
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
|
||||
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
|
||||
|
||||
done
|
||||
|
||||
# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
|
||||
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
|
||||
|
||||
# train truecaser
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG
|
||||
|
||||
# apply truecaser (cleaned training corpus)
|
||||
for prefix in corpus
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# apply truecaser (dev/test files)
|
||||
for prefix in newsdev2016 newstest2016
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# train BPE
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
|
||||
|
||||
# apply BPE
|
||||
|
||||
for prefix in corpus newsdev2016 newstest2016
|
||||
do
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
done
|
@ -1,20 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Barry Haddow
|
||||
# Distributed under MIT license
|
||||
|
||||
#
|
||||
# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
|
||||
|
||||
import io
|
||||
import sys
|
||||
istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
|
||||
ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
for line in istream:
|
||||
line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
|
||||
line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
|
||||
line = line.replace("\u0102", "A").replace("\u0103", "a")
|
||||
line = line.replace("\u00C2", "A").replace("\u00E2", "a")
|
||||
line = line.replace("\u00CE", "I").replace("\u00EE", "i")
|
||||
ostream.write(line)
|
@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# model prefix
|
||||
prefix=model/model.npz
|
||||
|
||||
dev=data/newsdev2016.bpe.ro
|
||||
ref=data/newsdev2016.tok.en
|
||||
|
||||
# decode
|
||||
|
||||
cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml -b 12 -n --mini-batch 10 --maxi-batch 100 2>/dev/null \
|
||||
| sed 's/\@\@ //g' | ./moses-scripts/scripts/recaser/detruecase.perl > $dev.output.postprocessed
|
||||
|
||||
# get BLEU
|
||||
./moses-scripts/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed | cut -f 3 -d ' ' | cut -f 1 -d ','
|
4
examples/translate/.gitignore
vendored
4
examples/translate/.gitignore
vendored
@ -1,4 +0,0 @@
|
||||
moses-scripts
|
||||
en-de
|
||||
data/*.out
|
||||
*.yml
|
@ -1,2 +0,0 @@
|
||||
Translation example
|
||||
===================
|
@ -1,3 +0,0 @@
|
||||
#!/bin/bash -v
|
||||
|
||||
rm -rf moses-scripts en-de data/*.out *.yml
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,60 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# set chosen gpus
|
||||
GPUS=0
|
||||
if [ $# -ne 0 ]
|
||||
then
|
||||
GPUS=$@
|
||||
fi
|
||||
echo Using gpus $GPUS
|
||||
|
||||
if [ ! -e ../../build/amun ]
|
||||
then
|
||||
echo "amun is not installed in ../../build, you need to compile the toolkit first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# download dependencies and data
|
||||
if [ ! -e "moses-scripts" ]
|
||||
then
|
||||
git clone https://github.com/amunmt/moses-scripts
|
||||
fi
|
||||
|
||||
if [ ! -e "en-de/model.npz" ]
|
||||
then
|
||||
wget -r -l 1 --cut-dirs=2 -e robots=off -nH -np -R index.html* http://data.statmt.org/rsennrich/wmt16_systems/en-de/
|
||||
fi
|
||||
|
||||
|
||||
# translate test set with single model
|
||||
cat data/newstest2015.ende.en | \
|
||||
# preprocess
|
||||
moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
|
||||
moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
|
||||
moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
|
||||
# translate
|
||||
../../build/amun -m en-de/model.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
|
||||
--mini-batch 50 --maxi-batch 1000 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe | \
|
||||
# postprocess
|
||||
moses-scripts/scripts/recaser/detruecase.perl | \
|
||||
moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.single.out
|
||||
|
||||
# create configuration file for model ensemble
|
||||
../../build/amun -m en-de/model-ens?.npz -s en-de/vocab.en.json -t en-de/vocab.de.json \
|
||||
--mini-batch 1 --maxi-batch 1 -d $GPUS --gpu-threads 1 -b 12 -n --bpe en-de/ende.bpe \
|
||||
--relative-paths --dump-config > ensemble.yml
|
||||
|
||||
# translate test set with ensemble
|
||||
cat data/newstest2015.ende.en | \
|
||||
# preprocess
|
||||
moses-scripts/scripts/tokenizer/normalize-punctuation.perl -l en | \
|
||||
moses-scripts/scripts/tokenizer/tokenizer.perl -l en -penn | \
|
||||
moses-scripts/scripts/recaser/truecase.perl -model en-de/truecase-model.en | \
|
||||
# translate
|
||||
../../build/amun -c ensemble.yml --gpu-threads 1 | \
|
||||
# postprocess
|
||||
moses-scripts/scripts/recaser/detruecase.perl | \
|
||||
moses-scripts/scripts/tokenizer/detokenizer.perl -l de > data/newstest2015.ensemble.out
|
||||
|
||||
moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.single.out
|
||||
moses-scripts/scripts/generic/multi-bleu.perl data/newstest2015.ende.de < data/newstest2015.ensemble.out
|
@ -9,17 +9,19 @@
|
||||
|
||||
namespace amunmt {
|
||||
|
||||
class God;
|
||||
|
||||
class BestHypsBase
|
||||
{
|
||||
public:
|
||||
BestHypsBase(
|
||||
const God &god,
|
||||
bool forbidUNK,
|
||||
bool returnNBestList,
|
||||
bool isInputFiltered,
|
||||
bool returnAttentionWeights,
|
||||
const std::map<std::string, float>& weights)
|
||||
: forbidUNK_(forbidUNK),
|
||||
returnNBestList_(returnNBestList),
|
||||
: god_(god),
|
||||
forbidUNK_(forbidUNK),
|
||||
isInputFiltered_(isInputFiltered),
|
||||
returnAttentionWeights_(returnAttentionWeights),
|
||||
weights_(weights)
|
||||
@ -35,8 +37,8 @@ class BestHypsBase
|
||||
std::vector<uint>& beamSizes) = 0;
|
||||
|
||||
protected:
|
||||
const God &god_;
|
||||
const bool forbidUNK_;
|
||||
const bool returnNBestList_;
|
||||
const bool isInputFiltered_;
|
||||
const bool returnAttentionWeights_;
|
||||
const std::map<std::string, float> weights_;
|
||||
|
@ -83,6 +83,19 @@ God& God::Init(int argc, char** argv) {
|
||||
LoadScorers();
|
||||
LoadFiltering();
|
||||
|
||||
returnNBestList_ = Get<bool>("return-alignment")
|
||||
|| Get<bool>("return-soft-alignment")
|
||||
|| Get<bool>("return-nematus-alignment");
|
||||
|
||||
useFusedSoftmax_ = true;
|
||||
if (returnNBestList_ ||
|
||||
gpuLoaders_.size() != 1 || // more than 1 scorer
|
||||
God::Get<size_t>("beam-size") > 11 // beam size affect shared mem alloc in gLogSoftMax()
|
||||
) {
|
||||
useFusedSoftmax_ = false;
|
||||
}
|
||||
//cerr << "useFusedSoftmax_=" << useFusedSoftmax_ << endl;
|
||||
|
||||
if (Has("input-file")) {
|
||||
LOG(info)->info("Reading from {}", Get<std::string>("input-file"));
|
||||
inputStream_.reset(new InputFileStream(Get<std::string>("input-file")));
|
||||
|
@ -77,6 +77,12 @@ class God {
|
||||
ThreadPool &GetThreadPool()
|
||||
{ return *pool_; }
|
||||
|
||||
bool ReturnNBestList() const
|
||||
{ return returnNBestList_; }
|
||||
|
||||
bool UseFusedSoftmax() const
|
||||
{ return useFusedSoftmax_; }
|
||||
|
||||
private:
|
||||
void LoadScorers();
|
||||
void LoadFiltering();
|
||||
@ -107,6 +113,9 @@ class God {
|
||||
mutable boost::shared_mutex accessLock_;
|
||||
|
||||
std::unique_ptr<ThreadPool> pool_;
|
||||
|
||||
bool returnNBestList_;
|
||||
bool useFusedSoftmax_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,10 @@ namespace amunmt {
|
||||
Scorer::Scorer(const God &god,
|
||||
const std::string& name,
|
||||
const YAML::Node& config, size_t tab)
|
||||
: name_(name), config_(config), tab_(tab)
|
||||
:god_(god)
|
||||
,name_(name)
|
||||
,config_(config)
|
||||
,tab_(tab)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -64,8 +64,11 @@ class Scorer {
|
||||
}
|
||||
|
||||
virtual BaseMatrix& GetProbs() = 0;
|
||||
virtual void *GetNBest() = 0; // hack - need to return matrix<NthOut> but NthOut contain cuda code
|
||||
virtual const BaseMatrix *GetBias() const = 0;
|
||||
|
||||
protected:
|
||||
const God &god_;
|
||||
const std::string& name_;
|
||||
const YAML::Node& config_;
|
||||
size_t tab_;
|
||||
|
@ -26,9 +26,8 @@ class BestHyps : public BestHypsBase
|
||||
{
|
||||
public:
|
||||
BestHyps(const God &god)
|
||||
: BestHypsBase(
|
||||
: BestHypsBase(god,
|
||||
!god.Get<bool>("allow-unk"),
|
||||
god.Get<bool>("n-best"),
|
||||
god.Get<std::vector<std::string>>("softmax-filter").size(),
|
||||
god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
|
||||
god.GetScorerWeights())
|
||||
@ -83,7 +82,7 @@ class BestHyps : public BestHypsBase
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> breakDowns;
|
||||
if (returnNBestList_) {
|
||||
if (god_.ReturnNBestList()) {
|
||||
breakDowns.push_back(bestCosts);
|
||||
for (auto& scorer : scorers) {
|
||||
std::vector<float> modelCosts(beamSize);
|
||||
@ -123,7 +122,7 @@ class BestHyps : public BestHypsBase
|
||||
hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
|
||||
}
|
||||
|
||||
if (returnNBestList_) {
|
||||
if (god_.ReturnNBestList()) {
|
||||
hyp->GetCostBreakdown().resize(scorers.size());
|
||||
float sum = 0;
|
||||
for(size_t j = 0; j < scorers.size(); ++j) {
|
||||
|
@ -25,6 +25,18 @@ class CPUEncoderDecoderBase : public Scorer {
|
||||
virtual void GetAttention(mblas::Matrix& Attention) = 0;
|
||||
virtual mblas::Matrix& GetAttention() = 0;
|
||||
|
||||
virtual void *GetNBest()
|
||||
{
|
||||
assert(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
virtual const BaseMatrix *GetBias() const
|
||||
{
|
||||
assert(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
protected:
|
||||
mblas::Matrix SourceContext_;
|
||||
};
|
||||
|
@ -4,16 +4,20 @@ namespace amunmt {
|
||||
namespace GPU {
|
||||
|
||||
BestHyps::BestHyps(const God &god)
|
||||
: BestHypsBase(
|
||||
: BestHypsBase(god,
|
||||
!god.Get<bool>("allow-unk"),
|
||||
god.Get<bool>("n-best"),
|
||||
god.Get<std::vector<std::string>>("softmax-filter").size(),
|
||||
god.Get<bool>("return-alignment") || god.Get<bool>("return-soft-alignment") || god.Get<bool>("return-nematus-alignment"),
|
||||
god.GetScorerWeights()),
|
||||
nthElement_(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch")),
|
||||
keys(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
|
||||
Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch"))
|
||||
{}
|
||||
Costs(god.Get<size_t>("beam-size") * god.Get<size_t>("mini-batch")),
|
||||
maxBeamSize_(god.Get<uint>("beam-size"))
|
||||
{
|
||||
if (!god_.UseFusedSoftmax()) {
|
||||
NthElement *obj = new NthElement(god.Get<size_t>("beam-size"), god.Get<size_t>("mini-batch"));
|
||||
nthElement_.reset(obj);
|
||||
}
|
||||
}
|
||||
|
||||
void BestHyps::DisAllowUNK(mblas::Matrix& Prob) {
|
||||
SetColumn(Prob, UNK_ID, std::numeric_limits<float>::lowest());
|
||||
@ -24,7 +28,17 @@ void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Prob
|
||||
std::vector<unsigned>& outKeys,
|
||||
const bool isFirst)
|
||||
{
|
||||
nthElement_.getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
|
||||
nthElement_->getNBestList(beamSizes, Probs, outCosts, outKeys, isFirst);
|
||||
}
|
||||
|
||||
// fast fused softmax and nth_element
|
||||
void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
|
||||
DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<unsigned>& outKeys,
|
||||
const bool isFirst)
|
||||
{
|
||||
getNBestList(beamSizes, Probs, nBest, outCosts, outKeys, isFirst);
|
||||
}
|
||||
|
||||
std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPtr>& scorers,
|
||||
@ -52,6 +66,7 @@ std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPt
|
||||
return alignments;
|
||||
}
|
||||
|
||||
// standard nth_element
|
||||
void BestHyps::CalcBeam(
|
||||
const Beam& prevHyps,
|
||||
const std::vector<ScorerPtr>& scorers,
|
||||
@ -77,35 +92,49 @@ void BestHyps::CalcBeam(
|
||||
cudaMemcpyHostToDevice);
|
||||
//mblas::copy(vCosts.begin(), vCosts.end(), Costs.begin());
|
||||
|
||||
const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
|
||||
|
||||
BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
|
||||
|
||||
for (size_t i = 1; i < scorers.size(); ++i) {
|
||||
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
|
||||
|
||||
Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
|
||||
}
|
||||
|
||||
if (forbidUNK_) {
|
||||
DisAllowUNK(Probs);
|
||||
}
|
||||
|
||||
size_t beamSizeSum = std::accumulate(beamSizes.begin(), beamSizes.end(), 0);
|
||||
|
||||
std::vector<float> bestCosts;
|
||||
std::vector<unsigned> bestKeys;
|
||||
|
||||
FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
|
||||
const bool isFirst = (vCosts[0] == 0.0f) ? true : false;
|
||||
|
||||
if (god_.UseFusedSoftmax()) {
|
||||
const mblas::Matrix& b4 = *static_cast<const mblas::Matrix*>(scorers[0]->GetBias());
|
||||
DeviceVector<NthOutBatch> &nBest = *static_cast<DeviceVector<NthOutBatch>*>(scorers[0]->GetNBest());
|
||||
nBest.resize(beamSizeSum);
|
||||
|
||||
BEGIN_TIMER("GetProbs.LogSoftmaxAndNBest");
|
||||
mblas::LogSoftmaxAndNBest(nBest, Probs, b4, Costs, forbidUNK_, maxBeamSize_, beamSizes, beamSizeSum, isFirst);
|
||||
PAUSE_TIMER("GetProbs.LogSoftmaxAndNBest");
|
||||
//std::cerr << "2Probs=" << Probs.Debug(1) << std::endl;
|
||||
|
||||
FindBests(beamSizes, Probs, nBest, bestCosts, bestKeys, isFirst);
|
||||
}
|
||||
else {
|
||||
BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, Costs);
|
||||
|
||||
for (size_t i = 1; i < scorers.size(); ++i) {
|
||||
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
|
||||
|
||||
Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs);
|
||||
}
|
||||
|
||||
if (forbidUNK_) {
|
||||
DisAllowUNK(Probs);
|
||||
}
|
||||
|
||||
FindBests(beamSizes, Probs, bestCosts, bestKeys, isFirst);
|
||||
}
|
||||
|
||||
std::vector<HostVector<float>> breakDowns;
|
||||
if (returnNBestList_) {
|
||||
if (god_.ReturnNBestList()) {
|
||||
breakDowns.push_back(bestCosts);
|
||||
for (size_t i = 1; i < scorers.size(); ++i) {
|
||||
std::vector<float> modelCosts(beamSizeSum);
|
||||
mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs());
|
||||
|
||||
nthElement_.getValueByKey(modelCosts, currProbs);
|
||||
nthElement_->getValueByKey(modelCosts, currProbs);
|
||||
breakDowns.push_back(modelCosts);
|
||||
}
|
||||
}
|
||||
@ -135,7 +164,7 @@ void BestHyps::CalcBeam(
|
||||
hyp.reset(new Hypothesis(prevHyps[hypIndex], wordIndex, hypIndex, cost));
|
||||
}
|
||||
|
||||
if(returnNBestList_) {
|
||||
if(god_.ReturnNBestList()) {
|
||||
hyp->GetCostBreakdown().resize(scorers.size());
|
||||
float sum = 0;
|
||||
for (size_t j = 0; j < scorers.size(); ++j) {
|
||||
@ -162,5 +191,43 @@ void BestHyps::CalcBeam(
|
||||
PAUSE_TIMER("CalcBeam");
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
void BestHyps::getNBestList(const std::vector<uint>& beamSizes,
|
||||
mblas::Matrix& Probs,
|
||||
DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<uint>& outKeys,
|
||||
const bool isFirst) const
|
||||
{
|
||||
GetPairs(nBest, outKeys, outCosts);
|
||||
assert(outCosts.size() == outKeys.size());
|
||||
|
||||
/*
|
||||
cerr << "outCosts/outKeys=";
|
||||
for (size_t i = 0; i < outKeys.size(); ++i) {
|
||||
cerr << "(" << outCosts[i] << "," << outKeys[i] << ") ";
|
||||
}
|
||||
cerr << endl;
|
||||
*/
|
||||
//cerr << endl;
|
||||
}
|
||||
|
||||
void BestHyps::GetPairs(DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<uint>& outKeys,
|
||||
std::vector<float>& outValues) const
|
||||
{
|
||||
//cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl;
|
||||
outKeys.resize(nBest.size());
|
||||
outValues.resize(nBest.size());
|
||||
|
||||
HostVector<NthOutBatch> hostVec(nBest.size());
|
||||
mblas::copy(thrust::raw_pointer_cast(nBest.data()), nBest.size(), thrust::raw_pointer_cast(hostVec.data()), cudaMemcpyDeviceToHost);
|
||||
|
||||
for (size_t i = 0; i < nBest.size(); ++i) {
|
||||
outKeys[i] = hostVec[i].ind;
|
||||
outValues[i] = hostVec[i].score;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ class BestHyps : public BestHypsBase
|
||||
|
||||
void DisAllowUNK(mblas::Matrix& Prob);
|
||||
|
||||
// standard nth_element
|
||||
void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<unsigned>& outKeys,
|
||||
@ -31,6 +32,7 @@ class BestHyps : public BestHypsBase
|
||||
|
||||
std::vector<SoftAlignmentPtr> GetAlignments(const std::vector<ScorerPtr>& scorers,
|
||||
size_t hypIndex);
|
||||
|
||||
void CalcBeam(
|
||||
const Beam& prevHyps,
|
||||
const std::vector<ScorerPtr>& scorers,
|
||||
@ -39,9 +41,29 @@ class BestHyps : public BestHypsBase
|
||||
std::vector<uint>& beamSizes);
|
||||
|
||||
private:
|
||||
NthElement nthElement_;
|
||||
std::unique_ptr<NthElement> nthElement_;
|
||||
DeviceVector<unsigned> keys;
|
||||
DeviceVector<float> Costs;
|
||||
uint maxBeamSize_;
|
||||
|
||||
// fast fused softmax and nth_element
|
||||
void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
|
||||
DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<unsigned>& outKeys,
|
||||
const bool isFirst);
|
||||
|
||||
void getNBestList(const std::vector<uint>& beamSizes,
|
||||
mblas::Matrix& Probs,
|
||||
DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<uint>& outKeys,
|
||||
const bool isFirst=false) const;
|
||||
|
||||
void GetPairs(DeviceVector<NthOutBatch> &nBest,
|
||||
std::vector<uint>& outKeys,
|
||||
std::vector<float>& outValues) const;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -70,8 +70,9 @@ void EncoderDecoder::Decode(const State& in, State& out, const std::vector<uint>
|
||||
edIn.GetStates(),
|
||||
edIn.GetEmbeddings(),
|
||||
*SourceContext_,
|
||||
sentencesMask_,
|
||||
beamSizes);
|
||||
sentenceLengths_,
|
||||
beamSizes,
|
||||
god_.UseFusedSoftmax());
|
||||
PAUSE_TIMER("Decode");
|
||||
}
|
||||
|
||||
@ -81,7 +82,7 @@ State* EncoderDecoder::NewState() const {
|
||||
|
||||
void EncoderDecoder::Encode(const Sentences& source) {
|
||||
BEGIN_TIMER("Encode");
|
||||
encoder_->Encode(source, tab_, *SourceContext_, sentencesMask_);
|
||||
encoder_->Encode(source, tab_, *SourceContext_, sentenceLengths_);
|
||||
//cerr << "GPU SourceContext_=" << SourceContext_.Debug(1) << endl;
|
||||
PAUSE_TIMER("Encode");
|
||||
}
|
||||
@ -89,7 +90,7 @@ void EncoderDecoder::Encode(const Sentences& source) {
|
||||
void EncoderDecoder::BeginSentenceState(State& state, size_t batchSize) {
|
||||
//BEGIN_TIMER("BeginSentenceState");
|
||||
EDState& edState = state.get<EDState>();
|
||||
decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentencesMask_);
|
||||
decoder_->EmptyState(edState.GetStates(), *SourceContext_, batchSize, sentenceLengths_);
|
||||
|
||||
decoder_->EmptyEmbedding(edState.GetEmbeddings(), batchSize);
|
||||
//PAUSE_TIMER("BeginSentenceState");
|
||||
@ -143,6 +144,16 @@ BaseMatrix& EncoderDecoder::GetProbs() {
|
||||
return decoder_->GetProbs();
|
||||
}
|
||||
|
||||
void *EncoderDecoder::GetNBest()
|
||||
{
|
||||
return &decoder_->GetNBest();
|
||||
}
|
||||
|
||||
const BaseMatrix *EncoderDecoder::GetBias() const
|
||||
{
|
||||
return decoder_->GetBias();
|
||||
}
|
||||
|
||||
mblas::Matrix& EncoderDecoder::GetAttention() {
|
||||
return decoder_->GetAttention();
|
||||
}
|
||||
|
@ -50,6 +50,9 @@ class EncoderDecoder : public Scorer {
|
||||
mblas::Matrix& GetAttention();
|
||||
virtual BaseMatrix& GetProbs();
|
||||
|
||||
virtual void *GetNBest();
|
||||
virtual const BaseMatrix *GetBias() const;
|
||||
|
||||
size_t GetVocabSize() const;
|
||||
|
||||
void Filter(const std::vector<size_t>& filterIds);
|
||||
@ -59,7 +62,7 @@ class EncoderDecoder : public Scorer {
|
||||
std::unique_ptr<Encoder> encoder_;
|
||||
std::unique_ptr<Decoder> decoder_;
|
||||
DeviceVector<uint> indices_;
|
||||
mblas::IMatrix sentencesMask_;
|
||||
mblas::IMatrix sentenceLengths_;
|
||||
// set in Encoder::GetContext() to length (maxSentenceLength * batchSize). 1 if it's a word, 0 otherwise
|
||||
|
||||
std::unique_ptr<mblas::Matrix> SourceContext_;
|
||||
|
@ -66,7 +66,7 @@ class Decoder {
|
||||
void InitializeState(CellState& State,
|
||||
const mblas::Matrix& SourceContext,
|
||||
const size_t batchSize,
|
||||
const mblas::IMatrix &sentencesMask)
|
||||
const mblas::IMatrix &sentenceLengths)
|
||||
{
|
||||
using namespace mblas;
|
||||
|
||||
@ -81,7 +81,7 @@ class Decoder {
|
||||
|
||||
//std::cerr << "SourceContext=" << SourceContext.Debug(1) << std::endl;
|
||||
//std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
|
||||
Mean(Temp2_, SourceContext, sentencesMask);
|
||||
Mean(Temp2_, SourceContext, sentenceLengths);
|
||||
|
||||
//std::cerr << "1State=" << State.Debug(1) << std::endl;
|
||||
//std::cerr << "3Temp2_=" << Temp2_.Debug(1) << std::endl;
|
||||
@ -156,7 +156,7 @@ class Decoder {
|
||||
void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
|
||||
const CellState& HiddenState,
|
||||
const mblas::Matrix& SourceContext,
|
||||
const mblas::IMatrix &sentencesMask,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
const std::vector<uint>& beamSizes)
|
||||
{
|
||||
// mapping = 1/0 whether each position, in each sentence in the batch is actually a valid word
|
||||
@ -165,6 +165,7 @@ class Decoder {
|
||||
|
||||
using namespace mblas;
|
||||
|
||||
size_t maxLength = SourceContext.dim(0);
|
||||
size_t batchSize = SourceContext.dim(3);
|
||||
//std::cerr << "batchSize=" << batchSize << std::endl;
|
||||
//std::cerr << "HiddenState=" << HiddenState.Debug(0) << std::endl;
|
||||
@ -182,11 +183,13 @@ class Decoder {
|
||||
batchMapping.size(),
|
||||
thrust::raw_pointer_cast(dBatchMapping_.data()),
|
||||
cudaMemcpyHostToDevice);
|
||||
//std::cerr << "mapping=" << Debug(mapping, 2) << std::endl;
|
||||
//std::cerr << "batchMapping=" << Debug(batchMapping, 2) << std::endl;
|
||||
//std::cerr << "dBatchMapping_=" << Debug(dBatchMapping_, 2) << std::endl;
|
||||
|
||||
const size_t srcSize = sentencesMask.size() / beamSizes.size();
|
||||
/*
|
||||
std::cerr << "SourceContext=" << SourceContext.Debug(0) << std::endl;
|
||||
std::cerr << "AlignedSourceContext=" << AlignedSourceContext.Debug(0) << std::endl;
|
||||
std::cerr << "A_=" << A_.Debug(0) << std::endl;
|
||||
std::cerr << "sentenceLengths=" << sentenceLengths.Debug(2) << std::endl;
|
||||
*/
|
||||
|
||||
Prod(/*h_[1],*/ Temp2_, *(HiddenState.output), *w_.W_);
|
||||
//std::cerr << "1Temp2_=" << Temp2_.Debug() << std::endl;
|
||||
@ -198,14 +201,14 @@ class Decoder {
|
||||
}
|
||||
//std::cerr << "2Temp2_=" << Temp2_.Debug() << std::endl;
|
||||
|
||||
Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, srcSize);
|
||||
Broadcast(Tanh(_1 + _2), Temp1_, SCU_, Temp2_, dBatchMapping_, maxLength);
|
||||
|
||||
//std::cerr << "w_.V_=" << w_.V_->Debug(0) << std::endl;
|
||||
//std::cerr << "3Temp1_=" << Temp1_.Debug(0) << std::endl;
|
||||
|
||||
Prod(A_, *w_.V_, Temp1_, false, true);
|
||||
|
||||
mblas::Softmax(A_, dBatchMapping_, sentencesMask, batchSize);
|
||||
mblas::Softmax(A_, dBatchMapping_, sentenceLengths, batchSize);
|
||||
mblas::WeightedMean(AlignedSourceContext, A_, SourceContext, dBatchMapping_);
|
||||
|
||||
/*
|
||||
@ -253,9 +256,12 @@ class Decoder {
|
||||
}
|
||||
|
||||
void GetProbs(mblas::Matrix& Probs,
|
||||
std::shared_ptr<mblas::Matrix> &b4,
|
||||
const CellState& State,
|
||||
const mblas::Matrix& Embedding,
|
||||
const mblas::Matrix& AlignedSourceContext) {
|
||||
const mblas::Matrix& AlignedSourceContext,
|
||||
bool useFusedSoftmax)
|
||||
{
|
||||
using namespace mblas;
|
||||
|
||||
//BEGIN_TIMER("GetProbs.Prod");
|
||||
@ -298,7 +304,7 @@ class Decoder {
|
||||
Element(Tanh(_1 + _2 + _3), T1_, T2_, T3_);
|
||||
//PAUSE_TIMER("GetProbs.Element");
|
||||
|
||||
std::shared_ptr<mblas::Matrix> w4, b4;
|
||||
std::shared_ptr<mblas::Matrix> w4;
|
||||
if(!filtered_) {
|
||||
w4 = w_.W4_;
|
||||
b4 = w_.B4_;
|
||||
@ -315,13 +321,15 @@ class Decoder {
|
||||
Prod(Probs, T1_, *w4);
|
||||
PAUSE_TIMER("GetProbs.Prod4");
|
||||
|
||||
BEGIN_TIMER("GetProbs.BroadcastVec");
|
||||
BroadcastVec(_1 + _2, Probs, *b4);
|
||||
PAUSE_TIMER("GetProbs.BroadcastVec");
|
||||
if (!useFusedSoftmax) {
|
||||
BEGIN_TIMER("GetProbs.BroadcastVec");
|
||||
BroadcastVec(_1 + _2, Probs, *b4);
|
||||
PAUSE_TIMER("GetProbs.BroadcastVec");
|
||||
|
||||
BEGIN_TIMER("GetProbs.LogSoftMax");
|
||||
mblas::LogSoftmax(Probs);
|
||||
PAUSE_TIMER("GetProbs.LogSoftMax");
|
||||
BEGIN_TIMER("GetProbs.LogSoftMax");
|
||||
mblas::LogSoftmax(Probs);
|
||||
PAUSE_TIMER("GetProbs.LogSoftMax");
|
||||
}
|
||||
}
|
||||
|
||||
void Filter(const std::vector<size_t>& ids) {
|
||||
@ -365,8 +373,9 @@ class Decoder {
|
||||
const CellState& State,
|
||||
const mblas::Matrix& Embeddings,
|
||||
const mblas::Matrix& SourceContext,
|
||||
const mblas::IMatrix &sentencesMask,
|
||||
const std::vector<uint>& beamSizes)
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
const std::vector<uint>& beamSizes,
|
||||
bool useFusedSoftmax)
|
||||
{
|
||||
//BEGIN_TIMER("Decode");
|
||||
|
||||
@ -379,7 +388,11 @@ class Decoder {
|
||||
//PAUSE_TIMER("GetHiddenState");
|
||||
|
||||
//BEGIN_TIMER("GetAlignedSourceContext");
|
||||
GetAlignedSourceContext(AlignedSourceContext_, HiddenState_, SourceContext, sentencesMask, beamSizes);
|
||||
GetAlignedSourceContext(AlignedSourceContext_,
|
||||
HiddenState_,
|
||||
SourceContext,
|
||||
sentenceLengths,
|
||||
beamSizes);
|
||||
//std::cerr << "AlignedSourceContext_=" << AlignedSourceContext_.Debug(1) << std::endl;
|
||||
//PAUSE_TIMER("GetAlignedSourceContext");
|
||||
|
||||
@ -389,7 +402,7 @@ class Decoder {
|
||||
//PAUSE_TIMER("GetNextState");
|
||||
|
||||
//BEGIN_TIMER("GetProbs");
|
||||
GetProbs(NextState, Embeddings, AlignedSourceContext_);
|
||||
GetProbs(NextState, Embeddings, AlignedSourceContext_, useFusedSoftmax);
|
||||
//std::cerr << "Probs_=" << Probs_.Debug(1) << std::endl;
|
||||
//PAUSE_TIMER("GetProbs");
|
||||
|
||||
@ -403,9 +416,9 @@ class Decoder {
|
||||
void EmptyState(CellState& State,
|
||||
const mblas::Matrix& SourceContext,
|
||||
size_t batchSize,
|
||||
const mblas::IMatrix &sentencesMask)
|
||||
const mblas::IMatrix &sentenceLengths)
|
||||
{
|
||||
rnn1_.InitializeState(State, SourceContext, batchSize, sentencesMask);
|
||||
rnn1_.InitializeState(State, SourceContext, batchSize, sentenceLengths);
|
||||
alignment_.Init(SourceContext);
|
||||
}
|
||||
|
||||
@ -435,6 +448,14 @@ class Decoder {
|
||||
return alignment_.GetAttention();
|
||||
}
|
||||
|
||||
DeviceVector<NthOutBatch>& GetNBest() {
|
||||
return nBest_;
|
||||
}
|
||||
|
||||
const mblas::Matrix *GetBias() const {
|
||||
return b4_.get();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void GetHiddenState(CellState& HiddenState,
|
||||
@ -446,10 +467,13 @@ class Decoder {
|
||||
void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
|
||||
const CellState& HiddenState,
|
||||
const mblas::Matrix& SourceContext,
|
||||
const mblas::IMatrix &sentencesMask,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
const std::vector<uint>& beamSizes) {
|
||||
alignment_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext,
|
||||
sentencesMask, beamSizes);
|
||||
alignment_.GetAlignedSourceContext(AlignedSourceContext,
|
||||
HiddenState,
|
||||
SourceContext,
|
||||
sentenceLengths,
|
||||
beamSizes);
|
||||
}
|
||||
|
||||
void GetNextState(CellState& State,
|
||||
@ -461,8 +485,10 @@ class Decoder {
|
||||
|
||||
void GetProbs(const CellState& State,
|
||||
const mblas::Matrix& Embedding,
|
||||
const mblas::Matrix& AlignedSourceContext) {
|
||||
softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext);
|
||||
const mblas::Matrix& AlignedSourceContext,
|
||||
bool useFusedSoftmax)
|
||||
{
|
||||
softmax_.GetProbs(Probs_, b4_, State, Embedding, AlignedSourceContext, useFusedSoftmax);
|
||||
}
|
||||
|
||||
std::unique_ptr<Cell> InitHiddenCell(const Weights& model, const YAML::Node& config){
|
||||
@ -505,6 +531,9 @@ class Decoder {
|
||||
Alignment<Weights::DecAlignment> alignment_;
|
||||
Softmax<Weights::DecSoftmax> softmax_;
|
||||
|
||||
DeviceVector<NthOutBatch> nBest_;
|
||||
std::shared_ptr<mblas::Matrix> b4_;
|
||||
|
||||
Decoder(const Decoder&) = delete;
|
||||
};
|
||||
|
||||
|
@ -63,22 +63,19 @@ std::vector<std::vector<size_t>> GetBatchInput(const Sentences& source, size_t t
|
||||
}
|
||||
|
||||
void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context,
|
||||
mblas::IMatrix &sentencesMask)
|
||||
mblas::IMatrix &sentenceLengths)
|
||||
{
|
||||
size_t maxSentenceLength = GetMaxLength(source, tab);
|
||||
|
||||
//cerr << "1dMapping=" << mblas::Debug(dMapping, 2) << endl;
|
||||
HostVector<uint> hMapping(maxSentenceLength * source.size(), 0);
|
||||
HostVector<uint> hSentenceLengths(source.size());
|
||||
for (size_t i = 0; i < source.size(); ++i) {
|
||||
for (size_t j = 0; j < source.at(i)->GetWords(tab).size(); ++j) {
|
||||
hMapping[i * maxSentenceLength + j] = 1;
|
||||
}
|
||||
hSentenceLengths[i] = source.at(i)->GetWords(tab).size();
|
||||
}
|
||||
|
||||
sentencesMask.NewSize(maxSentenceLength, source.size(), 1, 1);
|
||||
mblas::copy(thrust::raw_pointer_cast(hMapping.data()),
|
||||
hMapping.size(),
|
||||
sentencesMask.data(),
|
||||
sentenceLengths.NewSize(source.size(), 1, 1, 1);
|
||||
mblas::copy(thrust::raw_pointer_cast(hSentenceLengths.data()),
|
||||
hSentenceLengths.size(),
|
||||
sentenceLengths.data(),
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
//cerr << "GetContext1=" << context.Debug(1) << endl;
|
||||
@ -106,7 +103,7 @@ void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context
|
||||
|
||||
backwardRnn_.Encode(embeddedWords_.crend() - maxSentenceLength,
|
||||
embeddedWords_.crend() ,
|
||||
context, source.size(), true, &sentencesMask);
|
||||
context, source.size(), true, &sentenceLengths);
|
||||
//cerr << "GetContext5=" << context.Debug(1) << endl;
|
||||
}
|
||||
|
||||
|
@ -71,8 +71,9 @@ class Encoder {
|
||||
}
|
||||
|
||||
template <class It>
|
||||
void Encode(It it, It end, mblas::Matrix& Context, size_t batchSize, bool invert,
|
||||
const mblas::IMatrix *sentencesMask=nullptr)
|
||||
void Encode(It it, It end, mblas::Matrix& Context,
|
||||
size_t batchSize, bool invert,
|
||||
const mblas::IMatrix *sentenceLengths=nullptr)
|
||||
{
|
||||
InitializeState(batchSize);
|
||||
|
||||
@ -86,12 +87,15 @@ class Encoder {
|
||||
|
||||
//std::cerr << "invert=" << invert << std::endl;
|
||||
if(invert) {
|
||||
assert(sentencesMask);
|
||||
assert(sentenceLengths);
|
||||
|
||||
//std::cerr << "1State_=" << State_.Debug(1) << std::endl;
|
||||
//std::cerr << "mapping=" << mblas::Debug(*mapping) << std::endl;
|
||||
//mblas::MapMatrix(*(State_.cell), *sentencesMask, n - i - 1);
|
||||
mblas::MapMatrix(*(State_.output), *sentencesMask, n - i - 1);
|
||||
mblas::MapMatrix(*(State_.output), *sentenceLengths, n - i - 1);
|
||||
if (State_.cell->size()) {
|
||||
mblas::MapMatrix(*(State_.cell), *sentenceLengths, n - i - 1);
|
||||
}
|
||||
//std::cerr << "2State_=" << State_.Debug(1) << std::endl;
|
||||
|
||||
mblas::PasteRows(Context, *(State_.output), (n - i - 1), gru_->GetStateLength().output);
|
||||
@ -124,7 +128,7 @@ class Encoder {
|
||||
Encoder(const Weights& model, const YAML::Node& config);
|
||||
|
||||
void Encode(const Sentences& words, size_t tab, mblas::Matrix& context,
|
||||
mblas::IMatrix &sentencesMask);
|
||||
mblas::IMatrix &sentenceLengths);
|
||||
|
||||
private:
|
||||
std::unique_ptr<Cell> InitForwardCell(const Weights& model, const YAML::Node& config);
|
||||
|
@ -17,7 +17,7 @@ Matrix& Swap(Matrix& Out, Matrix& In) {
|
||||
|
||||
__global__ void gMean(MatrixWrapper<float> out,
|
||||
const MatrixWrapper<float> in,
|
||||
const MatrixWrapper<uint> mapping)
|
||||
const MatrixWrapper<uint> sentenceLengths)
|
||||
{
|
||||
// out = batches * states
|
||||
// in = max sentence length * states * 1 * batches
|
||||
@ -37,7 +37,7 @@ __global__ void gMean(MatrixWrapper<float> out,
|
||||
float sum = 0.0f;
|
||||
int counter = 0;
|
||||
for (size_t row = 0; row < in.dim(0); ++row) {
|
||||
int isWord = mapping(row, batch, 0, 0);
|
||||
bool isWord = row < sentenceLengths[batch];
|
||||
//printf("batch=%lu startMapInd=%lu mapOffset=%lu -> %d \n", batch, startMapInd, mapOffset, isWord);
|
||||
if (isWord) {
|
||||
sum += in(row, state, 0, batch);
|
||||
@ -50,13 +50,14 @@ __global__ void gMean(MatrixWrapper<float> out,
|
||||
}
|
||||
}
|
||||
|
||||
void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
|
||||
void Mean(Matrix& Out,
|
||||
const Matrix& In,
|
||||
const mblas::IMatrix &sentenceLengths)
|
||||
{
|
||||
assert(Out.dim(2) == 1);
|
||||
assert(Out.dim(3) == 1);
|
||||
assert(Out.dim(0) == In.dim(3));
|
||||
assert(Out.dim(1) == In.dim(1));
|
||||
assert(In.dim(0) * In.dim(3) == sentencesMask.size());
|
||||
|
||||
// mean of each ROW
|
||||
size_t batchNum = Out.dim(0) * Out.dim(2) * Out.dim(3);
|
||||
@ -67,14 +68,14 @@ void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask)
|
||||
MatrixWrapper<float> inWrap(In);
|
||||
//cerr << "outWrap=" << outWrap.Debug() << endl;
|
||||
|
||||
MatrixWrapper<uint> mappingWrap(sentencesMask, false);
|
||||
MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);
|
||||
|
||||
uint size = outWrap.size();
|
||||
uint threads = std::min((uint)MAX_THREADS, size);
|
||||
uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1);
|
||||
|
||||
gMean<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, inWrap, mappingWrap);
|
||||
(outWrap, inWrap, sentenceLengthsWrap);
|
||||
|
||||
}
|
||||
|
||||
@ -432,27 +433,27 @@ Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
|
||||
|
||||
__global__ void gSoftMax(MatrixWrapper<float> out,
|
||||
const MatrixWrapper<uint> batchIdsWrap,
|
||||
const MatrixWrapper<uint> sentencesMappingWrap,
|
||||
const MatrixWrapper<uint> sentenceLengthsWrap,
|
||||
uint shareSize)
|
||||
{
|
||||
extern __shared__ float _share[];
|
||||
|
||||
size_t numHypos = out.dim(0);
|
||||
size_t srcLen = out.dim(1);
|
||||
size_t maxLength = out.dim(1);
|
||||
|
||||
int hypoInd = blockIdx.x;
|
||||
int origSrcPos = threadIdx.x;
|
||||
|
||||
while (hypoInd < numHypos) {
|
||||
MatrixWrapper<float> _max(_share, shareSize);
|
||||
MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);
|
||||
_max[origSrcPos] = out(hypoInd, origSrcPos, 0, 0);
|
||||
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
|
||||
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
|
||||
int srcPos = tid + origSrcPos;
|
||||
if (srcPos < srcLen) {
|
||||
if (srcPos < maxLength) {
|
||||
float value = out(hypoInd, srcPos, 0, 0);
|
||||
|
||||
int batch = batchIdsWrap[hypoInd];
|
||||
value *= sentencesMappingWrap(srcPos, batch, 0, 0);
|
||||
value *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0;
|
||||
if (value > _max[origSrcPos]) {
|
||||
_max[origSrcPos] = value;
|
||||
}
|
||||
@ -475,16 +476,16 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
|
||||
__syncthreads();
|
||||
|
||||
//float* _sum = _share;// + blockDim.x;
|
||||
MatrixWrapper<float> _sum(_share, shareSize);
|
||||
MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);
|
||||
|
||||
_sum[origSrcPos] = 0.0f;
|
||||
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
|
||||
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
|
||||
int srcPos = tid + origSrcPos;
|
||||
if (srcPos < srcLen) {
|
||||
if (srcPos < maxLength) {
|
||||
out(hypoInd, srcPos, 0, 0) = __expf(out(hypoInd, srcPos, 0, 0) - max);
|
||||
|
||||
int batch = batchIdsWrap[hypoInd];
|
||||
out(hypoInd, srcPos, 0, 0) *= sentencesMappingWrap(srcPos, batch, 0, 0);
|
||||
out(hypoInd, srcPos, 0, 0) *= srcPos < sentenceLengthsWrap[batch] ? 1 : 0; // sentencesMappingWrap(srcPos, batch, 0, 0);
|
||||
_sum[origSrcPos] += out(hypoInd, srcPos, 0, 0);
|
||||
}
|
||||
}
|
||||
@ -504,9 +505,9 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int tid = 0; tid < srcLen; tid += blockDim.x) {
|
||||
for (int tid = 0; tid < maxLength; tid += blockDim.x) {
|
||||
int srcPos = tid + origSrcPos;
|
||||
if (srcPos < srcLen) {
|
||||
if (srcPos < maxLength) {
|
||||
out(hypoInd, srcPos, 0, 0) /= _sum[0];
|
||||
}
|
||||
}
|
||||
@ -515,20 +516,23 @@ __global__ void gSoftMax(MatrixWrapper<float> out,
|
||||
}
|
||||
}
|
||||
|
||||
Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize)
|
||||
Matrix& Softmax(Matrix& Out,
|
||||
const DeviceVector<uint>& batchIds,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
size_t batchSize)
|
||||
{
|
||||
size_t srcSize = Out.dim(1);
|
||||
size_t maxLength = Out.dim(1);
|
||||
|
||||
MatrixWrapper<float> outWrap(Out);
|
||||
const MatrixWrapper<uint> batchIdsWrap(batchIds);
|
||||
const MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
|
||||
const MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths, false);
|
||||
|
||||
int blocks = batchSize;
|
||||
int threads = std::min(MAX_THREADS, (int)srcSize);
|
||||
int threads = std::min(MAX_THREADS, (int)maxLength);
|
||||
int shared = sizeof(float) * threads;
|
||||
|
||||
gSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, batchIdsWrap, sentencesMappingWrap, threads);
|
||||
(outWrap, batchIdsWrap, sentenceLengthsWrap, threads);
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -544,7 +548,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)
|
||||
|
||||
while (rowIdx < rows) {
|
||||
//float* _max = _share;
|
||||
MatrixWrapper<float> _max(_share, shareSize);
|
||||
MatrixWrapper<float> _max(_share, shareSize, 1, 1, 1);
|
||||
|
||||
_max[threadIdx.x] = out(rowIdx, threadIdx.x, 0, 0);
|
||||
for (int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
@ -573,7 +577,7 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, uint shareSize)
|
||||
__syncthreads();
|
||||
|
||||
//float* _sum = _share;// + blockDim.x;
|
||||
MatrixWrapper<float> _sum(_share, shareSize);
|
||||
MatrixWrapper<float> _sum(_share, shareSize, 1, 1, 1);
|
||||
|
||||
_sum[threadIdx.x] = 0.0f;
|
||||
for (int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
@ -675,8 +679,8 @@ void Fill(Matrix& In, float value) {
|
||||
|
||||
__global__
|
||||
void gMapMatrix(MatrixWrapper<float> in,
|
||||
const MatrixWrapper<uint> sentencesMappingWrap,
|
||||
int mappingCols, int i)
|
||||
const MatrixWrapper<uint> sentenceLengthsWrap,
|
||||
int i)
|
||||
{
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < in.size()) {
|
||||
@ -685,27 +689,28 @@ void gMapMatrix(MatrixWrapper<float> in,
|
||||
int col = tid % numCols;
|
||||
|
||||
//in[tid] *= mappingWrap(i, batchIdx, 0, 0);
|
||||
in(batchIdx, col, 0, 0) *= sentencesMappingWrap(i, batchIdx, 0, 0); // [mappingCols * batchIdx + i];
|
||||
in(batchIdx, col, 0, 0) *= (i < sentenceLengthsWrap[batchIdx] ? 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
void MapMatrix(Matrix& state, const mblas::IMatrix &sentencesMask, size_t i)
|
||||
void MapMatrix(Matrix& state,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
size_t i)
|
||||
{
|
||||
// blank out rows in the state matrix where the word position i does not exist
|
||||
// mapping is a concatenated array of 1 & 0 of each sentence in the batch to say whether word exists or not.
|
||||
|
||||
int batchSize = state.dim(0);
|
||||
int stateLength = state.dim(1);
|
||||
int sentenceLength = sentencesMask.size() / batchSize;
|
||||
|
||||
int numThreads = std::min((int)state.size(), MAX_THREADS);
|
||||
int numBlocks = (state.size() / numThreads) + ((state.size() % numThreads == 0) ? 0 : 1);
|
||||
|
||||
MatrixWrapper<float> stateWrap(state);
|
||||
MatrixWrapper<uint> sentencesMappingWrap(sentencesMask, false);
|
||||
MatrixWrapper<uint> sentenceLengthsWrap(sentenceLengths);
|
||||
|
||||
gMapMatrix<<<numBlocks, numThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(stateWrap, sentencesMappingWrap, sentenceLength, i);
|
||||
(stateWrap, sentenceLengthsWrap, i);
|
||||
|
||||
/*
|
||||
cerr << "nBlocks=" << numBlocks << endl;
|
||||
@ -852,20 +857,569 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps
|
||||
Normalization(out, in, alpha, nullptr, eps);
|
||||
}
|
||||
|
||||
__global__ void gRandomizeMemory(int *data)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__global__
|
||||
void gBeamSizeInit(MatrixWrapper<uint> hypo2BeamSizeWrap,
|
||||
MatrixWrapper<uint> batch2HypoWrap,
|
||||
MatrixWrapper<uint> hypo2CandidateWrap,
|
||||
bool isFirst,
|
||||
uint beamSizeSum,
|
||||
const MatrixWrapper<uint> beamSizesWrap)
|
||||
{
|
||||
clock_t start = clock();
|
||||
uint hypoInd = 0;
|
||||
uint candidateInd = 0;
|
||||
|
||||
uint a = 0, b = 0;
|
||||
//printf("beamSizesWrap.size()=%u \n", beamSizesWrap.size());
|
||||
for (size_t batchInd = 0; batchInd < beamSizesWrap.size(); ++batchInd) {
|
||||
uint beamSize = beamSizesWrap[batchInd];
|
||||
/*
|
||||
printf("batchInd=%u ", batchInd);
|
||||
printf("beamSize=%u ", beamSize);
|
||||
printf("a=%u ", a);
|
||||
printf("b=%u \n", b);
|
||||
*/
|
||||
|
||||
if (beamSize) {
|
||||
if (isFirst) {
|
||||
assert(a < hypo2BeamSizeWrap.size());
|
||||
assert(a < hypo2CandidateWrap.size());
|
||||
hypo2BeamSizeWrap[a] = beamSize;
|
||||
hypo2CandidateWrap[a] = candidateInd;
|
||||
++a;
|
||||
|
||||
assert(b < batch2HypoWrap.size());
|
||||
batch2HypoWrap[b] = batchInd;
|
||||
++b;
|
||||
|
||||
candidateInd += beamSize;
|
||||
}
|
||||
else {
|
||||
for (size_t j = 0; j < beamSize; ++j) {
|
||||
assert(a < hypo2BeamSizeWrap.size());
|
||||
assert(a < hypo2CandidateWrap.size());
|
||||
hypo2BeamSizeWrap[a] = beamSize;
|
||||
hypo2CandidateWrap[a] = candidateInd;
|
||||
++a;
|
||||
|
||||
candidateInd += beamSize;
|
||||
}
|
||||
|
||||
assert(b < batch2HypoWrap.size());
|
||||
batch2HypoWrap[b] = hypoInd;
|
||||
++b;
|
||||
}
|
||||
|
||||
hypoInd += beamSize;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void RandomizeMemory()
|
||||
__device__
|
||||
float GetMaxScore(const MatrixWrapper<NthOutBatch> &nBestMatrix)
|
||||
{
|
||||
int *data;
|
||||
HANDLE_ERROR( cudaMalloc((void**)&data, 8 * 1024 ^ 3) );
|
||||
float ret = -1111111111111;
|
||||
for (uint i = 0; i < nBestMatrix.dim(1); ++i) {
|
||||
const NthOutBatch &curr = nBestMatrix[i];
|
||||
if (curr.score > ret) {
|
||||
ret = curr.score;
|
||||
}
|
||||
}
|
||||
|
||||
uint threads = 1024;
|
||||
uint blocks = 8 * 1024 ^ 3 / threads;
|
||||
gRandomizeMemory<<<blocks, threads>>>(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__
|
||||
void AddElement(float &minScore,
|
||||
uint &i,
|
||||
NthOutBatch *arr,
|
||||
bool forbidUNK,
|
||||
uint vocabInd,
|
||||
const NthOutBatch &ele)
|
||||
{
|
||||
const float score = ele.score;
|
||||
|
||||
if (forbidUNK && vocabInd == UNK_ID) {
|
||||
arr[i].score = -1111111111111;
|
||||
minScore = -1111111111111;
|
||||
}
|
||||
else {
|
||||
arr[i] = ele;
|
||||
|
||||
if (score < minScore) {
|
||||
minScore = score;
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__device__
|
||||
void MergeElement(float &minScore,
|
||||
NthOutBatch *arr,
|
||||
uint arrSize,
|
||||
const NthOutBatch &ele)
|
||||
{
|
||||
float newMinScore = +1111111111;
|
||||
bool found = false;
|
||||
for (uint i = 0; i < arrSize; ++i) {
|
||||
NthOutBatch &currEle = arr[i];
|
||||
if (!found && minScore == currEle.score) {
|
||||
currEle = ele;
|
||||
found = true;
|
||||
}
|
||||
|
||||
// update min score
|
||||
if (currEle.score < newMinScore) {
|
||||
newMinScore = currEle.score;
|
||||
}
|
||||
}
|
||||
|
||||
minScore = newMinScore;
|
||||
}
|
||||
|
||||
__device__
|
||||
void MergeElement(float &minScore,
|
||||
NthOutBatch *arr,
|
||||
uint arrSize,
|
||||
const NthOutBatch &ele,
|
||||
bool forbidUNK,
|
||||
uint vocabInd)
|
||||
{
|
||||
if (forbidUNK && vocabInd == UNK_ID) {
|
||||
// do nothing
|
||||
}
|
||||
else if (ele.score > minScore) {
|
||||
// replace element with min score
|
||||
MergeElement(minScore, arr, arrSize, ele);
|
||||
|
||||
/*
|
||||
printf("arrInd=%d ind=%d vocabId=%d \n",
|
||||
arrInd,
|
||||
_max[threadIdx.x].ind,
|
||||
_max[threadIdx.x].vocabId);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
void NBestAndMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
|
||||
float &topScore,
|
||||
const MatrixWrapper<float> in,
|
||||
const MatrixWrapper<float> b4Wrap,
|
||||
uint hypoInd,
|
||||
uint maxBeamSize,
|
||||
bool forbidUNK,
|
||||
const MatrixWrapper<uint> hypo2BeamSizeWrap,
|
||||
const MatrixWrapper<uint> hypo2CandidateWrap)
|
||||
{
|
||||
extern __shared__ char _sharePtr[];
|
||||
|
||||
MatrixWrapper<float> maxMatrix((float*)_sharePtr, blockDim.x, 1, 1, 1);
|
||||
|
||||
void *ptrOffset = _sharePtr + sizeof(float) * blockDim.x;
|
||||
MatrixWrapper<NthOutBatch> nBestMatrix((NthOutBatch*)ptrOffset, blockDim.x, maxBeamSize, 1, 1);
|
||||
NthOutBatch *arr = &nBestMatrix(threadIdx.x, 0, 0, 0);
|
||||
|
||||
uint vocabSize = in.dim(1);
|
||||
|
||||
assert(hypoInd < hypo2BeamSizeWrap.size());
|
||||
uint beamSize = hypo2BeamSizeWrap[hypoInd];
|
||||
|
||||
float minScore = +1111111111;
|
||||
|
||||
// init
|
||||
uint vocabInd = threadIdx.x;
|
||||
uint i = 0;
|
||||
while (vocabInd < vocabSize && i < beamSize) {
|
||||
const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
|
||||
|
||||
uint arrInd = hypoInd * vocabSize + vocabInd;
|
||||
NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
|
||||
|
||||
AddElement(minScore, i, arr, forbidUNK, vocabInd, ele);
|
||||
|
||||
vocabInd += blockDim.x;
|
||||
}
|
||||
|
||||
// MAIN LOOP
|
||||
while (vocabInd < vocabSize) {
|
||||
const float score = in(hypoInd, vocabInd, 0, 0) + b4Wrap(0, vocabInd, 0, 0);
|
||||
uint arrInd = hypoInd * vocabSize + vocabInd;
|
||||
NthOutBatch ele(arrInd, score, hypoInd, vocabInd);
|
||||
|
||||
MergeElement(minScore, arr, beamSize, ele, forbidUNK, vocabInd);
|
||||
|
||||
vocabInd += blockDim.x;
|
||||
} // while (vocabInd < vocabSize) {
|
||||
|
||||
// merge nbest from different threads
|
||||
int len = blockDim.x;
|
||||
while (len != 1) {
|
||||
__syncthreads();
|
||||
int skip = (len + 1) >> 1;
|
||||
if (threadIdx.x < (len >> 1)) {
|
||||
NthOutBatch *dest = &nBestMatrix(threadIdx.x, 0, 0, 0);
|
||||
|
||||
for (uint i = 0; i < beamSize; ++i) {
|
||||
const NthOutBatch &ele = nBestMatrix(threadIdx.x + skip, i, 0, 0);
|
||||
if (ele.score > minScore) {
|
||||
MergeElement(minScore, dest, beamSize, ele);
|
||||
}
|
||||
}
|
||||
}
|
||||
len = (len + 1) >> 1;
|
||||
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
// copy to output array
|
||||
assert(hypoInd < hypo2CandidateWrap.size());
|
||||
uint candidateInd = hypo2CandidateWrap[hypoInd];
|
||||
for (uint i = 0; i < beamSize; ++i) {
|
||||
const NthOutBatch &curr = nBestMatrix(0, i, 0, 0);
|
||||
//printf("vocabInd=%u \n", best.vocabInd);
|
||||
|
||||
assert(candidateInd + i < nBestCandidatesWrap.size());
|
||||
nBestCandidatesWrap[candidateInd + i] = curr;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
topScore = GetMaxScore(nBestMatrix);
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__device__
|
||||
void SumAndLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
|
||||
const MatrixWrapper<float> in,
|
||||
const MatrixWrapper<float> b4Wrap,
|
||||
uint hypoInd,
|
||||
uint maxBeamSize,
|
||||
float topScore,
|
||||
const MatrixWrapper<uint> hypo2BeamSizeWrap,
|
||||
const MatrixWrapper<uint> hypo2CandidateWrap)
|
||||
{
|
||||
extern __shared__ float _share[];
|
||||
|
||||
size_t vocabSize = in.dim(1);
|
||||
//assert(nBestCandidatesWrap.dim(0) == rows);
|
||||
|
||||
//float* _sum = _share;// + blockDim.x;
|
||||
MatrixWrapper<float> _sum(_share, blockDim.x, 1, 1, 1);
|
||||
|
||||
// calc sum
|
||||
_sum[threadIdx.x] = 0.0f;
|
||||
for (int id = threadIdx.x; id < vocabSize; id += blockDim.x) {
|
||||
//row[id] = exp(row[id] - max);
|
||||
float val = in(hypoInd, id, 0, 0) + b4Wrap(0, id, 0, 0);
|
||||
val = __expf(val - topScore);
|
||||
_sum[threadIdx.x] += val;
|
||||
}
|
||||
|
||||
int len = blockDim.x;
|
||||
while (len != 1) {
|
||||
__syncthreads();
|
||||
|
||||
int skip = (len + 1) >> 1;
|
||||
if (threadIdx.x < (len >> 1)) {
|
||||
_sum[threadIdx.x] += _sum[threadIdx.x + skip];
|
||||
}
|
||||
len = (len + 1) >> 1;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// apply partition and log to top
|
||||
if (threadIdx.x == 0) {
|
||||
//__syncthreads();
|
||||
//printf("val=%f %f \n", in(rowIdx, ele.vocabId, 0, 0), val);
|
||||
|
||||
// nbest
|
||||
uint beamSize = hypo2BeamSizeWrap[hypoInd];
|
||||
uint startPos = hypo2CandidateWrap[hypoInd];
|
||||
for (uint i = 0; i < beamSize; ++i) {
|
||||
//__syncthreads();
|
||||
NthOutBatch &ele = nBestCandidatesWrap[startPos + i];
|
||||
|
||||
float &val = ele.score;
|
||||
val = __expf(val - topScore);
|
||||
val = __logf(val /_sum[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void gLogSoftMax(MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
|
||||
const MatrixWrapper<float> in,
|
||||
const MatrixWrapper<float> b4Wrap,
|
||||
uint maxBeamSize,
|
||||
bool forbidUNK,
|
||||
const MatrixWrapper<uint> hypo2BeamSizeWrap,
|
||||
const MatrixWrapper<uint> hypo2CandidateWrap)
|
||||
{
|
||||
uint hypos = in.dim(0);
|
||||
uint vocabSize = in.dim(1);
|
||||
|
||||
uint hypoInd = blockIdx.x; // index of previous hypo
|
||||
while (hypoInd < hypos) {
|
||||
float topScore;
|
||||
|
||||
NBestAndMax(nBestCandidatesWrap,
|
||||
topScore,
|
||||
in,
|
||||
b4Wrap,
|
||||
hypoInd,
|
||||
maxBeamSize,
|
||||
forbidUNK,
|
||||
hypo2BeamSizeWrap,
|
||||
hypo2CandidateWrap);
|
||||
|
||||
SumAndLogSoftMax(nBestCandidatesWrap,
|
||||
in,
|
||||
b4Wrap,
|
||||
hypoInd,
|
||||
maxBeamSize,
|
||||
topScore,
|
||||
hypo2BeamSizeWrap,
|
||||
hypo2CandidateWrap);
|
||||
|
||||
|
||||
__syncthreads();
|
||||
hypoInd += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void gNBestPerBatch(MatrixWrapper<NthOutBatch> nBestWrap,
|
||||
MatrixWrapper<NthOutBatch> nBestCandidatesWrap,
|
||||
const MatrixWrapper<float> in,
|
||||
const MatrixWrapper<float> costsWrap,
|
||||
uint maxBeamSize,
|
||||
bool forbidUNK,
|
||||
bool isFirst,
|
||||
const MatrixWrapper<uint> hypo2BeamSizeWrap,
|
||||
const MatrixWrapper<uint> batch2HypoWrap,
|
||||
const MatrixWrapper<uint> hypo2CandidateWrap)
|
||||
{
|
||||
//uint rows = in.dim(0);
|
||||
uint batchSize = batch2HypoWrap.dim(0);
|
||||
|
||||
uint batchInd = blockIdx.x;
|
||||
while (batchInd < batchSize) {
|
||||
assert(batchInd < batch2HypoWrap.size());
|
||||
assert(batchInd < hypo2BeamSizeWrap.size());
|
||||
assert(batchInd < nBestWrap.size());
|
||||
|
||||
uint hypoInd = batch2HypoWrap[batchInd];
|
||||
uint beamSize = hypo2BeamSizeWrap[hypoInd];
|
||||
assert(beamSize);
|
||||
|
||||
uint nextHypoInd;
|
||||
if (isFirst) {
|
||||
nextHypoInd = batchInd * beamSize;
|
||||
}
|
||||
else {
|
||||
nextHypoInd = hypoInd;
|
||||
}
|
||||
|
||||
// candiate from 1st hypo
|
||||
float minScore = +999999;
|
||||
assert(hypoInd < hypo2CandidateWrap.size());
|
||||
uint candidateInd = hypo2CandidateWrap[hypoInd];
|
||||
for (uint i = 0; i < beamSize; ++i) {
|
||||
float prevCost;
|
||||
if (isFirst) {
|
||||
assert(batchInd < costsWrap.size());
|
||||
prevCost = costsWrap[batchInd];
|
||||
}
|
||||
else {
|
||||
//printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
|
||||
assert(hypoInd < costsWrap.size());
|
||||
prevCost = costsWrap[hypoInd];
|
||||
}
|
||||
|
||||
assert((nextHypoInd + i) < nBestWrap.size());
|
||||
assert(candidateInd + i < nBestCandidatesWrap.size());
|
||||
nBestWrap[nextHypoInd + i] = nBestCandidatesWrap[candidateInd + i];
|
||||
|
||||
float &score = nBestWrap[nextHypoInd + i].score;
|
||||
score += prevCost;
|
||||
|
||||
if (score < minScore) {
|
||||
minScore = score;
|
||||
}
|
||||
}
|
||||
|
||||
// candidates from other previous hypos
|
||||
if (!isFirst) {
|
||||
for (uint hypoOffset = 1; hypoOffset < beamSize; ++hypoOffset) {
|
||||
//printf("hypoInd=%d \n", (hypoInd + hypoOffset));
|
||||
|
||||
//printf("prevHypoInd=%, candidateInd=%d \n", prevHypoInd, candidateInd);
|
||||
assert((hypoInd + hypoOffset) < costsWrap.size());
|
||||
float prevCost = costsWrap[hypoInd + hypoOffset];
|
||||
|
||||
assert((hypoInd + hypoOffset) < hypo2CandidateWrap.size());
|
||||
uint candidateInd = hypo2CandidateWrap[hypoInd + hypoOffset];
|
||||
|
||||
for (uint candidateOffset = 0; candidateOffset < beamSize; ++candidateOffset) {
|
||||
assert((candidateInd + candidateOffset) < nBestCandidatesWrap.size());
|
||||
NthOutBatch &candidate = nBestCandidatesWrap[candidateInd + candidateOffset];
|
||||
candidate.score += prevCost;
|
||||
|
||||
assert(nextHypoInd < nBestWrap.size());
|
||||
NthOutBatch *arr = &nBestWrap[nextHypoInd];
|
||||
|
||||
if (candidate.score > minScore) {
|
||||
MergeElement(minScore, arr, beamSize, candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
batchInd += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
|
||||
const Matrix& in,
|
||||
const Matrix& b4,
|
||||
const DeviceVector<float> &costs,
|
||||
bool forbidUNK,
|
||||
uint maxBeamSize,
|
||||
const std::vector<uint>& beamSizes,
|
||||
uint beamSizeSum,
|
||||
bool isFirst)
|
||||
{
|
||||
//BEGIN_TIMER("LogSoftmax excl kernels");
|
||||
|
||||
//cerr << "in=" << in.Debug(0) << endl;
|
||||
//cerr << "beamSizes=" << beamSizes.size() << endl;
|
||||
|
||||
// create beam size vectors on GPU but exclude empty beams
|
||||
uint batchSize = 0;
|
||||
uint candidateInd = 0;
|
||||
for (size_t batchInd = 0; batchInd < beamSizes.size(); ++batchInd) {
|
||||
uint beamSize = beamSizes[batchInd];
|
||||
//cerr << "(" << beamSize << "," << hypoInd << ") ";
|
||||
|
||||
if (beamSize) {
|
||||
if (isFirst) {
|
||||
candidateInd += beamSize;
|
||||
}
|
||||
else {
|
||||
candidateInd += beamSize * beamSize;
|
||||
}
|
||||
|
||||
++batchSize;
|
||||
}
|
||||
}
|
||||
|
||||
DeviceVector<uint> d_beamSizes(beamSizes);
|
||||
DeviceVector<uint> hypo2BeamSize(in.dim(0));
|
||||
DeviceVector<uint> hypo2Candidate(in.dim(0));
|
||||
DeviceVector<uint> batch2Hypo(batchSize);
|
||||
DeviceVector<NthOutBatch> nBestCandidates(candidateInd);
|
||||
|
||||
/*
|
||||
cerr << "in=" << in.Debug(0) << endl;
|
||||
cerr << "beamSizes=" << beamSizes.size() << endl;
|
||||
cerr << "beamSizeSum=" << beamSizeSum << endl;
|
||||
cerr << "batchSize=" << batchSize << endl;
|
||||
cerr << "candidateInd=" << candidateInd << endl;
|
||||
cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl;
|
||||
cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl;
|
||||
cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl;
|
||||
cerr << "nBest=" << Debug(nBest, 0) << endl;
|
||||
cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl;
|
||||
cerr << endl;
|
||||
*/
|
||||
//DeviceVector<NthOutBatch> nBest(beamSizeSum);
|
||||
//cerr << "nBest=" << nBest.size() << endl;
|
||||
|
||||
MatrixWrapper<float> inWrap(in);
|
||||
MatrixWrapper<float> b4Wrap(b4);
|
||||
MatrixWrapper<uint> hypo2BeamSizeWrap(hypo2BeamSize);
|
||||
MatrixWrapper<uint> hypo2CandidateWrap(hypo2Candidate);
|
||||
MatrixWrapper<uint> batch2HypoWrap(batch2Hypo);
|
||||
MatrixWrapper<NthOutBatch> nBestWrap(nBest);
|
||||
MatrixWrapper<NthOutBatch> nBestCandidatesWrap(nBestCandidates);
|
||||
MatrixWrapper<float> costsWrap(costs);
|
||||
|
||||
MatrixWrapper<uint> beamSizesWrap(d_beamSizes);
|
||||
|
||||
//PAUSE_TIMER("LogSoftmax excl kernels");
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, (int)in.dim(0));
|
||||
int threads = std::min(MAX_THREADS, (int)in.dim(1));
|
||||
int shared = sizeof(NthOutBatch) * threads * maxBeamSize
|
||||
+ sizeof(float) * threads;
|
||||
//cerr << "shared=" << shared << endl;
|
||||
|
||||
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
//cerr << "step0" << endl;
|
||||
|
||||
//BEGIN_TIMER("gBeamSizeInit");
|
||||
gBeamSizeInit<<<1, 1, 0, CudaStreamHandler::GetStream()>>>
|
||||
(hypo2BeamSizeWrap,
|
||||
batch2HypoWrap,
|
||||
hypo2CandidateWrap,
|
||||
isFirst,
|
||||
beamSizeSum,
|
||||
beamSizesWrap
|
||||
);
|
||||
//PAUSE_TIMER("gBeamSizeInit");
|
||||
|
||||
/*
|
||||
cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 2) << endl;
|
||||
cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 2) << endl;
|
||||
cerr << "batch2Hypo=" << Debug(batch2Hypo, 2) << endl;
|
||||
cerr << endl;
|
||||
*/
|
||||
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
//cerr << "step1" << endl;
|
||||
|
||||
//BEGIN_TIMER("gLogSoftMax");
|
||||
gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
|
||||
(nBestCandidatesWrap,
|
||||
inWrap,
|
||||
b4Wrap,
|
||||
maxBeamSize,
|
||||
forbidUNK,
|
||||
hypo2BeamSizeWrap,
|
||||
hypo2CandidateWrap);
|
||||
//PAUSE_TIMER("gLogSoftMax");
|
||||
|
||||
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
//cerr << "step2" << endl;
|
||||
|
||||
threads = 1;
|
||||
|
||||
//BEGIN_TIMER("gNBestPerBatch");
|
||||
gNBestPerBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(nBestWrap,
|
||||
nBestCandidatesWrap,
|
||||
inWrap,
|
||||
costsWrap,
|
||||
maxBeamSize,
|
||||
forbidUNK,
|
||||
isFirst,
|
||||
hypo2BeamSizeWrap,
|
||||
batch2HypoWrap,
|
||||
hypo2CandidateWrap);
|
||||
//PAUSE_TIMER("gNBestPerBatch");
|
||||
|
||||
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
//cerr << "step3" << endl;
|
||||
//cerr << "3costs=" << Debug(costs, 0) << endl;
|
||||
}
|
||||
|
||||
} // namespace mblas
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "gpu/mblas/matrix.h"
|
||||
#include "gpu/mblas/matrix_wrapper.h"
|
||||
#include "gpu/mblas/handles.h"
|
||||
#include "gpu/mblas/nth_element_kernels.h"
|
||||
|
||||
namespace amunmt {
|
||||
namespace GPU {
|
||||
@ -83,6 +84,26 @@ std::string Debug(const HostVector<T> &vec, size_t verbosity = 1)
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
std::string Debug(const std::vector<T> &vec, size_t verbosity = 1)
|
||||
{
|
||||
std::stringstream strm;
|
||||
|
||||
strm << "size=" << vec.size();
|
||||
|
||||
if (verbosity) {
|
||||
T sum = Sum(vec.data(), vec.size());
|
||||
strm << " sum=" << sum;
|
||||
}
|
||||
|
||||
if (verbosity == 2) {
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
strm << " " << vec[i];
|
||||
}
|
||||
}
|
||||
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void copy(const T *in, size_t count, T *out, cudaMemcpyKind kind) {
|
||||
@ -93,7 +114,9 @@ void Fill(Matrix& In, float value=0.0f);
|
||||
|
||||
Matrix& Swap(Matrix& Out, Matrix& In);
|
||||
|
||||
void Mean(Matrix& Out, const Matrix& In, const IMatrix &sentencesMask);
|
||||
void Mean(Matrix& Out,
|
||||
const Matrix& In,
|
||||
const mblas::IMatrix &sentenceLengths);
|
||||
|
||||
void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const DeviceVector<uint>& mapping);
|
||||
|
||||
@ -117,7 +140,7 @@ Matrix& CopyRow(Matrix& Out,
|
||||
Matrix& Concat(Matrix& Out, const Matrix& In);
|
||||
|
||||
void MapMatrix(Matrix& state,
|
||||
const mblas::IMatrix &sentencesMask,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
size_t i);
|
||||
|
||||
Matrix& CopyRows(Matrix& Out,
|
||||
@ -135,7 +158,10 @@ Matrix& Slice(Matrix& Out,
|
||||
Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
|
||||
bool transA = false, bool transB = false);
|
||||
|
||||
Matrix& Softmax(Matrix& Out, const DeviceVector<uint>& batchIds, const mblas::IMatrix &sentencesMask, size_t batchSize);
|
||||
Matrix& Softmax(Matrix& Out,
|
||||
const DeviceVector<uint>& batchIds,
|
||||
const mblas::IMatrix &sentenceLengths,
|
||||
size_t batchSize);
|
||||
|
||||
Matrix& LogSoftmax(Matrix& Out);
|
||||
|
||||
@ -232,7 +258,7 @@ __global__ void gBroadcastVecColumn(Functor functor,
|
||||
size_t rows = outWrap.dim(0);
|
||||
size_t cols = outWrap.dim(1);
|
||||
|
||||
MatrixWrapper<float> sdata(sdataOrig, rows);
|
||||
MatrixWrapper<float> sdata(sdataOrig, rows, 1, 1, 1);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
for (int i = 0; i < rows; ++i)
|
||||
@ -422,7 +448,15 @@ void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, const Mat
|
||||
|
||||
void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps);
|
||||
|
||||
void RandomizeMemory();
|
||||
void LogSoftmaxAndNBest(DeviceVector<NthOutBatch> &nBest,
|
||||
const Matrix& in,
|
||||
const Matrix& b4,
|
||||
const DeviceVector<float> &costs,
|
||||
bool forbidUNK,
|
||||
uint maxBeamSize,
|
||||
const std::vector<uint>& beamSizes,
|
||||
uint beamSizeSum,
|
||||
bool isFirst);
|
||||
|
||||
} // namespace mblas
|
||||
} // namespace GPU
|
||||
|
@ -94,39 +94,14 @@ public:
|
||||
dataConst_ = data_;
|
||||
}
|
||||
|
||||
MatrixWrapper(DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
|
||||
{
|
||||
dim_[0] = a;
|
||||
dim_[1] = b;
|
||||
dim_[2] = c;
|
||||
dim_[3] = d;
|
||||
updateStridesRowMajor();
|
||||
|
||||
assert(size() == vec.size());
|
||||
|
||||
data_ = thrust::raw_pointer_cast(vec.data());
|
||||
dataConst_ = data_;
|
||||
}
|
||||
|
||||
MatrixWrapper(const DeviceVector<T> &vec, uint a, uint b, uint c, uint d)
|
||||
{
|
||||
dim_[0] = a;
|
||||
dim_[1] = b;
|
||||
dim_[2] = c;
|
||||
dim_[3] = d;
|
||||
updateStridesRowMajor();
|
||||
|
||||
assert(size() == vec.size());
|
||||
|
||||
data_ = nullptr;
|
||||
dataConst_ = thrust::raw_pointer_cast(vec.data());
|
||||
}
|
||||
|
||||
__device__
|
||||
MatrixWrapper(T *ptr, uint size)
|
||||
MatrixWrapper(T *ptr, uint a, uint b, uint c, uint d)
|
||||
{
|
||||
dim_[0] = size;
|
||||
size_ = size;
|
||||
dim_[0] = a;
|
||||
dim_[1] = b;
|
||||
dim_[2] = c;
|
||||
dim_[3] = d;
|
||||
updateStrides();
|
||||
|
||||
data_ = ptr;
|
||||
dataConst_ = ptr;
|
||||
@ -148,7 +123,7 @@ public:
|
||||
return stride_[i];
|
||||
}
|
||||
|
||||
__host__
|
||||
__device__ __host__
|
||||
void updateStrides()
|
||||
{
|
||||
stride_[0] = dim_[1];
|
||||
@ -159,7 +134,7 @@ public:
|
||||
size_ = stride_[3] * dim_[3];
|
||||
}
|
||||
|
||||
__host__
|
||||
__device__ __host__
|
||||
void updateStridesRowMajor()
|
||||
{
|
||||
stride_[0] = 1;
|
||||
|
@ -166,5 +166,7 @@ void NthElement::getValueByKey(std::vector<float>& out, const mblas::Matrix &d_i
|
||||
HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace GPU
|
||||
} // namespace amunmt
|
||||
|
@ -18,8 +18,11 @@ class NthElement {
|
||||
NthElement(uint maxBeamSize, uint maxBatchSize);
|
||||
virtual ~NthElement();
|
||||
|
||||
void getNBestList(const std::vector<uint>& beamSizes, mblas::Matrix& Probs,
|
||||
std::vector<float>& outCosts, std::vector<uint>& outKeys,
|
||||
// standard nth_element
|
||||
void getNBestList(const std::vector<uint>& beamSizes,
|
||||
mblas::Matrix& Probs,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<uint>& outKeys,
|
||||
const bool isFirst=false);
|
||||
|
||||
void GetPairs(uint number,
|
||||
@ -47,6 +50,7 @@ class NthElement {
|
||||
const HostVector<uint>& batchFirstElementIdxs,
|
||||
const HostVector<uint>& cummulatedBeamSizes);
|
||||
|
||||
|
||||
};
|
||||
|
||||
} // namespace GPU
|
||||
|
@ -1,3 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include "matrix_wrapper.h"
|
||||
|
||||
namespace amunmt {
|
||||
@ -32,6 +34,60 @@ struct NthOut
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct NthOutBatch
|
||||
{
|
||||
uint ind;
|
||||
float score;
|
||||
//uint hypoInd;
|
||||
//uint vocabInd;
|
||||
|
||||
__device__ __host__
|
||||
NthOutBatch(const float& rhs)
|
||||
{
|
||||
// only to be used to init variable in matrix.h gSum
|
||||
assert(rhs == 0.0f);
|
||||
ind = rhs;
|
||||
score = rhs;
|
||||
//hypoInd = rhs;
|
||||
//vocabInd = rhs;
|
||||
}
|
||||
|
||||
__device__ __host__
|
||||
NthOutBatch() {}
|
||||
|
||||
__device__ __host__
|
||||
NthOutBatch(uint vInd, float vScore, uint vHypoInd, uint vVocabInd)
|
||||
:ind(vInd)
|
||||
,score(vScore)
|
||||
//,hypoInd(vHypoInd)
|
||||
//,vocabInd(vVocabInd)
|
||||
{}
|
||||
|
||||
__device__ __host__
|
||||
NthOutBatch& operator=(const NthOutBatch& rhs)
|
||||
{
|
||||
ind = rhs.ind;
|
||||
score = rhs.score;
|
||||
//hypoInd = rhs.hypoInd;
|
||||
//vocabInd = rhs.vocabInd;
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ __host__
|
||||
NthOutBatch& operator+=(const NthOutBatch& rhs)
|
||||
{
|
||||
ind += rhs.ind;
|
||||
score += rhs.score;
|
||||
//hypoInd += rhs.hypoInd;
|
||||
//vocabInd += rhs.vocabInd;
|
||||
return *this;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
|
||||
@ -40,6 +96,17 @@ inline std::ostream& operator<<(std::ostream &out, const NthOut &obj)
|
||||
return out;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream &out, const NthOutBatch &obj)
|
||||
{
|
||||
out << "("
|
||||
<< obj.ind << ","
|
||||
<< obj.score << ","
|
||||
//<< obj.hypoInd << ","
|
||||
//<< obj.vocabInd
|
||||
<< ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void gMaxElement(mblas::MatrixWrapper<NthOut> out,
|
||||
|
Loading…
Reference in New Issue
Block a user