mirror of
https://github.com/facebookresearch/fairseq.git
synced 2024-11-14 14:20:21 +03:00
b87c536651
Summary: Changelog: - `90f52a1`: Support loading subsets of the data on each worker with the `--fix-batches-to-gpus` flag. This should fix #217 and #266. - `6eda0a9`: Update README for replicating the "Scaling Neural Machine Translation" paper - `b14c7cf`: Fallback to no_c10d backend for pytorch 0.4.1 (fixes #294) Pull Request resolved: https://github.com/pytorch/fairseq/pull/295 Differential Revision: D10121559 Pulled By: myleott fbshipit-source-id: 41c84d0ee4cdd113544b5d3aa38ae8b23acc2c27
140 lines
3.8 KiB
Bash
140 lines
3.8 KiB
Bash
#!/bin/bash
|
||
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
|
||
|
||
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
||
git clone https://github.com/moses-smt/mosesdecoder.git
|
||
|
||
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
||
git clone https://github.com/rsennrich/subword-nmt.git
|
||
|
||
SCRIPTS=mosesdecoder/scripts
|
||
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
||
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
||
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
|
||
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
|
||
BPEROOT=subword-nmt
|
||
BPE_TOKENS=40000
|
||
|
||
URLS=(
|
||
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
|
||
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
|
||
"http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz"
|
||
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
|
||
"http://statmt.org/wmt14/test-full.tgz"
|
||
)
|
||
FILES=(
|
||
"training-parallel-europarl-v7.tgz"
|
||
"training-parallel-commoncrawl.tgz"
|
||
"training-parallel-nc-v12.tgz"
|
||
"dev.tgz"
|
||
"test-full.tgz"
|
||
)
|
||
CORPORA=(
|
||
"training/europarl-v7.de-en"
|
||
"commoncrawl.de-en"
|
||
"training/news-commentary-v12.de-en"
|
||
)
|
||
|
||
# This will make the dataset compatible to the one used in "Convolutional Sequence to Sequence Learning"
|
||
# https://arxiv.org/abs/1705.03122
|
||
if [ "$1" == "--icml17" ]; then
|
||
URLS[2]="http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
|
||
FILES[2]="training-parallel-nc-v9.tgz"
|
||
CORPORA[2]="training/news-commentary-v9.de-en"
|
||
fi
|
||
|
||
if [ ! -d "$SCRIPTS" ]; then
|
||
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
|
||
exit
|
||
fi
|
||
|
||
src=en
|
||
tgt=de
|
||
lang=en-de
|
||
prep=wmt14_en_de
|
||
tmp=$prep/tmp
|
||
orig=orig
|
||
dev=dev/newstest2013
|
||
|
||
mkdir -p $orig $tmp $prep
|
||
|
||
cd $orig
|
||
|
||
for ((i=0;i<${#URLS[@]};++i)); do
|
||
file=${FILES[i]}
|
||
if [ -f $file ]; then
|
||
echo "$file already exists, skipping download"
|
||
else
|
||
url=${URLS[i]}
|
||
wget "$url"
|
||
if [ -f $file ]; then
|
||
echo "$url successfully downloaded."
|
||
else
|
||
echo "$url not successfully downloaded."
|
||
exit -1
|
||
fi
|
||
if [ ${file: -4} == ".tgz" ]; then
|
||
tar zxvf $file
|
||
elif [ ${file: -4} == ".tar" ]; then
|
||
tar xvf $file
|
||
fi
|
||
fi
|
||
done
|
||
cd ..
|
||
|
||
echo "pre-processing train data..."
|
||
for l in $src $tgt; do
|
||
rm $tmp/train.tags.$lang.tok.$l
|
||
for f in "${CORPORA[@]}"; do
|
||
cat $orig/$f.$l | \
|
||
perl $NORM_PUNC $l | \
|
||
perl $REM_NON_PRINT_CHAR | \
|
||
perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
|
||
done
|
||
done
|
||
|
||
echo "pre-processing test data..."
|
||
for l in $src $tgt; do
|
||
if [ "$l" == "$src" ]; then
|
||
t="src"
|
||
else
|
||
t="ref"
|
||
fi
|
||
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
|
||
sed -e 's/<seg id="[0-9]*">\s*//g' | \
|
||
sed -e 's/\s*<\/seg>\s*//g' | \
|
||
sed -e "s/\’/\'/g" | \
|
||
perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
|
||
echo ""
|
||
done
|
||
|
||
echo "splitting train and valid..."
|
||
for l in $src $tgt; do
|
||
awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
|
||
awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
|
||
done
|
||
|
||
TRAIN=$tmp/train.de-en
|
||
BPE_CODE=$prep/code
|
||
rm -f $TRAIN
|
||
for l in $src $tgt; do
|
||
cat $tmp/train.$l >> $TRAIN
|
||
done
|
||
|
||
echo "learn_bpe.py on ${TRAIN}..."
|
||
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
||
|
||
for L in $src $tgt; do
|
||
for f in train.$L valid.$L test.$L; do
|
||
echo "apply_bpe.py to ${f}..."
|
||
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
|
||
done
|
||
done
|
||
|
||
perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
|
||
perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
|
||
|
||
for L in $src $tgt; do
|
||
cp $tmp/bpe.test.$L $prep/test.$L
|
||
done
|