From a75c30923be7bda98592561050432211f9227009 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Thu, 28 Jun 2018 14:19:31 -0400 Subject: [PATCH] Fix preprocessing for WMT14 En-De to replicate Scaling NMT paper (#203) --- examples/translation/README.md | 2 +- examples/translation/prepare-wmt14en2de.sh | 47 ++++++++++++++++++---- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/examples/translation/README.md b/examples/translation/README.md index fd379088a..7d42bbc99 100644 --- a/examples/translation/README.md +++ b/examples/translation/README.md @@ -113,7 +113,7 @@ To replicate results from the paper [Scaling Neural Machine Translation (Ott et 1. Prepare the WMT'14 En-De data with a BPE vocab of 32k: ``` -$ BPE_TOKENS=32764 bash prepare-wmt14en2de.sh +$ bash prepare-wmt14en2de.sh --scaling18 $ cd ../.. ``` 2. Preprocess the dataset with a joined dictionary: diff --git a/examples/translation/prepare-wmt14en2de.sh b/examples/translation/prepare-wmt14en2de.sh index 780183974..e831ab210 100644 --- a/examples/translation/prepare-wmt14en2de.sh +++ b/examples/translation/prepare-wmt14en2de.sh @@ -13,18 +13,20 @@ CLEAN=$SCRIPTS/training/clean-corpus-n.perl NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl BPEROOT=subword-nmt -BPE_TOKENS="${BPE_TOKENS:-40000}" +BPE_TOKENS=40000 URLS=( "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz" + "http://data.statmt.org/wmt17/translation-task/dev.tgz" "http://statmt.org/wmt14/test-full.tgz" ) FILES=( "training-parallel-europarl-v7.tgz" "training-parallel-commoncrawl.tgz" "training-parallel-nc-v12.tgz" + "dev.tgz" "test-full.tgz" ) CORPORA=( @@ -41,6 +43,12 @@ if [ "$1" == "--icml17" ]; then CORPORA[2]="training/news-commentary-v9.de-en" fi +# This will make the dataset comparable to the one used in "Scaling Neural Machine Translation" +# https://arxiv.org/abs/1806.00187 +if [ "$1" == "--scaling18" ]; then + BPE_TOKENS=32764 +fi + if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit @@ -52,6 +60,7 @@ lang=en-de prep=wmt14_en_de tmp=$prep/tmp orig=orig +dev=dev/newstest2013 mkdir -p $orig $tmp $prep @@ -105,11 +114,26 @@ for l in $src $tgt; do echo "" done -echo "splitting train and valid..." -for l in $src $tgt; do - awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l - awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l -done +if [ "$1" == "--scaling18" ]; then + # apply length filtering before BPE for --scaling18 + perl $CLEAN $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 80 + + # use newstest2013 for valid + echo "pre-processing valid data..." + for l in $src $tgt; do + rm $tmp/valid.$l + cat $orig/$dev.$l | \ + perl $NORM_PUNC $l | \ + perl $REM_NON_PRINT_CHAR | \ + perl $TOKENIZER -threads 8 -a -l $l >> $tmp/valid.$l + done +else + echo "splitting train and valid..." + for l in $src $tgt; do + awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l + awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l + done +fi TRAIN=$tmp/train.de-en BPE_CODE=$prep/code @@ -128,8 +152,15 @@ for L in $src $tgt; do done done -perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 -perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 +if [ "$1" == "--scaling18" ]; then + for L in $src $tgt; do + cp $tmp/bpe.train.$L $prep/train.$L + cp $tmp/bpe.valid.$L $prep/valid.$L + done +else + perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 + perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 +fi for L in $src $tgt; do cp $tmp/bpe.test.$L $prep/test.$L