mirror of
https://github.com/facebookresearch/fairseq.git
synced 2024-11-14 14:20:21 +03:00
116 lines
2.9 KiB
Bash
116 lines
2.9 KiB
Bash
#!/usr/bin/env bash
|
||
#
|
||
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
|
||
|
||
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
||
git clone https://github.com/moses-smt/mosesdecoder.git
|
||
|
||
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
||
git clone https://github.com/rsennrich/subword-nmt.git
|
||
|
||
SCRIPTS=mosesdecoder/scripts
|
||
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
||
LC=$SCRIPTS/tokenizer/lowercase.perl
|
||
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
||
BPEROOT=subword-nmt
|
||
BPE_TOKENS=10000
|
||
|
||
URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
|
||
GZ=de-en.tgz
|
||
|
||
if [ ! -d "$SCRIPTS" ]; then
|
||
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
|
||
exit
|
||
fi
|
||
|
||
src=de
|
||
tgt=en
|
||
lang=de-en
|
||
prep=iwslt14.tokenized.de-en
|
||
tmp=$prep/tmp
|
||
orig=orig
|
||
|
||
mkdir -p $orig $tmp $prep
|
||
|
||
echo "Downloading data from ${URL}..."
|
||
cd $orig
|
||
wget "$URL"
|
||
|
||
if [ -f $GZ ]; then
|
||
echo "Data successfully downloaded."
|
||
else
|
||
echo "Data not successfully downloaded."
|
||
exit
|
||
fi
|
||
|
||
tar zxvf $GZ
|
||
cd ..
|
||
|
||
echo "pre-processing train data..."
|
||
for l in $src $tgt; do
|
||
f=train.tags.$lang.$l
|
||
tok=train.tags.$lang.tok.$l
|
||
|
||
cat $orig/$lang/$f | \
|
||
grep -v '<url>' | \
|
||
grep -v '<talkid>' | \
|
||
grep -v '<keywords>' | \
|
||
sed -e 's/<title>//g' | \
|
||
sed -e 's/<\/title>//g' | \
|
||
sed -e 's/<description>//g' | \
|
||
sed -e 's/<\/description>//g' | \
|
||
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
|
||
echo ""
|
||
done
|
||
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
|
||
for l in $src $tgt; do
|
||
perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
|
||
done
|
||
|
||
echo "pre-processing valid/test data..."
|
||
for l in $src $tgt; do
|
||
for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
|
||
fname=${o##*/}
|
||
f=$tmp/${fname%.*}
|
||
echo $o $f
|
||
grep '<seg id' $o | \
|
||
sed -e 's/<seg id="[0-9]*">\s*//g' | \
|
||
sed -e 's/\s*<\/seg>\s*//g' | \
|
||
sed -e "s/\’/\'/g" | \
|
||
perl $TOKENIZER -threads 8 -l $l | \
|
||
perl $LC > $f
|
||
echo ""
|
||
done
|
||
done
|
||
|
||
|
||
echo "creating train, valid, test..."
|
||
for l in $src $tgt; do
|
||
awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
|
||
awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
|
||
|
||
cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
|
||
$tmp/IWSLT14.TEDX.dev2012.de-en.$l \
|
||
$tmp/IWSLT14.TED.tst2010.de-en.$l \
|
||
$tmp/IWSLT14.TED.tst2011.de-en.$l \
|
||
$tmp/IWSLT14.TED.tst2012.de-en.$l \
|
||
> $tmp/test.$l
|
||
done
|
||
|
||
TRAIN=$tmp/train.en-de
|
||
BPE_CODE=$prep/code
|
||
rm -f $TRAIN
|
||
for l in $src $tgt; do
|
||
cat $tmp/train.$l >> $TRAIN
|
||
done
|
||
|
||
echo "learn_bpe.py on ${TRAIN}..."
|
||
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
||
|
||
for L in $src $tgt; do
|
||
for f in train.$L valid.$L test.$L; do
|
||
echo "apply_bpe.py to ${f}..."
|
||
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
|
||
done
|
||
done
|