add script for testing on Jenkins

This commit is contained in:
Filip Gralinski 2016-07-25 23:12:25 +02:00
parent 3b6a1734df
commit 3f7dd08d0b
4 changed files with 74 additions and 0 deletions

2
.gitignore vendored
View File

@ -36,3 +36,5 @@
build
scripts/docker
amunmt-distribution.tar.gz
tests/wmt16/*/*

View File

@ -8,3 +8,6 @@ mkdir build
cd build
cmake ..
make
cd ..
tar zvcf amunmt-distribution.tar.gz build/bin/* scripts/download_models.py tests/wmt16/Makefile tests/wmt16/extract_segs.py

59
tests/wmt16/Makefile Normal file
View File

@ -0,0 +1,59 @@
DOMAIN=news
SRC=de
TRG=en
MOSES_SCRIPTS=/opt/moses/moses-scripts
AMUNMT_BIN=../../build/bin
DIRECTION=$(SRC)-$(TRG)
DIR=$(DOMAIN)-$(DIRECTION)
NUMBER_OF_LINES=99999
.SECONDARY:
$(DIR)/bleu.points: $(DIR)/bleu.score
perl -ne '/BLEU = (\d+\.\d+)/; print "BLEU\n$$1\n"' < $< > $@
$(DIR)/bleu.score: $(DIR)/test-$(SRC)$(TRG)-ref.$(TRG).tok $(DIR)/test-$(SRC)$(TRG)-out.$(TRG).tok
$(MOSES_SCRIPTS)/generic/multi-bleu.perl $< < $(DIR)/test-$(SRC)$(TRG)-out.$(TRG).tok > $@
%.$(TRG).tok: %.$(TRG).txt
$(MOSES_SCRIPTS)/tokenizer/tokenizer.perl -l $(TRG) -a < $< > $@
%-out.$(TRG).txt: %-out.$(TRG).bpe
sed 's/\@\@ //g' $< | \
$(MOSES_SCRIPTS)/recaser/detruecase.perl | \
$(MOSES_SCRIPTS)/tokenizer/detokenizer.perl -l $(TRG) > $@
%-out.$(TRG).bpe: %-src.$(SRC).bpe $(DIR)/config.yml
head -n $(NUMBER_OF_LINES) < $< | $(AMUNMT_BIN)/amun -c $(DIR)/config.yml > $@
%.$(SRC).bpe: %.$(SRC).pre $(DIR)/$(SRC)$(TRG).bpe
$(AMUNMT_BIN)/bpe $(DIR)/$(SRC)$(TRG).bpe < $< > $@
%.$(SRC).pre: %.$(SRC).txt $(DIR)/truecase-model.$(SRC)
$(MOSES_SCRIPTS)/tokenizer/normalize-punctuation.perl -l $(SRC) < $< | \
$(MOSES_SCRIPTS)/tokenizer/tokenizer.perl -l $(SRC) -a | \
$(MOSES_SCRIPTS)/recaser/truecase.perl -model $(DIR)/truecase-model.$(SRC) > $@
$(DIR)/test-$(SRC)$(TRG)-ref.$(TRG).txt: $(DIR)/test-$(SRC)$(TRG)-ref.$(TRG).sgm
./extract_segs.py < $< > $@
$(DIR)/test-$(SRC)$(TRG)-src.$(SRC).txt: $(DIR)/test-$(SRC)$(TRG)-src.$(SRC).sgm
./extract_segs.py < $< > $@
$(DIR)/test-$(SRC)$(TRG)-ref.$(TRG).sgm: $(DIR)/test.tgz
tar --to-stdout -zxvf $< test/$(DOMAIN)test2016-$(SRC)$(TRG)-ref.$(TRG).sgm > $@
$(DIR)/test-$(SRC)$(TRG)-src.$(SRC).sgm: $(DIR)/test.tgz
tar --to-stdout -zxvf $< test/$(DOMAIN)test2016-$(SRC)$(TRG)-src.$(SRC).sgm > $@
$(DIR)/test.tgz:
mkdir -p $(DIR)
wget 'http://data.statmt.org/wmt16/translation-task/test.tgz' -O $@
$(DIR)/model.npz $(DIR)/vocab.$(SRC).json $(DIR)/vocab.$(TRG).json $(DIR)/$(SRC)$(TRG).bpe $(DIR)/truecase-model.$(SRC) $(DIR)/config.yml: ../../scripts/download_models.py
mkdir -p $(DIR)
$< -w $(DIR) -m $(DIRECTION)

10
tests/wmt16/extract_segs.py Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import re
import sys
for line in sys.stdin:
m = re.search(ur'<seg id="\d+">(.*)</seg>', line)
if m:
print m.group(1)