mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
backtranslation for Tatoeba data
This commit is contained in:
parent
f81a2ad638
commit
6537fdea13
@ -40,16 +40,20 @@ MODELHOME ?= ${MODELSDIR}/${LANGPAIR}
|
||||
## we need the UTF8 sort order
|
||||
## --> use bash sort and UTF8 locale
|
||||
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
ifneq (${wildcard ${MODELHOME}},)
|
||||
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
endif
|
||||
|
||||
ifeq (${MODELNAME},)
|
||||
ifneq (${wildcard ${WORKHOME/models/${LANGPAIR}}},)
|
||||
MODELHOME = ../${notdir ${WORKHOME}}/models/${LANGPAIR}
|
||||
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
||||
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip | LANG=en_US.UTF-8 sort}}
|
||||
MODELZIP = ${lastword ${shell ls ${MODELHOME}/*-20*.zip 2>/dev/null | LANG=en_US.UTF-8 sort}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
endif
|
||||
endif
|
||||
|
||||
## set to 1 if the model for backtranslation is a multi-target model
|
||||
## --> need to use pre-processing script differently
|
||||
@ -263,7 +267,7 @@ all-wikis-all-langs: index.html
|
||||
done
|
||||
|
||||
|
||||
## aux function to pring the selected modelname
|
||||
## aux function to print the selected modelname
|
||||
.PHONY: print-modelname
|
||||
print-modelname:
|
||||
@echo ${MODELNAME}
|
||||
|
323
bt-tatoeba/Makefile
Normal file
323
bt-tatoeba/Makefile
Normal file
@ -0,0 +1,323 @@
|
||||
#
|
||||
# backtranslate wiki data with Tatoeba-MT challenge data
|
||||
#
|
||||
# only works with sentencepiece models!
|
||||
#
|
||||
|
||||
PWD := ${shell pwd}
|
||||
TOOLSDIR := ${PWD}/../tools
|
||||
|
||||
include ../lib/env.mk
|
||||
include ../lib/config.mk
|
||||
include ../lib/slurm.mk
|
||||
|
||||
SRC ?= fin
|
||||
TRG ?= eng
|
||||
|
||||
|
||||
|
||||
# TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge
|
||||
TATOEBA_STORAGE = https://object.pouta.csc.fi/Tatoeba-Challenge-WikiShuffled
|
||||
TATOEBA_GITRAW = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
|
||||
TATOEBA_RELEASED = ${TATOEBA_GITRAW}/models/released-model-results.txt
|
||||
TATOEBA_MODEL_STORAGE = https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
|
||||
|
||||
## various sources are available
|
||||
## can be general wikipedia, wikinews, wikibooks, ...
|
||||
# WIKISOURCE ?= wikipedia
|
||||
WIKISOURCE ?= wiki
|
||||
|
||||
## split size in nr-of-lines
|
||||
## default part to be selected = aa
|
||||
SPLIT_SIZE ?= 1000000
|
||||
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH ?= 100
|
||||
MAX_SENTENCES ?= ${SPLIT_SIZE}
|
||||
|
||||
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
|
||||
PWD := $(shell pwd)
|
||||
|
||||
|
||||
|
||||
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
|
||||
MODELZIP := ${shell wget -qq -O - ${TATOEBA_GITRAW}/models/released-model-results.txt |\
|
||||
grep '^${LANGPAIR}' | head -1 | cut -f4}
|
||||
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
MULTI_TARGET_MODEL := ${shell wget -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
|
||||
ifneq (${MULTI_TARGET_MODEL},0)
|
||||
TARGET_LANG_LABEL := ${shell wget -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<'}
|
||||
endif
|
||||
|
||||
RELEASED_WIKIS := $(patsubst %.tar,%,${shell wget -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
|
||||
grep -o 'WikiShuffled/...\.tar' | cut -f2 -d'/'})
|
||||
|
||||
|
||||
WIKI_DIR = ${PWD}/wiki
|
||||
LANGID = ${SRC}
|
||||
PART = aa
|
||||
OUTPUT_DIR = ${LANGPAIR}
|
||||
WIKI_TXT = ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.${PART}.gz
|
||||
WIKI_SRC = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_PRE = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
|
||||
WIKI_TRG = ${OUTPUT_DIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
||||
|
||||
WIKI_LATEST_SRC = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_LATEST_TRG = ${OUTPUT_DIR}/latest/${WIKISOURCE}.${PART}.${LANGPAIR}.${TRG}.gz
|
||||
|
||||
|
||||
## all parts of this wiki
|
||||
PARTS = ${sort ${patsubst ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,%,\
|
||||
${wildcard ${WIKI_DIR}/${LANGID}/${WIKISOURCE}.??.gz}}}
|
||||
|
||||
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
||||
WIKISOURCES = ${sort $(patsubst %.txt.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.txt.gz})) \
|
||||
$(patsubst %.${PART}.gz,%,$(notdir ${wildcard ${WIKI_DIR}/${LANGID}/*.${PART}.gz}))}
|
||||
|
||||
|
||||
## targets for all parts of the current wiki source
|
||||
|
||||
ALLWIKIPARTS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/${WIKISOURCE}.%.gz,${PARTS}}
|
||||
ALLWIKIPARTS_SRC = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALLWIKIPARTS_PRE = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${PARTS}}
|
||||
ALLWIKIPARTS_TRG = ${patsubst %,${OUTPUT_DIR}/${WIKISOURCE}.%_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
ALLWIKIPARTS_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${SRC}.gz,${PARTS}}
|
||||
ALLWIKIPARTS_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/${WIKISOURCE}.%.${LANGPAIR}.${TRG}.gz,${PARTS}}
|
||||
|
||||
|
||||
## all wiki sources for the selected part
|
||||
|
||||
ALLWIKIS_TXT = ${patsubst %,${WIKI_DIR}/${LANGID}/%.${PART}.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_SRC = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_PRE = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz,${WIKISOURCES}}
|
||||
ALLWIKIS_TRG = ${patsubst %,${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz,${WIKISOURCES}}
|
||||
|
||||
|
||||
|
||||
## don't delete translated text if the process crashes
|
||||
.PRECIOUS: ${WIKI_TRG}
|
||||
|
||||
|
||||
ifdef LOCAL_SCRATCH
|
||||
TMPDIR = ${LOCAL_SCRATCH}
|
||||
endif
|
||||
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
||||
module load nlpl-udpipe nlpl-opus &&
|
||||
endif
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: translate
|
||||
|
||||
all-jobs: fetch
|
||||
${MAKE} prepare-allwikis
|
||||
${MAKE} translate-all-jobs
|
||||
|
||||
all2eng:
|
||||
for w in ${filter-out eng,${RELEASED_WIKIS}}; do \
|
||||
echo "make SRC=$$w TRG=eng all-jobs"; \
|
||||
done
|
||||
|
||||
|
||||
.PHONY: fetch
|
||||
fetch: ${WIKI_DIR}/${SRC}
|
||||
|
||||
.PHONY: prepare
|
||||
prepare: ${LANGPAIR}/${MODELNAME}/decoder.yml ${WIKI_TXT}
|
||||
|
||||
.PHONY: prepare-allwikis
|
||||
prepare-allwikis: ${LANGPAIR}/${MODELNAME}/decoder.yml ${ALLWIKIS_TXT}
|
||||
|
||||
.PHONY: translate
|
||||
translate: ${WIKI_LATEST_TRG}
|
||||
${MAKE} ${WIKI_LATEST_SRC}
|
||||
|
||||
## translate all parts
|
||||
.PHONY: translate-all-parts
|
||||
translate-all-parts: ${ALLWIKIPARTS_LATEST_TRG}
|
||||
${MAKE} ${ALLWIKIPARTS_LATEST_SRC}
|
||||
|
||||
## translate all wikis and all parts
|
||||
.PHONY: translate-all
|
||||
translate-all:
|
||||
for s in ${WIKISOURCES}; do \
|
||||
${MAKE} translate-allparts; \
|
||||
done
|
||||
|
||||
## create jobs for translating all parts
|
||||
.PHONY: translate-all-parts-jobs
|
||||
translate-all-parts-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for p in ${PARTS}; do \
|
||||
${MAKE} PART=$$p translate.submit; \
|
||||
done
|
||||
|
||||
## create jobs for translating all parts of all wikis
|
||||
.PHONY: translate-all-jobs
|
||||
translate-all-jobs: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
for s in ${WIKISOURCES}; do \
|
||||
${MAKE} WIKISOURCE=$$s translate-all-parts-jobs; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: print-modelinfo
|
||||
print-modelinfo:
|
||||
@echo ${MODELNAME}
|
||||
@echo ${MODELZIP}
|
||||
@echo ${MODELINFO}
|
||||
@echo "multi-target model: ${MULTI_TARGET_MODEL}"
|
||||
@echo "target language label: ${TARGET_LANG_LABEL}"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## fetch the latest model
|
||||
## ---> TODO: should we fetch from ObjectStorage instead?
|
||||
|
||||
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
wget -O ${dir $@}/model.zip ${MODELZIP}
|
||||
cd ${dir $@} && unzip model.zip
|
||||
rm -f ${dir $@}/model.zip
|
||||
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< ${dir $@}/preprocess-old.sh > ${dir $@}/preprocess.sh
|
||||
chmod +x ${dir $@}/preprocess.sh
|
||||
endif
|
||||
|
||||
|
||||
## pre-process data
|
||||
|
||||
ifeq (${MULTI_TARGET_MODEL},1)
|
||||
PREPROCESS_ARGS = ${SRC} ${TRG} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
else
|
||||
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
${WIKI_DIR}/${SRC}/%.${PART}.gz: | ${WIKI_DIR}/${SRC}
|
||||
${GZCAT} ${@:.${PART}.gz=.txt.gz} |\
|
||||
split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
|
||||
gzip -f ${patsubst %${PART}.gz,%,$@}??
|
||||
|
||||
|
||||
${WIKI_DIR}/${SRC}/%.txt.gz: ${WIKI_DIR}/${SRC}
|
||||
echo "done!"
|
||||
|
||||
|
||||
${WIKI_DIR}/${SRC}:
|
||||
mkdir -p $@
|
||||
wget -O $@.tar ${TATOEBA_STORAGE}/${SRC}.tar
|
||||
tar -C ${dir $@} -xf $@.tar
|
||||
if [ -d ${WIKI_DIR}/data/${SRC} ]; then \
|
||||
mv ${WIKI_DIR}/data/${SRC}/*.txt.gz $@/;\
|
||||
rm -f ${WIKI_DIR}/data/${SRC}/*;\
|
||||
rmdir ${WIKI_DIR}/data/${SRC};\
|
||||
rmdir ${WIKI_DIR}/data;\
|
||||
fi
|
||||
if [ -e ${WIKI_DIR}/${SRC}/wiki.${SRC}.aa.gz ]; then \
|
||||
for f in `ls ${WIKI_DIR}/${SRC}/*.${SRC}.*`; do \
|
||||
mv $$f `echo $$f | sed 's/\.${SRC}\././'`; \
|
||||
done \
|
||||
fi
|
||||
rm -f $@.tar
|
||||
|
||||
${OUTPUT_DIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/${SRC}/%.${PART}.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${GZCAT} $< |\
|
||||
grep -v '[<>{}]' |\
|
||||
${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
gzip -f > $@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${WIKI_SRC}: ${WIKI_PRE}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${GZCAT} $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
sed 's/^>>[a-z]*<< //' |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
## overwrite the file with the latest translations
|
||||
## --> this allows multiple translation iterations
|
||||
## without duplicating the data we want to use in MT training
|
||||
|
||||
${WIKI_LATEST_SRC}: ${WIKI_SRC}
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
${WIKI_LATEST_TRG}: ${WIKI_TRG}
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
|
||||
|
||||
|
||||
## translate
|
||||
|
||||
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN_DECODER} \
|
||||
-i ${PWD}/$< \
|
||||
-c decoder.yml \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > ${PWD}/$@
|
||||
#ifneq (${LANGPAIR},)
|
||||
#ifneq (${MODELNAME},)
|
||||
# rm -fr ${LANGPAIR}/${MODELNAME}
|
||||
#endif
|
||||
#endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
check-length:
|
||||
for d in `find . -maxdepth 1 -type d -name '*-*' -printf "%f "`; do \
|
||||
s=`echo $$d | cut -f1 -d'-'`; \
|
||||
t=`echo $$d | cut -f2 -d'-'`; \
|
||||
echo "check $$d"; \
|
||||
for S in `ls $$d/*.$$s.gz`; do \
|
||||
T=`echo $$S | sed 's/.$$s.gz/.$$t.gz/'`; \
|
||||
echo "$$S -- $$T"; \
|
||||
${GZCAT} $$S | wc -l; \
|
||||
${GZCAT} $$T | wc -l; \
|
||||
if [ `${GZCAT} $$S | wc -l` != `${GZCAT} $$T | wc -l` ]; then \
|
||||
echo "$$S != $$T"; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
|
@ -36,6 +36,46 @@ tatoeba-memad-bilingual:
|
||||
|
||||
|
||||
|
||||
tatoeba-memad-dist:
|
||||
${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
|
||||
MODELTYPE=transformer-align \
|
||||
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
|
||||
${MAKE} TRGLANGS="${MEMAD_LANGS3}" SRCLANGS="eng" \
|
||||
TATOEBA_RELEASEDIR=models-memad \
|
||||
TATOEBA_MODELSHOME=models-memad \
|
||||
MODELTYPE=transformer-align release-tatoeba-1m
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
|
||||
MODELTYPE=transformer-align \
|
||||
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="eng" \
|
||||
TATOEBA_RELEASEDIR=models-memad \
|
||||
TATOEBA_MODELSHOME=models-memad \
|
||||
MODELTYPE=transformer-align release-tatoeba-1m
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
|
||||
SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
|
||||
MODELTYPE=transformer-align \
|
||||
tatoeba-multilingual-eval-1m compare-tatoeba-1m eval-testsets-tatoeba-1m
|
||||
${MAKE} SRCLANGS="${MEMAD_LANGS3}" TRGLANGS="${MEMAD_LANGS3}" \
|
||||
SKIP_LANGPAIRS="deu-deu|eng-eng|fin-fin|fra-fra|nld-nld|swe-swe" \
|
||||
TATOEBA_RELEASEDIR=models-memad \
|
||||
TATOEBA_MODELSHOME=models-memad \
|
||||
MODELTYPE=transformer-align release-tatoeba-1m
|
||||
@for s in ${MEMAD_LANGS3}; do \
|
||||
for t in ${MEMAD_LANGS3}; do \
|
||||
if [ "$$s" != "$$t" ]; then \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
|
||||
MODELTYPE=transformer-align \
|
||||
tatoeba-multilingual-eval compare-tatoeba eval-testsets-tatoeba; \
|
||||
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
|
||||
TATOEBA_RELEASEDIR=models-memad \
|
||||
TATOEBA_MODELSHOME=models-memad \
|
||||
MODELTYPE=transformer-align release-tatoeba; \
|
||||
fi \
|
||||
done \
|
||||
done
|
||||
|
||||
|
||||
|
||||
#----------------------------------------------------------------
|
||||
# fine-tuning on YLE subtitle data
|
||||
#----------------------------------------------------------------
|
||||
@ -50,12 +90,22 @@ MEMAD_SUBTYPE = FIN-SWE
|
||||
MEMAD_LANGPAIR = fin2swe
|
||||
MEMAD_TUNETASK = tune
|
||||
|
||||
|
||||
tatoeba-yletune-all: tatoeba-yletune-finswe-all tatoeba-yletune-swefin-all
|
||||
tatoeba-yletune-finswe-all: tatoeba-yletune-finswe tatoeba-yletune-fihswe \
|
||||
tatoeba-yletune-finswh tatoeba-yletune-fihswh tatoeba-yletune-fisw
|
||||
tatoeba-yletune-swefin-all: tatoeba-yletune-swefin tatoeba-yletune-swefih \
|
||||
tatoeba-yletune-swhfin tatoeba-yletune-swhfih tatoeba-yletune-swfi
|
||||
|
||||
tatoeba-yleeval-all:
|
||||
${MAKE} MEMAD_TUNETASK=tuneeval tatoeba-yletune-all
|
||||
|
||||
tatoeba-yledist-all:
|
||||
${MAKE} MEMAD_TUNETASK=tunedist \
|
||||
TATOEBA_RELEASEDIR=models-memad-tuned \
|
||||
TATOEBA_MODELSHOME=models-memad-tuned \
|
||||
tatoeba-yletune-all
|
||||
|
||||
|
||||
tatoeba-yletune-finswe:
|
||||
${MAKE} MEMAD_SUBTYPE=FIN-SWE MEMAD_LANGPAIR=fin2swe tatoeba-yletune
|
||||
|
@ -116,7 +116,8 @@ TATOEBA_DEVSET = Tatoeba-dev
|
||||
TATOEBA_TESTSET = Tatoeba-test
|
||||
TATOEBA_DEVSET_NAME = Tatoeba-dev
|
||||
TATOEBA_TESTSET_NAME = Tatoeba-test
|
||||
|
||||
TATOEBA_RELEASEDIR = ${PWD}/models-tatoeba
|
||||
TATOEBA_MODELSHOME = ${PWD}/models-tatoeba
|
||||
|
||||
TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
|
||||
DEVSET=${TATOEBA_DEVSET} \
|
||||
@ -130,8 +131,8 @@ TATOEBA_PARAMS := TRAINSET=${TATOEBA_TRAINSET} \
|
||||
TESTSIZE=10000 \
|
||||
DEVMINSIZE=200 \
|
||||
WORKHOME=${TATOEBA_WORK} \
|
||||
MODELSHOME=${PWD}/models-tatoeba \
|
||||
RELEASEDIR=${PWD}/models-tatoeba \
|
||||
MODELSHOME=${TATOEBA_MODELSHOME} \
|
||||
RELEASEDIR=${TATOEBA_RELEASEDIR} \
|
||||
MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
|
||||
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
|
||||
ALT_MODEL_DIR=tatoeba \
|
||||
@ -407,6 +408,40 @@ tatoeba-wiki2eng-macro:
|
||||
tatoeba-print-missing-wiki:
|
||||
@echo $(filter-out ${WIKILANGS},${WIKIMACROLANGS})
|
||||
|
||||
tatoeba-wiki2eng-parent:
|
||||
for l in ${WIKIMACROLANGS}; do \
|
||||
if [ ! `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
echo "check $$l-eng"; \
|
||||
if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
echo "check data size of $$l-eng"; \
|
||||
if [ `find work-tatoeba/$$l-eng/train -name 'opus.src.clean.spm*.gz' 2>/dev/null | xargs zcat | head -100000 | wc -l` -lt 100000 ]; then \
|
||||
p=`langgroup -p $$l`; \
|
||||
echo "${MAKE} SRCLANGS=$$p TRGLANGS=eng tatoeba-$${p}2eng-train-1m"; \
|
||||
fi \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
tatoeba-wiki2eng-done:
|
||||
for l in ${WIKIMACROLANGS}; do \
|
||||
if [ `find models-tatoeba/$$l-eng -name '*.zip' 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
echo "model available for $$l-eng"; \
|
||||
elif [ `find work-tatoeba/$$l-eng -name '*.done' 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
echo -n "model aivailbale for $$l-eng but not released"; \
|
||||
if [ `find work-tatoeba/$$l-eng -name '*.eval' 2>/dev/null | wc -l` -gt 0 ]; then \
|
||||
echo -n ", BLEU = "; \
|
||||
grep BLEU work-tatoeba/$$l-eng/*eval | head -1 | cut -f3 -d' '; \
|
||||
elif [ ! -e work-tatoeba/$$l-eng/test/Tatoeba-test.src ]; then \
|
||||
echo ", missing eval file"; \
|
||||
echo "make TATOEBA_WORK=work-tatoeba-tmp SRCLANGS=$$l TRGLANGS=eng data-tatoeba"; \
|
||||
else \
|
||||
echo ", run 'make tatoeba-$${l}2eng-evalall'"; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# language groups
|
||||
###########################################################################################
|
||||
@ -1266,7 +1301,7 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
|
||||
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
|
||||
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
|
||||
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
|
||||
ang ara_Latn aze_Latn bul_Latn ell_Latn heb_Latn rus_Latn
|
||||
ang ara_Latn bul_Latn ell_Latn heb_Latn rus_Latn
|
||||
SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
|
||||
|
||||
## modify language IDs in training data to adjust them to test sets
|
||||
@ -1279,7 +1314,7 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
|
||||
| sed 's/\_[A-Z][A-Z]//g' \
|
||||
| sed 's/\-[a-z]*//g' \
|
||||
| sed 's/jpn_[A-Za-z]*/jpn/g' \
|
||||
| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;s/apc_Latn/apc/' \
|
||||
| sed 's/ara_Latn/ara/;s/arq_Latn/arq/;' \
|
||||
| sed 's/kor_[A-Za-z]*/kor/g' \
|
||||
| sed 's/nor_Latn/nor/g' \
|
||||
| sed 's/nor/nob/g' \
|
||||
@ -1336,6 +1371,7 @@ ${TATOEBA_MONO}/%.labels:
|
||||
-tar -C $@.d -xf $@.d/train.tar
|
||||
rm -f $@.d/train.tar
|
||||
if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \
|
||||
echo "........ move test files to ${dir $@}Tatoeba-test.${LANGPAIR}.clean.*"; \
|
||||
mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
cat $@.d/data/${LANGPAIR}/test.id $(FIXLANGIDS) > ${dir $@}Tatoeba-test.${LANGPAIR}.clean.id; \
|
||||
@ -1400,11 +1436,13 @@ ${TATOEBA_MONO}/%.labels:
|
||||
echo "extract $$s-$$t data"; \
|
||||
for d in dev test train; do \
|
||||
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
|
||||
echo "........ make ${dir $@}Tatoeba-$$d.$$s-$$t"; \
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$s-$$t; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$s-$$t ]; then \
|
||||
echo "........ compress to ${dir $@}Tatoeba-$$d.$$s-$$t.clean.*.gz"; \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$s.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$s-$$t | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$s-$$t.clean.$$t.gz; \
|
||||
fi; \
|
||||
@ -1415,11 +1453,13 @@ ${TATOEBA_MONO}/%.labels:
|
||||
echo "extract $$t-$$s data"; \
|
||||
for d in dev test train; do \
|
||||
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id ]; then \
|
||||
echo "........ make ${dir $@}Tatoeba-$$d.$$t-$$s"; \
|
||||
paste ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.id \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} \
|
||||
${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} |\
|
||||
grep -P "$$s\t$$t\t" > ${dir $@}Tatoeba-$$d.$$t-$$s; \
|
||||
if [ -s ${dir $@}Tatoeba-$$d.$$t-$$s ]; then \
|
||||
echo "........ compress to ${dir $@}Tatoeba-$$d.$$t-$$s.clean.*.gz"; \
|
||||
cut -f3 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$t.gz; \
|
||||
cut -f4 ${dir $@}Tatoeba-$$d.$$t-$$s | ${GZIP} -c > ${dir $@}Tatoeba-$$d.$$t-$$s.clean.$$s.gz; \
|
||||
fi; \
|
||||
@ -1439,6 +1479,7 @@ ${TATOEBA_MONO}/%.labels:
|
||||
for d in dev test train; do \
|
||||
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT} ]; then \
|
||||
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
|
||||
echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}"; \
|
||||
${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
else \
|
||||
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${SRCEXT}; \
|
||||
@ -1446,6 +1487,7 @@ ${TATOEBA_MONO}/%.labels:
|
||||
fi; \
|
||||
if [ -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT} ]; then \
|
||||
if [ ! -e ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}.gz ]; then \
|
||||
echo "........... compress ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}"; \
|
||||
${GZIP} ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
else \
|
||||
rm -f ${dir $@}Tatoeba-$$d.${LANGPAIR}.clean.${TRGEXT}; \
|
||||
@ -1482,6 +1524,7 @@ ${TATOEBA_MONO}/%.labels:
|
||||
|
||||
|
||||
|
||||
|
||||
test-tune-data:
|
||||
make SRCEXT=bre TRGEXT=eng LANGPAIR=bre-eng \
|
||||
work-tatoeba-test/data/simple/Tatoeba-OpenSubtitles-train.bre-eng.clean.bre.gz
|
||||
@ -1964,3 +2007,30 @@ fixlabels.sh:
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
tatoeba-missing-test:
|
||||
for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
|
||||
if [ ! -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
|
||||
if [ `find work-tatoeba/$$d/train -name '*-model' | wc -l` -gt 0 ]; then \
|
||||
p=`echo $$d | sed 's/-/2/'`; \
|
||||
echo "missing eval file for $$d"; \
|
||||
mkdir -p work-tatoeba-tmp/$$d/train; \
|
||||
rsync -av work-tatoeba/$$d/train/*model* work-tatoeba-tmp/$$d/train/; \
|
||||
make FIT_DATA_SIZE=1000 LANGGROUP_FIT_DATA_SIZE=1000 TATOEBA_WORK=work-tatoeba-tmp tatoeba-$$p-data; \
|
||||
cp work-tatoeba-tmp/$$d/test/Tatoeba-test.* work-tatoeba/$$d/test/; \
|
||||
rm -fr work-tatoeba-tmp/$$d; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
tatoeba-touch-test:
|
||||
for d in `find work-tatoeba/ -maxdepth 1 -type d -name '???-???' | cut -f2 -d/`; do \
|
||||
if [ -e work-tatoeba/$$d/test/Tatoeba-test.src ]; then \
|
||||
if [ -e work-tatoeba/$$d/val/Tatoeba-dev.src ]; then \
|
||||
touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.src*; \
|
||||
touch -r work-tatoeba/$$d/val/Tatoeba-dev.src work-tatoeba/$$d/test/Tatoeba-test.trg*; \
|
||||
fi \
|
||||
fi \
|
||||
done
|
||||
|
@ -61,9 +61,9 @@ endif
|
||||
%.submitcpu:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "${LANGPAIRSTR}${@:.submitcpu=}"' >>$@
|
||||
echo '#SBATCH -o ${LANGPAIRSTR}${@:.submitcpu=}.out.%j' >> $@
|
||||
echo '#SBATCH -e ${LANGPAIRSTR}${@:.submitcpu=}.err.%j' >> $@
|
||||
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submitcpu=}"' >>$@
|
||||
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.err.%j' >> $@
|
||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||
ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
|
@ -2,8 +2,15 @@
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
#
|
||||
# CHANGES
|
||||
#
|
||||
# * issue with perl code that removes control characters
|
||||
# unicode property Other = \p{C}) seems to remove
|
||||
# newline characters as well --> add negative lookahead
|
||||
# to avoid removing newline characters!
|
||||
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
@ -90,7 +97,7 @@ else
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;'
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
|
@ -3,6 +3,14 @@
|
||||
# USAGE preprocess.sh langid spmodel < input > output
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
#
|
||||
# CHANGES
|
||||
#
|
||||
# * issue with perl code that removes control characters
|
||||
# unicode property Other = \p{C}) seems to remove
|
||||
# newline characters as well --> add negative lookahead
|
||||
# to avoid removing newline characters!
|
||||
#
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
@ -49,7 +57,7 @@ sed -e 's/,/,/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
perl -C -pe 's/(?!\n)\p{C}/ /g;'
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
||||
|
@ -30,6 +30,18 @@ ${TICO19_TEST}: %/tico19-test.en.gz: tico19-testset/test/test.%.tsv
|
||||
cut -f4 $< | tail -n +2 | sed 's/^ *//;s/ *$$//' | gzip -c > ${@:en.gz=${patsubst en-%/,%,$(dir $@)}}.gz
|
||||
|
||||
|
||||
TICODATADIRS = $(sort $(subst /,,${dir ${wildcard */tico19-test.*}}))
|
||||
crosslink-tico:
|
||||
-for d in ${TICODATADIRS}; do \
|
||||
s=`echo "$$d" | cut -f1 -d'-'`; \
|
||||
t=`echo "$$d" | cut -f2 -d'-'`; \
|
||||
mkdir -p $$t-$$s; \
|
||||
cd $$t-$$s; \
|
||||
ln -s ../$$d/tico19* .; \
|
||||
cd ..; \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
1
testsets/am-en/tico19-test.am.gz
Symbolic link
1
testsets/am-en/tico19-test.am.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-am/tico19-test.am.gz
|
1
testsets/am-en/tico19-test.amh.gz
Symbolic link
1
testsets/am-en/tico19-test.amh.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-am/tico19-test.amh.gz
|
1
testsets/am-en/tico19-test.en.gz
Symbolic link
1
testsets/am-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-am/tico19-test.en.gz
|
1
testsets/am-en/tico19-test.eng.gz
Symbolic link
1
testsets/am-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-am/tico19-test.eng.gz
|
1
testsets/amh-eng/tico19-test.am.gz
Symbolic link
1
testsets/amh-eng/tico19-test.am.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-amh/tico19-test.am.gz
|
1
testsets/amh-eng/tico19-test.amh.gz
Symbolic link
1
testsets/amh-eng/tico19-test.amh.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-amh/tico19-test.amh.gz
|
1
testsets/amh-eng/tico19-test.en.gz
Symbolic link
1
testsets/amh-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-amh/tico19-test.en.gz
|
1
testsets/amh-eng/tico19-test.eng.gz
Symbolic link
1
testsets/amh-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-amh/tico19-test.eng.gz
|
1
testsets/ar-en/tico19-test.ar.gz
Symbolic link
1
testsets/ar-en/tico19-test.ar.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ar/tico19-test.ar.gz
|
1
testsets/ar-en/tico19-test.ara.gz
Symbolic link
1
testsets/ar-en/tico19-test.ara.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ar/tico19-test.ara.gz
|
1
testsets/ar-en/tico19-test.en.gz
Symbolic link
1
testsets/ar-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ar/tico19-test.en.gz
|
1
testsets/ar-en/tico19-test.eng.gz
Symbolic link
1
testsets/ar-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ar/tico19-test.eng.gz
|
1
testsets/ara-eng/tico19-test.ar.gz
Symbolic link
1
testsets/ara-eng/tico19-test.ar.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ara/tico19-test.ar.gz
|
1
testsets/ara-eng/tico19-test.ara.gz
Symbolic link
1
testsets/ara-eng/tico19-test.ara.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ara/tico19-test.ara.gz
|
1
testsets/ara-eng/tico19-test.en.gz
Symbolic link
1
testsets/ara-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ara/tico19-test.en.gz
|
1
testsets/ara-eng/tico19-test.eng.gz
Symbolic link
1
testsets/ara-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ara/tico19-test.eng.gz
|
1
testsets/ben-eng/tico19-test.ben.gz
Symbolic link
1
testsets/ben-eng/tico19-test.ben.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ben/tico19-test.ben.gz
|
1
testsets/ben-eng/tico19-test.bn.gz
Symbolic link
1
testsets/ben-eng/tico19-test.bn.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ben/tico19-test.bn.gz
|
1
testsets/ben-eng/tico19-test.en.gz
Symbolic link
1
testsets/ben-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ben/tico19-test.en.gz
|
1
testsets/ben-eng/tico19-test.eng.gz
Symbolic link
1
testsets/ben-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-ben/tico19-test.eng.gz
|
1
testsets/bn-en/tico19-test.ben.gz
Symbolic link
1
testsets/bn-en/tico19-test.ben.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-bn/tico19-test.ben.gz
|
1
testsets/bn-en/tico19-test.bn.gz
Symbolic link
1
testsets/bn-en/tico19-test.bn.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-bn/tico19-test.bn.gz
|
1
testsets/bn-en/tico19-test.en.gz
Symbolic link
1
testsets/bn-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-bn/tico19-test.en.gz
|
1
testsets/bn-en/tico19-test.eng.gz
Symbolic link
1
testsets/bn-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-bn/tico19-test.eng.gz
|
1
testsets/en-es/tico19-test.en.gz
Symbolic link
1
testsets/en-es/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.en.gz
|
1
testsets/en-es/tico19-test.eng.gz
Symbolic link
1
testsets/en-es/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.eng.gz
|
1
testsets/en-es/tico19-test.es.gz
Symbolic link
1
testsets/en-es/tico19-test.es.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.es-LA.gz
|
1
testsets/en-es/tico19-test.spa.gz
Symbolic link
1
testsets/en-es/tico19-test.spa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.spa.gz
|
1
testsets/en-es_LA/tico19-test.eng.gz
Symbolic link
1
testsets/en-es_LA/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en.gz
|
1
testsets/en-es_LA/tico19-test.spa.gz
Symbolic link
1
testsets/en-es_LA/tico19-test.spa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.es-LA.gz
|
1
testsets/en-pt/tico19-test.en.gz
Symbolic link
1
testsets/en-pt/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-pt_BR/tico19-test.en.gz
|
1
testsets/en-pt/tico19-test.eng.gz
Symbolic link
1
testsets/en-pt/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-pt_BR/tico19-test.en.gz
|
1
testsets/en-pt/tico19-test.por.gz
Symbolic link
1
testsets/en-pt/tico19-test.por.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-pt_BR/tico19-test.pt-BR.gz
|
1
testsets/en-pt/tico19-test.pt.gz
Symbolic link
1
testsets/en-pt/tico19-test.pt.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-pt_BR/tico19-test.pt-BR.gz
|
1
testsets/en-pt_BR/tico19-test.eng.gz
Symbolic link
1
testsets/en-pt_BR/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en.gz
|
1
testsets/en-pt_BR/tico19-test.pob.gz
Symbolic link
1
testsets/en-pt_BR/tico19-test.pob.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.pt-BR.gz
|
1
testsets/en-ti/tico19-test.en-ti_ER.en.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ER.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ti_ER/tico19-test.en.gz
|
1
testsets/en-ti/tico19-test.en-ti_ER.eng.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ER.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en-ti_ER.en.gz
|
1
testsets/en-ti/tico19-test.en-ti_ER.ti.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ER.ti.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ti_ER/tico19-test.ti_ER.gz
|
1
testsets/en-ti/tico19-test.en-ti_ER.tir.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ER.tir.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en-ti_ER.ti.gz
|
1
testsets/en-ti/tico19-test.en-ti_ET.en.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ET.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ti_ET/tico19-test.en.gz
|
1
testsets/en-ti/tico19-test.en-ti_ET.eng.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ET.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en-ti_ET.en.gz
|
1
testsets/en-ti/tico19-test.en-ti_ET.ti.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ET.ti.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ti_ET/tico19-test.ti_ET.gz
|
1
testsets/en-ti/tico19-test.en-ti_ET.tir.gz
Symbolic link
1
testsets/en-ti/tico19-test.en-ti_ET.tir.gz
Symbolic link
@ -0,0 +1 @@
|
||||
tico19-test.en-ti_ET.ti.gz
|
1
testsets/eng-pob
Symbolic link
1
testsets/eng-pob
Symbolic link
@ -0,0 +1 @@
|
||||
en-pt_BR
|
1
testsets/eng-por
Symbolic link
1
testsets/eng-por
Symbolic link
@ -0,0 +1 @@
|
||||
en-pt
|
1
testsets/es-en/tico19-test.en.gz
Symbolic link
1
testsets/es-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es/tico19-test.en.gz
|
1
testsets/es-en/tico19-test.eng.gz
Symbolic link
1
testsets/es-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es/tico19-test.eng.gz
|
1
testsets/es-en/tico19-test.es.gz
Symbolic link
1
testsets/es-en/tico19-test.es.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es/tico19-test.es.gz
|
1
testsets/es-en/tico19-test.spa.gz
Symbolic link
1
testsets/es-en/tico19-test.spa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es/tico19-test.spa.gz
|
1
testsets/es_LA-en/tico19-test.en.gz
Symbolic link
1
testsets/es_LA-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.en.gz
|
1
testsets/es_LA-en/tico19-test.eng.gz
Symbolic link
1
testsets/es_LA-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.eng.gz
|
1
testsets/es_LA-en/tico19-test.es-LA.gz
Symbolic link
1
testsets/es_LA-en/tico19-test.es-LA.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.es-LA.gz
|
1
testsets/es_LA-en/tico19-test.spa.gz
Symbolic link
1
testsets/es_LA-en/tico19-test.spa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-es_LA/tico19-test.spa.gz
|
1
testsets/fa-en/tico19-test.en.gz
Symbolic link
1
testsets/fa-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fa/tico19-test.en.gz
|
1
testsets/fa-en/tico19-test.eng.gz
Symbolic link
1
testsets/fa-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fa/tico19-test.eng.gz
|
1
testsets/fa-en/tico19-test.fa.gz
Symbolic link
1
testsets/fa-en/tico19-test.fa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fa/tico19-test.fa.gz
|
1
testsets/fa-en/tico19-test.fas.gz
Symbolic link
1
testsets/fa-en/tico19-test.fas.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fa/tico19-test.fas.gz
|
1
testsets/fas-eng/tico19-test.en.gz
Symbolic link
1
testsets/fas-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-fas/tico19-test.en.gz
|
1
testsets/fas-eng/tico19-test.eng.gz
Symbolic link
1
testsets/fas-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-fas/tico19-test.eng.gz
|
1
testsets/fas-eng/tico19-test.fa.gz
Symbolic link
1
testsets/fas-eng/tico19-test.fa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-fas/tico19-test.fa.gz
|
1
testsets/fas-eng/tico19-test.fas.gz
Symbolic link
1
testsets/fas-eng/tico19-test.fas.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-fas/tico19-test.fas.gz
|
1
testsets/fr-en/tico19-test.en.gz
Symbolic link
1
testsets/fr-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fr/tico19-test.en.gz
|
1
testsets/fr-en/tico19-test.eng.gz
Symbolic link
1
testsets/fr-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fr/tico19-test.eng.gz
|
1
testsets/fr-en/tico19-test.fr.gz
Symbolic link
1
testsets/fr-en/tico19-test.fr.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fr/tico19-test.fr.gz
|
1
testsets/fr-en/tico19-test.fra.gz
Symbolic link
1
testsets/fr-en/tico19-test.fra.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-fr/tico19-test.fra.gz
|
1
testsets/ha-en/tico19-test.en.gz
Symbolic link
1
testsets/ha-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ha/tico19-test.en.gz
|
1
testsets/ha-en/tico19-test.eng.gz
Symbolic link
1
testsets/ha-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ha/tico19-test.eng.gz
|
1
testsets/ha-en/tico19-test.ha.gz
Symbolic link
1
testsets/ha-en/tico19-test.ha.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ha/tico19-test.ha.gz
|
1
testsets/ha-en/tico19-test.hau.gz
Symbolic link
1
testsets/ha-en/tico19-test.hau.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-ha/tico19-test.hau.gz
|
1
testsets/hau-eng/tico19-test.en.gz
Symbolic link
1
testsets/hau-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-hau/tico19-test.en.gz
|
1
testsets/hau-eng/tico19-test.eng.gz
Symbolic link
1
testsets/hau-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-hau/tico19-test.eng.gz
|
1
testsets/hau-eng/tico19-test.ha.gz
Symbolic link
1
testsets/hau-eng/tico19-test.ha.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-hau/tico19-test.ha.gz
|
1
testsets/hau-eng/tico19-test.hau.gz
Symbolic link
1
testsets/hau-eng/tico19-test.hau.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-hau/tico19-test.hau.gz
|
1
testsets/hi-en/tico19-test.en.gz
Symbolic link
1
testsets/hi-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-hi/tico19-test.en.gz
|
1
testsets/hi-en/tico19-test.eng.gz
Symbolic link
1
testsets/hi-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-hi/tico19-test.eng.gz
|
1
testsets/hi-en/tico19-test.hi.gz
Symbolic link
1
testsets/hi-en/tico19-test.hi.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-hi/tico19-test.hi.gz
|
1
testsets/hi-en/tico19-test.hin.gz
Symbolic link
1
testsets/hi-en/tico19-test.hin.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-hi/tico19-test.hin.gz
|
1
testsets/id-en/tico19-test.en.gz
Symbolic link
1
testsets/id-en/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-id/tico19-test.en.gz
|
1
testsets/id-en/tico19-test.eng.gz
Symbolic link
1
testsets/id-en/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-id/tico19-test.eng.gz
|
1
testsets/id-en/tico19-test.id.gz
Symbolic link
1
testsets/id-en/tico19-test.id.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-id/tico19-test.id.gz
|
1
testsets/id-en/tico19-test.msa.gz
Symbolic link
1
testsets/id-en/tico19-test.msa.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../en-id/tico19-test.msa.gz
|
1
testsets/kau-eng/tico19-test.en.gz
Symbolic link
1
testsets/kau-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-kau/tico19-test.en.gz
|
1
testsets/kau-eng/tico19-test.eng.gz
Symbolic link
1
testsets/kau-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-kau/tico19-test.eng.gz
|
1
testsets/kau-eng/tico19-test.kau.gz
Symbolic link
1
testsets/kau-eng/tico19-test.kau.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-kau/tico19-test.kau.gz
|
1
testsets/kau-eng/tico19-test.kr.gz
Symbolic link
1
testsets/kau-eng/tico19-test.kr.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-kau/tico19-test.kr.gz
|
1
testsets/khm-eng/tico19-test.en.gz
Symbolic link
1
testsets/khm-eng/tico19-test.en.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-khm/tico19-test.en.gz
|
1
testsets/khm-eng/tico19-test.eng.gz
Symbolic link
1
testsets/khm-eng/tico19-test.eng.gz
Symbolic link
@ -0,0 +1 @@
|
||||
../eng-khm/tico19-test.eng.gz
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user