2020-01-11 01:29:06 +03:00
|
|
|
#
|
|
|
|
# backtranslate wiki data
|
|
|
|
#
|
|
|
|
# only works with sentencepiece models!
|
|
|
|
#
|
|
|
|
|
|
|
|
include ../Makefile.env
|
|
|
|
include ../Makefile.config
|
|
|
|
include ../Makefile.slurm
|
|
|
|
|
|
|
|
SRC = af
|
|
|
|
TRG = en
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## various sources are available
|
|
|
|
## can be general wikipedia, wikinews, wikibooks, ...
|
|
|
|
WIKISOURCE = wiki
|
2020-01-11 01:29:06 +03:00
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## maximum input length (number sentence piece segments)
|
|
|
|
## maximum number of sentences to be translated (top N lines)
|
|
|
|
MAX_LENGTH = 100
|
|
|
|
MAX_SENTENCES = 1000000
|
2020-01-19 20:00:13 +03:00
|
|
|
PART = aa
|
2020-01-11 01:29:06 +03:00
|
|
|
|
|
|
|
LANGPAIR = ${SRC}-${TRG}
|
|
|
|
|
|
|
|
|
|
|
|
MODELHOME = ../models/${LANGPAIR}
|
2020-01-12 02:10:53 +03:00
|
|
|
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|
|
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
2020-01-11 01:29:06 +03:00
|
|
|
|
2020-01-22 14:33:28 +03:00
|
|
|
ifeq (${MODELNAME},)
|
2020-02-14 01:12:55 +03:00
|
|
|
MODELHOME = ../work-langid/models/${LANGPAIR}
|
2020-02-12 00:20:11 +03:00
|
|
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|
|
|
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
2020-01-22 14:33:28 +03:00
|
|
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
|
|
|
endif
|
|
|
|
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
|
|
|
module load nlpl-udpipe nlpl-opus &&
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
|
|
|
|
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
LANGID = ${SRC}
|
2020-01-21 00:37:40 +03:00
|
|
|
WIKI_DIR = wiki/${LANGID}
|
2020-01-19 20:00:13 +03:00
|
|
|
WIKI_TXT = ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.${PART}.gz
|
|
|
|
WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
|
|
|
|
WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
|
|
|
|
WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
2020-01-12 02:10:53 +03:00
|
|
|
|
2020-01-29 22:46:18 +03:00
|
|
|
## all parts of this wiki
|
|
|
|
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
|
|
|
|
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## don't delete translated text if the process crashes
|
|
|
|
.PRECIOUS: ${WIKI_TRG}
|
2020-01-11 01:29:06 +03:00
|
|
|
|
|
|
|
## find wiki downloads
|
2020-01-12 02:10:53 +03:00
|
|
|
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
|
|
|
|
|
|
|
## we don't need to keep the json file
|
2020-01-19 20:00:13 +03:00
|
|
|
.INTERMEDIATE: ${WIKI_JSON} ${WIKI_PRE}
|
2020-01-12 02:10:53 +03:00
|
|
|
|
2020-01-10 17:45:42 +03:00
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
## find UDPipe model
|
2020-01-12 02:10:53 +03:00
|
|
|
ifndef UDPIPE_MODELS
|
|
|
|
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
|
|
|
|
endif
|
2020-01-10 17:45:42 +03:00
|
|
|
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
|
|
|
|
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
|
2020-01-11 01:29:06 +03:00
|
|
|
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
|
|
|
|
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
|
|
|
|
all: index.html
|
2020-01-11 01:29:06 +03:00
|
|
|
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
|
|
|
|
|
2020-01-21 00:37:40 +03:00
|
|
|
|
2020-02-12 00:20:11 +03:00
|
|
|
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
|
|
|
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
2020-01-21 00:37:40 +03:00
|
|
|
|
2020-02-12 00:20:11 +03:00
|
|
|
all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
2020-01-21 00:37:40 +03:00
|
|
|
for w in ${WIKISOURCES}; do \
|
2020-01-24 14:39:21 +03:00
|
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
|
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
|
|
|
if [ `find ${WIKI_DIR} -name "$$w.${LANGID}.${PART}.gz" | wc -l` -gt 0 ]; then \
|
|
|
|
echo "${MAKE} WIKISOURCE=$$w HPC_CORES=1 WALLTIME=72 translate.submit"; \
|
2020-01-29 22:46:18 +03:00
|
|
|
${MAKE} WIKISOURCE=$$w HPC_MEM=4g HPC_CORES=1 WALLTIME=72 translate.submit; \
|
2020-01-22 14:33:28 +03:00
|
|
|
fi \
|
2020-01-21 00:37:40 +03:00
|
|
|
done
|
|
|
|
|
2020-03-20 16:32:29 +03:00
|
|
|
all-wikitext:
|
|
|
|
for w in ${WIKISOURCES}; do \
|
|
|
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
|
|
|
done
|
2020-01-21 00:37:40 +03:00
|
|
|
|
|
|
|
all-wikilangs: index.html
|
2020-01-10 17:45:42 +03:00
|
|
|
for l in ${WIKILANGS}; do \
|
2020-01-12 02:10:53 +03:00
|
|
|
${MAKE} LANGID=$$l extract-text; \
|
2020-01-10 17:45:42 +03:00
|
|
|
done
|
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
|
2020-03-25 00:47:57 +03:00
|
|
|
|
|
|
|
## for Breton: use the multilingual celtic model to backtranslate
|
|
|
|
breton:
|
|
|
|
${MAKE} SRC=br TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis
|
|
|
|
|
|
|
|
## do the same for all Celtic languages in the model
|
|
|
|
celtic:
|
|
|
|
for l in ga cy br gd kv gv; do \
|
|
|
|
${MAKE} SRC=$$l TRG=en MODELHOME=../models/ga+cy+br+gd+kv+gv-en all-wikis; \
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-01-22 14:33:28 +03:00
|
|
|
# Tagalog (tl, tgl), Central Bikol (bcl), Malayalam (ml, mal), Bengali (bn, ben), and Mongolian (mn, mon)
|
|
|
|
focus-wikis:
|
|
|
|
for l in tl bcl ml bn mn; do \
|
|
|
|
${MAKE} SRC=$$l TRG=en all-wikis; \
|
|
|
|
done
|
|
|
|
|
2020-02-08 01:19:21 +03:00
|
|
|
get-data: ${WIKI_JSON}
|
2020-01-12 02:10:53 +03:00
|
|
|
extract-text: ${WIKI_TXT}
|
2020-02-12 00:20:11 +03:00
|
|
|
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
2020-01-11 01:29:06 +03:00
|
|
|
prepare-data: ${WIKI_PRE}
|
|
|
|
translate: ${WIKI_SRC} ${WIKI_TRG}
|
2020-01-10 17:45:42 +03:00
|
|
|
|
2020-01-29 22:46:18 +03:00
|
|
|
## translate all parts
|
2020-02-12 00:20:11 +03:00
|
|
|
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
2020-01-29 22:46:18 +03:00
|
|
|
for p in ${PARTS}; do \
|
|
|
|
${MAKE} PART=$$p translate; \
|
|
|
|
done
|
|
|
|
|
|
|
|
## create jobs for translating all parts
|
2020-02-12 00:20:11 +03:00
|
|
|
submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
2020-01-29 22:46:18 +03:00
|
|
|
for p in ${PARTS}; do \
|
|
|
|
${MAKE} PART=$$p translate.submit; \
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-01-10 17:45:42 +03:00
|
|
|
print-names:
|
|
|
|
echo ${LANGNAME}
|
|
|
|
echo ${UDPIPE_MODEL}
|
|
|
|
echo ${WIKI_JSON}
|
2020-02-14 01:12:55 +03:00
|
|
|
echo ${MODELNAME}
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## fetch the latest model
|
|
|
|
## ---> TODO: should we fetch from ObjectStorage instead?
|
2020-01-11 01:29:06 +03:00
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
2020-01-11 01:29:06 +03:00
|
|
|
ifneq (${MODELZIP},)
|
|
|
|
mkdir -p ${dir $@}
|
|
|
|
cp ${MODELZIP} ${dir $@}
|
|
|
|
cd ${dir $@} && unzip *.zip
|
|
|
|
endif
|
|
|
|
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## pre-process data
|
|
|
|
## ---> TODO: does that work for multilingual data that need prefix?
|
|
|
|
|
2020-01-19 20:00:13 +03:00
|
|
|
${LANGPAIR}/%.${PART}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: ${WIKI_DIR}/%.${SRC}.${PART}.gz
|
2020-01-11 01:29:06 +03:00
|
|
|
ifneq (${MODELZIP},)
|
2020-01-12 02:10:53 +03:00
|
|
|
mkdir -p ${dir $@}
|
|
|
|
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
2020-01-11 01:29:06 +03:00
|
|
|
zcat $< |\
|
2020-01-24 14:39:21 +03:00
|
|
|
grep -v '[<>{}]' |\
|
2020-01-12 02:10:53 +03:00
|
|
|
${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\
|
|
|
|
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
|
|
|
head -${MAX_SENTENCES} |\
|
2020-01-11 01:29:06 +03:00
|
|
|
gzip -c > $@
|
|
|
|
endif
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
|
2020-01-24 14:39:21 +03:00
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## merge SentencePiece segments in the source text
|
|
|
|
## (Why? because we filter out some data from the original wiki text, see above)
|
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
${WIKI_SRC}: ${WIKI_PRE}
|
|
|
|
ifneq (${MODELZIP},)
|
2020-01-12 02:10:53 +03:00
|
|
|
mkdir -p ${dir $@}
|
2020-01-11 01:29:06 +03:00
|
|
|
zcat $< |\
|
|
|
|
sed 's/ //g;s/▁/ /g' | \
|
|
|
|
sed 's/^ *//;s/ *$$//' |\
|
|
|
|
gzip -c > $@
|
|
|
|
endif
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
|
|
|
|
## translate
|
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
|
|
|
|
ifneq (${MODELZIP},)
|
2020-01-12 02:10:53 +03:00
|
|
|
mkdir -p ${dir $@}
|
|
|
|
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
|
|
|
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
|
2020-01-11 01:29:06 +03:00
|
|
|
-i ${PWD}/$< \
|
|
|
|
-c decoder.yml \
|
|
|
|
-d ${MARIAN_GPUS} \
|
|
|
|
${MARIAN_DECODER_FLAGS} |\
|
|
|
|
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
|
|
|
gzip -c > ${PWD}/$@
|
2020-01-12 02:10:53 +03:00
|
|
|
ifneq (${LANGPAIR},)
|
|
|
|
ifneq (${MODELNAME},)
|
|
|
|
rm -fr ${LANGPAIR}/${MODELNAME}
|
|
|
|
endif
|
|
|
|
endif
|
2020-01-11 01:29:06 +03:00
|
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
## index of all downloadable files
|
|
|
|
index.html:
|
2020-01-12 02:10:53 +03:00
|
|
|
wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
|
2020-01-11 01:29:06 +03:00
|
|
|
|
|
|
|
## wiki in json format
|
2020-01-10 17:45:42 +03:00
|
|
|
${WIKI_JSON}:
|
2020-01-12 02:10:53 +03:00
|
|
|
wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
|
2020-01-12 02:10:53 +03:00
|
|
|
## check whether there is a UDPipe model
|
|
|
|
## backoff to moses tools
|
2020-01-11 01:29:06 +03:00
|
|
|
ifneq (${UDPIPE_MODEL},)
|
|
|
|
SENTSPLITTER = udpipe --input=horizontal --tokenize \
|
|
|
|
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
|
|
|
|
grep '^\# *text *= *' |\
|
|
|
|
sed 's/^\# *text *= *//'
|
|
|
|
else
|
|
|
|
SENTSPLITTER = ${MOSESSCRIPTS}/ems/support/split-sentences.perl -l ${LANGID}
|
|
|
|
endif
|
|
|
|
|
|
|
|
## extract sentences and normalize
|
|
|
|
## - requires jq, udpipe, and moses-scripts
|
2020-01-10 17:45:42 +03:00
|
|
|
${WIKI_TXT}: ${WIKI_JSON}
|
2020-01-19 20:00:13 +03:00
|
|
|
mkdir -p ${dir $@}
|
2020-01-10 17:45:42 +03:00
|
|
|
${LOAD_MODULES} \
|
|
|
|
zcat $< | jq -r '.text' | \
|
|
|
|
grep -v 'null' |\
|
2020-01-24 14:39:21 +03:00
|
|
|
grep -v '[<>{}]' |\
|
2020-01-11 01:29:06 +03:00
|
|
|
${SENTSPLITTER} |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
2020-03-19 21:42:27 +03:00
|
|
|
python3 ../scripts/filter/mono-match-lang.py -l ${LANGID} |\
|
2020-01-19 20:00:13 +03:00
|
|
|
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
|
|
|
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
|
|
|
|
|
|
|
|
2020-02-08 01:19:21 +03:00
|
|
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
|
|
|
|
|
|
|
|
2020-01-19 20:00:13 +03:00
|
|
|
|
|
|
|
## OLD: without splitting into parts
|
|
|
|
#
|
|
|
|
# ${WIKI_TXT}: ${WIKI_JSON}
|
|
|
|
# ${LOAD_MODULES} \
|
|
|
|
# zcat $< | jq -r '.text' | \
|
|
|
|
# grep -v 'null' |\
|
|
|
|
# ${SENTSPLITTER} |\
|
|
|
|
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
|
|
|
# sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
|
|
|
# gzip -c > $@
|
2020-01-10 17:45:42 +03:00
|
|
|
|
2020-01-11 01:29:06 +03:00
|
|
|
|
2020-01-10 17:45:42 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# afrikaans-afribooms-ud-2.4-190531.udpipe af
|
|
|
|
# ancient_greek-perseus-ud-2.4-190531.udpipe
|
|
|
|
# ancient_greek-proiel-ud-2.4-190531.udpipe
|
|
|
|
# arabic-padt-ud-2.4-190531.udpipe ar
|
|
|
|
# armenian-armtdp-ud-2.4-190531.udpipe
|
|
|
|
# basque-bdt-ud-2.4-190531.udpipe eo
|
|
|
|
# belarusian-hse-ud-2.4-190531.udpipe
|
|
|
|
# bulgarian-btb-ud-2.4-190531.udpipe bg
|
|
|
|
# catalan-ancora-ud-2.4-190531.udpipe ca
|
|
|
|
# chinese-gsd-ud-2.4-190531.udpipe zh
|
|
|
|
# classical_chinese-kyoto-ud-2.4-190531.udpipe zh_tw
|
|
|
|
# coptic-scriptorium-ud-2.4-190531.udpipe
|
|
|
|
# croatian-set-ud-2.4-190531.udpipe hr
|
|
|
|
# czech-cac-ud-2.4-190531.udpipe cs
|
|
|
|
# czech-cltt-ud-2.4-190531.udpipe cs
|
|
|
|
# czech-fictree-ud-2.4-190531.udpipe cs
|
|
|
|
# czech-pdt-ud-2.4-190531.udpipe cs
|
|
|
|
# danish-ddt-ud-2.4-190531.udpipe da
|
|
|
|
# dutch-alpino-ud-2.4-190531.udpipe nl
|
|
|
|
# dutch-lassysmall-ud-2.4-190531.udpipe nl
|
|
|
|
# english-ewt-ud-2.4-190531.udpipe en
|
|
|
|
# english-gum-ud-2.4-190531.udpipe en
|
|
|
|
# english-lines-ud-2.4-190531.udpipe en
|
|
|
|
# english-partut-ud-2.4-190531.udpipe en
|
|
|
|
# estonian-edt-ud-2.4-190531.udpipe et
|
|
|
|
# estonian-ewt-ud-2.4-190531.udpipe et
|
|
|
|
# finnish-ftb-ud-2.4-190531.udpipe fi
|
|
|
|
# finnish-tdt-ud-2.4-190531.udpipe fi
|
|
|
|
# french-gsd-ud-2.4-190531.udpipe fr
|
|
|
|
# french-partut-ud-2.4-190531.udpipe fr
|
|
|
|
# french-sequoia-ud-2.4-190531.udpipe fr
|
|
|
|
# french-spoken-ud-2.4-190531.udpipe fr
|
|
|
|
# galician-ctg-ud-2.4-190531.udpipe gl
|
|
|
|
# galician-treegal-ud-2.4-190531.udpipe gl
|
|
|
|
# german-gsd-ud-2.4-190531.udpipe de
|
|
|
|
# gothic-proiel-ud-2.4-190531.udpipe
|
|
|
|
# greek-gdt-ud-2.4-190531.udpipe el
|
|
|
|
# hebrew-htb-ud-2.4-190531.udpipe he
|
|
|
|
# hindi-hdtb-ud-2.4-190531.udpipe hi
|
|
|
|
# hungarian-szeged-ud-2.4-190531.udpipe hu
|
|
|
|
# indonesian-gsd-ud-2.4-190531.udpipe id
|
|
|
|
# irish-idt-ud-2.4-190531.udpipe cy
|
|
|
|
# italian-isdt-ud-2.4-190531.udpipe it
|
|
|
|
# italian-partut-ud-2.4-190531.udpipe it
|
|
|
|
# italian-postwita-ud-2.4-190531.udpipe it
|
|
|
|
# italian-vit-ud-2.4-190531.udpipe it
|
|
|
|
# japanese-gsd-ud-2.4-190531.udpipe ja
|
|
|
|
# korean-gsd-ud-2.4-190531.udpipe ko
|
|
|
|
# korean-kaist-ud-2.4-190531.udpipe ko
|
|
|
|
# latin-ittb-ud-2.4-190531.udpipe la
|
|
|
|
# latin-perseus-ud-2.4-190531.udpipe la
|
|
|
|
# latin-proiel-ud-2.4-190531.udpipe la
|
|
|
|
# latvian-lvtb-ud-2.4-190531.udpipe lv
|
|
|
|
# lithuanian-alksnis-ud-2.4-190531.udpipe lt
|
|
|
|
# lithuanian-hse-ud-2.4-190531.udpipe lt
|
|
|
|
# maltese-mudt-ud-2.4-190531.udpipe mt
|
|
|
|
# marathi-ufal-ud-2.4-190531.udpipe
|
|
|
|
# north_sami-giella-ud-2.4-190531.udpipe
|
|
|
|
# norwegian-bokmaal-ud-2.4-190531.udpipe nb
|
|
|
|
# norwegian-nynorsklia-ud-2.4-190531.udpipe nn
|
|
|
|
# norwegian-nynorsk-ud-2.4-190531.udpipe nn
|
|
|
|
# old_church_slavonic-proiel-ud-2.4-190531.udpipe
|
|
|
|
# old_french-srcmf-ud-2.4-190531.udpipe
|
|
|
|
# old_russian-torot-ud-2.4-190531.udpipe
|
|
|
|
# persian-seraji-ud-2.4-190531.udpipe fa
|
|
|
|
# polish-lfg-ud-2.4-190531.udpipe pl
|
|
|
|
# polish-pdb-ud-2.4-190531.udpipe pl
|
|
|
|
# portuguese-bosque-ud-2.4-190531.udpipe pt
|
|
|
|
# portuguese-gsd-ud-2.4-190531.udpipe pt
|
|
|
|
# romanian-nonstandard-ud-2.4-190531.udpipe ro
|
|
|
|
# romanian-rrt-ud-2.4-190531.udpipe ro
|
|
|
|
# russian-gsd-ud-2.4-190531.udpipe ru
|
|
|
|
# russian-syntagrus-ud-2.4-190531.udpipe ru
|
|
|
|
# russian-taiga-ud-2.4-190531.udpipe ru
|
|
|
|
# serbian-set-ud-2.4-190531.udpipe sr
|
|
|
|
# slovak-snk-ud-2.4-190531.udpipe sk
|
|
|
|
# slovenian-ssj-ud-2.4-190531.udpipe sl
|
|
|
|
# slovenian-sst-ud-2.4-190531.udpipe sl
|
|
|
|
# spanish-ancora-ud-2.4-190531.udpipe es
|
|
|
|
# spanish-gsd-ud-2.4-190531.udpipe es
|
|
|
|
# swedish-lines-ud-2.4-190531.udpipe sv
|
|
|
|
# swedish-talbanken-ud-2.4-190531.udpipe sv
|
|
|
|
# tamil-ttb-ud-2.4-190531.udpipe
|
|
|
|
# telugu-mtg-ud-2.4-190531.udpipe
|
|
|
|
# turkish-imst-ud-2.4-190531.udpipe tr
|
|
|
|
# ukrainian-iu-ud-2.4-190531.udpipe
|
|
|
|
# urdu-udtb-ud-2.4-190531.udpipe
|
|
|
|
# uyghur-udt-ud-2.4-190531.udpipe
|
|
|
|
# vietnamese-vtb-ud-2.4-190531.udpipe vi
|
|
|
|
# wolof-wtb-ud-2.4-190531.udpipe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-content.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiki-20191209-cirrussearch-general.json.gz
|
|
|
|
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-content.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikibooks-20191209-cirrussearch-general.json.gz
|
|
|
|
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-content.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwikiquote-20191209-cirrussearch-general.json.gz
|
|
|
|
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-content.json.gz
|
|
|
|
# https://dumps.wikimedia.org/other/cirrussearch/current/afwiktionary-20191209-cirrussearch-general.json.gz
|
|
|
|
|
|
|
|
|
|
|
|
# enwiki-20191209-cirrussearch-content.json.gz 10-Dec-2019 11:04 22622822308
|
|
|
|
# enwiki-20191209-cirrussearch-content.json.gz.tmp 10-Dec-2019 10:57 21460369408
|
|
|
|
# enwiki-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:22 50406322974
|
|
|
|
# enwiki-20191209-cirrussearch-general.json.gz.tmp 10-Dec-2019 15:50 44720914432
|
|
|
|
# enwikibooks-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:24 319454731
|
|
|
|
# enwikibooks-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:25 97206925
|
|
|
|
# enwikinews-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:26 53746769
|
|
|
|
# enwikinews-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:36 364098656
|
|
|
|
# enwikiquote-20191209-cirrussearch-content.json.gz 10-Dec-2019 16:38 234637326
|
|
|
|
# enwikiquote-20191209-cirrussearch-general.json.gz 10-Dec-2019 16:38 66848855
|
|
|
|
# enwikisource-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:09 5236203374
|
|
|
|
# enwikisource-20191209-cirrussearch-content.json..> 10-Dec-2019 17:06 4597481472
|
|
|
|
# enwikisource-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:11 152492247
|
|
|
|
# enwikiversity-20191209-cirrussearch-content.jso..> 10-Dec-2019 17:12 145288148
|
|
|
|
# enwikiversity-20191209-cirrussearch-general.jso..> 10-Dec-2019 17:13 193051475
|
|
|
|
# enwikivoyage-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:14 179134384
|
|
|
|
# enwikivoyage-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:15 99357806
|
|
|
|
# enwiktionary-20191209-cirrussearch-content.json.gz 10-Dec-2019 17:36 2319801836
|
|
|
|
# enwiktionary-20191209-cirrussearch-content.json..> 10-Dec-2019 17:23 918503424
|
|
|
|
# enwiktionary-20191209-cirrussearch-general.json.gz 10-Dec-2019 17:42 848846623
|
|
|
|
# enwiktionary-20191209-cirrussearch-general.json..> 10-Dec-2019 17:40 661585920
|