more data for cy-en

This commit is contained in:
Joerg Tiedemann 2020-03-25 20:40:29 +02:00
parent 93f03a1fe7
commit 14f6ef808a
2 changed files with 66 additions and 0 deletions

View File

@ -330,12 +330,77 @@ fiskmo-svfi-%:
## a batch of interesting models ....
${DATADIR}/${PRE}/%.cy-en.clean.cy.gz:
wget http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
tar -xzf $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
$(TOKENIZER)/detokenizer.perl -l cy < $(patsubst %.cy-en.clean.cy.gz,%.cy,${notdir $@}) |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > $@
$(TOKENIZER)/detokenizer.perl -l en < $(patsubst %.cy-en.clean.cy.gz,%.en,${notdir $@}) |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${@:.cy.gz=.en.gz}
rm -f $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
celtic-english:
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
# more data for cy-en
celtic-english-extra: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
## also inlcude backtranslations!
english-celtic-extra: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic-bt
## extra data from http://techiaith.cymru
# http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
# http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
# http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
# http://techiaith.cymru/alinio/rhestr_geiriau.tsv
# http://techiaith.cymru/alinio/hunalign/cy-en.dic
welsh-data: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
for c in CofnodYCynulliad Deddfwriaeth Meddalwedd; do \
wget http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \
tar -xzf $$c.tar.gz; \
$(TOKENIZER)/detokenizer.perl -l cy < $$c.cy |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.cy.gz; \
$(TOKENIZER)/detokenizer.perl -l en < $$c.en |\
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.en.gz; \
rm -f $$c.tar.gz; \
done
wget http://techiaith.cymru/alinio/rhestr_geiriau.tsv
tail -n +16 rhestr_geiriau.tsv | cut -f1 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.en.gz
tail -n +16 rhestr_geiriau.tsv | cut -f2 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.cy.gz
rm -f rhestr_geiriau.tsv
wget http://techiaith.cymru/alinio/hunalign/cy-en.dic
cut -f1 -d '@' < cy-en.dic | sed 's/ $$*//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.en.gz
cut -f2 -d '@' < cy-en.dic | sed 's/^ *//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
LANGS_FR_VARIANTS = fr_BE fr_CA fr_FR
LANGS_ES_VARIANTS = es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE
LANGS_PT_VARIANTS = pt_br pt_BR pt_PT

View File

@ -7,6 +7,7 @@ http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
http://techiaith.cymru/alinio/rhestr_geiriau.tsv
http://techiaith.cymru/alinio/hunalign/cy-en.dic
(see work/data/cy-en)