mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
more data for cy-en
This commit is contained in:
parent
93f03a1fe7
commit
14f6ef808a
@ -330,12 +330,77 @@ fiskmo-svfi-%:
|
||||
## a batch of interesting models ....
|
||||
|
||||
|
||||
${DATADIR}/${PRE}/%.cy-en.clean.cy.gz:
|
||||
wget http://techiaith.cymru/corpws/Moses/$(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
|
||||
tar -xzf $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
|
||||
$(TOKENIZER)/detokenizer.perl -l cy < $(patsubst %.cy-en.clean.cy.gz,%.cy,${notdir $@}) |\
|
||||
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > $@
|
||||
$(TOKENIZER)/detokenizer.perl -l en < $(patsubst %.cy-en.clean.cy.gz,%.en,${notdir $@}) |\
|
||||
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${@:.cy.gz=.en.gz}
|
||||
rm -f $(patsubst %.cy-en.clean.cy.gz,%.tar.gz,${notdir $@})
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
celtic-english:
|
||||
${MAKE} HELDOUTSIZE=0 SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
|
||||
${MAKE} HELDOUTSIZE=0 TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
|
||||
|
||||
|
||||
# more data for cy-en
|
||||
|
||||
celtic-english-extra: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
|
||||
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
|
||||
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
|
||||
SRCLANGS="ga cy br gd kv gv" TRGLANGS=en train-dynamic
|
||||
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
|
||||
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
|
||||
TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic
|
||||
|
||||
|
||||
## also inlcude backtranslations!
|
||||
|
||||
english-celtic-extra: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
|
||||
${MAKE} DATASET=opus+techiaith HELDOUTSIZE=0 \
|
||||
EXTRA_TRAINSET="CofnodYCynulliad Deddfwriaeth Meddalwedd rhestr_geiriau dic" \
|
||||
TRGLANGS="ga cy br gd kv gv" SRCLANGS=en TRG=ga SRC=en train-dynamic-bt
|
||||
|
||||
|
||||
|
||||
## extra data from http://techiaith.cymru
|
||||
|
||||
# http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
|
||||
# http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
|
||||
# http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
|
||||
# http://techiaith.cymru/alinio/rhestr_geiriau.tsv
|
||||
# http://techiaith.cymru/alinio/hunalign/cy-en.dic
|
||||
|
||||
welsh-data: ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
|
||||
|
||||
${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz:
|
||||
for c in CofnodYCynulliad Deddfwriaeth Meddalwedd; do \
|
||||
wget http://techiaith.cymru/corpws/Moses/$$c/$$c.tar.gz; \
|
||||
tar -xzf $$c.tar.gz; \
|
||||
$(TOKENIZER)/detokenizer.perl -l cy < $$c.cy |\
|
||||
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.cy.gz; \
|
||||
$(TOKENIZER)/detokenizer.perl -l en < $$c.en |\
|
||||
$(MOSESSCRIPTS)/recaser/detruecase.perl | gzip -c > ${DATADIR}/${PRE}/$$c.cy-en.clean.en.gz; \
|
||||
rm -f $$c.tar.gz; \
|
||||
done
|
||||
wget http://techiaith.cymru/alinio/rhestr_geiriau.tsv
|
||||
tail -n +16 rhestr_geiriau.tsv | cut -f1 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.en.gz
|
||||
tail -n +16 rhestr_geiriau.tsv | cut -f2 | gzip -c > ${DATADIR}/${PRE}/rhestr_geiriau.cy-en.clean.cy.gz
|
||||
rm -f rhestr_geiriau.tsv
|
||||
wget http://techiaith.cymru/alinio/hunalign/cy-en.dic
|
||||
cut -f1 -d '@' < cy-en.dic | sed 's/ $$*//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.en.gz
|
||||
cut -f2 -d '@' < cy-en.dic | sed 's/^ *//' | gzip -c > ${DATADIR}/${PRE}/dic.cy-en.clean.cy.gz
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
LANGS_FR_VARIANTS = fr_BE fr_CA fr_FR
|
||||
LANGS_ES_VARIANTS = es_AR es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE
|
||||
LANGS_PT_VARIANTS = pt_br pt_BR pt_PT
|
||||
|
1
NOTES.md
1
NOTES.md
@ -7,6 +7,7 @@ http://techiaith.cymru/corpws/Moses/CofnodYCynulliad/CofnodYCynulliad.tar.gz
|
||||
http://techiaith.cymru/corpws/Moses/Deddfwriaeth/Deddfwriaeth.tar.gz
|
||||
http://techiaith.cymru/corpws/Moses/Meddalwedd/Meddalwedd.tar.gz
|
||||
http://techiaith.cymru/alinio/rhestr_geiriau.tsv
|
||||
http://techiaith.cymru/alinio/hunalign/cy-en.dic
|
||||
|
||||
(see work/data/cy-en)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user