Major revision of m4m. May not be backwards-compatible.

This commit is contained in:
Ulrich Germann 2014-05-18 21:53:19 +01:00
parent ff6a27b56e
commit 9cda90afdf
14 changed files with 234 additions and 147 deletions

View File

@ -64,6 +64,6 @@ LMODELS :=
LMODEL_ENTRIES :=
endef
clear-locks:
@rm -rf `find -L -type d -name '*.lock'`
clear-locks: | $(shell find -L -type d -name '*.lock')
rm -rf $|

View File

@ -2,7 +2,7 @@
moses.threads ?= 4
moses.flags += -threads ${moses.threads}
moses.flags += -v 0 -t -text-type "test"
moses.flags += -v 0 -t -text-type "test" -fd '${FACTORSEP}'
%.multi-bleu: | %.cleaned
$(lock)
@ -35,19 +35,16 @@ moses.ini ?=
# $1: output base name
# $2: system to be evaluated
# $3: evaluation input
# $4: evaluation input type
# $5: evaluation reference
define bleu_eval
EVALUATIONS += $1
.INTERMEDIATE: $3 $5
$1: moses.ini := $2
$1: moses.input := $3
$1: moses.inputtype := $4
$1: bleu.ref := $5
$1: moses.inputtype := $(call guess-inputtype,$3)
$1: bleu.ref := $$(shell echo $(patsubst %.${L1},%.${L2},$3) | perl -pe 's?/cfn[^/]+/?/cased/?')
$1.moses-out: | $2 $3
$1.multi-bleu: | $5
$1.multi-bleu: | $(call reffiles,$3,$(dir $(patsubst %/,%,$(dir $3))))
$1: | $1.multi-bleu
endef
@ -63,7 +60,8 @@ $(foreach system,${SYSTEMS},\
$(foreach tuneset,${tune.sets},\
$(foreach evalset,${eval.sets},\
$(foreach run,$(shell seq ${tune.runs}),\
$(eval $(call bleu_eval,${system}/eval/$(notdir ${tuneset})/${run}/$(notdir ${evalset}),\
$(eval $(call bleu_eval,\
${system}/eval/$(notdir ${tuneset})/${run}/$(notdir ${evalset}),\
${system}/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
${evalset}.${L1},${moses.inputtype.plaintext},${evalset}.${L2}))))))

View File

@ -18,6 +18,7 @@ TUNED_SYSTEMS :=
DTABLES :=
PTABLES :=
LMODELS :=
INPUT_FEATURES ?=
export MY_EXPERIMENT :=

View File

@ -3,7 +3,7 @@
# default parameters
kenlm.order ?= 5
kenlm.memory ?= 30%
kenlm.memory ?= 10%
kenlm.type ?= probing
kenlm.lazy ?= 1
kenlm.factor ?= 0

View File

@ -5,10 +5,10 @@
m4mdir := $(patsubst %modules/,%,\
$(dir $(word $(words $(MAKEFILE_LIST)),\
$(MAKEFILE_LIST))))
# $(info M4MDIR is ${m4mdir})
# m4m modules to be included
M4M_MODULES := aux init
#M4M_MODULES += directory-structure
M4M_MODULES += tools moses-parameters prepare-corpus
M4M_MODULES += mgiza fastalign mmbitext phrase-table moses-ini
M4M_MODULES += tune-moses eval-system kenlm

View File

@ -16,7 +16,6 @@ mkcls_args = -n10 -c50
%/mgiza.cfg: m4=3
%/mgiza.cfg: nodumps=0
%/mgiza.cfg: onlyaldumps=0
%/mgiza.cfg: model1dumpfrequency=0
%/mgiza.cfg: model4smoothfactor=0.4
%/mgiza.cfg: nsmooth=4
%/mgiza.cfg: NCPUS=8
@ -62,7 +61,7 @@ $(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L1}-${L2}.A3.final.gz
$(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L2}-${L1}.A3.final.gz $(giza2bal.pl)
$(lock)
$(giza2bal.pl) -d 'gunzip -c ${A3fwd}' -i 'gunzip -c ${A3bwd}' \
| $(symal) $(symal_args) | gzip > $@_ && mv $@_ $@
| $(symal) $(symal_args) | perl -pe 's/^.*{##}\s+//' | gzip > $@_ && mv $@_ $@
$(unlock)
# merge alignments produced by mgiza
@ -93,8 +92,8 @@ mkcls_cmd = $(call stream,$(1),$(2)) | ${mkcls} $(mkcls_args) -p/dev/stdin -V$(3
# mkcls: -p: input data
# mkcls: -V: word classes (output)
$(giztmp)/%.vcb.classes: | $(gizaln.in)
$(giztmp)/%.vcb.classes: ${mkcls}
$(giztmp)/%.vcb.classes: | $(gizaln.in)
@echo CREATING $@
$(lock)
mkdir -p $(@D)
@ -107,9 +106,12 @@ $(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt
$(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt
$(giztmp)/${L2}.vcb: | $(giztmp)/${L1}-${L2}.snt
$(giztmp)/${L2}-${L1}.snt: | $(giztmp)/${L1}-${L2}.snt
$(giztmp)/${L1}-${L2}.snt: L1files = $(addprefix $(pll-clean), .${L1}.gz)
$(giztmp)/${L1}-${L2}.snt: L2files = $(addprefix $(pll-clean), .${L2}.gz)
$(giztmp)/${L1}-${L2}.snt: | $(giztmp) $(addprefix $(pll-clean), .${L1}.gz .${L2}.gz)
#$(info $(addprefix $(pll-clean), .${L1}.gz))
$(giztmp)/${L1}-${L2}.snt: L1files = $(addsuffix .${L1}.gz, $(pll-clean))
$(giztmp)/${L1}-${L2}.snt: L2files = $(addsuffix .${L2}.gz, $(pll-clean))
$(giztmp)/${L1}-${L2}.snt: | $(giztmp)
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L1}.gz, $(pll-clean))
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L2}.gz, $(pll-clean))
$(lock)
$(plain2snt) \
<(ls $(L1files) | xargs zcat -f) \
@ -167,8 +169,12 @@ $(giztmp)/${L2}-${L1}/mgiza.cfg: | \
@echo "mh ${mh}" >> $@
@echo "m3 ${m3}" >> $@
@echo "m4 ${m4}" >> $@
@echo "t1 ${m1}" >> $@
@echo "t2 ${m2}" >> $@
@echo "th ${mh}" >> $@
@echo "t3 ${m3}" >> $@
@echo "t4 ${m4}" >> $@
@echo "o ${ODIR}" >> $@
@echo "model1dumpfrequency ${model1dumpfrequency}" >> $@
@echo "model4smoothfactor ${model4smoothfactor}" >> $@
@echo "onlyaldumps ${onlyaldumps}" >> $@
@echo "nodumps ${nodumps}" >> $@

View File

@ -5,12 +5,12 @@
define mmap_ttrack
.INTERMEDIATE += $(strip $1).txt.gz
$2/$(notdir $1).mct: | $2/$(notdir $1).sfa
$2/$(notdir $1).mct: | $2/$(notdir $1).tdx
$2/$(notdir $1).tdx: | $2/$(notdir $1).sfa
$2/$(notdir $1).sfa: | $(strip $1).txt.gz
$$(lock)
zcat -f $$< | ${MOSES_BIN}/mtt-build -i -o $$@.lock/$$(basename $${@F})
zcat -f $(strip $1).txt.gz \
| ${MOSES_BIN}/mtt-build -i -o $$@.lock/$$(basename $${@F})
mv $$@.lock/$$(basename $${@F}).tdx $${@D}
mv $$@.lock/$$(basename $${@F}).sfa $${@D}
mv $$@.lock/$$(basename $${@F}).mct $${@D}
@ -24,12 +24,21 @@ define mmap_bitext
$(call mmap_ttrack,$1${L1},$2)
$(call mmap_ttrack,$1${L2},$2)
$2/$(notdir $1)${L1}-${L2}.mam: SYMAL = $(strip $1)${L1}-${L2}.symal.gz
$2/$(notdir $1)${L1}-${L2}.mam: | $(strip $1)${L1}-${L2}.symal.gz
$$(lock)
zcat -f $$< | ${MOSES_BIN}/symal2mam $$@_ && mv $$@_ $$@
zcat -f $${SYMAL} | ${MOSES_BIN}/symal2mam $$@_ && mv $$@_ $$@
$$(unlock)
.INTERMEDIATE += $(strip $1)${L1}-${L2}.symal.gz
$2/$(notdir $1)${L1}-${L2}.lex: | $2/$(notdir $1)${L1}.mct
$2/$(notdir $1)${L1}-${L2}.lex: | $2/$(notdir $1)${L2}.mct
$2/$(notdir $1)${L1}-${L2}.lex: | $2/$(notdir $1)${L1}-${L2}.mam
$$(lock)
${MOSES_BIN}/mmlex-build $2/$(notdir $1) ${L1} ${L2} \
-o $$@.lock/$${@F} -c $$@.lock/$$(basename $${@F}).coc
mv $$@.lock/$${@F} $${@D}
mv $$@.lock/$$(basename $${@F}).coc $${@D}
$$(unlock)
endef

View File

@ -13,7 +13,6 @@ moses.ini_ttable-limit = 20
moses.ini_distortion-limit = 6
moses.ini_v = 0
weight_vector = perl -ne \
'm/name=([^; ]+)/;\
print "$$1=";\
@ -24,35 +23,37 @@ define create_moses_ini
$(strip $1)/moses.ini.0: ${PTABLES} ${DTABLES} ${LMODELS} ${MOSES_INI_PREREQ}
$$(lock)
echo '[input-factors]' > $$@_
echo '[input-factors]' > $$@_
echo '$${moses.ini_input-factors}' >> $$@_
echo >> $$@_
echo '[search-algorithm]' >> $$@_
echo >> $$@_
echo '[search-algorithm]' >> $$@_
echo '$${moses.ini_search-algorithm}' >> $$@_
echo >> $$@_
echo '[stack]' >> $$@_
echo >> $$@_
echo '[stack]' >> $$@_
echo '$${moses.ini_stack}' >> $$@_
echo >> $$@_
echo '[cube-pruning-pop-limit]' >> $$@_
echo >> $$@_
echo '[cube-pruning-pop-limit]' >> $$@_
echo '$${moses.ini_cube-pruning-pop-limit}' >> $$@_
echo >> $$@_
echo '[mapping]' >> $$@_
echo >> $$@_
echo '[mapping]' >> $$@_
echo '$${moses.ini_mapping}' >> $$@_
echo >> $$@_
echo '[distortion-limit]' >> $$@_
echo >> $$@_
echo '[distortion-limit]' >> $$@_
echo '$${moses.ini_distortion-limit}' >> $$@_
echo >> $$@_
echo '[v]' >> $$@_
echo >> $$@_
echo '[v]' >> $$@_
echo '$${moses.ini_v}' >> $$@_
echo >> $$@_
echo '[feature]' >> $$@_
$$(foreach f,${STANDARD_FEATURES},echo $$f >> $$@_;)
$$(foreach pt,${PTABLE_ENTRIES},echo "$$(subst ;, ,$${pt})" >> $$@_;)
$$(foreach dt,${DTABLE_ENTRIES},echo "$$(subst ;, ,$${dt})" >> $$@_;)
$$(foreach lm,${LMODEL_ENTRIES},echo "$$(subst ;, ,$${lm})" >> $$@_;)
echo >> $$@_
echo '[weight]' >> $$@_
$$(foreach x,$(STANDARD_FEATURES),echo "$$x0= 1.0" >> $$@_;)
echo >> $$@_
echo '[feature]' >> $$@_
$$(foreach f,${STANDARD_FEATURES},echo $$f >> $$@_;)
$$(foreach i,${INPUT_FEATURES},echo "$$(subst ;, ,$${i})" >> $$@_;)
$$(foreach pt,${PTABLE_ENTRIES},echo "$$(subst ;, ,$${pt})" >> $$@_;)
$$(foreach dt,${DTABLE_ENTRIES},echo "$$(subst ;, ,$${dt})" >> $$@_;)
$$(foreach lm,${LMODEL_ENTRIES},echo "$$(subst ;, ,$${lm})" >> $$@_;)
echo >> $$@_
echo '[weight]' >> $$@_
$$(foreach x,$(STANDARD_FEATURES),echo "$$x0= 1.0" >> $$@_;)
$$(foreach i,${INPUT_FEATURES},echo '$$i' | $${weight_vector} >> $$@_;)
$$(foreach x,${PTABLE_ENTRIES},echo '$$x' | $${weight_vector} >> $$@_;)
$$(foreach x,${DTABLE_ENTRIES},echo '$$x' | $${weight_vector} >> $$@_;)
$$(foreach x,${LMODEL_ENTRIES},echo '$$x' | $${weight_vector} >> $$@_;)

View File

@ -1,7 +1,7 @@
# -*- makefile -*-
casing1 = truecase
casing2 = truecase
word-alignment = fast
casing1 ?= truecase
casing2 ?= truecase
word-alignment ?= fast
moses.threads = $(shell parallel --number-of-cores)
# numerical constants for moses
@ -17,12 +17,12 @@ lmodel = model/lm/${L2}/kenlm
lexdm_specs = wbe-mslr-bidirectional-fe-allff
lexdm = model/dm/bin/${L1}-${L2}/${dflt_lexdmodel_specs}
ptable.max-phrase-length = 7
ptable.smoothing = --GoodTuring
ptable.source-factors = 0
ptable.target-factors = 0
ptable.num-features = 5
ptable.implemetation = 1
ptable.max-phrase-length ?= 7
ptable.smoothing ?= --GoodTuring
ptable.source-factors ?= 0
ptable.target-factors ?= 0
ptable.num-features ?= 5
ptable.implemetation ?= 1
# reminder: implementation types:
# 0 - text
# 1 - binary via processPhraseTable
@ -50,6 +50,8 @@ dmodel.description = $(addprefix ${dmodel.type}-${dmodel.orientation}-,\
distortion-limit = 6
# DEFAULT TUNING PARAMETERS
mert.nbest = 100
mert.extra-flags = --no-filter-phrase-table
mert.decoder-flags = -threads ${moses.threads}
FACTORSEP ?= \n
mert.nbest = 100
mert.extra-flags ?=
mert.extra-flags += --no-filter-phrase-table
mert.decoder-flags = -threads ${moses.threads} -fd '${FACTORSEP}'

View File

@ -23,37 +23,15 @@ ${moses.extract-phrases} ${moses.extract} $(1:.aln.gz=) ${L1} ${L2} \
endef
#################################################################################
# create_phrase_table: add rules to create a standard phrase table
# ADD RULES TO CREATE A STANDARD PHRASE TABLE FROM
# $(pll.txt1),$(pll.txt2),$(pll.aln) that are specified as target-specific
# variables like this:
# $1.txt.gz: pll.txt1 = ...
# $1.txt.gz: pll.txt2 = ...
# $1.txt.gz: pll.aln = ...
# This function is normally called indirectly via $(eval $(call add_bin_pt,...))
#
# Note: this section should be improved:
# - split into shards
# - create bash file with jobs
# - run batch file in parallel
#--------------------------------------------------------------------------------
define create_phrase_table
# $1: stem of phrase extractions
# $2: L1 text
# $3: L2 text
# $4: symal file
# normally, $2 ... $4 are default values ${pll.txt1} ${pll.txt2} ${pll.aln}
define extract_phrases
SHARDS = $$(foreach x, $${L1} $${L2} aln, $1.shards/$$x-DONE)
.SECONDARY: $1.txt.gz
.SECONDARY: $1.${L2}-given-${L1}.lex.gz
.SECONDARY: $1.${L1}-given-${L2}.lex.gz
.INTERMEDIATE: $1.txt.gz
.INTERMEDIATE: $$(SHARDS)
.INTERMEDIATE: $1.tmp/fwd.scored.gz
.INTERMEDIATE: $1.tmp/bwd/scoring.done
.INTERMEDIATE: $1.${L2}-given-${L1}.lex.gz
.INTERMEDIATE: $1.${L1}-given-${L2}.lex.gz
.INTERMEDIATE: $1.shards/${L1}-DONE
.INTERMEDIATE: $1.shards/${L2}-DONE
.INTERMEDIATE: $1.shards/aln-DONE
.INTERMEDIATE: $1.shards/extract.batch
.INTERMEDIATE: $1.shards/extract.done
$1.shards/${L1}-DONE: $(if $2,$2,$$(pll.txt1))
$$(lock)
@ -87,10 +65,32 @@ $1.shards/extract.batch: $$(SHARDS)
$1.shards/extract.done: $1.shards/extract.batch
$$(lock)
${parallel} -j$(shell echo $$((${NUMCORES}/1))) < $1.shards/extract.batch
${parallel} -j$(shell echo $$((${NUMCORES}/4))) < $1.shards/extract.batch
touch $$@
$$(unlock)
endef
#################################################################################
# create_phrase_table: add rules to create a standard phrase table
# ADD RULES TO CREATE A STANDARD PHRASE TABLE FROM
# $(pll.txt1),$(pll.txt2),$(pll.aln) that are specified as target-specific
# variables like this:
# $1.txt.gz: pll.txt1 = ...
# $1.txt.gz: pll.txt2 = ...
# $1.txt.gz: pll.aln = ...
# This function is normally called indirectly via $(eval $(call add_bin_pt,...))
#
# Note: this section should be improved:
# - split into shards
# - create bash file with jobs
# - run batch file in parallel
#--------------------------------------------------------------------------------
define create_phrase_table
$(call extract_phrases,$1,$${pll.txt1},$${pll.txt2},$${pll.aln})
ptable: $1.txt.gz
$1.txt.gz: | ${merge-sorted}
$1.txt.gz: | ${MOSES_BIN}/consolidate
@ -99,10 +99,10 @@ $1.txt.gz: | $1.tmp/bwd/scoring.done
$$(lock)
${MOSES_BIN}/consolidate \
<(zcat -f $1.tmp/fwd.scored.gz) \
<(${merge-sorted} $1.tmp/bwd/scored.*.gz) \
/dev/stdout \
<(${merge-sorted} $1.tmp/bwd/scored.*.gz) /dev/stdout \
$(if $(ptable.smoothing), $(ptable.smoothing) $1.tmp/fwd.coc) \
| gzip > $$@_ && mv $$@_ $$@
| gzip > $$@_
mv $$@_ $$@
$$(unlock)
$1.tmp/fwd.scored.gz: | $(merge-sorted)
@ -119,7 +119,7 @@ $1.tmp/bwd/scoring.done: | $1.${L2}-given-${L1}.lex.gz
$$(lock)
$(merge-sorted) $1.shards/*.bwd.gz \
| ${moses.score-phrases} ${MOSES_BIN}/score - $1.${L2}-given-${L1}.lex.gz \
$${@D}/scored "$(ptable.smoothing)" --Inverse && touch $$@
$${@D}/scored $(ptable.smoothing) --Inverse && touch $$@
$$(unlock)
# reminder: $2,$3,$4 = L1text, L2text, alignment
@ -206,7 +206,6 @@ endef
define create_lexical_reordering_table
mystem := $(strip $1).$(strip $2)
.INTERMEDIATE: $${mystem}.gz
$${mystem}.gz: dmshards = $$(shell ls $3.shards/*.dst.gz 2>/dev/null)
$${mystem}.gz: dm.type=$(word 1,$(subst -, ,$2))
$${mystem}.gz: dm.orient=$(word 2,$(subst -, ,$2))
@ -271,6 +270,12 @@ PTABLES += $(strip $4) $(strip $5) $(strip $6)
endef
#################################################################################
# $1: input factor
# $2: output factor
# $3: num-features
# $4: path to mmapped data
# $5: path to text data
# $6: path and basename of dynamic data
define add_mmsapt
$(call mmap_bitext,$(strip $5),$(strip $4))
@ -282,8 +287,10 @@ MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += base=$(abspath $4)/ L1=${L1} L2=${L2}
PTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
MOSES_INI_PREREQ += $(addprefix $(strip $4),${L1}.mct ${L1}.tdx ${L1}.sfa)
MOSES_INI_PREREQ += $(addprefix $(strip $4),${L2}.mct ${L2}.tdx ${L2}.sfa)
MOSES_INI_PREREQ += $(strip $4)${L1}-${L2}.mam
MOSES_INI_PREREQ += $(addprefix $(strip $4)/${L1},.mct .tdx .sfa)
MOSES_INI_PREREQ += $(addprefix $(strip $4)/${L2},.mct .tdx .sfa)
MOSES_INI_PREREQ += $(strip $4)/${L1}-${L2}.mam
MOSES_INI_PREREQ += $(strip $4)/${L1}-${L2}.lex
endef

View File

@ -9,17 +9,26 @@
max-sentence-length ?= 80
casing.${L1} ?= truecase
casing.${L2} ?= truecase
MAX_NUM_REFS ?= 4
# tok-mno: monolingual resources
# tok-pll: parallel resources
trn.tok-mno = $(addprefix ${WDIR}/crp/trn/mno/tok/, $(notdir $(wildcard ${WDIR}/crp/trn/mno/raw/*.$1.gz)))
trn.tok-pll = $(addprefix ${WDIR}/crp/trn/pll/tok/, $(notdir $(wildcard ${WDIR}/crp/trn/pll/raw/*.$1.gz)))
trn.raw-mno = $(notdir $(wildcard ${WDIR}/crp/trn/mno/raw/*.$1.gz))
trn.tok-mno = $(addprefix ${WDIR}/crp/trn/mno/tok/, $(call trn.raw-mno,$1))
trn.cased-mno = $(addprefix ${WDIR}/crp/trn/mno/cased/, $(call trn.raw-mno,$1))
trn.raw-pll = $(notdir $(wildcard ${WDIR}/crp/trn/pll/raw/*.$1.gz))
trn.tok-pll = $(addprefix ${WDIR}/crp/trn/pll/tok/, $(call trn.raw-pll,$1))
trn.cased-pll = $(addprefix ${WDIR}/crp/trn/pll/cased/, $(call trn.raw-pll,$1))
define tokenize
$1/tok/%.$2.gz: $1/raw/%.$2.gz
$2/tok/%.$3.gz: | $2/raw/%.$3.gz
$$(lock)
zcat $$< | ${parallel} --pipe -k ${tokenize.$2} | gzip > $$@_
zcat $$(word 1,$$|) | ${pre-tokenize.$1} \
| ${parallel} -j4 --pipe -k ${tokenize.$1} \
| gzip > $$@_
mv $$@_ $$@
$$(unlock)
@ -30,36 +39,36 @@ endef
###########################################################################
define truecase
$1/cased/%.$2.gz: caser = ${run-truecaser}
$1/cased/%.$2.gz: caser += -model ${WDIR}/aux/truecasing-model.$2
$1/cased/%.$2.gz: $1/tok/%.$2.gz ${WDIR}/aux/truecasing-model.$2
$2/cased/%.$3.gz: caser = ${run-truecaser}
$2/cased/%.$3.gz: caser += -model ${WDIR}/aux/truecasing-model.$1
$2/cased/%.$3.gz: | $2/tok/%.$3.gz ${WDIR}/aux/truecasing-model.$1
$$(lock)
zcat $$< | ${parallel} --pipe -k $${caser} | gzip > $$@_
zcat $$(word 1, $$|) | ${parallel} --pipe -k $${caser} | gzip > $$@_
mv $$@_ $$@
$$(unlock)
$1/cased/%.$2: caser = ${run-truecaser}
$1/cased/%.$2: caser += -model ${WDIR}/aux/truecasing-model.$2
$1/cased/%.$2: $1/tok/%.$2.gz ${WDIR}/aux/truecasing-model.$2
$2/cased/%.$3: | $2/cased/%.$3.gz
$$(lock)
zcat $$< | ${parallel} --pipe -k $${caser} > $$@_
gzip -d < $$(word 1, $$|) > $$@_
mv $$@_ $$@
$$(unlock)
endef
define lowercase
$1/cased/%.$2.gz: caser = ${run-lowercaser}
$1/cased/%.$2.gz: | $1/tok/%.$2.gz
$2/cased/%.$3.gz: caser = ${run-lowercaser}
$2/cased/%.$3.gz: | $2/tok/%.$3.gz
$$(lock)
zcat $$| | ${parallel} --pipe -k $${caser} | gzip > $$@_
zcat $$| | ${parallel} -j4 --pipe -k $${caser} | gzip > $$@_
mv $$@_ $$@
$$(unlock)
$1/cased/%.$2: caser = ${run-lowercaser}
$1/cased/%.$2: | $1/tok/%.$2.gz
$2/cased/%.$3: | $2/cased/%.$3.gz
$$(lock)
zcat $$| | ${parallel} --pipe -k $${caser} > $$@_
gzip -d < $$(word 1, $$|) > $$@_
mv $$@_ $$@
$$(unlock)
endef
define skipcasing
@ -83,9 +92,12 @@ pll-ready: $(foreach l,${L1} ${L2}, $(addsuffix .$l.gz,${pll-clean}))
define clean_corpus
.INTERMEDIATE: $1/clean/$2.${L1}.gz
.INTERMEDIATE: $1/clean/$2.${L2}.gz
.INTERMEDIATE: $1/clean/$2.clean.log
# .INTERMEDIATE: $1/clean/$2.${L1}.gz
# .INTERMEDIATE: $1/clean/$2.${L2}.gz
# .INTERMEDIATE: $1/clean/$2.clean.log
# .SECONDARY: $1/clean/$2.${L1}.gz
# .SECONDARY: $1/clean/$2.${L2}.gz
# .SECONDARY: $1/clean/$2.clean.log
$1/clean/$2.${L2}.gz: | $1/clean/$2.clean.log
$$(lock)
gzip < $$(@D)/_$2.${L2} > $$@_ && rm $$(@D)/_$2.${L2}
@ -110,17 +122,26 @@ endef
############################################################################
# Truecasing models #
############################################################################
.INTERMEDIATE: $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
.INTERMEDIATE: $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
# .INTERMEDIATE: $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
# .INTERMEDIATE: $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
# .SECONDARY: $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
# .SECONDARY: $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
#${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1})
${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1})
$(lock)
$(if $|,,$(error Can't find training data for $@!))#'
${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f)
test -s $@_ || (echo "Truecasing model $@ is empty!" && exit 1)
mv $@_ $@
$(unlock)
${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
#${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2})
${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2})
$(lock)
$(if $|,,$(error Can't find training data for $@!))#'
${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f)
test -s $@_ || (echo "Truecasing model $@ is empty!" && exit 1)
mv $@_ $@
$(unlock)
@ -129,18 +150,24 @@ ${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-p
# Generate rules #
############################################################################
all_data_dirs := $(addprefix ${WDIR}/crp/,trn/mno trn/pll dev tst)
all_data_dirs := $(addprefix ${WDIR}/crp/,trn/mno trn/pll dev tst dev+tst)
# add rules for tokenization and casing
snippet := $(foreach d,$(all_data_dirs),$(foreach l,${L1} ${L2},\
$(call tokenize,$d,$l)$(call ${casing.$l},$d,$l)))
snippet := $(foreach d,$(all_data_dirs),\
$(call tokenize,${L1},$d,${L1})$(call ${casing.${L1}},${L1},$d,${L1}))
snippet += $(foreach d,$(all_data_dirs),\
$(foreach l,${L2} $(addprefix ${L2},$(shell seq 0 ${MAX_NUM_REFS})),\
$(call tokenize,${L2},$d,$l)$(call ${casing.${L2}},${L2},$d,$l)))
MY_EXPERIMENT += $(snippet)
#$(info $(snippet))
$(eval $(snippet))
# add rules for cleaning parallel data prior to word alignment
snippet := $(foreach s,${pllshards},$(call clean_corpus,${WDIR}/crp/trn/pll,$s))
MY_EXPERIMENT += $(snippet)
#$(info $(snippet))
$(eval $(snippet))

View File

@ -5,9 +5,9 @@
# MOSES_ROOT: root directory of the distribution
# MOSES_BIN: where compiled binaries are kept
# MGIZA_ROOT: root directory of the mgiza installation
MOSES_ROOT ?= ${HOME}/code/moses/master/mosesdecoder
MOSES_BIN ?= ${HOME}/bin
MGIZA_ROOT ?= ${HOME}/tools/mgiza
MOSES_ROOT ?= ${HOME}/accept/exp/journal-paper/moses
MOSES_BIN ?= ${MOSES_ROOT}/bin
MGIZA_ROOT ?= ${MOSES_ROOT}
# default location (unless specified otherwise above)
MOSES_BIN ?= ${MOSES_ROOT}/bin
@ -19,12 +19,15 @@ M4M_SCRIPTS ?= ${m4mdir}scripts
# default locations of scripts and executables
# utilities
parallel ?= $(shell which parallel)
parallel ?= $(shell which parallel) --gnu
$(if ${parallel},,$(error GNU parallel utility not found!))
# corpus preprocessing
tokenize.${L1} ?= ${MOSES_SCRIPTS}/tokenizer/tokenizer.perl -q -a -l ${L1}
tokenize.${L2} ?= ${MOSES_SCRIPTS}/tokenizer/tokenizer.perl -q -a -l ${L2}
pre-tokenize.${L1} ?= ${MOSES_SCRIPTS}/tokenizer/pre-tokenizer.perl -l ${L1}
pre-tokenize.${L2} ?= ${MOSES_SCRIPTS}/tokenizer/pre-tokenizer.perl -l ${L2}
tokenize.${L1} ?= ${MOSES_SCRIPTS}/tokenizer/tokenizer.perl -q -a -l ${L1} -no-escape
tokenize.${L2} ?= ${MOSES_SCRIPTS}/tokenizer/tokenizer.perl -q -a -l ${L2} -no-escape
train-truecaser ?= ${MOSES_SCRIPTS}/recaser/train-truecaser.perl
run-truecaser ?= ${MOSES_SCRIPTS}/recaser/truecase.perl
run-detruecaser ?= ${MOSES_SCRIPTS}/recaser/detruecase.perl

View File

@ -6,29 +6,62 @@
untuned_model ?= model/moses.ini.0
tune.dir ?= ${basedir}/tune
# FUNCTIONS FOR COMPUTING REFERENCE FILE DEPENDENCIES
# AND INPUT TYPE FROM INPUT FILE PATH FOR TUNING AND EVAL
# get basenames (with path) of all files belonging
# to a particular set (e.g. dev / tst)
get_set = $(addprefix $(patsubst %/,%,$1)/,\
$(shell find -L $(patsubst %/,%,$(dir $1)) -regex '.*${L1}\(.gz\)?'\
| perl -pe 's/.*\/(.*?).${L1}(\.gz)?$$/$$1/' | sort | uniq))
# $1: moses input file
# ->: base name of corresponding reference files
refbase = $(notdir $(patsubst %.${L1},%.${L2},%,$(patsubst %.gz,%,$1)))
# $1: moses input file
# $2: root of directory tree for search
# ->: list of full paths to reference files
reffiles = $(addprefix $(patsubst %/,%,$2)/cased/,\
$(shell find -L $2 -regex '.*$(call refbase,$1)[0-9]*\(.gz\)?'\
| perl -pe 's/.*\/(.*?)(\.gz)?$$/$$1/' | sort | uniq))
# $1: moses input file
# ->: 0 for plain text, 1 for confusion network
guess-inputtype = $(if $(findstring /cfn,$1),1,0)
############################################################################
# TUNE SYSTEM
#
# $1: untuned moses.ini
# $2: tuned moses.ini
# $3: moses input
# $4: reference
# $5: input type
# $2: tuned moses.ini
# $3: moses input (ref files and input type are computed automatically)
# ->: Makefile snippet for tuning system on input file given
#
define tune_system
TUNED_SYSTEMS += $(strip $2)
.INTERMEDIATE: $1
tune.reffiles = $$(call reffiles,$3,$(dir $(patsubst %/,%,$(dir $3))))
#.INTERMEDIATE: $1
$(strip $2): $${PTABLES} $${DTABLES} $${LMODELS} $${MOSES_INI_PREREQ}
$(strip $2): mert.wdir = $(dir $(abspath $2))tmp
$(strip $2): tune.src = $3
$(strip $2): tune.ref = $4
$(strip $2): | $1 $3 $4
$(strip $2): mert.wdir = $(dir $(abspath $2))tmp
$(strip $2): tune.src = $3
$(strip $2): tune.ref = $$(shell echo $(patsubst %.${L1},%.${L2},$3) | perl -pe 's?/cfn[^/]+/?/cased/?')
$(strip $2): tune.itype = $$(call guess-inputtype,$3)
$(strip $2): | $1 $3 $${tune.reffiles}
$(strip $2):
$$(lock)
$$(info REFFILES = $${tune.reffiles})
mkdir -p $${mert.wdir}
rm -f $${mert.wdir}/*
${mert} ${mert.extra-flags} --nbest ${mert.nbest} --mertdir ${MOSES_BIN} \
--rootdir ${MOSES_ROOT}/scripts --working-dir $${mert.wdir} \
--decoder-flags "${mert.decoder-flags} -inputtype $5" \
$${tune.src} $${tune.ref} ${moses} $1
$(if $(findstring -continue,${mert.extra-flags}),,rm -f $${mert.wdir}/*)
${mert} ${mert.extra-flags} \
--nbest ${mert.nbest} \
--mertdir ${MOSES_BIN} \
--rootdir ${MOSES_SCRIPTS} \
--working-dir $${mert.wdir} \
--decoder-flags "$${mert.decoder-flags}" \
--inputtype $${tune.itype} \
$${tune.src} $${tune.ref} $${moses} $1
${apply-weights} $1 $${mert.wdir}/moses.ini $$@_ && mv $$@_ $$@
$$(unlock)

View File

@ -25,7 +25,7 @@ trap 'cleanup' 0
export LC_ALL=C
if [[ "$inv" == "--Inverse" ]] ; then
parallel < $obase.$$ -j10 --pipe --blocksize 250M "sort -S 10G | gzip > $obase.{#}.gz" &
parallel --gnu < $obase.$$ -j10 --pipe --blocksize 250M "sort -S 10G | gzip > $obase.{#}.gz" &
else
gzip < $obase.$$ > $obase.scored.gz_ &
fi