# -*- makefile -*- # # Makefile module for GIZA++ alignment (with mgiza) # (c) 2011-2012 Ulrich Germann # --- Training Parameters ------------------------------------------------------ # # mkcls: -n: iterations -c: classes. No space allowed after -n/-c! mkcls_args = -n10 -c50 # giza training %/mgiza.cfg: m1=5 %/mgiza.cfg: m2=0 %/mgiza.cfg: mh=5 %/mgiza.cfg: m3=3 %/mgiza.cfg: m4=3 %/mgiza.cfg: nodumps=0 %/mgiza.cfg: onlyaldumps=0 %/mgiza.cfg: model4smoothfactor=0.4 %/mgiza.cfg: nsmooth=4 %/mgiza.cfg: NCPUS=8 # symal symal_grow_diag_final_and = -a=g -d=yes -f=yes -b=yes symal_args = ${symal_grow_diag_final_and} # ------------------------------------------------------------------------------ # You should not have to edit anything below this line # ------------------------------------------------------------------------------ gizaln = ${WDIR}/crp/trn/aln/giza giztmp = $(gizaln)/tmp gizout = $(gizaln) gizaln.in = ${WDIR}/crp/trn/pll/clean/ .PHONY: giza giza-prep giza: | $(gizout)/${L1}.txt.gz giza: | $(gizout)/${L2}.txt.gz giza: | $(gizout)/${L1}-${L2}.symal.gz @echo "GIZA WORD ALIGNMENT COMPLETE!" other = $(if $(findstring $(1),${L1}),${L2},${L1}) fwd = $(1)-$(call other,$(1)) bwd = $(call other,$(1))-$(1) $(gizout)/${L1}.txt.gz: a3file = $(gizout)/${L2}-${L1}.A3.final.gz $(gizout)/${L1}.txt.gz: | $(gizout)/${L2}-${L1}.A3.final.gz $(lock) (zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@ $(unlock) $(gizout)/${L2}.txt.gz: a3file = $(gizout)/${L1}-${L2}.A3.final.gz $(gizout)/${L2}.txt.gz: | $(gizout)/${L1}-${L2}.A3.final.gz $(lock) (zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@ $(unlock) $(gizout)/${L1}-${L2}.symal.gz: A3fwd = $(gizout)/${L1}-${L2}.A3.final.gz $(gizout)/${L1}-${L2}.symal.gz: A3bwd = $(gizout)/${L2}-${L1}.A3.final.gz $(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L1}-${L2}.A3.final.gz $(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L2}-${L1}.A3.final.gz $(giza2bal.pl) $(lock) $(giza2bal.pl) -d 'gunzip -c ${A3fwd}' -i 'gunzip -c ${A3bwd}' \ | $(symal) $(symal_args) | perl -pe 's/^.*{##}\s+//' | gzip > $@_ && mv $@_ $@ $(unlock) # merge alignments produced by mgiza $(gizout)/%.A3.final.gz: | $(giztmp)/%/mgiza.DONE mkdir -p ${@D} $(lock) $(mgiza.merge) $(shell ls $(giztmp)/$*/$*.A3.final.part* 2>/dev/null) | gzip > $@_ mv $@_ $@ $(unlock) # run mgiza: %/mgiza.DONE: | %/mgiza.cfg $(lock) $(mgiza) $| && touch $@ $(unlock) $(giztmp) $(gizout): mkdir -p $@ # --- run mkcls (to get word classes) ------------------------------------------ # ATTENTION: mkcls does not accept spaces between option markers and # option arguments mkcls -n 10 -c 50 ... will make it fail! # mkcls_classes and mkcls_iterations should be set in parameters.mak stream = find -L $(1) -type f -name "*.$(2)" -or -name "*.$(2).gz" | sort | xargs zcat -f mkcls_cmd = $(call stream,$(1),$(2)) | ${mkcls} $(mkcls_args) -p/dev/stdin -V$(3) opt # mkcls: -n: iterations # mkcls: -c: classes # mkcls: -p: input data # mkcls: -V: word classes (output) $(giztmp)/%.vcb.classes: ${mkcls} $(giztmp)/%.vcb.classes: | $(gizaln.in) @echo CREATING $@ $(lock) mkdir -p $(@D) @$(call mkcls_cmd,$|,$*,$@_) && mv $@_ $@ @mv $@_.cats $@.cats $(unlock) # NUMBERIZED CORPUS FOR GIZA $(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt $(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt $(giztmp)/${L2}.vcb: | $(giztmp)/${L1}-${L2}.snt $(giztmp)/${L2}-${L1}.snt: | $(giztmp)/${L1}-${L2}.snt #$(info $(addprefix $(pll-clean), .${L1}.gz)) $(giztmp)/${L1}-${L2}.snt: L1files = $(addsuffix .${L1}.gz, $(pll-clean)) $(giztmp)/${L1}-${L2}.snt: L2files = $(addsuffix .${L2}.gz, $(pll-clean)) $(giztmp)/${L1}-${L2}.snt: | $(giztmp) $(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L1}.gz, $(pll-clean)) $(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L2}.gz, $(pll-clean)) $(lock) $(plain2snt) \ <(ls $(L1files) | xargs zcat -f) \ <(ls $(L2files) | xargs zcat -f) \ -vcb1 $(giztmp)/${L1}.vcb_ -vcb2 $(giztmp)/${L2}.vcb_ \ -snt1 $(giztmp)/${L1}-${L2}.snt_ -snt2 $(giztmp)/${L2}-${L1}.snt_ mv $(giztmp)/${L1}.vcb_ $(giztmp)/${L1}.vcb mv $(giztmp)/${L2}.vcb_ $(giztmp)/${L2}.vcb mv $(giztmp)/${L1}-${L2}.snt_ $(giztmp)/${L1}-${L2}.snt mv $(giztmp)/${L2}-${L1}.snt_ $(giztmp)/${L2}-${L1}.snt $(unlock) # .cooc files $(giztmp)/${L1}-${L2}.cooc: V1 = $(giztmp)/${L1}.vcb $(giztmp)/${L1}-${L2}.cooc: V2 = $(giztmp)/${L2}.vcb $(giztmp)/${L2}-${L1}.cooc: V1 = $(giztmp)/${L2}.vcb $(giztmp)/${L2}-${L1}.cooc: V2 = $(giztmp)/${L1}.vcb $(giztmp)/%.cooc: | $(giztmp)/%.snt @echo CREATING $@ $(lock) $(snt2cooc) $@_ ${V1} ${V2} $| && mv $@_ $@ $(unlock) ################################################################################ # MGIZA CONFIG FILE: # $(giztmp)/%/mgiza.cfg: SHELL=bash # --- CORPUS RESOURCES --------------------------------------------------------- $(giztmp)/%/mgiza.cfg: V1 = $(giztmp)/${FROM}.vcb $(giztmp)/%/mgiza.cfg: V2 = $(giztmp)/${TO}.vcb $(giztmp)/%/mgiza.cfg: SNT = $(giztmp)/${FROM}-${TO}.snt $(giztmp)/%/mgiza.cfg: COOC = $(giztmp)/${FROM}-${TO}.cooc $(giztmp)/%/mgiza.cfg: ODIR = $(giztmp)/${FROM}-${TO}/${FROM}-${TO} #$(giztmp)/%/mgiza.cfg: | $(giztmp)/%.cooc $(giztmp)/%.snt $(giztmp)/${L1}-${L2}/mgiza.cfg: FROM = ${L1} $(giztmp)/${L1}-${L2}/mgiza.cfg: TO = ${L2} $(giztmp)/${L2}-${L1}/mgiza.cfg: FROM = ${L2} $(giztmp)/${L2}-${L1}/mgiza.cfg: TO = ${L1} $(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.cooc $(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.snt $(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.cooc $(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.snt $(giztmp)/${L1}-${L2}/mgiza.cfg \ $(giztmp)/${L2}-${L1}/mgiza.cfg: | \ $(giztmp)/${L1}.vcb $(giztmp)/${L1}.vcb.classes \ $(giztmp)/${L2}.vcb $(giztmp)/${L2}.vcb.classes mkdir -p $(dir $@) touch $@ @echo "s ${V1}" >> $@ @echo "t ${V2}" >> $@ @echo "c ${SNT}" >> $@ @echo "cooc ${COOC}" >> $@ @echo "m1 ${m1}" >> $@ @echo "m2 ${m2}" >> $@ @echo "mh ${mh}" >> $@ @echo "m3 ${m3}" >> $@ @echo "m4 ${m4}" >> $@ @echo "t1 ${m1}" >> $@ @echo "t2 ${m2}" >> $@ @echo "th ${mh}" >> $@ @echo "t3 ${m3}" >> $@ @echo "t4 ${m4}" >> $@ @echo "o ${ODIR}" >> $@ @echo "model4smoothfactor ${model4smoothfactor}" >> $@ @echo "onlyaldumps ${onlyaldumps}" >> $@ @echo "nodumps ${nodumps}" >> $@ @echo "nsmooth ${nsmooth}" >> $@ @echo "NCPUS ${NCPUS}" >> $@ # # ------------------------------------------------------------------------------ # sanity checks ifeq ($(gizaln),) $(warning Giza base directory not defined) endif ifeq ($(gizaln.in),) $(warning No directory for Giza++ training data specified!) endif