mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
191 lines
6.7 KiB
Makefile
191 lines
6.7 KiB
Makefile
# -*- makefile -*-
|
|
#
|
|
# Makefile module for GIZA++ alignment (with mgiza)
|
|
# (c) 2011-2012 Ulrich Germann
|
|
|
|
# --- Training Parameters ------------------------------------------------------
|
|
#
|
|
# mkcls: -n: iterations -c: classes. No space allowed after -n/-c!
|
|
mkcls_args = -n10 -c50
|
|
|
|
# giza training
|
|
%/mgiza.cfg: m1=5
|
|
%/mgiza.cfg: m2=0
|
|
%/mgiza.cfg: mh=5
|
|
%/mgiza.cfg: m3=3
|
|
%/mgiza.cfg: m4=3
|
|
%/mgiza.cfg: nodumps=0
|
|
%/mgiza.cfg: onlyaldumps=0
|
|
%/mgiza.cfg: model4smoothfactor=0.4
|
|
%/mgiza.cfg: nsmooth=4
|
|
%/mgiza.cfg: NCPUS=8
|
|
|
|
# symal
|
|
symal_grow_diag_final_and = -a=g -d=yes -f=yes -b=yes
|
|
symal_args = ${symal_grow_diag_final_and}
|
|
|
|
# ------------------------------------------------------------------------------
|
|
# You should not have to edit anything below this line
|
|
# ------------------------------------------------------------------------------
|
|
gizaln = ${WDIR}/crp/trn/aln/giza
|
|
giztmp = $(gizaln)/tmp
|
|
gizout = $(gizaln)
|
|
gizaln.in = ${WDIR}/crp/trn/pll/clean/
|
|
|
|
.PHONY: giza giza-prep
|
|
|
|
giza: | $(gizout)/${L1}.txt.gz
|
|
giza: | $(gizout)/${L2}.txt.gz
|
|
giza: | $(gizout)/${L1}-${L2}.symal.gz
|
|
@echo "GIZA WORD ALIGNMENT COMPLETE!"
|
|
|
|
other = $(if $(findstring $(1),${L1}),${L2},${L1})
|
|
fwd = $(1)-$(call other,$(1))
|
|
bwd = $(call other,$(1))-$(1)
|
|
|
|
$(gizout)/${L1}.txt.gz: a3file = $(gizout)/${L2}-${L1}.A3.final.gz
|
|
$(gizout)/${L1}.txt.gz: | $(gizout)/${L2}-${L1}.A3.final.gz
|
|
$(lock)
|
|
(zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@
|
|
$(unlock)
|
|
|
|
$(gizout)/${L2}.txt.gz: a3file = $(gizout)/${L1}-${L2}.A3.final.gz
|
|
$(gizout)/${L2}.txt.gz: | $(gizout)/${L1}-${L2}.A3.final.gz
|
|
$(lock)
|
|
(zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@
|
|
$(unlock)
|
|
|
|
$(gizout)/${L1}-${L2}.symal.gz: A3fwd = $(gizout)/${L1}-${L2}.A3.final.gz
|
|
$(gizout)/${L1}-${L2}.symal.gz: A3bwd = $(gizout)/${L2}-${L1}.A3.final.gz
|
|
$(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L1}-${L2}.A3.final.gz
|
|
$(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L2}-${L1}.A3.final.gz $(giza2bal.pl)
|
|
$(lock)
|
|
$(giza2bal.pl) -d 'gunzip -c ${A3fwd}' -i 'gunzip -c ${A3bwd}' \
|
|
| $(symal) $(symal_args) | perl -pe 's/^.*{##}\s+//' | gzip > $@_ && mv $@_ $@
|
|
$(unlock)
|
|
|
|
# merge alignments produced by mgiza
|
|
$(gizout)/%.A3.final.gz: | $(giztmp)/%/mgiza.DONE
|
|
mkdir -p ${@D}
|
|
$(lock)
|
|
$(mgiza.merge) $(shell ls $(giztmp)/$*/$*.A3.final.part* 2>/dev/null) | gzip > $@_
|
|
mv $@_ $@
|
|
$(unlock)
|
|
|
|
# run mgiza:
|
|
%/mgiza.DONE: | %/mgiza.cfg
|
|
$(lock)
|
|
$(mgiza) $| && touch $@
|
|
$(unlock)
|
|
|
|
$(giztmp) $(gizout):
|
|
mkdir -p $@
|
|
|
|
# --- run mkcls (to get word classes) ------------------------------------------
|
|
# ATTENTION: mkcls does not accept spaces between option markers and
|
|
# option arguments mkcls -n 10 -c 50 ... will make it fail!
|
|
# mkcls_classes and mkcls_iterations should be set in parameters.mak
|
|
stream = find -L $(1) -type f -name "*.$(2)" -or -name "*.$(2).gz" | sort | xargs zcat -f
|
|
mkcls_cmd = $(call stream,$(1),$(2)) | ${mkcls} $(mkcls_args) -p/dev/stdin -V$(3) opt
|
|
# mkcls: -n: iterations
|
|
# mkcls: -c: classes
|
|
# mkcls: -p: input data
|
|
# mkcls: -V: word classes (output)
|
|
|
|
$(giztmp)/%.vcb.classes: ${mkcls}
|
|
$(giztmp)/%.vcb.classes: | $(gizaln.in)
|
|
@echo CREATING $@
|
|
$(lock)
|
|
mkdir -p $(@D)
|
|
@$(call mkcls_cmd,$|,$*,$@_) && mv $@_ $@
|
|
@mv $@_.cats $@.cats
|
|
$(unlock)
|
|
|
|
# NUMBERIZED CORPUS FOR GIZA
|
|
$(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt
|
|
$(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt
|
|
$(giztmp)/${L2}.vcb: | $(giztmp)/${L1}-${L2}.snt
|
|
$(giztmp)/${L2}-${L1}.snt: | $(giztmp)/${L1}-${L2}.snt
|
|
#$(info $(addprefix $(pll-clean), .${L1}.gz))
|
|
$(giztmp)/${L1}-${L2}.snt: L1files = $(addsuffix .${L1}.gz, $(pll-clean))
|
|
$(giztmp)/${L1}-${L2}.snt: L2files = $(addsuffix .${L2}.gz, $(pll-clean))
|
|
$(giztmp)/${L1}-${L2}.snt: | $(giztmp)
|
|
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L1}.gz, $(pll-clean))
|
|
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L2}.gz, $(pll-clean))
|
|
$(lock)
|
|
$(plain2snt) \
|
|
<(ls $(L1files) | xargs zcat -f) \
|
|
<(ls $(L2files) | xargs zcat -f) \
|
|
-vcb1 $(giztmp)/${L1}.vcb_ -vcb2 $(giztmp)/${L2}.vcb_ \
|
|
-snt1 $(giztmp)/${L1}-${L2}.snt_ -snt2 $(giztmp)/${L2}-${L1}.snt_
|
|
mv $(giztmp)/${L1}.vcb_ $(giztmp)/${L1}.vcb
|
|
mv $(giztmp)/${L2}.vcb_ $(giztmp)/${L2}.vcb
|
|
mv $(giztmp)/${L1}-${L2}.snt_ $(giztmp)/${L1}-${L2}.snt
|
|
mv $(giztmp)/${L2}-${L1}.snt_ $(giztmp)/${L2}-${L1}.snt
|
|
$(unlock)
|
|
|
|
# .cooc files
|
|
$(giztmp)/${L1}-${L2}.cooc: V1 = $(giztmp)/${L1}.vcb
|
|
$(giztmp)/${L1}-${L2}.cooc: V2 = $(giztmp)/${L2}.vcb
|
|
$(giztmp)/${L2}-${L1}.cooc: V1 = $(giztmp)/${L2}.vcb
|
|
$(giztmp)/${L2}-${L1}.cooc: V2 = $(giztmp)/${L1}.vcb
|
|
$(giztmp)/%.cooc: | $(giztmp)/%.snt
|
|
@echo CREATING $@
|
|
$(lock)
|
|
$(snt2cooc) $@_ ${V1} ${V2} $| && mv $@_ $@
|
|
$(unlock)
|
|
|
|
################################################################################
|
|
# MGIZA CONFIG FILE:
|
|
#
|
|
$(giztmp)/%/mgiza.cfg: SHELL=bash
|
|
# --- CORPUS RESOURCES ---------------------------------------------------------
|
|
$(giztmp)/%/mgiza.cfg: V1 = $(giztmp)/${FROM}.vcb
|
|
$(giztmp)/%/mgiza.cfg: V2 = $(giztmp)/${TO}.vcb
|
|
$(giztmp)/%/mgiza.cfg: SNT = $(giztmp)/${FROM}-${TO}.snt
|
|
$(giztmp)/%/mgiza.cfg: COOC = $(giztmp)/${FROM}-${TO}.cooc
|
|
$(giztmp)/%/mgiza.cfg: ODIR = $(giztmp)/${FROM}-${TO}/${FROM}-${TO}
|
|
#$(giztmp)/%/mgiza.cfg: | $(giztmp)/%.cooc $(giztmp)/%.snt
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: FROM = ${L1}
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: TO = ${L2}
|
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: FROM = ${L2}
|
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: TO = ${L1}
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.cooc
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.snt
|
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.cooc
|
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.snt
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg \
|
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | \
|
|
$(giztmp)/${L1}.vcb $(giztmp)/${L1}.vcb.classes \
|
|
$(giztmp)/${L2}.vcb $(giztmp)/${L2}.vcb.classes
|
|
mkdir -p $(dir $@)
|
|
touch $@
|
|
@echo "s ${V1}" >> $@
|
|
@echo "t ${V2}" >> $@
|
|
@echo "c ${SNT}" >> $@
|
|
@echo "cooc ${COOC}" >> $@
|
|
@echo "m1 ${m1}" >> $@
|
|
@echo "m2 ${m2}" >> $@
|
|
@echo "mh ${mh}" >> $@
|
|
@echo "m3 ${m3}" >> $@
|
|
@echo "m4 ${m4}" >> $@
|
|
@echo "t1 ${m1}" >> $@
|
|
@echo "t2 ${m2}" >> $@
|
|
@echo "th ${mh}" >> $@
|
|
@echo "t3 ${m3}" >> $@
|
|
@echo "t4 ${m4}" >> $@
|
|
@echo "o ${ODIR}" >> $@
|
|
@echo "model4smoothfactor ${model4smoothfactor}" >> $@
|
|
@echo "onlyaldumps ${onlyaldumps}" >> $@
|
|
@echo "nodumps ${nodumps}" >> $@
|
|
@echo "nsmooth ${nsmooth}" >> $@
|
|
@echo "NCPUS ${NCPUS}" >> $@
|
|
# # ------------------------------------------------------------------------------
|
|
# sanity checks
|
|
ifeq ($(gizaln),)
|
|
$(warning Giza base directory not defined)
|
|
endif
|
|
ifeq ($(gizaln.in),)
|
|
$(warning No directory for Giza++ training data specified!)
|
|
endif
|