Merge github.com:moses-smt/mosesdecoder into weight-new

This commit is contained in:
Hieu Hoang 2013-03-13 17:54:03 +00:00
commit 2f78fe5fe5
13 changed files with 92 additions and 44 deletions

View File

@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
Generally, for trouble installing external libraries, you should get support
directly from the library maker:
Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user

View File

@ -15,7 +15,7 @@ Moses is a statistical machine translation system that allows you to automatical
%prep
%setup -q
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
@ -33,9 +33,9 @@ make install
cd ../giza-pp
make
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
%build
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
%install
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
cp -R bin $RPM_BUILD_ROOT/opt/moses
@ -62,4 +62,4 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
/opt/moses/scripts/tokenizer/*
/opt/moses/scripts/training/*
/opt/moses/irstlm-5.70.04/*
/opt/moses/giza++-v2/*
/opt/moses/giza++-v1.0.7/*

View File

@ -271,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
out << std::endl;
}
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
{
std::vector<const Hypothesis *> edges;
const Hypothesis *currentHypo = hypo;
while (currentHypo) {
edges.push_back(currentHypo);
currentHypo = currentHypo->GetPrevHypo();
}
OutputAlignment(out, edges);
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;

View File

@ -142,7 +142,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
void OutputNBest(std::ostream& out
, const Moses::TrellisPathList &nBestList

View File

@ -197,7 +197,7 @@ public:
// MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL;
if (!staticData.UseMBR())
{
{
bestHypo = manager.GetBestHypothesis();
if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) {
@ -214,13 +214,18 @@ public:
staticData.GetOutputFactorOrder(),
staticData.GetReportSegmentation(),
staticData.GetReportAllFactors());
if (staticData.PrintAlignmentInfo()) {
out << "||| ";
OutputAlignment(out, bestHypo);
}
OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
IFVERBOSE(1) {
debug << "BEST TRANSLATION: " << *bestHypo << endl;
}
}
out << endl;
}
}
// MBR decoding (n-best MBR, lattice MBR, consensus)
else

View File

@ -816,8 +816,11 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo) {
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
}
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();
@ -860,11 +863,14 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
}
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(hypergraphHypothesisID,arcNumber));
// Get an id number for this hypothesis
int mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
int mosesHypothesisID;
if (searchGraph[arcNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
}
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
@ -878,6 +884,10 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
hypergraphHypothesisID += 1;
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
}
// Unique end node
@ -904,7 +914,12 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
int lineNumber = (*it).second;
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
int mosesHypothesisID = thisHypo->GetId();
int mosesHypothesisID;// = thisHypo->GetId();
if (searchGraph[lineNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
}
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),

View File

@ -158,6 +158,7 @@ Parameter::Parameter()
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("alignment-output-file", "print output word alignments into given file");

View File

@ -169,10 +169,6 @@ bool StaticData::LoadData(Parameter *parameter)
}
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
// factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -182,6 +178,16 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment
// alignments
SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
if (m_PrintAlignmentInfo) {
m_needAlignmentInfo = true;
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true;

View File

@ -142,6 +142,7 @@ protected:
bool m_reportAllFactorsNBest;
std::string m_detailedTranslationReportingFilePath;
bool m_onlyDistinctNBest;
bool m_PrintAlignmentInfo;
bool m_needAlignmentInfo;
bool m_PrintAlignmentInfoNbest;
@ -653,6 +654,9 @@ public:
const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile;
}
bool PrintAlignmentInfo() const {
return m_PrintAlignmentInfo;
}
bool PrintAlignmentInfoInNbest() const {
return m_PrintAlignmentInfoNbest;
}

View File

@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (kneserNeyFlag) {
float D = kneserNey_D3;
if (countEF < 2) D = kneserNey_D1;
if (countEF < 3) D = kneserNey_D2;
else if (countEF < 3) D = kneserNey_D2;
if (D > countEF) D = countEF - 0.01; // sanity constraint
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct

View File

@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo;
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString;
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
}
}
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n";

View File

@ -16,15 +16,15 @@ $HELP = 1
unless &GetOptions('corpus=s' => \$CORPUS,
'model=s' => \$MODEL,
'filler=s' => \$FILLER,
'factored' => \$FACTORED,
'factored' => \$FACTORED,
'min-size=i' => \$MIN_SIZE,
'min-count=i' => \$MIN_COUNT,
'max-count=i' => \$MAX_COUNT,
'help' => \$HELP,
'verbose' => \$VERBOSE,
'syntax' => \$SYNTAX,
'binarize' => \$BINARIZE,
'mark-split' => \$MARK_SPLIT,
'syntax' => \$SYNTAX,
'binarize' => \$BINARIZE,
'mark-split' => \$MARK_SPLIT,
'train' => \$TRAIN);
if ($HELP ||
@ -155,34 +155,37 @@ sub apply {
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
$COUNT{$lc} = $count;
$TRUECASE{$lc} = $factored_word;
$LABEL{$lc} = $label if $SYNTAX;
$LABEL{$lc} = $label if $SYNTAX;
}
close(MODEL);
while(<STDIN>) {
my $first = 1;
chop; s/\s+/ /g; s/^ //; s/ $//;
my @BUFFER; # for xml tags
my @BUFFER; # for xml tags
foreach my $factored_word (split) {
print " " unless $first;
$first = 0;
# syntax: don't split xml
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
push @BUFFER,$factored_word;
$first = 1;
next;
}
# get case class
my $word = $factored_word;
$word =~ s/\|.+//g; # just first factor
my $lc = lc($word);
# syntax: don't split xml
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
push @BUFFER,$factored_word;
$first = 1;
next;
}
# get case class
my $word = $factored_word;
$word =~ s/\|.+//g; # just first factor
my $lc = lc($word);
print STDERR "considering $word ($lc)...\n" if $VERBOSE;
# don't split frequent words
if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
$lc !~ /[a-zA-Z]/) {; # has to have at least one letter
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
print $factored_word;
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
next;
}

View File

@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
sub extract_sgml_tag_attribute
{
my ($name, $data) = @_;
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
}
#################################