Merge github.com:moses-smt/mosesdecoder into weight-new

2024-09-11 19:27:11 +03:00 · 2013-03-13 17:54:03 +00:00 · 2013-03-13 17:54:03 +00:00 · 2f78fe5fe5
commit 2f78fe5fe5
parent f8f67f42af 3a7f4f776a
13 changed files with 92 additions and 44 deletions
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
 Generally, for trouble installing external libraries, you should get support
 directly from the library maker:

-Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
+Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
 IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
 SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user

--- a/contrib/rpm/rpmbuild/SPECS/moses.spec
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@ -15,7 +15,7 @@ Moses is a statistical machine translation system that allows you to automatical
 %prep
 %setup -q

-mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7

 wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz 
 wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
@ -33,9 +33,9 @@ make install

 cd ../giza-pp
 make
-cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
 %build
-./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
 %install
 mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R bin $RPM_BUILD_ROOT/opt/moses
@ -62,4 +62,4 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
 /opt/moses/scripts/tokenizer/*
 /opt/moses/scripts/training/*
 /opt/moses/irstlm-5.70.04/*
-/opt/moses/giza++-v2/*
+/opt/moses/giza++-v1.0.7/*
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@ -271,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
  out << std::endl;
 }

+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+  std::vector<const Hypothesis *> edges;
+  const Hypothesis *currentHypo = hypo;
+  while (currentHypo) {
+    edges.push_back(currentHypo);
+    currentHypo = currentHypo->GetPrevHypo();
+  }
+
+  OutputAlignment(out, edges);
+
+}
+
 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
 {
  ostringstream out;
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@ -142,7 +142,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
 void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo,  const Moses::TrellisPath &path);
-
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);

 void OutputNBest(std::ostream& out
                  , const Moses::TrellisPathList &nBestList
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@ -197,7 +197,7 @@ public:
      // MAP decoding: best hypothesis
      const Hypothesis* bestHypo = NULL;
      if (!staticData.UseMBR()) 
-			{
+	  {
        bestHypo = manager.GetBestHypothesis();
        if (bestHypo) {
          if (staticData.IsPathRecoveryEnabled()) {
@ -214,13 +214,18 @@ public:
            staticData.GetOutputFactorOrder(),
            staticData.GetReportSegmentation(),
            staticData.GetReportAllFactors());
+          if (staticData.PrintAlignmentInfo()) {
+        	out << "||| ";
+            OutputAlignment(out, bestHypo);
+          }
+
          OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
          IFVERBOSE(1) {
            debug << "BEST TRANSLATION: " << *bestHypo << endl;
          }
        }
        out << endl;
-			}
+	  }

      // MBR decoding (n-best MBR, lattice MBR, consensus)
      else 
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -816,8 +816,11 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
 size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {

-  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
-
+  ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); 
+  const Hypothesis *prevHypo = hypo->GetPrevHypo();
+  if (prevHypo) {
+    scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
+  }
  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
  size_t numScoreComps = featureValues.size();

@ -860,11 +863,14 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 	}
      }

-      // Record that this arc ends at this node
-      hypergraphIDToArcs.insert(pair<int,int>(hypergraphHypothesisID,arcNumber));
-
      // Get an id number for this hypothesis
-      int mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      int mosesHypothesisID;
+      if (searchGraph[arcNumber].recombinationHypo) {
+	mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+      } else {
+	mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      }
+
      if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
      
 	mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
@ -878,6 +884,10 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou

 	hypergraphHypothesisID += 1;
      }
+
+      // Record that this arc ends at this node
+      hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
+
    }
    
    // Unique end node
@ -904,7 +914,12 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
      for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
 	int lineNumber = (*it).second;
 	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
-	int mosesHypothesisID = thisHypo->GetId();
+	int mosesHypothesisID;// = thisHypo->GetId();
+	if (searchGraph[lineNumber].recombinationHypo) {
+	  mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+	} else {
+	  mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+	}
 	//	int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
 	UTIL_THROW_IF(
 		      (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -158,6 +158,7 @@ Parameter::Parameter()
  AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");                                          
  AddParam("minphr-memory", "Load phrase table in minphr format into memory");

+  AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
  AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
  AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
  AddParam("alignment-output-file", "print output word alignments into given file");
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -169,10 +169,6 @@ bool StaticData::LoadData(Parameter *parameter)
    }
  }

-  if(m_parameter->GetParam("sort-word-alignment").size()) {
-    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
-  }
-  
  // factor delimiter
  if (m_parameter->GetParam("factor-delimiter").size() > 0) {
    m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -182,6 +178,16 @@ bool StaticData::LoadData(Parameter *parameter)
  SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );

  //word-to-word alignment
+  // alignments
+  SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+  if (m_PrintAlignmentInfo) {
+    m_needAlignmentInfo = true;
+  }
+
+  if(m_parameter->GetParam("sort-word-alignment").size()) {
+    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
+  }
+
  SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
  if (m_PrintAlignmentInfoNbest) {
    m_needAlignmentInfo = true;
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@ -142,6 +142,7 @@ protected:
  bool m_reportAllFactorsNBest;
  std::string m_detailedTranslationReportingFilePath;
  bool m_onlyDistinctNBest;
+  bool m_PrintAlignmentInfo;
  bool m_needAlignmentInfo;
  bool m_PrintAlignmentInfoNbest;

@ -653,6 +654,9 @@ public:
  const std::string &GetAlignmentOutputFile() const {
    return m_alignmentOutputFile;
  }
+  bool PrintAlignmentInfo() const {
+    return m_PrintAlignmentInfo;
+  }
  bool PrintAlignmentInfoInNbest() const {
    return m_PrintAlignmentInfoNbest;
  }
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   if (kneserNeyFlag) {
     float D = kneserNey_D3;
     if (countEF < 2) D = kneserNey_D1;
-     if (countEF < 3) D = kneserNey_D2;
+     else if (countEF < 3) D = kneserNey_D2;
     if (D > countEF) D = countEF - 0.01; // sanity constraint

     float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
  if (m_options.isOrientationFlag())
    outextractstrOrientation << orientationInfo;

+  if (m_options.isIncludeSentenceIdFlag()) {
+    outextractstr << " ||| " << sentence.sentenceID;
+  }
+
  if (m_options.getInstanceWeightsFile().length()) {
    if (m_options.isTranslationFlag()) {
      outextractstr << " ||| " << sentence.weightString;
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
    }
  }

-  if (m_options.isIncludeSentenceIdFlag()) {
-    outextractstr << " ||| " << sentence.sentenceID;
-  }

  if (m_options.isTranslationFlag()) outextractstr << "\n";
  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@ -16,15 +16,15 @@ $HELP = 1
    unless &GetOptions('corpus=s' => \$CORPUS,
 		       'model=s' => \$MODEL,
 		       'filler=s' => \$FILLER,
-           'factored' => \$FACTORED,
+		       'factored' => \$FACTORED,
 		       'min-size=i' => \$MIN_SIZE,
 		       'min-count=i' => \$MIN_COUNT,
 		       'max-count=i' => \$MAX_COUNT,
 		       'help' => \$HELP,
 		       'verbose' => \$VERBOSE,
-           'syntax' => \$SYNTAX,
-           'binarize' => \$BINARIZE,
-           'mark-split' => \$MARK_SPLIT,
+		       'syntax' => \$SYNTAX,
+		       'binarize' => \$BINARIZE,
+		       'mark-split' => \$MARK_SPLIT,
 		       'train' => \$TRAIN);

 if ($HELP ||
@ -155,34 +155,37 @@ sub apply {
        next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
 	$COUNT{$lc} = $count;
 	$TRUECASE{$lc} = $factored_word;
-  $LABEL{$lc} = $label if $SYNTAX;
+	$LABEL{$lc} = $label if $SYNTAX;
    }
    close(MODEL);

    while(<STDIN>) {
 	my $first = 1;
 	chop; s/\s+/ /g; s/^ //; s/ $//;
-  my @BUFFER; # for xml tags
+	my @BUFFER; # for xml tags
 	foreach my $factored_word (split) {
 	    print " " unless $first;	    
 	    $first = 0;

-      # syntax: don't split xml
-      if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
-        push @BUFFER,$factored_word;
-        $first = 1;
-        next;
-      }
-
-      # get case class
-      my $word = $factored_word;
-      $word =~ s/\|.+//g; # just first factor
-      my $lc = lc($word);
-
+	    # syntax: don't split xml
+	    if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+		push @BUFFER,$factored_word;
+		$first = 1;
+		next;
+	    }
+	    
+	    # get case class
+	    my $word = $factored_word;
+	    $word =~ s/\|.+//g; # just first factor
+	    my $lc = lc($word);
+	    
+	    print STDERR "considering $word ($lc)...\n" if $VERBOSE;
 	    # don't split frequent words
-	    if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
-    print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+	    if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+	        $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+		print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
 		print $factored_word;
+		print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
 		next;
 	    }

--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
 sub extract_sgml_tag_attribute
 {
 	my ($name, $data) = @_;
-	($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+	($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
 }

 #################################