Merge branch 'master' into weight-new

2024-12-28 22:45:50 +03:00 · 2012-12-24 23:28:48 +00:00 · 2012-12-24 23:28:48 +00:00 · b9c76342e9
commit b9c76342e9
parent 1798957647 861792bfc5
7 changed files with 148 additions and 9 deletions
--- a/moses/PhraseLengthFeature.cpp
+++ b/moses/PhraseLengthFeature.cpp
@ -13,8 +13,8 @@ void PhraseLengthFeature::Evaluate(
              ScoreComponentCollection* accumulator) const
 {
  // get length of source and target phrase
-  size_t sourceLength = context.GetTargetPhrase().GetSize();
-  size_t targetLength = context.GetTranslationOption().GetSourcePhrase()->GetSize();
+  size_t targetLength = context.GetTargetPhrase().GetSize();
+  size_t sourceLength = context.GetTranslationOption().GetSourceWordsRange().GetNumWordsCovered();

  // create feature names
  stringstream nameSource;
--- a/moses/PhraseLengthFeatureTest.cpp
+++ b/moses/PhraseLengthFeatureTest.cpp
@ -0,0 +1,102 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2010 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <boost/test/unit_test.hpp>
+
+#include "FactorCollection.h"
+#include "PhraseLengthFeature.h"
+#include "Sentence.h"
+#include "TargetPhrase.h"
+#include "TranslationOption.h"
+
+using namespace Moses;
+using namespace std;
+
+BOOST_AUTO_TEST_SUITE(phrase_length_feature)
+
+//TODO: Factor out setup code so that it can be reused
+
+static Word MakeWord(string text) {
+  FactorCollection &factorCollection = FactorCollection::Instance();
+  const Factor* f = factorCollection.AddFactor(Input,0,text);
+  Word w;
+  w.SetFactor(0,f);
+  return w;
+}
+
+
+BOOST_AUTO_TEST_CASE(evaluate) {
+  Word w1 = MakeWord("w1");
+  Word w2 = MakeWord("y2");
+  Word w3 = MakeWord("x3");
+  Word w4 = MakeWord("w4");
+
+  Phrase p1;
+  p1.AddWord(w1);
+  p1.AddWord(w3);
+  p1.AddWord(w4);
+
+  Phrase p2;
+  p2.AddWord(w1);
+  p2.AddWord(w2);
+
+  Phrase p3;
+  p3.AddWord(w2);
+  p3.AddWord(w1);
+  p3.AddWord(w4);
+  p3.AddWord(w4);
+
+  TargetPhrase tp1(p1);
+  TargetPhrase tp2(p2);
+  TargetPhrase tp3(p3);
+
+  Sentence sentence;
+  vector<FactorType> order;
+  order.push_back(0);
+  stringstream in("the input sentence has 6 words");
+  sentence.Read(in, order);
+
+  TranslationOption topt1(WordsRange(0,0), tp1, sentence);
+  TranslationOption topt2(WordsRange(1,3), tp2, sentence);
+  TranslationOption topt3(WordsRange(2,3), tp3, sentence);
+
+  PhraseBasedFeatureContext context1(topt1,sentence);
+  PhraseBasedFeatureContext context2(topt2,sentence);
+  PhraseBasedFeatureContext context3(topt3,sentence);
+     
+  PhraseLengthFeature plf;
+
+  ScoreComponentCollection acc1,acc2,acc3;
+
+  plf.Evaluate(context1, &acc1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "s1"),1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "t3"),1);
+  BOOST_CHECK_EQUAL(acc1.GetScoreForProducer(&plf, "1,3"),1);
+
+  plf.Evaluate(context2, &acc2);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "s3"),1);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "t2"),1);
+  BOOST_CHECK_EQUAL(acc2.GetScoreForProducer(&plf, "3,2"),1);
+
+  plf.Evaluate(context3, &acc3);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "s2"),1);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "t4"),1);
+  BOOST_CHECK_EQUAL(acc3.GetScoreForProducer(&plf, "2,4"),1);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@ -46,6 +46,7 @@ class PhraseExtractionOptions {
  bool includeSentenceIdFlag; //include sentence id in extract file
  bool onlyOutputSpanInfo;
  bool gzOutput;
+  std::string instanceWeightsFile; //weights for each sentence

 public:  
  PhraseExtractionOptions(const int initmaxPhraseLength):
@ -99,7 +100,11 @@ public:
    } 
    void initGzOutput (const bool initgzOutput){
        gzOutput= initgzOutput;
-    } 
+    }
+    void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
+      instanceWeightsFile = std::string(initInstanceWeightsFile);
+    }
+     
    // functions for getting values
    bool isAllModelsOutputFlag() const {
        return allModelsOutputFlag;
@ -136,7 +141,10 @@ public:
    } 
    bool isGzOutput () const {
        return gzOutput;
-   } 
+    }
+    std::string getInstanceWeightsFile() const {
+      return instanceWeightsFile;
+    }
 };

 }
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@ -54,10 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
  return true;
 }

-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
 {
  using namespace std;
  this->sentenceID = sentenceID;
+  this->weightString = std::string(weightString);

  // process sentence strings and store in target and source members.
  if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@ -35,6 +35,7 @@ public:
  std::vector<int> alignedCountS;
  std::vector<std::vector<int> > alignedToT;
  int sentenceID;
+  std::string weightString;

  virtual ~SentenceAlignment();

@ -43,7 +44,7 @@ public:
  virtual bool processSourceSentence(const char *, int, bool boundaryRules);

  bool create(char targetString[], char sourceString[],
-              char alignmentString[], int sentenceID, bool boundaryRules);
+              char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
  
 };

--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@ -114,7 +114,7 @@ int main(int argc, char* argv[])

 if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
-    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n";
+    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
    exit(1);
  }

@ -144,6 +144,12 @@ int main(int argc, char* argv[])
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.initGzOutput(true);  
+    } else if (strcmp(argv[i], "--InstanceWeights") == 0) {
+      if (i+1 >= argc) {
+        cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
+        exit(1);
+      }
+      options.initInstanceWeightsFile(argv[++i]);
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@ -220,6 +226,13 @@ int main(int argc, char* argv[])
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

+  istream *iwFileP = NULL;
+  auto_ptr<Moses::InputFileStream> instanceWeightsFile;
+  if (options.getInstanceWeightsFile().length()) {
+    instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
+    iwFileP = instanceWeightsFile.get();
+  }
+
  // open output files
  if (options.isTranslationFlag()) {
    string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
@ -238,10 +251,14 @@ int main(int argc, char* argv[])
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
+    char weightString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+    if (iwFileP) {
+      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
+    }
    SentenceAlignment sentence;
 	// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
@ -251,7 +268,7 @@ int main(int argc, char* argv[])
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
-	if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
+	if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
   	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation);
      task->Run();
      delete task;
@ -695,6 +712,16 @@ for(int fi=startF; fi<=endF; fi++) {
  if (m_options.isOrientationFlag())
    outextractstrOrientation << orientationInfo;

+  if (m_options.getInstanceWeightsFile().length()) {
+    if (m_options.isTranslationFlag()) {
+      outextractstr << " ||| " << sentence.weightString;
+      outextractstrInv << " ||| " << sentence.weightString;
+    }
+    if (m_options.isOrientationFlag()) {
+      outextractstrOrientation << " ||| " << sentence.weightString;
+    }
+  }
+
  if (m_options.isIncludeSentenceIdFlag()) {
    outextractstr << " ||| " << sentence.sentenceID;
  }
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@ -337,7 +337,7 @@ int main(int argc, char* argv[])
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

-    if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
+    if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(sentence);
      }