Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-27 22:14:57 +03:00 · 2015-04-05 16:45:17 +04:00 · 2015-04-05 16:45:17 +04:00 · 4cb8a1837e
commit 4cb8a1837e
parent 7ffdddef13 66cfd14159
10 changed files with 95 additions and 34 deletions
--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@ -6,7 +6,7 @@
 #include <vector>
 #include <algorithm>
 #include <cstring>
-#include <unordered_set>
+#include <set>
 #include <glib.h>
 #include <stdexcept>
 #include <boost/thread.hpp>
@ -1557,9 +1557,9 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
 {
    std::size_t line_no = 0;
    std::size_t perchunk = chunksize ? chunksize : 2000;
-    std::vector< std::string > lines[nthreads];
-    std::vector< std::string > results[nthreads];
-    boost::thread workers[nthreads];
+    std::vector< std::vector< std::string > > lines(nthreads);
+    std::vector< std::vector< std::string > > results(nthreads);
+    std::vector< boost::thread > workers(nthreads);
    bool done_p = !(is.good() && os.good());
    

@ -1589,20 +1589,20 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
                        results[ithread].resize(line_pos);
                        break;
                    }
-                    lines[ithread][line_pos].clear();
+                    lines[ithread][line_pos].clear(); 
                } else if (skip_xml_p && 
                           (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { 
-                    lines[ithread][line_pos].clear();
+                    lines[ithread][line_pos].clear(); 
                } else {
                    lines[ithread][line_pos] = 
-                        std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
+                        std::string(SPC_BYTE).append(istr).append(SPC_BYTE); 
                }
            } 

-            if (line_pos)
+            if (line_pos) {
                workers[ithread] = 
-                    boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
-
+                    boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread])); 
+            }
        } // end for loop starting threads

        for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@ -1772,12 +1772,12 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
    std::size_t finilen = 0;
    std::size_t dotslen = 0;

-    static std::size_t SEQ_LIM = 6;
+	  const std::size_t SEQ_LIM = 6;

    charclass_t prev_class = empty;
    charclass_t curr_class = empty;
-    charclass_t seq[SEQ_LIM] = { empty };
-    std::size_t pos[SEQ_LIM] = { 0 };
+    std::vector<charclass_t> seq(SEQ_LIM, empty);
+    std::vector<std::size_t> pos(SEQ_LIM, 0);
    std::size_t seqpos = 0;

    GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
@ -1785,7 +1785,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
    bool curr_word_p = false;

    std::vector<std::size_t> breaks;
-    std::unordered_set<std::size_t> suppress;
+    std::set<std::size_t> suppress;
    
    for (; icp <= ncp; ++icp) {
        currwc = wchar_t(ucs4[icp]);
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
            } else if (currwc >= SMAL_HYPH) {
                curr_word_p = true;
            } else {
-                curr_word_p = currwc >= WAVE_DASH && curr_word_p <= KANA_DHYP; 
+                curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP); 
            }
            break;
        case G_UNICODE_CLOSE_PUNCTUATION:
--- a/mert/HopeFearDecoder.cpp
+++ b/mert/HopeFearDecoder.cpp
@ -180,7 +180,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
  references_.Load(referenceFiles, vocab_);

  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights,num_dense_);
  scorer_ = scorer;

  static const string kWeights = "weights";
@ -243,7 +243,7 @@ void HypergraphHopeFearDecoder::HopeFear(
 {
  size_t sentenceId = *sentenceIdIter_;
  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights, num_dense_);
  const Graph& graph = *(graphs_[sentenceId]);

  // ValType hope_scale = 1.0;
@ -338,7 +338,7 @@ void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValTy
  HgHypothesis bestHypo;
  size_t sentenceId = *sentenceIdIter_;
  SparseVector weights;
-  wv.ToSparse(&weights);
+  wv.ToSparse(&weights, num_dense_);
  vector<ValType> bg(scorer_->NumberOfScores());
  //cerr << "Calculating bleu on " << sentenceId << endl;
  Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -77,6 +77,7 @@ unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_f
 unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test mira_feature_vector_test : MiraFeatureVectorTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;
--- a/mert/MiraFeatureVectorTest.cpp
+++ b/mert/MiraFeatureVectorTest.cpp
@ -0,0 +1,49 @@
+#include "MiraFeatureVector.h"
+#include "MiraWeightVector.h"
+
+#define BOOST_TEST_MODULE MiraFeatureVector
+#include <boost/test/unit_test.hpp>
+
+using namespace MosesTuning;
+
+/* Note that the conversion to and from SparseVector needs to know
+how many of the features are really "dense". This is because in hg mira
+all features (sparse and dense) are to get rolled in to SparseVector
+*/
+
+BOOST_AUTO_TEST_CASE(from_sparse) {
+  SparseVector sp;
+  sp.set("dense0", 0.2);
+  sp.set("dense1", 0.3);
+  sp.set("sparse0", 0.7);
+  sp.set("sparse1", 0.9);
+  sp.set("sparse2", 0.1);
+
+  MiraFeatureVector mfv(sp,2);
+  BOOST_CHECK_EQUAL(mfv.size(),5);
+
+  BOOST_CHECK_EQUAL(mfv.feat(0),0);
+  BOOST_CHECK_EQUAL(mfv.feat(1),1);
+  BOOST_CHECK_EQUAL(mfv.feat(2),4);
+  BOOST_CHECK_EQUAL(mfv.feat(3),5);
+  BOOST_CHECK_EQUAL(mfv.feat(4),6);
+
+  BOOST_CHECK_CLOSE(mfv.val(0), 0.2,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(1), 0.3,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(2), 0.7,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(3), 0.9,1e-5);
+  BOOST_CHECK_CLOSE(mfv.val(4), 0.1,1e-5);
+
+  MiraWeightVector mwv;
+  mwv.update(mfv,1.0);
+  SparseVector sp2;
+  mwv.ToSparse(&sp2,2);
+
+  //check we get back what we started with
+  BOOST_CHECK_CLOSE(sp2.get("dense0"), 0.2,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("dense1"), 0.3,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse0"), 0.7,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse1"), 0.9,1e-5);
+  BOOST_CHECK_CLOSE(sp2.get("sparse2"), 0.1,1e-5);
+
+}
--- a/mert/MiraWeightVector.cpp
+++ b/mert/MiraWeightVector.cpp
@ -93,11 +93,17 @@ void MiraWeightVector::update(size_t index, ValType delta)
  m_lastUpdated[index] = m_numUpdates;
 }

-void MiraWeightVector::ToSparse(SparseVector* sparse) const
+void MiraWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
 {
  for (size_t i = 0; i < m_weights.size(); ++i) {
    if(abs(m_weights[i])>1e-8) {
-      sparse->set(i,m_weights[i]);
+      if (i < denseSize) {
+        sparse->set(i,m_weights[i]);
+      } else {
+        //The ids in MiraFeatureVector/MiraWeightVector for sparse features
+        //need to be translated when converting back to SparseVector.
+        sparse->set(i-denseSize, m_weights[i]);
+      }
    }
  }
 }
@ -172,12 +178,18 @@ size_t AvgWeightVector::size() const
  return m_wv.m_weights.size();
 }

-void AvgWeightVector::ToSparse(SparseVector* sparse) const
+void AvgWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
 {
  for (size_t i = 0; i < size(); ++i) {
    ValType w = weight(i);
    if(abs(w)>1e-8) {
-      sparse->set(i,w);
+      if (i < denseSize) {
+        sparse->set(i,w);
+      } else {
+        //The ids in MiraFeatureVector/MiraWeightVector for sparse features
+        //need to be translated when converting back to SparseVector.
+        sparse->set(i-denseSize, w);
+      }
    }
  }
 }
--- a/mert/MiraWeightVector.h
+++ b/mert/MiraWeightVector.h
@ -64,9 +64,9 @@ public:
  AvgWeightVector avg();

  /**
-    * Convert to sparse vector, interpreting all features as sparse.
+    * Convert to sparse vector, interpreting all features as sparse. Only used by hgmira.
   **/
-  void ToSparse(SparseVector* sparse) const;
+  void ToSparse(SparseVector* sparse, size_t denseSize) const;

  friend class AvgWeightVector;

@ -104,7 +104,7 @@ public:
  ValType score(const MiraFeatureVector& fv) const;
  ValType weight(std::size_t index) const;
  std::size_t size() const;
-  void ToSparse(SparseVector* sparse) const;
+  void ToSparse(SparseVector* sparse, size_t num_dense) const;
 private:
  const MiraWeightVector& m_wv;
 };
--- a/moses/FF/VW/VWFeatureBase.h
+++ b/moses/FF/VW/VWFeatureBase.h
@ -3,7 +3,7 @@
 #include <string>
 #include <boost/thread/tss.hpp>

-#include "Classifier.h"
+#include "vw/Classifier.h"
 #include "moses/TypeDef.h"
 #include "moses/Util.h"
 #include "moses/FF/StatelessFeatureFunction.h"
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@ -484,9 +484,7 @@ namespace Moses
      
      targetOffset += tp.GetSize();
    }
-    // Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
-    // Or fix it somewhere else.
-    out << std::endl;
+    // Used by --print-alignment-info, so no endl
  }

  void 
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -1868,8 +1868,7 @@ void Manager::OutputAlignment(ostream &out, const vector<const Hypothesis *> &ed

    targetOffset += tp.GetSize();
  }
-  // Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
-  // Or fix it somewhere else.
+  // Used by --alignment-output-file so requires endl
  out << std::endl;
 }

@ -2024,6 +2023,8 @@ void Manager::OutputBestHypo(const Moses::TrellisPath &path, long /*translationI
 void Manager::OutputAlignment(std::ostringstream &out, const TrellisPath &path) const
 {
  Hypothesis::OutputAlignment(out, path.GetEdges());
+  // Used by --alignment-output-file so requires endl
+  out << std::endl;
 }

 } // namespace
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@ -49,7 +49,7 @@ my $l1input = "$corpus.$l1";
 if (-e $l1input) {
  $opn = $l1input;
 } elsif (-e $l1input.".gz") {
-  $opn = "zcat $l1input.gz |";
+  $opn = "gunzip -c $l1input.gz |";
 } else {
    die "Error: $l1input does not exist";
 }
@ -59,7 +59,7 @@ my $l2input = "$corpus.$l2";
 if (-e $l2input) {
  $opn = $l2input;
 } elsif (-e $l2input.".gz") {
-  $opn = "zcat $l2input.gz |";
+  $opn = "gunzip -c $l2input.gz |";
 } else  {
 die "Error: $l2input does not exist";
 }
@ -154,7 +154,7 @@ print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
 sub word_count {
  my ($line) = @_;
  if ($ignore_xml) {
-    $line =~ s/<\S[^>]*\S>//g;
+    $line =~ s/<\S[^>]*\S>/ /g;
    $line =~ s/\s+/ /g;
    $line =~ s/^ //g;
    $line =~ s/ $//g;