Got tired of waiting for loading.

2024-12-26 13:23:25 +03:00 · 2012-10-14 17:35:58 +01:00 · 2012-10-14 17:35:58 +01:00 · a0ce62e795
commit a0ce62e795
parent 0d9095983d
7 changed files with 226 additions and 33 deletions
--- a/moses/src/Phrase.cpp
+++ b/moses/src/Phrase.cpp
@ -160,13 +160,15 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const

 void Phrase::CreateFromStringNewFormat(FactorDirection direction
                                       , const std::vector<FactorType> &factorOrder
-                                       , const std::string &phraseString
+                                       , const StringPiece &phraseString
                                       , const std::string & /*factorDelimiter */
                                       , Word &lhs)
 {
  // parse
-  vector<string> annotatedWordVector;
-  Tokenize(annotatedWordVector, phraseString);
+  vector<StringPiece> annotatedWordVector;
+  for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
+    annotatedWordVector.push_back(*it);
+  }
  // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
  //		to
  // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
@ -174,7 +176,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
  m_words.reserve(annotatedWordVector.size()-1);

  for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() -  1 ; phrasePos++) {
-    string &annotatedWord = annotatedWordVector[phrasePos];
+    StringPiece &annotatedWord = annotatedWordVector[phrasePos];
    bool isNonTerminal;
    if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
      // non-term
@ -197,7 +199,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
  }

  // lhs
-  string &annotatedWord = annotatedWordVector.back();
+  StringPiece &annotatedWord = annotatedWordVector.back();
  CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
  annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);

--- a/moses/src/Phrase.h
+++ b/moses/src/Phrase.h
@ -70,7 +70,7 @@ public:

  void CreateFromStringNewFormat(FactorDirection direction
                                 , const std::vector<FactorType> &factorOrder
-                                 , const std::string &phraseString
+                                 , const StringPiece &phraseString
                                 , const std::string &factorDelimiter
                                 , Word &lhs);

--- a/moses/src/RuleTable/LoaderStandard.cpp
+++ b/moses/src/RuleTable/LoaderStandard.cpp
@ -24,6 +24,7 @@
 #include <iterator>
 #include <algorithm>
 #include <sys/stat.h>
+#include <stdlib.h>
 #include "RuleTable/Trie.h"
 #include "FactorCollection.h"
 #include "Word.h"
@ -34,6 +35,8 @@
 #include "UserMessage.h"
 #include "ChartTranslationOptionList.h"
 #include "FactorCollection.h"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"

 using namespace std;

@ -159,6 +162,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
  string lineOrig;
  size_t count = 0;

+  vector<float> scoreVector;
  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
@ -168,31 +172,32 @@ bool RuleTableLoaderStandard::Load(FormatType format
    { // do nothing to format of line
      line = &lineOrig;
    }
+
+    util::TokenIter<util::MultiCharacter> pipes(*line, "|||");
+    StringPiece sourcePhraseString(*pipes);
+    StringPiece targetPhraseString(*++pipes);
+    StringPiece scoreString(*++pipes);
+    StringPiece alignString(*++pipes);
    
-    vector<string> tokens;
-    vector<float> scoreVector;
-
-    TokenizeMultiCharSeparator(tokens, *line , "|||" );
-
-    if (tokens.size() != 4 && tokens.size() != 5) {
+    if (++pipes && ++pipes) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

-    const string &sourcePhraseString = tokens[0]
-               , &targetPhraseString = tokens[1]
-               , &scoreString        = tokens[2]
-               , &alignString        = tokens[3];
-
    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

-    Tokenize<float>(scoreVector, scoreString);
+    scoreVector.clear();
+    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+      char *err_ind;
+      scoreVector.push_back(strtod(s->data(), &err_ind));
+      UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
+    }
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
@ -201,7 +206,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
      UserMessage::Add(strme.str());
      abort();
    }
-    CHECK(scoreVector.size() == numScoreComponents);

    // parse source & find pt node

--- a/moses/src/Word.cpp
+++ b/moses/src/Word.cpp
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "Word.h"
 #include "TypeDef.h"
 #include "StaticData.h"  // needed to determine the FactorDelimiter
+#include "util/tokenize_piece.hh"

 using namespace std;

@ -87,23 +88,16 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan

 void Word::CreateFromString(FactorDirection direction
                            , const std::vector<FactorType> &factorOrder
-                            , const std::string &str
+                            , const StringPiece &str
                            , bool isNonTerminal)
 {
  FactorCollection &factorCollection = FactorCollection::Instance();

-  vector<string> wordVec;
-  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
-  TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
-  //Tokenize(wordVec, str, "|");
-  CHECK(wordVec.size() <= factorOrder.size());
-
-  const Factor *factor;
-  for (size_t ind = 0; ind < wordVec.size(); ++ind) {
-    FactorType factorType = factorOrder[ind];
-    factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
-    m_factorArray[factorType] = factor;
+  util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
+  for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
+    m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
  }
+  CHECK(!fit);

  // assume term/non-term same for all factors
  m_isNonTerminal = isNonTerminal;
--- a/moses/src/Word.h
+++ b/moses/src/Word.h
@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "TypeDef.h"
 #include "Factor.h"
 #include "Util.h"
+#include "util/string_piece.hh"

 namespace Moses
 {
@ -129,7 +130,7 @@ public:

  void CreateFromString(FactorDirection direction
                        , const std::vector<FactorType> &factorOrder
-                        , const std::string &str
+                        , const StringPiece &str
                        , bool isNonTerminal);

  void CreateUnknownWord(const Word &sourceWord);
--- a/util/Jamfile
+++ b/util/Jamfile
@ -1,4 +1,4 @@
-lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ..//z : <include>.. : : <include>.. ;
+lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc string_piece.cc usage.cc ..//z : <include>.. : : <include>.. ;

 import testing ;

--- a/util/string_piece.cc
+++ b/util/string_piece.cc
@ -0,0 +1,192 @@
+// Copyright 2004 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in string_piece.hh.
+
+#include "util/string_piece.hh"
+
+#include <algorithm>
+
+#include <limits.h>
+
+#ifndef HAVE_ICU
+
+typedef StringPiece::size_type size_type;
+
+void StringPiece::CopyToString(std::string* target) const {
+  target->assign(ptr_, length_);
+}
+
+size_type StringPiece::find(const StringPiece& s, size_type pos) const {
+  if (length_ < 0 || pos > static_cast<size_type>(length_))
+    return npos;
+
+  const char* result = std::search(ptr_ + pos, ptr_ + length_,
+                                   s.ptr_, s.ptr_ + s.length_);
+  const size_type xpos = result - ptr_;
+  return xpos + s.length_ <= length_ ? xpos : npos;
+}
+
+size_type StringPiece::find(char c, size_type pos) const {
+  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+    return npos;
+  }
+  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+  return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
+  if (length_ < s.length_) return npos;
+  const size_t ulen = length_;
+  if (s.length_ == 0) return std::min(ulen, pos);
+
+  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+  return result != last ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(char c, size_type pos) const {
+  if (length_ <= 0) return npos;
+  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+       i >= 0; --i) {
+    if (ptr_[i] == c) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table.  This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char.  Thus it should be be declared
+// as follows:
+//   bool table[UCHAR_MAX + 1]
+static inline void BuildLookupTable(const StringPiece& characters_wanted,
+                                    bool* table) {
+  const size_type length = characters_wanted.length();
+  const char* const data = characters_wanted.data();
+  for (size_type i = 0; i < length; ++i) {
+    table[static_cast<unsigned char>(data[i])] = true;
+  }
+}
+
+size_type StringPiece::find_first_of(const StringPiece& s,
+                                     size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(const StringPiece& s,
+                                         size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  if (s.length_ == 0)
+    return 0;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (; pos < length_; ++pos) {
+    if (ptr_[pos] != c) {
+      return pos;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(const StringPiece& s,
+                                        size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  size_type i = std::min(pos, length_ - 1);
+  if (s.length_ == 0)
+    return i;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (; ; --i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (ptr_[i] != c)
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+  if (pos > length_) pos = length_;
+  if (n > length_ - pos) n = length_ - pos;
+  return StringPiece(ptr_ + pos, n);
+}
+
+const size_type StringPiece::npos = size_type(-1);
+
+#endif // !HAVE_ICU