Got tired of waiting for loading.

2024-12-26 13:23:25 +03:00 · 2012-10-14 17:35:58 +01:00 · 2012-10-14 17:35:58 +01:00 · a0ce62e795
commit a0ce62e795
parent 0d9095983d
7 changed files with 226 additions and 33 deletions
--- a/moses/src/Phrase.cpp
+++ b/moses/src/Phrase.cpp
@ -160,13 +160,15 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
 void Phrase::CreateFromStringNewFormat(FactorDirection direction
                                       , const std::vector<FactorType> &factorOrder
-                                       , const std::string &phraseString
+                                       , const StringPiece &phraseString
                                       , const std::string & /*factorDelimiter */
                                       , Word &lhs)
 {
  // parse
-  vector<string> annotatedWordVector;
+  vector<StringPiece> annotatedWordVector;
-  Tokenize(annotatedWordVector, phraseString);
+  for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
    annotatedWordVector.push_back(*it);
  }
  // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
  //		to
  // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
@ -174,7 +176,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
  m_words.reserve(annotatedWordVector.size()-1);
  for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() -  1 ; phrasePos++) {
-    string &annotatedWord = annotatedWordVector[phrasePos];
+    StringPiece &annotatedWord = annotatedWordVector[phrasePos];
    bool isNonTerminal;
    if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
      // non-term
@ -197,7 +199,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
  }
  // lhs
-  string &annotatedWord = annotatedWordVector.back();
+  StringPiece &annotatedWord = annotatedWordVector.back();
  CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
  annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);
--- a/moses/src/Phrase.h
+++ b/moses/src/Phrase.h
@ -70,7 +70,7 @@ public:
  void CreateFromStringNewFormat(FactorDirection direction
                                 , const std::vector<FactorType> &factorOrder
-                                 , const std::string &phraseString
+                                 , const StringPiece &phraseString
                                 , const std::string &factorDelimiter
                                 , Word &lhs);
--- a/moses/src/RuleTable/LoaderStandard.cpp
+++ b/moses/src/RuleTable/LoaderStandard.cpp
@ -24,6 +24,7 @@
 #include <iterator>
 #include <algorithm>
 #include <sys/stat.h>
 #include <stdlib.h>
 #include "RuleTable/Trie.h"
 #include "FactorCollection.h"
 #include "Word.h"
@ -34,6 +35,8 @@
 #include "UserMessage.h"
 #include "ChartTranslationOptionList.h"
 #include "FactorCollection.h"
 #include "util/string_piece.hh"
 #include "util/tokenize_piece.hh"
 using namespace std;
@ -159,6 +162,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
  string lineOrig;
  size_t count = 0;
  vector<float> scoreVector;
  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
@ -168,31 +172,32 @@ bool RuleTableLoaderStandard::Load(FormatType format
    { // do nothing to format of line
      line = &lineOrig;
    }
    util::TokenIter<util::MultiCharacter> pipes(*line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);
    StringPiece alignString(*++pipes);
-    vector<string> tokens;
+    if (++pipes && ++pipes) {
    vector<float> scoreVector;
    TokenizeMultiCharSeparator(tokens, *line , "|||" );
    if (tokens.size() != 4 && tokens.size() != 5) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }
    const string &sourcePhraseString = tokens[0]
               , &targetPhraseString = tokens[1]
               , &scoreString        = tokens[2]
               , &alignString        = tokens[3];
    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }
-    Tokenize<float>(scoreVector, scoreString);
+    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      char *err_ind;
      scoreVector.push_back(strtod(s->data(), &err_ind));
      UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
    }
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
@ -201,7 +206,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
      UserMessage::Add(strme.str());
      abort();
    }
    CHECK(scoreVector.size() == numScoreComponents);
    // parse source & find pt node
--- a/moses/src/Word.cpp
+++ b/moses/src/Word.cpp
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "Word.h"
 #include "TypeDef.h"
 #include "StaticData.h"  // needed to determine the FactorDelimiter
 #include "util/tokenize_piece.hh"
 using namespace std;
@ -87,23 +88,16 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
 void Word::CreateFromString(FactorDirection direction
                            , const std::vector<FactorType> &factorOrder
-                            , const std::string &str
+                            , const StringPiece &str
                            , bool isNonTerminal)
 {
  FactorCollection &factorCollection = FactorCollection::Instance();
-  vector<string> wordVec;
+  util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
-  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+  for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
-  TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
+    m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
  //Tokenize(wordVec, str, "|");
  CHECK(wordVec.size() <= factorOrder.size());
  const Factor *factor;
  for (size_t ind = 0; ind < wordVec.size(); ++ind) {
    FactorType factorType = factorOrder[ind];
    factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
    m_factorArray[factorType] = factor;
  }
  CHECK(!fit);
  // assume term/non-term same for all factors
  m_isNonTerminal = isNonTerminal;
--- a/moses/src/Word.h
+++ b/moses/src/Word.h
@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "TypeDef.h"
 #include "Factor.h"
 #include "Util.h"
 #include "util/string_piece.hh"
 namespace Moses
 {
@ -129,7 +130,7 @@ public:
  void CreateFromString(FactorDirection direction
                        , const std::vector<FactorType> &factorOrder
-                        , const std::string &str
+                        , const StringPiece &str
                        , bool isNonTerminal);
  void CreateUnknownWord(const Word &sourceWord);
--- a/util/Jamfile
+++ b/util/Jamfile
@ -1,4 +1,4 @@
-lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ..//z : <include>.. : : <include>.. ;
+lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc string_piece.cc usage.cc ..//z : <include>.. : : <include>.. ;
 import testing ;
--- a/util/string_piece.cc
+++ b/util/string_piece.cc
@ -0,0 +1,192 @@
 // Copyright 2004 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in string_piece.hh.
 #include "util/string_piece.hh"
 #include <algorithm>
 #include <limits.h>
 #ifndef HAVE_ICU
 typedef StringPiece::size_type size_type;
 void StringPiece::CopyToString(std::string* target) const {
  target->assign(ptr_, length_);
 }
 size_type StringPiece::find(const StringPiece& s, size_type pos) const {
  if (length_ < 0 || pos > static_cast<size_type>(length_))
    return npos;
  const char* result = std::search(ptr_ + pos, ptr_ + length_,
                                   s.ptr_, s.ptr_ + s.length_);
  const size_type xpos = result - ptr_;
  return xpos + s.length_ <= length_ ? xpos : npos;
 }
 size_type StringPiece::find(char c, size_type pos) const {
  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
    return npos;
  }
  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
  return result != ptr_ + length_ ? result - ptr_ : npos;
 }
 size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
  if (length_ < s.length_) return npos;
  const size_t ulen = length_;
  if (s.length_ == 0) return std::min(ulen, pos);
  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
  return result != last ? result - ptr_ : npos;
 }
 size_type StringPiece::rfind(char c, size_type pos) const {
  if (length_ <= 0) return npos;
  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
       i >= 0; --i) {
    if (ptr_[i] == c) {
      return i;
    }
  }
  return npos;
 }
 // For each character in characters_wanted, sets the index corresponding
 // to the ASCII code of that character to 1 in table.  This is used by
 // the find_.*_of methods below to tell whether or not a character is in
 // the lookup table in constant time.
 // The argument `table' must be an array that is large enough to hold all
 // the possible values of an unsigned char.  Thus it should be be declared
 // as follows:
 //   bool table[UCHAR_MAX + 1]
 static inline void BuildLookupTable(const StringPiece& characters_wanted,
                                    bool* table) {
  const size_type length = characters_wanted.length();
  const char* const data = characters_wanted.data();
  for (size_type i = 0; i < length; ++i) {
    table[static_cast<unsigned char>(data[i])] = true;
  }
 }
 size_type StringPiece::find_first_of(const StringPiece& s,
                                     size_type pos) const {
  if (length_ == 0 || s.length_ == 0)
    return npos;
  // Avoid the cost of BuildLookupTable() for a single-character search.
  if (s.length_ == 1)
    return find_first_of(s.ptr_[0], pos);
  bool lookup[UCHAR_MAX + 1] = { false };
  BuildLookupTable(s, lookup);
  for (size_type i = pos; i < length_; ++i) {
    if (lookup[static_cast<unsigned char>(ptr_[i])]) {
      return i;
    }
  }
  return npos;
 }
 size_type StringPiece::find_first_not_of(const StringPiece& s,
                                         size_type pos) const {
  if (length_ == 0)
    return npos;
  if (s.length_ == 0)
    return 0;
  // Avoid the cost of BuildLookupTable() for a single-character search.
  if (s.length_ == 1)
    return find_first_not_of(s.ptr_[0], pos);
  bool lookup[UCHAR_MAX + 1] = { false };
  BuildLookupTable(s, lookup);
  for (size_type i = pos; i < length_; ++i) {
    if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
      return i;
    }
  }
  return npos;
 }
 size_type StringPiece::find_first_not_of(char c, size_type pos) const {
  if (length_ == 0)
    return npos;
  for (; pos < length_; ++pos) {
    if (ptr_[pos] != c) {
      return pos;
    }
  }
  return npos;
 }
 size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
  if (length_ == 0 || s.length_ == 0)
    return npos;
  // Avoid the cost of BuildLookupTable() for a single-character search.
  if (s.length_ == 1)
    return find_last_of(s.ptr_[0], pos);
  bool lookup[UCHAR_MAX + 1] = { false };
  BuildLookupTable(s, lookup);
  for (size_type i = std::min(pos, length_ - 1); ; --i) {
    if (lookup[static_cast<unsigned char>(ptr_[i])])
      return i;
    if (i == 0)
      break;
  }
  return npos;
 }
 size_type StringPiece::find_last_not_of(const StringPiece& s,
                                        size_type pos) const {
  if (length_ == 0)
    return npos;
  size_type i = std::min(pos, length_ - 1);
  if (s.length_ == 0)
    return i;
  // Avoid the cost of BuildLookupTable() for a single-character search.
  if (s.length_ == 1)
    return find_last_not_of(s.ptr_[0], pos);
  bool lookup[UCHAR_MAX + 1] = { false };
  BuildLookupTable(s, lookup);
  for (; ; --i) {
    if (!lookup[static_cast<unsigned char>(ptr_[i])])
      return i;
    if (i == 0)
      break;
  }
  return npos;
 }
 size_type StringPiece::find_last_not_of(char c, size_type pos) const {
  if (length_ == 0)
    return npos;
  for (size_type i = std::min(pos, length_ - 1); ; --i) {
    if (ptr_[i] != c)
      return i;
    if (i == 0)
      break;
  }
  return npos;
 }
 StringPiece StringPiece::substr(size_type pos, size_type n) const {
  if (pos > length_) pos = length_;
  if (n > length_ - pos) n = length_ - pos;
  return StringPiece(ptr_ + pos, n);
 }
 const size_type StringPiece::npos = size_type(-1);
 #endif // !HAVE_ICU
`@ -1,4 +1,4 @@`
	`lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ..//z : <include>.. : : <include>.. ;`	`lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc string_piece.cc usage.cc ..//z : <include>.. : : <include>.. ;`

	`import testing ;`	`import testing ;`