Got tired of waiting for loading.

This commit is contained in:
Kenneth Heafield 2012-10-14 17:35:58 +01:00
parent 0d9095983d
commit a0ce62e795
7 changed files with 226 additions and 33 deletions

View File

@ -160,13 +160,15 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
void Phrase::CreateFromStringNewFormat(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const std::string &phraseString
, const StringPiece &phraseString
, const std::string & /*factorDelimiter */
, Word &lhs)
{
// parse
vector<string> annotatedWordVector;
Tokenize(annotatedWordVector, phraseString);
vector<StringPiece> annotatedWordVector;
for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
annotatedWordVector.push_back(*it);
}
// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
// to
// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
@ -174,7 +176,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
m_words.reserve(annotatedWordVector.size()-1);
for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() - 1 ; phrasePos++) {
string &annotatedWord = annotatedWordVector[phrasePos];
StringPiece &annotatedWord = annotatedWordVector[phrasePos];
bool isNonTerminal;
if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
// non-term
@ -197,7 +199,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
}
// lhs
string &annotatedWord = annotatedWordVector.back();
StringPiece &annotatedWord = annotatedWordVector.back();
CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);

View File

@ -70,7 +70,7 @@ public:
void CreateFromStringNewFormat(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const std::string &phraseString
, const StringPiece &phraseString
, const std::string &factorDelimiter
, Word &lhs);

View File

@ -24,6 +24,7 @@
#include <iterator>
#include <algorithm>
#include <sys/stat.h>
#include <stdlib.h>
#include "RuleTable/Trie.h"
#include "FactorCollection.h"
#include "Word.h"
@ -34,6 +35,8 @@
#include "UserMessage.h"
#include "ChartTranslationOptionList.h"
#include "FactorCollection.h"
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
using namespace std;
@ -159,6 +162,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
string lineOrig;
size_t count = 0;
vector<float> scoreVector;
while(getline(inStream, lineOrig)) {
const string *line;
if (format == HieroFormat) { // reformat line
@ -168,31 +172,32 @@ bool RuleTableLoaderStandard::Load(FormatType format
{ // do nothing to format of line
line = &lineOrig;
}
util::TokenIter<util::MultiCharacter> pipes(*line, "|||");
StringPiece sourcePhraseString(*pipes);
StringPiece targetPhraseString(*++pipes);
StringPiece scoreString(*++pipes);
StringPiece alignString(*++pipes);
vector<string> tokens;
vector<float> scoreVector;
TokenizeMultiCharSeparator(tokens, *line , "|||" );
if (tokens.size() != 4 && tokens.size() != 5) {
if (++pipes && ++pipes) {
stringstream strme;
strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
UserMessage::Add(strme.str());
abort();
}
const string &sourcePhraseString = tokens[0]
, &targetPhraseString = tokens[1]
, &scoreString = tokens[2]
, &alignString = tokens[3];
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
continue;
}
Tokenize<float>(scoreVector, scoreString);
scoreVector.clear();
for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
char *err_ind;
scoreVector.push_back(strtod(s->data(), &err_ind));
UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
}
const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
if (scoreVector.size() != numScoreComponents) {
stringstream strme;
@ -201,7 +206,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
UserMessage::Add(strme.str());
abort();
}
CHECK(scoreVector.size() == numScoreComponents);
// parse source & find pt node

View File

@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Word.h"
#include "TypeDef.h"
#include "StaticData.h" // needed to determine the FactorDelimiter
#include "util/tokenize_piece.hh"
using namespace std;
@ -87,23 +88,16 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
void Word::CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const std::string &str
, const StringPiece &str
, bool isNonTerminal)
{
FactorCollection &factorCollection = FactorCollection::Instance();
vector<string> wordVec;
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
//Tokenize(wordVec, str, "|");
CHECK(wordVec.size() <= factorOrder.size());
const Factor *factor;
for (size_t ind = 0; ind < wordVec.size(); ++ind) {
FactorType factorType = factorOrder[ind];
factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
m_factorArray[factorType] = factor;
util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
}
CHECK(!fit);
// assume term/non-term same for all factors
m_isNonTerminal = isNonTerminal;

View File

@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
#include "Factor.h"
#include "Util.h"
#include "util/string_piece.hh"
namespace Moses
{
@ -129,7 +130,7 @@ public:
void CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const std::string &str
, const StringPiece &str
, bool isNonTerminal);
void CreateUnknownWord(const Word &sourceWord);

View File

@ -1,4 +1,4 @@
lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ..//z : <include>.. : : <include>.. ;
lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc string_piece.cc usage.cc ..//z : <include>.. : : <include>.. ;
import testing ;

192
util/string_piece.cc Normal file
View File

@ -0,0 +1,192 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in string_piece.hh.
#include "util/string_piece.hh"
#include <algorithm>
#include <limits.h>
#ifndef HAVE_ICU
typedef StringPiece::size_type size_type;
void StringPiece::CopyToString(std::string* target) const {
target->assign(ptr_, length_);
}
size_type StringPiece::find(const StringPiece& s, size_type pos) const {
if (length_ < 0 || pos > static_cast<size_type>(length_))
return npos;
const char* result = std::search(ptr_ + pos, ptr_ + length_,
s.ptr_, s.ptr_ + s.length_);
const size_type xpos = result - ptr_;
return xpos + s.length_ <= length_ ? xpos : npos;
}
size_type StringPiece::find(char c, size_type pos) const {
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
return npos;
}
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
return result != ptr_ + length_ ? result - ptr_ : npos;
}
size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
if (length_ < s.length_) return npos;
const size_t ulen = length_;
if (s.length_ == 0) return std::min(ulen, pos);
const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
return result != last ? result - ptr_ : npos;
}
size_type StringPiece::rfind(char c, size_type pos) const {
if (length_ <= 0) return npos;
for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
i >= 0; --i) {
if (ptr_[i] == c) {
return i;
}
}
return npos;
}
// For each character in characters_wanted, sets the index corresponding
// to the ASCII code of that character to 1 in table. This is used by
// the find_.*_of methods below to tell whether or not a character is in
// the lookup table in constant time.
// The argument `table' must be an array that is large enough to hold all
// the possible values of an unsigned char. Thus it should be be declared
// as follows:
// bool table[UCHAR_MAX + 1]
static inline void BuildLookupTable(const StringPiece& characters_wanted,
bool* table) {
const size_type length = characters_wanted.length();
const char* const data = characters_wanted.data();
for (size_type i = 0; i < length; ++i) {
table[static_cast<unsigned char>(data[i])] = true;
}
}
size_type StringPiece::find_first_of(const StringPiece& s,
size_type pos) const {
if (length_ == 0 || s.length_ == 0)
return npos;
// Avoid the cost of BuildLookupTable() for a single-character search.
if (s.length_ == 1)
return find_first_of(s.ptr_[0], pos);
bool lookup[UCHAR_MAX + 1] = { false };
BuildLookupTable(s, lookup);
for (size_type i = pos; i < length_; ++i) {
if (lookup[static_cast<unsigned char>(ptr_[i])]) {
return i;
}
}
return npos;
}
size_type StringPiece::find_first_not_of(const StringPiece& s,
size_type pos) const {
if (length_ == 0)
return npos;
if (s.length_ == 0)
return 0;
// Avoid the cost of BuildLookupTable() for a single-character search.
if (s.length_ == 1)
return find_first_not_of(s.ptr_[0], pos);
bool lookup[UCHAR_MAX + 1] = { false };
BuildLookupTable(s, lookup);
for (size_type i = pos; i < length_; ++i) {
if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
return i;
}
}
return npos;
}
size_type StringPiece::find_first_not_of(char c, size_type pos) const {
if (length_ == 0)
return npos;
for (; pos < length_; ++pos) {
if (ptr_[pos] != c) {
return pos;
}
}
return npos;
}
size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
if (length_ == 0 || s.length_ == 0)
return npos;
// Avoid the cost of BuildLookupTable() for a single-character search.
if (s.length_ == 1)
return find_last_of(s.ptr_[0], pos);
bool lookup[UCHAR_MAX + 1] = { false };
BuildLookupTable(s, lookup);
for (size_type i = std::min(pos, length_ - 1); ; --i) {
if (lookup[static_cast<unsigned char>(ptr_[i])])
return i;
if (i == 0)
break;
}
return npos;
}
size_type StringPiece::find_last_not_of(const StringPiece& s,
size_type pos) const {
if (length_ == 0)
return npos;
size_type i = std::min(pos, length_ - 1);
if (s.length_ == 0)
return i;
// Avoid the cost of BuildLookupTable() for a single-character search.
if (s.length_ == 1)
return find_last_not_of(s.ptr_[0], pos);
bool lookup[UCHAR_MAX + 1] = { false };
BuildLookupTable(s, lookup);
for (; ; --i) {
if (!lookup[static_cast<unsigned char>(ptr_[i])])
return i;
if (i == 0)
break;
}
return npos;
}
size_type StringPiece::find_last_not_of(char c, size_type pos) const {
if (length_ == 0)
return npos;
for (size_type i = std::min(pos, length_ - 1); ; --i) {
if (ptr_[i] != c)
return i;
if (i == 0)
break;
}
return npos;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > length_) pos = length_;
if (n > length_ - pos) n = length_ - pos;
return StringPiece(ptr_ + pos, n);
}
const size_type StringPiece::npos = size_type(-1);
#endif // !HAVE_ICU