mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
Got tired of waiting for loading.
This commit is contained in:
parent
0d9095983d
commit
a0ce62e795
@ -160,13 +160,15 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
|
||||
|
||||
void Phrase::CreateFromStringNewFormat(FactorDirection direction
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const std::string &phraseString
|
||||
, const StringPiece &phraseString
|
||||
, const std::string & /*factorDelimiter */
|
||||
, Word &lhs)
|
||||
{
|
||||
// parse
|
||||
vector<string> annotatedWordVector;
|
||||
Tokenize(annotatedWordVector, phraseString);
|
||||
vector<StringPiece> annotatedWordVector;
|
||||
for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
|
||||
annotatedWordVector.push_back(*it);
|
||||
}
|
||||
// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
|
||||
// to
|
||||
// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
|
||||
@ -174,7 +176,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
|
||||
m_words.reserve(annotatedWordVector.size()-1);
|
||||
|
||||
for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() - 1 ; phrasePos++) {
|
||||
string &annotatedWord = annotatedWordVector[phrasePos];
|
||||
StringPiece &annotatedWord = annotatedWordVector[phrasePos];
|
||||
bool isNonTerminal;
|
||||
if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
|
||||
// non-term
|
||||
@ -197,7 +199,7 @@ void Phrase::CreateFromStringNewFormat(FactorDirection direction
|
||||
}
|
||||
|
||||
// lhs
|
||||
string &annotatedWord = annotatedWordVector.back();
|
||||
StringPiece &annotatedWord = annotatedWordVector.back();
|
||||
CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
|
||||
annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);
|
||||
|
||||
|
@ -70,7 +70,7 @@ public:
|
||||
|
||||
void CreateFromStringNewFormat(FactorDirection direction
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const std::string &phraseString
|
||||
, const StringPiece &phraseString
|
||||
, const std::string &factorDelimiter
|
||||
, Word &lhs);
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <sys/stat.h>
|
||||
#include <stdlib.h>
|
||||
#include "RuleTable/Trie.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "Word.h"
|
||||
@ -34,6 +35,8 @@
|
||||
#include "UserMessage.h"
|
||||
#include "ChartTranslationOptionList.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "util/string_piece.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -159,6 +162,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
|
||||
string lineOrig;
|
||||
size_t count = 0;
|
||||
|
||||
vector<float> scoreVector;
|
||||
while(getline(inStream, lineOrig)) {
|
||||
const string *line;
|
||||
if (format == HieroFormat) { // reformat line
|
||||
@ -168,31 +172,32 @@ bool RuleTableLoaderStandard::Load(FormatType format
|
||||
{ // do nothing to format of line
|
||||
line = &lineOrig;
|
||||
}
|
||||
|
||||
util::TokenIter<util::MultiCharacter> pipes(*line, "|||");
|
||||
StringPiece sourcePhraseString(*pipes);
|
||||
StringPiece targetPhraseString(*++pipes);
|
||||
StringPiece scoreString(*++pipes);
|
||||
StringPiece alignString(*++pipes);
|
||||
|
||||
vector<string> tokens;
|
||||
vector<float> scoreVector;
|
||||
|
||||
TokenizeMultiCharSeparator(tokens, *line , "|||" );
|
||||
|
||||
if (tokens.size() != 4 && tokens.size() != 5) {
|
||||
if (++pipes && ++pipes) {
|
||||
stringstream strme;
|
||||
strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
|
||||
UserMessage::Add(strme.str());
|
||||
abort();
|
||||
}
|
||||
|
||||
const string &sourcePhraseString = tokens[0]
|
||||
, &targetPhraseString = tokens[1]
|
||||
, &scoreString = tokens[2]
|
||||
, &alignString = tokens[3];
|
||||
|
||||
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
|
||||
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
|
||||
TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
Tokenize<float>(scoreVector, scoreString);
|
||||
scoreVector.clear();
|
||||
for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
|
||||
char *err_ind;
|
||||
scoreVector.push_back(strtod(s->data(), &err_ind));
|
||||
UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
|
||||
}
|
||||
const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
|
||||
if (scoreVector.size() != numScoreComponents) {
|
||||
stringstream strme;
|
||||
@ -201,7 +206,6 @@ bool RuleTableLoaderStandard::Load(FormatType format
|
||||
UserMessage::Add(strme.str());
|
||||
abort();
|
||||
}
|
||||
CHECK(scoreVector.size() == numScoreComponents);
|
||||
|
||||
// parse source & find pt node
|
||||
|
||||
|
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "Word.h"
|
||||
#include "TypeDef.h"
|
||||
#include "StaticData.h" // needed to determine the FactorDelimiter
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -87,23 +88,16 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
|
||||
|
||||
void Word::CreateFromString(FactorDirection direction
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const std::string &str
|
||||
, const StringPiece &str
|
||||
, bool isNonTerminal)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
|
||||
vector<string> wordVec;
|
||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
|
||||
//Tokenize(wordVec, str, "|");
|
||||
CHECK(wordVec.size() <= factorOrder.size());
|
||||
|
||||
const Factor *factor;
|
||||
for (size_t ind = 0; ind < wordVec.size(); ++ind) {
|
||||
FactorType factorType = factorOrder[ind];
|
||||
factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
|
||||
m_factorArray[factorType] = factor;
|
||||
util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
|
||||
for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
|
||||
m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
|
||||
}
|
||||
CHECK(!fit);
|
||||
|
||||
// assume term/non-term same for all factors
|
||||
m_isNonTerminal = isNonTerminal;
|
||||
|
@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "TypeDef.h"
|
||||
#include "Factor.h"
|
||||
#include "Util.h"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -129,7 +130,7 @@ public:
|
||||
|
||||
void CreateFromString(FactorDirection direction
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const std::string &str
|
||||
, const StringPiece &str
|
||||
, bool isNonTerminal);
|
||||
|
||||
void CreateUnknownWord(const Word &sourceWord);
|
||||
|
@ -1,4 +1,4 @@
|
||||
lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ..//z : <include>.. : : <include>.. ;
|
||||
lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc string_piece.cc usage.cc ..//z : <include>.. : : <include>.. ;
|
||||
|
||||
import testing ;
|
||||
|
||||
|
192
util/string_piece.cc
Normal file
192
util/string_piece.cc
Normal file
@ -0,0 +1,192 @@
|
||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in string_piece.hh.
|
||||
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#ifndef HAVE_ICU
|
||||
|
||||
typedef StringPiece::size_type size_type;
|
||||
|
||||
void StringPiece::CopyToString(std::string* target) const {
|
||||
target->assign(ptr_, length_);
|
||||
}
|
||||
|
||||
size_type StringPiece::find(const StringPiece& s, size_type pos) const {
|
||||
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
||||
return npos;
|
||||
|
||||
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
||||
s.ptr_, s.ptr_ + s.length_);
|
||||
const size_type xpos = result - ptr_;
|
||||
return xpos + s.length_ <= length_ ? xpos : npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find(char c, size_type pos) const {
|
||||
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
||||
return npos;
|
||||
}
|
||||
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
||||
return result != ptr_ + length_ ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
|
||||
if (length_ < s.length_) return npos;
|
||||
const size_t ulen = length_;
|
||||
if (s.length_ == 0) return std::min(ulen, pos);
|
||||
|
||||
const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
|
||||
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
||||
return result != last ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::rfind(char c, size_type pos) const {
|
||||
if (length_ <= 0) return npos;
|
||||
for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
|
||||
i >= 0; --i) {
|
||||
if (ptr_[i] == c) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
// For each character in characters_wanted, sets the index corresponding
|
||||
// to the ASCII code of that character to 1 in table. This is used by
|
||||
// the find_.*_of methods below to tell whether or not a character is in
|
||||
// the lookup table in constant time.
|
||||
// The argument `table' must be an array that is large enough to hold all
|
||||
// the possible values of an unsigned char. Thus it should be be declared
|
||||
// as follows:
|
||||
// bool table[UCHAR_MAX + 1]
|
||||
static inline void BuildLookupTable(const StringPiece& characters_wanted,
|
||||
bool* table) {
|
||||
const size_type length = characters_wanted.length();
|
||||
const char* const data = characters_wanted.data();
|
||||
for (size_type i = 0; i < length; ++i) {
|
||||
table[static_cast<unsigned char>(data[i])] = true;
|
||||
}
|
||||
}
|
||||
|
||||
size_type StringPiece::find_first_of(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (length_ == 0 || s.length_ == 0)
|
||||
return npos;
|
||||
|
||||
// Avoid the cost of BuildLookupTable() for a single-character search.
|
||||
if (s.length_ == 1)
|
||||
return find_first_of(s.ptr_[0], pos);
|
||||
|
||||
bool lookup[UCHAR_MAX + 1] = { false };
|
||||
BuildLookupTable(s, lookup);
|
||||
for (size_type i = pos; i < length_; ++i) {
|
||||
if (lookup[static_cast<unsigned char>(ptr_[i])]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find_first_not_of(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (length_ == 0)
|
||||
return npos;
|
||||
|
||||
if (s.length_ == 0)
|
||||
return 0;
|
||||
|
||||
// Avoid the cost of BuildLookupTable() for a single-character search.
|
||||
if (s.length_ == 1)
|
||||
return find_first_not_of(s.ptr_[0], pos);
|
||||
|
||||
bool lookup[UCHAR_MAX + 1] = { false };
|
||||
BuildLookupTable(s, lookup);
|
||||
for (size_type i = pos; i < length_; ++i) {
|
||||
if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find_first_not_of(char c, size_type pos) const {
|
||||
if (length_ == 0)
|
||||
return npos;
|
||||
|
||||
for (; pos < length_; ++pos) {
|
||||
if (ptr_[pos] != c) {
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
|
||||
if (length_ == 0 || s.length_ == 0)
|
||||
return npos;
|
||||
|
||||
// Avoid the cost of BuildLookupTable() for a single-character search.
|
||||
if (s.length_ == 1)
|
||||
return find_last_of(s.ptr_[0], pos);
|
||||
|
||||
bool lookup[UCHAR_MAX + 1] = { false };
|
||||
BuildLookupTable(s, lookup);
|
||||
for (size_type i = std::min(pos, length_ - 1); ; --i) {
|
||||
if (lookup[static_cast<unsigned char>(ptr_[i])])
|
||||
return i;
|
||||
if (i == 0)
|
||||
break;
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find_last_not_of(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (length_ == 0)
|
||||
return npos;
|
||||
|
||||
size_type i = std::min(pos, length_ - 1);
|
||||
if (s.length_ == 0)
|
||||
return i;
|
||||
|
||||
// Avoid the cost of BuildLookupTable() for a single-character search.
|
||||
if (s.length_ == 1)
|
||||
return find_last_not_of(s.ptr_[0], pos);
|
||||
|
||||
bool lookup[UCHAR_MAX + 1] = { false };
|
||||
BuildLookupTable(s, lookup);
|
||||
for (; ; --i) {
|
||||
if (!lookup[static_cast<unsigned char>(ptr_[i])])
|
||||
return i;
|
||||
if (i == 0)
|
||||
break;
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
size_type StringPiece::find_last_not_of(char c, size_type pos) const {
|
||||
if (length_ == 0)
|
||||
return npos;
|
||||
|
||||
for (size_type i = std::min(pos, length_ - 1); ; --i) {
|
||||
if (ptr_[i] != c)
|
||||
return i;
|
||||
if (i == 0)
|
||||
break;
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
||||
if (pos > length_) pos = length_;
|
||||
if (n > length_ - pos) n = length_ - pos;
|
||||
return StringPiece(ptr_ + pos, n);
|
||||
}
|
||||
|
||||
const size_type StringPiece::npos = size_type(-1);
|
||||
|
||||
#endif // !HAVE_ICU
|
Loading…
Reference in New Issue
Block a user