mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-16 06:50:32 +03:00
Support tokenize(const std::string &) as well.
Convenience wrapper: the actual function takes a const char[], but many of the call sites want to pass a string and have to call its c_str() first.
This commit is contained in:
parent
10a0a7b05a
commit
32722ab5b1
@ -442,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
|
|||||||
i++;
|
i++;
|
||||||
if (i%100000 == 0) cerr << "." << flush;
|
if (i%100000 == 0) cerr << "." << flush;
|
||||||
|
|
||||||
const vector<string> token = util::tokenize( line.c_str() );
|
const vector<string> token = util::tokenize( line );
|
||||||
if (token.size() != 4) {
|
if (token.size() != 4) {
|
||||||
cerr << "line " << i << " in " << fileName
|
cerr << "line " << i << " in " << fileName
|
||||||
<< " has wrong number of tokens, skipping:\n"
|
<< " has wrong number of tokens, skipping:\n"
|
||||||
|
@ -18,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
|
|||||||
string line;
|
string line;
|
||||||
while(getline(*fileP, line)) {
|
while(getline(*fileP, line)) {
|
||||||
// read
|
// read
|
||||||
const vector< string > domainSpecLine = util::tokenize( line.c_str() );
|
const vector< string > domainSpecLine = util::tokenize( line );
|
||||||
int lineNumber;
|
int lineNumber;
|
||||||
if (domainSpecLine.size() != 2 ||
|
if (domainSpecLine.size() != 2 ||
|
||||||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
||||||
|
@ -50,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
|
|||||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
target = util::tokenize(targetStringCPP.c_str());
|
target = util::tokenize(targetStringCPP);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
|
|||||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
source = util::tokenize(sourceStringCPP.c_str());
|
source = util::tokenize(sourceStringCPP);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,7 +108,7 @@ int main(int argc, char* argv[])
|
|||||||
if (! getLine(fileDirectP, itemDirect ))
|
if (! getLine(fileDirectP, itemDirect ))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
const vector< string > count = util::tokenize( itemDirect[4].c_str() );
|
const vector< string > count = util::tokenize( itemDirect[4] );
|
||||||
float countEF = atof(count[0].c_str());
|
float countEF = atof(count[0].c_str());
|
||||||
float countF = atof(count[1].c_str());
|
float countF = atof(count[1].c_str());
|
||||||
float prob = countF/countEF;
|
float prob = countF/countEF;
|
||||||
|
@ -166,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
|||||||
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
|
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
|
||||||
|
|
||||||
// counts, for debugging
|
// counts, for debugging
|
||||||
const vector<string> directCounts = util::tokenize(itemDirect[4].c_str());
|
const vector<string> directCounts = util::tokenize(itemDirect[4]);
|
||||||
const vector<string> indirectCounts = util::tokenize(itemIndirect[4].c_str());
|
const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
|
||||||
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
|
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
|
||||||
// output rule count if present in either file
|
// output rule count if present in either file
|
||||||
if (indirectCounts.size() > 1) {
|
if (indirectCounts.size() > 1) {
|
||||||
@ -223,7 +223,7 @@ string reverseAlignment(const string &alignments)
|
|||||||
{
|
{
|
||||||
stringstream ret("");
|
stringstream ret("");
|
||||||
|
|
||||||
const vector<string> alignToks = util::tokenize(alignments.c_str());
|
const vector<string> alignToks = util::tokenize(alignments);
|
||||||
|
|
||||||
for (size_t i = 0; i < alignToks.size(); ++i) {
|
for (size_t i = 0; i < alignToks.size(); ++i) {
|
||||||
const string &alignPair = alignToks[i];
|
const string &alignPair = alignToks[i];
|
||||||
|
@ -57,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
|
|||||||
m_tree.ConnectNodes();
|
m_tree.ConnectNodes();
|
||||||
SyntaxNode *root = m_tree.GetTop();
|
SyntaxNode *root = m_tree.GetTop();
|
||||||
assert(root);
|
assert(root);
|
||||||
m_words = util::tokenize(m_line.c_str());
|
m_words = util::tokenize(m_line);
|
||||||
return ConvertTree(*root, m_words);
|
return ConvertTree(*root, m_words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
|
|||||||
// There is no XML tree.
|
// There is no XML tree.
|
||||||
return std::auto_ptr<PcfgTree>();
|
return std::auto_ptr<PcfgTree>();
|
||||||
}
|
}
|
||||||
m_words = util::tokenize(m_line.c_str());
|
m_words = util::tokenize(m_line);
|
||||||
return ConvertTree(*root, m_words);
|
return ConvertTree(*root, m_words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ int main(int argc, char* argv[])
|
|||||||
map< string, int > topLabelCollection; // count of top labels, not used
|
map< string, int > topLabelCollection; // count of top labels, not used
|
||||||
SyntaxTree tree;
|
SyntaxTree tree;
|
||||||
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
|
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
|
||||||
const vector< string > inWords = util::tokenize( inBufferString.c_str() );
|
const vector< string > inWords = util::tokenize( inBufferString );
|
||||||
|
|
||||||
// output tree
|
// output tree
|
||||||
// cerr << "BEFORE:" << endl << tree;
|
// cerr << "BEFORE:" << endl << tree;
|
||||||
|
@ -322,7 +322,7 @@ void LexicalTable::load( const string &filePath )
|
|||||||
i++;
|
i++;
|
||||||
if (i%100000 == 0) cerr << "." << flush;
|
if (i%100000 == 0) cerr << "." << flush;
|
||||||
|
|
||||||
const vector<string> token = util::tokenize( line.c_str() );
|
const vector<string> token = util::tokenize( line );
|
||||||
if (token.size() != 3) {
|
if (token.size() != 3) {
|
||||||
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
||||||
token.size() << " " << token[0] << " " << line << endl;
|
token.size() << " " << token[0] << " " << line << endl;
|
||||||
|
@ -25,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
|
|||||||
tree_.ConnectNodes();
|
tree_.ConnectNodes();
|
||||||
SyntaxNode *root = tree_.GetTop();
|
SyntaxNode *root = tree_.GetTop();
|
||||||
assert(root);
|
assert(root);
|
||||||
words_ = util::tokenize(line_.c_str());
|
words_ = util::tokenize(line_);
|
||||||
return ConvertTree(*root, words_);
|
return ConvertTree(*root, words_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ void DTable::load( const string& fileName )
|
|||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
const vector<string> token = util::tokenize(line.c_str());
|
const vector<string> token = util::tokenize(line);
|
||||||
if (token.size() < 2) {
|
if (token.size() < 2) {
|
||||||
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
||||||
continue;
|
continue;
|
||||||
|
@ -37,6 +37,15 @@ inline std::vector<std::string> tokenize(const char input[])
|
|||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Split input string into a series of tokens.
|
||||||
|
*
|
||||||
|
* Like tokenize(const char[]), but takes a std::string.
|
||||||
|
*/
|
||||||
|
inline std::vector<std::string> tokenize(const std::string &input)
|
||||||
|
{
|
||||||
|
return tokenize(input.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user