From 32722ab5b1477cb55bce2ba5d0c2620446cff8a9 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Wed, 22 Apr 2015 10:35:18 +0700 Subject: [PATCH] Support tokenize(const std::string &) as well. Convenience wrapper: the actual function takes a const char[], but many of the call sites want to pass a string and have to call its c_str() first. --- .../PhraseDictionaryMultiModelCounts.cpp | 2 +- phrase-extract/DomainFeature.cpp | 2 +- phrase-extract/SentenceAlignmentWithSyntax.cpp | 4 ++-- phrase-extract/consolidate-direct-main.cpp | 2 +- phrase-extract/consolidate-reverse-main.cpp | 6 +++--- phrase-extract/extract-ghkm/XmlTreeParser.cpp | 2 +- phrase-extract/pcfg-common/xml_tree_parser.cc | 2 +- phrase-extract/relax-parse-main.cpp | 2 +- phrase-extract/statistics-main.cpp | 2 +- phrase-extract/syntax-common/xml_tree_parser.cc | 2 +- phrase-extract/tables-core.cpp | 2 +- util/tokenize.hh | 9 +++++++++ 12 files changed, 23 insertions(+), 14 deletions(-) diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp index 444557f9b..773e027cc 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp @@ -442,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic i++; if (i%100000 == 0) cerr << "." << flush; - const vector token = util::tokenize( line.c_str() ); + const vector token = util::tokenize( line ); if (token.size() != 4) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp index 0d2b96a8a..d5138ba9b 100644 --- a/phrase-extract/DomainFeature.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -18,7 +18,7 @@ void Domain::load( const std::string &domainFileName ) string line; while(getline(*fileP, line)) { // read - const vector< string > domainSpecLine = util::tokenize( line.c_str() ); + const vector< string > domainSpecLine = util::tokenize( line ); int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp index 7403243ab..4fd2355ae 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.cpp +++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp @@ -50,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin << sentenceID << ": " << e.getMsg() << std::endl; return false; } - target = util::tokenize(targetStringCPP.c_str()); + target = util::tokenize(targetStringCPP); return true; } @@ -71,7 +71,7 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin << sentenceID << ": " << e.getMsg() << std::endl; return false; } - source = util::tokenize(sourceStringCPP.c_str()); + source = util::tokenize(sourceStringCPP); return true; } diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp index e85a897ef..d25197372 100644 --- a/phrase-extract/consolidate-direct-main.cpp +++ b/phrase-extract/consolidate-direct-main.cpp @@ -108,7 +108,7 @@ int main(int argc, char* argv[]) if (! getLine(fileDirectP, itemDirect )) break; - const vector< string > count = util::tokenize( itemDirect[4].c_str() ); + const vector< string > count = util::tokenize( itemDirect[4] ); float countEF = atof(count[0].c_str()); float countF = atof(count[1].c_str()); float prob = countF/countEF; diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index abba063a3..bce496a0c 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -166,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]); // counts, for debugging - const vector directCounts = util::tokenize(itemDirect[4].c_str()); - const vector indirectCounts = util::tokenize(itemIndirect[4].c_str()); + const vector directCounts = util::tokenize(itemDirect[4]); + const vector indirectCounts = util::tokenize(itemIndirect[4]); fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0]; // output rule count if present in either file if (indirectCounts.size() > 1) { @@ -223,7 +223,7 @@ string reverseAlignment(const string &alignments) { stringstream ret(""); - const vector alignToks = util::tokenize(alignments.c_str()); + const vector alignToks = util::tokenize(alignments); for (size_t i = 0; i < alignToks.size(); ++i) { const string &alignPair = alignToks[i]; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 267906b4c..f9800c8e0 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -57,7 +57,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) m_tree.ConnectNodes(); SyntaxNode *root = m_tree.GetTop(); assert(root); - m_words = util::tokenize(m_line.c_str()); + m_words = util::tokenize(m_line); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc index 6a2f3fc51..29e46a9f2 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ b/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -52,7 +52,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) { // There is no XML tree. return std::auto_ptr(); } - m_words = util::tokenize(m_line.c_str()); + m_words = util::tokenize(m_line); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index ec7bd25fd..5c9daa7ae 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -45,7 +45,7 @@ int main(int argc, char* argv[]) map< string, int > topLabelCollection; // count of top labels, not used SyntaxTree tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); - const vector< string > inWords = util::tokenize( inBufferString.c_str() ); + const vector< string > inWords = util::tokenize( inBufferString ); // output tree // cerr << "BEFORE:" << endl << tree; diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp index d797b9f45..840f18602 100644 --- a/phrase-extract/statistics-main.cpp +++ b/phrase-extract/statistics-main.cpp @@ -322,7 +322,7 @@ void LexicalTable::load( const string &filePath ) i++; if (i%100000 == 0) cerr << "." << flush; - const vector token = util::tokenize( line.c_str() ); + const vector token = util::tokenize( line ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 47fda1c2d..c6e3cd3c3 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -25,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) { tree_.ConnectNodes(); SyntaxNode *root = tree_.GetTop(); assert(root); - words_ = util::tokenize(line_.c_str()); + words_ = util::tokenize(line_); return ConvertTree(*root, words_); } diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp index 93c5041dd..9aa7aa787 100644 --- a/phrase-extract/tables-core.cpp +++ b/phrase-extract/tables-core.cpp @@ -85,7 +85,7 @@ void DTable::load( const string& fileName ) abort(); } - const vector token = util::tokenize(line.c_str()); + const vector token = util::tokenize(line); if (token.size() < 2) { cerr << "line " << i << " in " << fileName << " too short, skipping\n"; continue; diff --git a/util/tokenize.hh b/util/tokenize.hh index f4f3289bc..5d8430222 100644 --- a/util/tokenize.hh +++ b/util/tokenize.hh @@ -37,6 +37,15 @@ inline std::vector tokenize(const char input[]) return token; } +/** Split input string into a series of tokens. + * + * Like tokenize(const char[]), but takes a std::string. + */ +inline std::vector tokenize(const std::string &input) +{ + return tokenize(input.c_str()); +} + } // namespace util #endif