diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp index 3ef82e73a..60ab8db66 100644 --- a/biconcor/phrase-lookup.cpp +++ b/biconcor/phrase-lookup.cpp @@ -109,14 +109,17 @@ size_t lookup( string query ) return suffixArray.Count( queryString ); } -vector tokenize( const char input[] ) +// Duplicate of definition in util/tokenize.hh. +// TODO: Can we de-duplicate this? At the time of writing biconcor does not +// use util at all. +vector tokenize(const char input[]) { vector< string > token; bool betweenWords = true; int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); + int i; + for(i = 0; input[i] != '\0'; i++) { + const bool isSpace = (input[i] == ' ' || input[i] == '\t'); if (!isSpace && betweenWords) { start = i; diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp index c632f9ff2..444557f9b 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp @@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "util/exception.hh" +#include "util/tokenize.hh" #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" using namespace std; @@ -30,29 +31,6 @@ void OutputVec(const vector &vec) cerr << endl; } -// from phrase-extract/tables-core.cpp -inline vector tokenize( const char* input ) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - namespace Moses { @@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic i++; if (i%100000 == 0) cerr << "." << flush; - vector token = tokenize( line.c_str() ); + const vector token = util::tokenize( line.c_str() ); if (token.size() != 4) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp index 899eb9f1c..0d2b96a8a 100644 --- a/phrase-extract/DomainFeature.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -2,6 +2,7 @@ #include "ExtractionPhrasePair.h" #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; @@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName ) string line; while(getline(*fileP, line)) { // read - vector< string > domainSpecLine = tokenize( line.c_str() ); + const vector< string > domainSpecLine = util::tokenize( line.c_str() ); int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { @@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName ) exit(1); } // store - string &name = domainSpecLine[1]; + const string &name = domainSpecLine[1]; spec.push_back( make_pair( lineNumber, name )); if (name2id.find( name ) == name2id.end()) { name2id[ name ] = list.size(); diff --git a/phrase-extract/DomainFeature.h b/phrase-extract/DomainFeature.h index 040a5fc72..95babb6c2 100644 --- a/phrase-extract/DomainFeature.h +++ b/phrase-extract/DomainFeature.h @@ -14,8 +14,6 @@ #include "ScoreFeature.h" -extern std::vector tokenize( const char*); - namespace MosesTraining { diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp index ee7f27ed9..21c1a1dbd 100644 --- a/phrase-extract/SentenceAlignment.cpp +++ b/phrase-extract/SentenceAlignment.cpp @@ -24,6 +24,7 @@ #include #include "tables-core.h" +#include "util/tokenize.hh" using namespace std; @@ -40,7 +41,7 @@ void addBoundaryWords(vector &phrase) bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules) { - target = tokenize(targetString); + target = util::tokenize(targetString); if (boundaryRules) addBoundaryWords(target); return true; @@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules) { - source = tokenize(sourceString); + source = util::tokenize(sourceString); if (boundaryRules) addBoundaryWords(source); return true; @@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[], } // reading in alignments - vector alignmentSequence = tokenize( alignmentString ); + vector alignmentSequence = util::tokenize( alignmentString ); for(size_t i=0; i #include "InputFileStream.h" #include "OutputFileStream.h" +#include "util/tokenize.hh" using namespace std; -std::vector tokenize( const char [] ); - vector< string > splitLine(const char *line) { vector< string > item; @@ -109,7 +108,7 @@ int main(int argc, char* argv[]) if (! getLine(fileDirectP, itemDirect )) break; - vector< string > count = tokenize( itemDirect[4].c_str() ); + const vector< string > count = util::tokenize( itemDirect[4].c_str() ); float countEF = atof(count[0].c_str()); float countF = atof(count[1].c_str()); float prob = countF/countEF; diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index e2b0ad473..abba063a3 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -28,6 +28,7 @@ #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; @@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]); // counts, for debugging - vector directCounts = tokenize(itemDirect[4].c_str()); - vector indirectCounts = tokenize(itemIndirect[4].c_str()); + const vector directCounts = util::tokenize(itemDirect[4].c_str()); + const vector indirectCounts = util::tokenize(itemIndirect[4].c_str()); fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0]; // output rule count if present in either file if (indirectCounts.size() > 1) { @@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item ) vector< string > splitLine(const char *line) { vector< string > item; - bool betweenWords = true; int start=0; int i=0; for(; line[i] != '\0'; i++) { @@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments) { stringstream ret(""); - vector alignToks = tokenize(alignments.c_str()); + const vector alignToks = util::tokenize(alignments.c_str()); for (size_t i = 0; i < alignToks.size(); ++i) { - string &alignPair = alignToks[i]; + const string &alignPair = alignToks[i]; vector alignPoints; Tokenize(alignPoints, alignPair, "-"); assert(alignPoints.size() == 2); diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 2f28c3244..267906b4c 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -23,6 +23,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include #include @@ -56,7 +57,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) m_tree.ConnectNodes(); SyntaxNode *root = m_tree.GetTop(); assert(root); - m_words = tokenize(m_line.c_str()); + m_words = util::tokenize(m_line.c_str()); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc index 3d9291994..6a2f3fc51 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ b/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -25,6 +25,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include "syntax-common/exception.h" @@ -51,7 +52,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) { // There is no XML tree. return std::auto_ptr(); } - m_words = tokenize(m_line.c_str()); + m_words = util::tokenize(m_line.c_str()); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index a6d50cef5..ec7bd25fd 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -21,6 +21,7 @@ #include "relax-parse.h" #include "tables-core.h" +#include "util/tokenize.hh" using namespace std; using namespace MosesTraining; @@ -44,7 +45,7 @@ int main(int argc, char* argv[]) map< string, int > topLabelCollection; // count of top labels, not used SyntaxTree tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); - vector< string > inWords = tokenize( inBufferString.c_str() ); + const vector< string > inWords = util::tokenize( inBufferString.c_str() ); // output tree // cerr << "BEFORE:" << endl << tree; @@ -104,7 +105,7 @@ void init(int argc, char* argv[]) } } -void store( SyntaxTree &tree, vector< string > &words ) +void store( SyntaxTree &tree, const vector< string > &words ) { // output words for( size_t i=0; i &words ); +void store( MosesTraining::SyntaxTree &tree, const std::vector &words ); void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp index a6c0b74db..d797b9f45 100644 --- a/phrase-extract/statistics-main.cpp +++ b/phrase-extract/statistics-main.cpp @@ -14,6 +14,7 @@ #include "AlignmentPhrase.h" #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; using namespace MosesTraining; @@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) bool PhraseAlignment::create(const char line[], int lineID ) { - vector< string > token = tokenize( line ); + const vector< string > token = util::tokenize( line ); int item = 1; PHRASE phraseF, phraseE; for (size_t j=0; j token = tokenize( line.c_str() ); + const vector token = util::tokenize( line.c_str() ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index c4363a3e2..47fda1c2d 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -3,6 +3,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include #include @@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) { tree_.ConnectNodes(); SyntaxNode *root = tree_.GetTop(); assert(root); - words_ = tokenize(line_.c_str()); + words_ = util::tokenize(line_.c_str()); return ConvertTree(*root, words_); } diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp index 30c1544e9..93c5041dd 100644 --- a/phrase-extract/tables-core.cpp +++ b/phrase-extract/tables-core.cpp @@ -1,5 +1,6 @@ // $Id$ //#include "beammain.h" +#include "util/tokenize.hh" #include "tables-core.h" #define TABLE_LINE_MAX_LENGTH 1000 @@ -7,29 +8,6 @@ using namespace std; -// as in beamdecoder/tables.cpp -vector tokenize( const char* input ) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - namespace MosesTraining { @@ -107,7 +85,7 @@ void DTable::load( const string& fileName ) abort(); } - vector token = tokenize(line.c_str()); + const vector token = util::tokenize(line.c_str()); if (token.size() < 2) { cerr << "line " << i << " in " << fileName << " too short, skipping\n"; continue; diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h index 44545d3a0..011fe09e6 100644 --- a/phrase-extract/tables-core.h +++ b/phrase-extract/tables-core.h @@ -12,8 +12,6 @@ #include #include -extern std::vector tokenize( const char*); - namespace MosesTraining {