Unify tokenize() into util, and unit-test it.

The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy.
2024-08-17 23:40:50 +03:00 · 2015-04-22 09:59:05 +07:00 · 2015-04-22 09:59:05 +07:00 · b2d821a141
commit b2d821a141
parent c15f3ef068
16 changed files with 41 additions and 82 deletions
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@ -109,14 +109,17 @@ size_t lookup( string query )
  return suffixArray.Count( queryString );
 }

+// Duplicate of definition in util/tokenize.hh.
+// TODO: Can we de-duplicate this?  At the time of writing biconcor does not
+// use util at all.
 vector<string> tokenize(const char input[])
 {
  vector< string > token;
  bool betweenWords = true;
  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+  int i;
+  for(i = 0; input[i] != '\0'; i++) {
+    const bool isSpace = (input[i] == ' ' || input[i] == '\t');

    if (!isSpace && betweenWords) {
      start = i;
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 #include "util/exception.hh"
+#include "util/tokenize.hh"
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"

 using namespace std;
@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
  cerr << endl;
 }

-// from phrase-extract/tables-core.cpp
-inline vector<string> tokenize( const char* input )
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-
 namespace Moses
 {

@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
    i++;
    if (i%100000 == 0) cerr << "." << flush;

-    vector<string> token = tokenize( line.c_str() );
+    const vector<string> token = util::tokenize( line.c_str() );
    if (token.size() != 4) {
      cerr << "line " << i << " in " << fileName
           << " has wrong number of tokens, skipping:\n"
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@ -2,6 +2,7 @@
 #include "ExtractionPhrasePair.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
  string line;
  while(getline(*fileP, line)) {
    // read
-    vector< string > domainSpecLine = tokenize( line.c_str() );
+    const vector< string > domainSpecLine = util::tokenize( line.c_str() );
    int lineNumber;
    if (domainSpecLine.size() != 2 ||
        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
      exit(1);
    }
    // store
-    string &name = domainSpecLine[1];
+    const string &name = domainSpecLine[1];
    spec.push_back( make_pair( lineNumber, name ));
    if (name2id.find( name ) == name2id.end()) {
      name2id[ name ] = list.size();
--- a/phrase-extract/DomainFeature.h
+++ b/phrase-extract/DomainFeature.h
@ -14,8 +14,6 @@

 #include "ScoreFeature.h"

-extern std::vector<std::string> tokenize( const char*);
-
 namespace MosesTraining
 {

--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@ -24,6 +24,7 @@
 #include <string>

 #include "tables-core.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)

 bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
 {
-  target = tokenize(targetString);
+  target = util::tokenize(targetString);
  if (boundaryRules)
    addBoundaryWords(target);
  return true;
@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo

 bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
 {
-  source = tokenize(sourceString);
+  source = util::tokenize(sourceString);
  if (boundaryRules)
    addBoundaryWords(source);
  return true;
@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
  }

  // reading in alignments
-  vector<string> alignmentSequence = tokenize( alignmentString );
+  vector<string> alignmentSequence = util::tokenize( alignmentString );
  for(size_t i=0; i<alignmentSequence.size(); i++) {
    int s,t;
    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@ -26,6 +26,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
-  target = tokenize(targetStringCPP.c_str());
+  target = util::tokenize(targetStringCPP.c_str());
  return true;
 }

@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
-  source = tokenize(sourceStringCPP.c_str());
+  source = util::tokenize(sourceStringCPP.c_str());
  return true;
 }

 } // namespace
-
-
-
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@ -25,11 +25,10 @@
 #include <cstdlib>
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

-std::vector<std::string> tokenize( const char [] );
-
 vector< string > splitLine(const char *line)
 {
  vector< string > item;
@ -109,7 +108,7 @@ int main(int argc, char* argv[])
    if (! getLine(fileDirectP,  itemDirect  ))
      break;

-    vector< string > count = tokenize( itemDirect[4].c_str() );
+    const vector< string > count = util::tokenize( itemDirect[4].c_str() );
    float countEF = atof(count[0].c_str());
    float countF = atof(count[1].c_str());
    float prob = countF/countEF;
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@ -28,6 +28,7 @@

 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;

@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);

    // counts, for debugging
-    vector<string> directCounts = tokenize(itemDirect[4].c_str());
-    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+    const vector<string> directCounts = util::tokenize(itemDirect[4].c_str());
+    const vector<string> indirectCounts = util::tokenize(itemIndirect[4].c_str());
    fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
    // output rule count if present in either file
    if (indirectCounts.size() > 1) {
@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
 vector< string > splitLine(const char *line)
 {
  vector< string > item;
-  bool betweenWords = true;
  int start=0;
  int i=0;
  for(; line[i] != '\0'; i++) {
@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
 {
  stringstream ret("");

-  vector<string> alignToks = tokenize(alignments.c_str());
+  const vector<string> alignToks = util::tokenize(alignments.c_str());

  for (size_t i = 0; i < alignToks.size(); ++i) {
-    string &alignPair = alignToks[i];
+    const string &alignPair = alignToks[i];
    vector<string> alignPoints;
    Tokenize(alignPoints, alignPair, "-");
    assert(alignPoints.size() == 2);
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@ -23,6 +23,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include <cassert>
 #include <vector>
@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
  m_tree.ConnectNodes();
  SyntaxNode *root = m_tree.GetTop();
  assert(root);
-  m_words = tokenize(m_line.c_str());
+  m_words = util::tokenize(m_line.c_str());
  return ConvertTree(*root, m_words);
 }

--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@ -25,6 +25,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include "syntax-common/exception.h"

@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
    // There is no XML tree.
    return std::auto_ptr<PcfgTree>();
  }
-  m_words = tokenize(m_line.c_str());
+  m_words = util::tokenize(m_line.c_str());
  return ConvertTree(*root, m_words);
 }

--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@ -21,6 +21,7 @@

 #include "relax-parse.h"
 #include "tables-core.h"
+#include "util/tokenize.hh"

 using namespace std;
 using namespace MosesTraining;
@ -44,7 +45,7 @@ int main(int argc, char* argv[])
    map< string, int > topLabelCollection; // count of top labels, not used
    SyntaxTree tree;
    ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
-    vector< string > inWords = tokenize( inBufferString.c_str() );
+    const vector< string > inWords = util::tokenize( inBufferString.c_str() );

    // output tree
    // cerr << "BEFORE:" << endl << tree;
@ -104,7 +105,7 @@ void init(int argc, char* argv[])
  }
 }

-void store( SyntaxTree &tree, vector< string > &words )
+void store( SyntaxTree &tree, const vector< string > &words )
 {
  // output words
  for( size_t i=0; i<words.size(); i++ ) {
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@ -39,7 +39,7 @@ char SAMTLevel = 0;

 // functions
 void init(int argc, char* argv[]);
-void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
+void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
 void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
 void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
 void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@ -14,6 +14,7 @@
 #include "AlignmentPhrase.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "util/tokenize.hh"

 using namespace std;
 using namespace MosesTraining;
@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )

 bool PhraseAlignment::create(const char line[], int lineID )
 {
-  vector< string > token = tokenize( line );
+  const vector< string > token = util::tokenize( line );
  int item = 1;
  PHRASE phraseF, phraseE;
  for (size_t j=0; j<token.size(); j++) {
@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
    i++;
    if (i%100000 == 0) cerr << "." << flush;

-    vector<string> token = tokenize( line.c_str() );
+    const vector<string> token = util::tokenize( line.c_str() );
    if (token.size() != 3) {
      cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
           token.size() << " " << token[0] << " " << line << endl;
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@ -3,6 +3,7 @@
 #include "tables-core.h"
 #include "XmlException.h"
 #include "XmlTree.h"
+#include "util/tokenize.hh"

 #include <cassert>
 #include <vector>
@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
  tree_.ConnectNodes();
  SyntaxNode *root = tree_.GetTop();
  assert(root);
-  words_ = tokenize(line_.c_str());
+  words_ = util::tokenize(line_.c_str());
  return ConvertTree(*root, words_);
 }

--- a/phrase-extract/tables-core.cpp
+++ b/phrase-extract/tables-core.cpp
@ -1,5 +1,6 @@
 // $Id$
 //#include "beammain.h"
+#include "util/tokenize.hh"
 #include "tables-core.h"

 #define TABLE_LINE_MAX_LENGTH 1000
@ -7,29 +8,6 @@

 using namespace std;

-// as in beamdecoder/tables.cpp
-vector<string> tokenize( const char* input )
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i=0;
-  for(; input[i] != '\0'; i++) {
-    bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-
 namespace MosesTraining
 {

@ -107,7 +85,7 @@ void DTable::load( const string& fileName )
      abort();
    }

-    vector<string> token = tokenize(line.c_str());
+    const vector<string> token = util::tokenize(line.c_str());
    if (token.size() < 2) {
      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
      continue;
--- a/phrase-extract/tables-core.h
+++ b/phrase-extract/tables-core.h
@ -12,8 +12,6 @@
 #include <map>
 #include <cmath>

-extern std::vector<std::string> tokenize( const char*);
-
 namespace MosesTraining
 {