Unify tokenize() into util, and unit-test it.

The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.

In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions.  In theory this may help performance, but it's mainly for clarity.

The comments are based on reverse-engineering, and the unit tests are based
on the comments.  It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!

I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
This commit is contained in:
Jeroen Vermeulen 2015-04-22 09:59:05 +07:00
parent c15f3ef068
commit b2d821a141
16 changed files with 41 additions and 82 deletions

View File

@ -109,14 +109,17 @@ size_t lookup( string query )
return suffixArray.Count( queryString );
}
vector<string> tokenize( const char input[] )
// Duplicate of definition in util/tokenize.hh.
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
// use util at all.
vector<string> tokenize(const char input[])
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
int i;
for(i = 0; input[i] != '\0'; i++) {
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;

View File

@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "util/exception.hh"
#include "util/tokenize.hh"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
using namespace std;
@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
cerr << endl;
}
// from phrase-extract/tables-core.cpp
inline vector<string> tokenize( const char* input )
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
namespace Moses
{
@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
i++;
if (i%100000 == 0) cerr << "." << flush;
vector<string> token = tokenize( line.c_str() );
const vector<string> token = util::tokenize( line.c_str() );
if (token.size() != 4) {
cerr << "line " << i << " in " << fileName
<< " has wrong number of tokens, skipping:\n"

View File

@ -2,6 +2,7 @@
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "util/tokenize.hh"
using namespace std;
@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
string line;
while(getline(*fileP, line)) {
// read
vector< string > domainSpecLine = tokenize( line.c_str() );
const vector< string > domainSpecLine = util::tokenize( line.c_str() );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
exit(1);
}
// store
string &name = domainSpecLine[1];
const string &name = domainSpecLine[1];
spec.push_back( make_pair( lineNumber, name ));
if (name2id.find( name ) == name2id.end()) {
name2id[ name ] = list.size();

View File

@ -14,8 +14,6 @@
#include "ScoreFeature.h"
extern std::vector<std::string> tokenize( const char*);
namespace MosesTraining
{

View File

@ -24,6 +24,7 @@
#include <string>
#include "tables-core.h"
#include "util/tokenize.hh"
using namespace std;
@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
target = tokenize(targetString);
target = util::tokenize(targetString);
if (boundaryRules)
addBoundaryWords(target);
return true;
@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
source = tokenize(sourceString);
source = util::tokenize(sourceString);
if (boundaryRules)
addBoundaryWords(source);
return true;
@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
}
// reading in alignments
vector<string> alignmentSequence = tokenize( alignmentString );
vector<string> alignmentSequence = util::tokenize( alignmentString );
for(size_t i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;

View File

@ -26,6 +26,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
using namespace std;
@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
target = tokenize(targetStringCPP.c_str());
target = util::tokenize(targetStringCPP.c_str());
return true;
}
@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
<< sentenceID << ": " << e.getMsg() << std::endl;
return false;
}
source = tokenize(sourceStringCPP.c_str());
source = util::tokenize(sourceStringCPP.c_str());
return true;
}
} // namespace

View File

@ -25,11 +25,10 @@
#include <cstdlib>
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "util/tokenize.hh"
using namespace std;
std::vector<std::string> tokenize( const char [] );
vector< string > splitLine(const char *line)
{
vector< string > item;
@ -109,7 +108,7 @@ int main(int argc, char* argv[])
if (! getLine(fileDirectP, itemDirect ))
break;
vector< string > count = tokenize( itemDirect[4].c_str() );
const vector< string > count = util::tokenize( itemDirect[4].c_str() );
float countEF = atof(count[0].c_str());
float countF = atof(count[1].c_str());
float prob = countF/countEF;

View File

@ -28,6 +28,7 @@
#include "tables-core.h"
#include "InputFileStream.h"
#include "util/tokenize.hh"
using namespace std;
@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
// counts, for debugging
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
const vector<string> directCounts = util::tokenize(itemDirect[4].c_str());
const vector<string> indirectCounts = util::tokenize(itemIndirect[4].c_str());
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
// output rule count if present in either file
if (indirectCounts.size() > 1) {
@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
vector< string > splitLine(const char *line)
{
vector< string > item;
bool betweenWords = true;
int start=0;
int i=0;
for(; line[i] != '\0'; i++) {
@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
{
stringstream ret("");
vector<string> alignToks = tokenize(alignments.c_str());
const vector<string> alignToks = util::tokenize(alignments.c_str());
for (size_t i = 0; i < alignToks.size(); ++i) {
string &alignPair = alignToks[i];
const string &alignPair = alignToks[i];
vector<string> alignPoints;
Tokenize(alignPoints, alignPair, "-");
assert(alignPoints.size() == 2);

View File

@ -23,6 +23,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
#include <cassert>
#include <vector>
@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
m_tree.ConnectNodes();
SyntaxNode *root = m_tree.GetTop();
assert(root);
m_words = tokenize(m_line.c_str());
m_words = util::tokenize(m_line.c_str());
return ConvertTree(*root, m_words);
}

View File

@ -25,6 +25,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
#include "syntax-common/exception.h"
@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
// There is no XML tree.
return std::auto_ptr<PcfgTree>();
}
m_words = tokenize(m_line.c_str());
m_words = util::tokenize(m_line.c_str());
return ConvertTree(*root, m_words);
}

View File

@ -21,6 +21,7 @@
#include "relax-parse.h"
#include "tables-core.h"
#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@ -44,7 +45,7 @@ int main(int argc, char* argv[])
map< string, int > topLabelCollection; // count of top labels, not used
SyntaxTree tree;
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
vector< string > inWords = tokenize( inBufferString.c_str() );
const vector< string > inWords = util::tokenize( inBufferString.c_str() );
// output tree
// cerr << "BEFORE:" << endl << tree;
@ -104,7 +105,7 @@ void init(int argc, char* argv[])
}
}
void store( SyntaxTree &tree, vector< string > &words )
void store( SyntaxTree &tree, const vector< string > &words )
{
// output words
for( size_t i=0; i<words.size(); i++ ) {

View File

@ -39,7 +39,7 @@ char SAMTLevel = 0;
// functions
void init(int argc, char* argv[]);
void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );

View File

@ -14,6 +14,7 @@
#include "AlignmentPhrase.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "util/tokenize.hh"
using namespace std;
using namespace MosesTraining;
@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
bool PhraseAlignment::create(const char line[], int lineID )
{
vector< string > token = tokenize( line );
const vector< string > token = util::tokenize( line );
int item = 1;
PHRASE phraseF, phraseE;
for (size_t j=0; j<token.size(); j++) {
@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
i++;
if (i%100000 == 0) cerr << "." << flush;
vector<string> token = tokenize( line.c_str() );
const vector<string> token = util::tokenize( line.c_str() );
if (token.size() != 3) {
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
token.size() << " " << token[0] << " " << line << endl;

View File

@ -3,6 +3,7 @@
#include "tables-core.h"
#include "XmlException.h"
#include "XmlTree.h"
#include "util/tokenize.hh"
#include <cassert>
#include <vector>
@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
tree_.ConnectNodes();
SyntaxNode *root = tree_.GetTop();
assert(root);
words_ = tokenize(line_.c_str());
words_ = util::tokenize(line_.c_str());
return ConvertTree(*root, words_);
}

View File

@ -1,5 +1,6 @@
// $Id$
//#include "beammain.h"
#include "util/tokenize.hh"
#include "tables-core.h"
#define TABLE_LINE_MAX_LENGTH 1000
@ -7,29 +8,6 @@
using namespace std;
// as in beamdecoder/tables.cpp
vector<string> tokenize( const char* input )
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
namespace MosesTraining
{
@ -107,7 +85,7 @@ void DTable::load( const string& fileName )
abort();
}
vector<string> token = tokenize(line.c_str());
const vector<string> token = util::tokenize(line.c_str());
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
continue;

View File

@ -12,8 +12,6 @@
#include <map>
#include <cmath>
extern std::vector<std::string> tokenize( const char*);
namespace MosesTraining
{