mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-17 23:40:50 +03:00
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy.
This commit is contained in:
parent
c15f3ef068
commit
b2d821a141
@ -109,14 +109,17 @@ size_t lookup( string query )
|
||||
return suffixArray.Count( queryString );
|
||||
}
|
||||
|
||||
// Duplicate of definition in util/tokenize.hh.
|
||||
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
|
||||
// use util at all.
|
||||
vector<string> tokenize(const char input[])
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
int i;
|
||||
for(i = 0; input[i] != '\0'; i++) {
|
||||
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
|
@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include "util/exception.hh"
|
||||
#include "util/tokenize.hh"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
|
||||
|
||||
using namespace std;
|
||||
@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
// from phrase-extract/tables-core.cpp
|
||||
inline vector<string> tokenize( const char* input )
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
const vector<string> token = util::tokenize( line.c_str() );
|
||||
if (token.size() != 4) {
|
||||
cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:\n"
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
|
||||
string line;
|
||||
while(getline(*fileP, line)) {
|
||||
// read
|
||||
vector< string > domainSpecLine = tokenize( line.c_str() );
|
||||
const vector< string > domainSpecLine = util::tokenize( line.c_str() );
|
||||
int lineNumber;
|
||||
if (domainSpecLine.size() != 2 ||
|
||||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
||||
@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
|
||||
exit(1);
|
||||
}
|
||||
// store
|
||||
string &name = domainSpecLine[1];
|
||||
const string &name = domainSpecLine[1];
|
||||
spec.push_back( make_pair( lineNumber, name ));
|
||||
if (name2id.find( name ) == name2id.end()) {
|
||||
name2id[ name ] = list.size();
|
||||
|
@ -14,8 +14,6 @@
|
||||
|
||||
#include "ScoreFeature.h"
|
||||
|
||||
extern std::vector<std::string> tokenize( const char*);
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)
|
||||
|
||||
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
|
||||
{
|
||||
target = tokenize(targetString);
|
||||
target = util::tokenize(targetString);
|
||||
if (boundaryRules)
|
||||
addBoundaryWords(target);
|
||||
return true;
|
||||
@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo
|
||||
|
||||
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
|
||||
{
|
||||
source = tokenize(sourceString);
|
||||
source = util::tokenize(sourceString);
|
||||
if (boundaryRules)
|
||||
addBoundaryWords(source);
|
||||
return true;
|
||||
@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
|
||||
}
|
||||
|
||||
// reading in alignments
|
||||
vector<string> alignmentSequence = tokenize( alignmentString );
|
||||
vector<string> alignmentSequence = util::tokenize( alignmentString );
|
||||
for(size_t i=0; i<alignmentSequence.size(); i++) {
|
||||
int s,t;
|
||||
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
|
||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||
return false;
|
||||
}
|
||||
target = tokenize(targetStringCPP.c_str());
|
||||
target = util::tokenize(targetStringCPP.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
|
||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||
return false;
|
||||
}
|
||||
source = tokenize(sourceStringCPP.c_str());
|
||||
source = util::tokenize(sourceStringCPP.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
||||
|
@ -25,11 +25,10 @@
|
||||
#include <cstdlib>
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
@ -109,7 +108,7 @@ int main(int argc, char* argv[])
|
||||
if (! getLine(fileDirectP, itemDirect ))
|
||||
break;
|
||||
|
||||
vector< string > count = tokenize( itemDirect[4].c_str() );
|
||||
const vector< string > count = util::tokenize( itemDirect[4].c_str() );
|
||||
float countEF = atof(count[0].c_str());
|
||||
float countF = atof(count[1].c_str());
|
||||
float prob = countF/countEF;
|
||||
|
@ -28,6 +28,7 @@
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
|
||||
|
||||
// counts, for debugging
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
const vector<string> directCounts = util::tokenize(itemDirect[4].c_str());
|
||||
const vector<string> indirectCounts = util::tokenize(itemIndirect[4].c_str());
|
||||
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
|
||||
// output rule count if present in either file
|
||||
if (indirectCounts.size() > 1) {
|
||||
@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; line[i] != '\0'; i++) {
|
||||
@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
|
||||
{
|
||||
stringstream ret("");
|
||||
|
||||
vector<string> alignToks = tokenize(alignments.c_str());
|
||||
const vector<string> alignToks = util::tokenize(alignments.c_str());
|
||||
|
||||
for (size_t i = 0; i < alignToks.size(); ++i) {
|
||||
string &alignPair = alignToks[i];
|
||||
const string &alignPair = alignToks[i];
|
||||
vector<string> alignPoints;
|
||||
Tokenize(alignPoints, alignPair, "-");
|
||||
assert(alignPoints.size() == 2);
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
|
||||
m_tree.ConnectNodes();
|
||||
SyntaxNode *root = m_tree.GetTop();
|
||||
assert(root);
|
||||
m_words = tokenize(m_line.c_str());
|
||||
m_words = util::tokenize(m_line.c_str());
|
||||
return ConvertTree(*root, m_words);
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
|
||||
@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
|
||||
// There is no XML tree.
|
||||
return std::auto_ptr<PcfgTree>();
|
||||
}
|
||||
m_words = tokenize(m_line.c_str());
|
||||
m_words = util::tokenize(m_line.c_str());
|
||||
return ConvertTree(*root, m_words);
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
#include "relax-parse.h"
|
||||
#include "tables-core.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -44,7 +45,7 @@ int main(int argc, char* argv[])
|
||||
map< string, int > topLabelCollection; // count of top labels, not used
|
||||
SyntaxTree tree;
|
||||
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
|
||||
vector< string > inWords = tokenize( inBufferString.c_str() );
|
||||
const vector< string > inWords = util::tokenize( inBufferString.c_str() );
|
||||
|
||||
// output tree
|
||||
// cerr << "BEFORE:" << endl << tree;
|
||||
@ -104,7 +105,7 @@ void init(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
void store( SyntaxTree &tree, vector< string > &words )
|
||||
void store( SyntaxTree &tree, const vector< string > &words )
|
||||
{
|
||||
// output words
|
||||
for( size_t i=0; i<words.size(); i++ ) {
|
||||
|
@ -39,7 +39,7 @@ char SAMTLevel = 0;
|
||||
|
||||
// functions
|
||||
void init(int argc, char* argv[]);
|
||||
void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
|
||||
void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
|
||||
void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "AlignmentPhrase.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
|
||||
|
||||
bool PhraseAlignment::create(const char line[], int lineID )
|
||||
{
|
||||
vector< string > token = tokenize( line );
|
||||
const vector< string > token = util::tokenize( line );
|
||||
int item = 1;
|
||||
PHRASE phraseF, phraseE;
|
||||
for (size_t j=0; j<token.size(); j++) {
|
||||
@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
const vector<string> token = util::tokenize( line.c_str() );
|
||||
if (token.size() != 3) {
|
||||
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
||||
token.size() << " " << token[0] << " " << line << endl;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
|
||||
tree_.ConnectNodes();
|
||||
SyntaxNode *root = tree_.GetTop();
|
||||
assert(root);
|
||||
words_ = tokenize(line_.c_str());
|
||||
words_ = util::tokenize(line_.c_str());
|
||||
return ConvertTree(*root, words_);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
// $Id$
|
||||
//#include "beammain.h"
|
||||
#include "util/tokenize.hh"
|
||||
#include "tables-core.h"
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
@ -7,29 +8,6 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<string> tokenize( const char* input )
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
@ -107,7 +85,7 @@ void DTable::load( const string& fileName )
|
||||
abort();
|
||||
}
|
||||
|
||||
vector<string> token = tokenize(line.c_str());
|
||||
const vector<string> token = util::tokenize(line.c_str());
|
||||
if (token.size() < 2) {
|
||||
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
||||
continue;
|
||||
|
@ -12,8 +12,6 @@
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
|
||||
extern std::vector<std::string> tokenize( const char*);
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user