mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
b2d821a141
The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy.
137 lines
3.4 KiB
C++
137 lines
3.4 KiB
C++
#include "SuffixArray.h"
|
|
#include <getopt.h>
|
|
|
|
using namespace std;
|
|
|
|
size_t lookup( string );
|
|
vector<string> tokenize( const char input[] );
|
|
SuffixArray suffixArray;
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
// handle parameters
|
|
string query;
|
|
string fileNameSuffix;
|
|
string fileNameSource;
|
|
int loadFlag = false;
|
|
int saveFlag = false;
|
|
int createFlag = false;
|
|
int queryFlag = false;
|
|
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
|
|
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
|
|
while(1) {
|
|
static struct option long_options[] = {
|
|
{"load", required_argument, 0, 'l'},
|
|
{"save", required_argument, 0, 's'},
|
|
{"create", required_argument, 0, 'c'},
|
|
{"query", required_argument, 0, 'q'},
|
|
{"stdio", no_argument, 0, 'i'},
|
|
{0, 0, 0, 0}
|
|
};
|
|
int option_index = 0;
|
|
int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
|
|
if (c == -1) break;
|
|
switch (c) {
|
|
case 'l':
|
|
fileNameSuffix = string(optarg);
|
|
loadFlag = true;
|
|
break;
|
|
case 's':
|
|
fileNameSuffix = string(optarg);
|
|
saveFlag = true;
|
|
break;
|
|
case 'c':
|
|
fileNameSource = string(optarg);
|
|
createFlag = true;
|
|
break;
|
|
case 'q':
|
|
query = string(optarg);
|
|
queryFlag = true;
|
|
break;
|
|
case 'i':
|
|
stdioFlag = true;
|
|
break;
|
|
default:
|
|
cerr << info;
|
|
exit(1);
|
|
}
|
|
}
|
|
if (stdioFlag) {
|
|
queryFlag = true;
|
|
}
|
|
|
|
// check if parameter settings are legal
|
|
if (saveFlag && !createFlag) {
|
|
cerr << "error: cannot save without creating\n" << info;
|
|
exit(1);
|
|
}
|
|
if (saveFlag && loadFlag) {
|
|
cerr << "error: cannot load and save at the same time\n" << info;
|
|
exit(1);
|
|
}
|
|
if (!loadFlag && !createFlag) {
|
|
cerr << "error: neither load or create - i have no info!\n" << info;
|
|
exit(1);
|
|
}
|
|
|
|
// do your thing
|
|
if (createFlag) {
|
|
cerr << "will create\n";
|
|
cerr << "corpus is in " << fileNameSource << endl;
|
|
suffixArray.Create( fileNameSource );
|
|
if (saveFlag) {
|
|
suffixArray.Save( fileNameSuffix );
|
|
cerr << "will save in " << fileNameSuffix << endl;
|
|
}
|
|
}
|
|
if (loadFlag) {
|
|
cerr << "will load from " << fileNameSuffix << endl;
|
|
suffixArray.Load( fileNameSuffix );
|
|
}
|
|
if (stdioFlag) {
|
|
while(true) {
|
|
string query;
|
|
if (getline(cin, query, '\n').eof()) {
|
|
return 0;
|
|
}
|
|
cout << lookup( query ) << endl;
|
|
}
|
|
} else if (queryFlag) {
|
|
cout << lookup( query ) << endl;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t lookup( string query )
|
|
{
|
|
cerr << "query is " << query << endl;
|
|
vector< string > queryString = tokenize( query.c_str() );
|
|
return suffixArray.Count( queryString );
|
|
}
|
|
|
|
// Duplicate of definition in util/tokenize.hh.
|
|
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
|
|
// use util at all.
|
|
vector<string> tokenize(const char input[])
|
|
{
|
|
vector< string > token;
|
|
bool betweenWords = true;
|
|
int start=0;
|
|
int i;
|
|
for(i = 0; input[i] != '\0'; i++) {
|
|
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
|
|
|
if (!isSpace && betweenWords) {
|
|
start = i;
|
|
betweenWords = false;
|
|
} else if (isSpace && !betweenWords) {
|
|
token.push_back( string( input+start, i-start ) );
|
|
betweenWords = true;
|
|
}
|
|
}
|
|
if (!betweenWords)
|
|
token.push_back( string( input+start, i-start ) );
|
|
return token;
|
|
}
|
|
|