2014-09-21 09:03:51 +04:00
|
|
|
#include "SuffixArray.h"
|
|
|
|
#include <getopt.h>
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
size_t lookup( string );
|
|
|
|
vector<string> tokenize( const char input[] );
|
|
|
|
SuffixArray suffixArray;
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
int main(int argc, char* argv[])
|
|
|
|
{
|
2014-09-21 09:03:51 +04:00
|
|
|
// handle parameters
|
|
|
|
string query;
|
|
|
|
string fileNameSuffix;
|
|
|
|
string fileNameSource;
|
|
|
|
int loadFlag = false;
|
|
|
|
int saveFlag = false;
|
|
|
|
int createFlag = false;
|
|
|
|
int queryFlag = false;
|
|
|
|
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
|
|
|
|
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
|
|
|
|
while(1) {
|
|
|
|
static struct option long_options[] = {
|
|
|
|
{"load", required_argument, 0, 'l'},
|
|
|
|
{"save", required_argument, 0, 's'},
|
|
|
|
{"create", required_argument, 0, 'c'},
|
|
|
|
{"query", required_argument, 0, 'q'},
|
|
|
|
{"stdio", no_argument, 0, 'i'},
|
|
|
|
{0, 0, 0, 0}
|
|
|
|
};
|
|
|
|
int option_index = 0;
|
|
|
|
int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
|
|
|
|
if (c == -1) break;
|
|
|
|
switch (c) {
|
|
|
|
case 'l':
|
|
|
|
fileNameSuffix = string(optarg);
|
|
|
|
loadFlag = true;
|
|
|
|
break;
|
|
|
|
case 's':
|
|
|
|
fileNameSuffix = string(optarg);
|
|
|
|
saveFlag = true;
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
fileNameSource = string(optarg);
|
|
|
|
createFlag = true;
|
|
|
|
break;
|
|
|
|
case 'q':
|
|
|
|
query = string(optarg);
|
|
|
|
queryFlag = true;
|
|
|
|
break;
|
|
|
|
case 'i':
|
|
|
|
stdioFlag = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
cerr << info;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (stdioFlag) {
|
|
|
|
queryFlag = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if parameter settings are legal
|
|
|
|
if (saveFlag && !createFlag) {
|
|
|
|
cerr << "error: cannot save without creating\n" << info;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (saveFlag && loadFlag) {
|
|
|
|
cerr << "error: cannot load and save at the same time\n" << info;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (!loadFlag && !createFlag) {
|
|
|
|
cerr << "error: neither load or create - i have no info!\n" << info;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// do your thing
|
|
|
|
if (createFlag) {
|
|
|
|
cerr << "will create\n";
|
|
|
|
cerr << "corpus is in " << fileNameSource << endl;
|
|
|
|
suffixArray.Create( fileNameSource );
|
|
|
|
if (saveFlag) {
|
|
|
|
suffixArray.Save( fileNameSuffix );
|
|
|
|
cerr << "will save in " << fileNameSuffix << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (loadFlag) {
|
|
|
|
cerr << "will load from " << fileNameSuffix << endl;
|
|
|
|
suffixArray.Load( fileNameSuffix );
|
|
|
|
}
|
|
|
|
if (stdioFlag) {
|
|
|
|
while(true) {
|
|
|
|
string query;
|
|
|
|
if (getline(cin, query, '\n').eof()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
cout << lookup( query ) << endl;
|
|
|
|
}
|
2015-01-14 14:07:42 +03:00
|
|
|
} else if (queryFlag) {
|
2014-09-21 09:03:51 +04:00
|
|
|
cout << lookup( query ) << endl;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-01-14 14:07:42 +03:00
|
|
|
size_t lookup( string query )
|
|
|
|
{
|
2014-09-21 09:03:51 +04:00
|
|
|
cerr << "query is " << query << endl;
|
|
|
|
vector< string > queryString = tokenize( query.c_str() );
|
|
|
|
return suffixArray.Count( queryString );
|
|
|
|
}
|
|
|
|
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
// Duplicate of definition in util/tokenize.hh.
|
|
|
|
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
|
|
|
|
// use util at all.
|
|
|
|
vector<string> tokenize(const char input[])
|
2014-09-21 09:03:51 +04:00
|
|
|
{
|
|
|
|
vector< string > token;
|
|
|
|
bool betweenWords = true;
|
|
|
|
int start=0;
|
Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.
In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions. In theory this may help performance, but it's mainly for clarity.
The comments are based on reverse-engineering, and the unit tests are based
on the comments. It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!
I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 05:59:05 +03:00
|
|
|
int i;
|
|
|
|
for(i = 0; input[i] != '\0'; i++) {
|
|
|
|
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
2014-09-21 09:03:51 +04:00
|
|
|
|
|
|
|
if (!isSpace && betweenWords) {
|
|
|
|
start = i;
|
|
|
|
betweenWords = false;
|
|
|
|
} else if (isSpace && !betweenWords) {
|
|
|
|
token.push_back( string( input+start, i-start ) );
|
|
|
|
betweenWords = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!betweenWords)
|
|
|
|
token.push_back( string( input+start, i-start ) );
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|