mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-05 15:58:03 +03:00
extended phrase lookup: print sentences, document id
This commit is contained in:
parent
ec3413f7b5
commit
6d0f482361
@ -21,6 +21,11 @@ SuffixArray::SuffixArray()
|
||||
m_wordInSentence(NULL),
|
||||
m_sentence(NULL),
|
||||
m_sentenceLength(NULL),
|
||||
m_document(NULL),
|
||||
m_documentName(NULL),
|
||||
m_documentNameLength(0),
|
||||
m_documentCount(0),
|
||||
m_useDocument(false),
|
||||
m_vcb(),
|
||||
m_size(0),
|
||||
m_sentenceCount(0) { }
|
||||
@ -32,6 +37,8 @@ SuffixArray::~SuffixArray()
|
||||
free(m_wordInSentence);
|
||||
free(m_sentence);
|
||||
free(m_sentenceLength);
|
||||
free(m_document);
|
||||
free(m_documentName);
|
||||
}
|
||||
|
||||
void SuffixArray::Create(const string& fileName )
|
||||
@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName )
|
||||
textFile.open(fileName.c_str());
|
||||
|
||||
if (!textFile) {
|
||||
cerr << "no such file or directory " << fileName << endl;
|
||||
cerr << "Error: no such file or directory " << fileName << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// first pass through data: get size
|
||||
istream *fileP = &textFile;
|
||||
m_size = 0;
|
||||
m_sentenceCount = 0;
|
||||
m_documentCount = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
if (m_useDocument && ProcessDocumentLine(line,0)) continue;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
m_size += words.size() + 1;
|
||||
m_sentenceCount++;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << m_size << " words (incl. sentence boundaries)" << endl;
|
||||
if (m_useDocument) {
|
||||
cerr << m_documentCount << " documents" << endl;
|
||||
if (m_documentCount == 0) {
|
||||
cerr << "Error: no documents found, aborting." << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// allocate memory
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName )
|
||||
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
|
||||
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
|
||||
// fill the array
|
||||
int wordIndex = 0;
|
||||
int sentenceId = 0;
|
||||
textFile.open(fileName.c_str());
|
||||
|
||||
if (!textFile) {
|
||||
cerr << "no such file or directory " << fileName << endl;
|
||||
exit(1);
|
||||
CheckAllocation(m_array != NULL, "m_array");
|
||||
CheckAllocation(m_index != NULL, "m_index");
|
||||
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
|
||||
CheckAllocation(m_sentence != NULL, "m_sentence");
|
||||
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
|
||||
if (m_useDocument) {
|
||||
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
|
||||
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
|
||||
CheckAllocation(m_document != NULL, "m_document");
|
||||
CheckAllocation(m_documentName != NULL, "m_documentName");
|
||||
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
|
||||
}
|
||||
|
||||
// second pass through data: fill the arrays
|
||||
int wordIndex = 0;
|
||||
int sentenceId = 0;
|
||||
m_documentNameLength = 0; // re-use as counter
|
||||
m_documentCount = 0; // re-use as counter
|
||||
textFile.open(fileName.c_str());
|
||||
fileP = &textFile;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
vector< WORD_ID >::const_iterator i;
|
||||
|
||||
@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName )
|
||||
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
|
||||
if (m_buffer == NULL) {
|
||||
cerr << "cannot allocate memory to m_buffer" << endl;
|
||||
cerr << "Error: cannot allocate memory to m_buffer" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName )
|
||||
cerr << "done sorting" << endl;
|
||||
}
|
||||
|
||||
// very specific code to deal with common crawl document ids
|
||||
bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
|
||||
{
|
||||
size_t i;
|
||||
// first 32 characters are hex-hash
|
||||
for(i=0; i<32; i++) {
|
||||
if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (line[i++] != ' ') return false;
|
||||
|
||||
// second token is float
|
||||
for (; line[i] != ' ' && line[i] != 0; i++) {
|
||||
if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
|
||||
// last token is url (=name)
|
||||
size_t startName = i;
|
||||
for (; line[i] != ' ' && line[i] != 0; i++) {}
|
||||
if (line[i] == ' ') return false;
|
||||
size_t endName = i+1; // include '\0'
|
||||
|
||||
// second pass: record name and sentence number
|
||||
if (m_document != NULL) {
|
||||
m_documentName[m_documentCount] = m_documentNameLength;
|
||||
for(size_t i=startName; i<endName; i++) {
|
||||
m_documentNameBuffer[m_documentNameLength + i-startName] = line[i];
|
||||
}
|
||||
m_document[m_documentCount] = sentenceId;
|
||||
}
|
||||
m_documentNameLength += endName-startName;
|
||||
m_documentCount++;
|
||||
return true;
|
||||
}
|
||||
|
||||
// good ol' quick sort
|
||||
void SuffixArray::Sort(INDEX start, INDEX end)
|
||||
{
|
||||
@ -162,7 +228,6 @@ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
|
||||
|
||||
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
|
||||
{
|
||||
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
|
||||
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
|
||||
}
|
||||
|
||||
@ -272,13 +337,74 @@ void SuffixArray::List(INDEX start, INDEX end)
|
||||
}
|
||||
}
|
||||
|
||||
void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
|
||||
{
|
||||
cout << "QUERY\t";
|
||||
for(size_t i=0; i<phrase.size(); i++) {
|
||||
if (i>0) cout << " ";
|
||||
cout << phrase[i];
|
||||
}
|
||||
cout << '\t';
|
||||
INDEX start = 0;
|
||||
INDEX end = m_size-1;
|
||||
INDEX mid = FindFirst( phrase, start, end );
|
||||
if (mid == m_size) { // no matches
|
||||
cout << "0 matches" << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
INDEX firstMatch = FindLast( phrase, mid, start, -1 );
|
||||
INDEX lastMatch = FindLast( phrase, mid, end, 1 );
|
||||
|
||||
// loop through all matches
|
||||
cout << (lastMatch-firstMatch+1) << " matches" << endl;
|
||||
for(INDEX i=firstMatch; i<=lastMatch;i++) {
|
||||
// get sentence information
|
||||
INDEX pos = GetPosition( i );
|
||||
INDEX start = pos - GetWordInSentence( pos );
|
||||
char length = GetSentenceLength( GetSentence( pos ) );
|
||||
// print document name
|
||||
if (m_useDocument) {
|
||||
INDEX sentence = GetSentence( pos );
|
||||
INDEX document = GetDocument( sentence );
|
||||
PrintDocumentName( document );
|
||||
cout << '\t';
|
||||
}
|
||||
// print sentence
|
||||
for(char i=0; i<length; i++) {
|
||||
if (i>0) cout << " ";
|
||||
cout << GetWord( start + i );
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
|
||||
{
|
||||
// binary search
|
||||
INDEX min = 0;
|
||||
INDEX max = m_documentCount-1;
|
||||
if (sentence >= m_document[max]) {
|
||||
return max;
|
||||
}
|
||||
while(true) {
|
||||
INDEX mid = (min + max) / 2;
|
||||
if (sentence >= m_document[mid] && sentence < m_document[mid+1]) {
|
||||
return mid;
|
||||
}
|
||||
if (sentence < m_document[mid]) {
|
||||
max = mid-1;
|
||||
}
|
||||
else {
|
||||
min = mid+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SuffixArray::Save(const string& fileName ) const
|
||||
{
|
||||
FILE *pFile = fopen ( fileName.c_str() , "w" );
|
||||
if (pFile == NULL) {
|
||||
cerr << "Cannot open " << fileName << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (pFile == NULL) Error("cannot open",fileName);
|
||||
|
||||
fwrite( &m_size, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
@ -288,6 +414,16 @@ void SuffixArray::Save(const string& fileName ) const
|
||||
|
||||
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
|
||||
|
||||
char useDocument = m_useDocument; // not sure if that is needed
|
||||
fwrite( &useDocument, sizeof(char), 1, pFile );
|
||||
if (m_useDocument) {
|
||||
fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
|
||||
}
|
||||
fclose( pFile );
|
||||
|
||||
m_vcb.Save( fileName + ".src-vcb" );
|
||||
@ -296,56 +432,81 @@ void SuffixArray::Save(const string& fileName ) const
|
||||
void SuffixArray::Load(const string& fileName )
|
||||
{
|
||||
FILE *pFile = fopen ( fileName.c_str() , "r" );
|
||||
if (pFile == NULL) {
|
||||
cerr << "no such file or directory " << fileName << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (pFile == NULL) Error("no such file or directory", fileName);
|
||||
|
||||
cerr << "loading from " << fileName << endl;
|
||||
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile );
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_size from", fileName);
|
||||
cerr << "words in corpus: " << m_size << endl;
|
||||
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
|
||||
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
CheckAllocation(m_array != NULL, "m_array");
|
||||
CheckAllocation(m_index != NULL, "m_index");
|
||||
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
|
||||
CheckAllocation(m_sentence != NULL, "m_sentence");
|
||||
fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
|
||||
|| Error("could not read m_array from", fileName);
|
||||
fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array
|
||||
|| Error("could not read m_index from", fileName);
|
||||
fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
|
||||
|| Error("could not read m_wordInSentence from", fileName);
|
||||
fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
|
||||
|| Error("could not read m_sentence from", fileName);
|
||||
|
||||
if (m_array == NULL) {
|
||||
cerr << "Error: cannot allocate memory to m_array" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (m_index == NULL) {
|
||||
cerr << "Error: cannot allocate memory to m_index" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (m_wordInSentence == NULL) {
|
||||
cerr << "Error: cannot allocate memory to m_wordInSentence" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (m_sentence == NULL) {
|
||||
cerr << "Error: cannot allocate memory to m_sentence" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
|
||||
fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
|
||||
fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
|
||||
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_sentenceCount from", fileName);
|
||||
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
|
||||
if (m_sentenceLength == NULL) {
|
||||
cerr << "Error: cannot allocate memory to m_sentenceLength" << endl;
|
||||
exit(1);
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
|
||||
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
|
||||
|| Error("could not read m_sentenceLength from", fileName);
|
||||
|
||||
if (m_useDocument) { // do not read it when you do not need it
|
||||
char useDocument;
|
||||
fread( &useDocument, sizeof(char), 1, pFile )
|
||||
|| Error("could not read m_useDocument from", fileName);
|
||||
if (!useDocument) {
|
||||
cerr << "Error: stored suffix array does not have a document index\n";
|
||||
exit(1);
|
||||
}
|
||||
fread( &m_documentCount, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_documentCount from", fileName);
|
||||
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
CheckAllocation(m_document != NULL, "m_document");
|
||||
CheckAllocation(m_documentName != NULL, "m_documentName");
|
||||
fread( m_document, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_document from", fileName);
|
||||
fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_documentName from", fileName);
|
||||
fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_documentNameLength from", fileName);
|
||||
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
|
||||
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
|
||||
fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
|
||||
|| Error("could not read m_document from", fileName);
|
||||
}
|
||||
|
||||
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
|
||||
fclose( pFile );
|
||||
|
||||
m_vcb.Load( fileName + ".src-vcb" );
|
||||
}
|
||||
|
||||
void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
|
||||
{
|
||||
if (check) return;
|
||||
cerr << "Error: could not allocate memory for " << dataStructure << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bool SuffixArray::Error( const char *message, const string &fileName) const
|
||||
{
|
||||
cerr << "Error: " << message << " " << fileName << endl;
|
||||
exit(1);
|
||||
return true; // yeah, i know.
|
||||
}
|
||||
|
@ -15,6 +15,12 @@ private:
|
||||
INDEX *m_sentence;
|
||||
char *m_sentenceLength;
|
||||
WORD_ID m_endOfSentence;
|
||||
INDEX *m_document;
|
||||
INDEX *m_documentName;
|
||||
char *m_documentNameBuffer;
|
||||
size_t m_documentNameLength;
|
||||
size_t m_documentCount;
|
||||
bool m_useDocument;
|
||||
Vocabulary m_vcb;
|
||||
INDEX m_size;
|
||||
INDEX m_sentenceCount;
|
||||
@ -28,6 +34,7 @@ public:
|
||||
~SuffixArray();
|
||||
|
||||
void Create(const std::string& fileName );
|
||||
bool ProcessDocumentLine( const char* const, const size_t );
|
||||
void Sort(INDEX start, INDEX end);
|
||||
int CompareIndex( INDEX a, INDEX b ) const;
|
||||
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
|
||||
@ -40,6 +47,7 @@ public:
|
||||
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
|
||||
int Match( const std::vector< WORD > &phrase, INDEX index );
|
||||
void List( INDEX start, INDEX end );
|
||||
void PrintSentenceMatches( const std::vector< WORD > &phrase );
|
||||
inline INDEX GetPosition( INDEX index ) const {
|
||||
return m_index[ index ];
|
||||
}
|
||||
@ -58,6 +66,17 @@ public:
|
||||
inline WORD GetWord( INDEX position ) const {
|
||||
return m_vcb.GetWord( m_array[position] );
|
||||
}
|
||||
void UseDocument() {
|
||||
m_useDocument = true;
|
||||
}
|
||||
INDEX GetDocument( INDEX sentence ) const;
|
||||
void PrintDocumentName( INDEX document ) {
|
||||
for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
|
||||
std::cout << m_documentNameBuffer[ i ];
|
||||
}
|
||||
}
|
||||
void Save(const std::string& fileName ) const;
|
||||
void Load(const std::string& fileName );
|
||||
void CheckAllocation(bool, const char *dataStructure) const;
|
||||
bool Error( const char* message, const std::string& fileName) const;
|
||||
};
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "SuffixArray.h"
|
||||
#include "../util/tokenize.hh"
|
||||
#include <getopt.h>
|
||||
|
||||
using namespace std;
|
||||
@ -13,10 +14,12 @@ int main(int argc, char* argv[])
|
||||
string query;
|
||||
string fileNameSuffix;
|
||||
string fileNameSource;
|
||||
int loadFlag = false;
|
||||
int saveFlag = false;
|
||||
int createFlag = false;
|
||||
int queryFlag = false;
|
||||
bool loadFlag = false;
|
||||
bool saveFlag = false;
|
||||
bool createFlag = false;
|
||||
bool queryFlag = false;
|
||||
bool querySentenceFlag = false;
|
||||
|
||||
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
|
||||
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
|
||||
while(1) {
|
||||
@ -25,11 +28,14 @@ int main(int argc, char* argv[])
|
||||
{"save", required_argument, 0, 's'},
|
||||
{"create", required_argument, 0, 'c'},
|
||||
{"query", required_argument, 0, 'q'},
|
||||
{"query-sentence", required_argument, 0, 'Q'},
|
||||
{"document", required_argument, 0, 'd'},
|
||||
{"stdio", no_argument, 0, 'i'},
|
||||
{"stdio-sentence", no_argument, 0, 'I'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
int option_index = 0;
|
||||
int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
|
||||
int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index);
|
||||
if (c == -1) break;
|
||||
switch (c) {
|
||||
case 'l':
|
||||
@ -48,17 +54,25 @@ int main(int argc, char* argv[])
|
||||
query = string(optarg);
|
||||
queryFlag = true;
|
||||
break;
|
||||
case 'Q':
|
||||
query = string(optarg);
|
||||
querySentenceFlag = true;
|
||||
break;
|
||||
case 'i':
|
||||
stdioFlag = true;
|
||||
break;
|
||||
case 'I':
|
||||
stdioFlag = true;
|
||||
querySentenceFlag = true;
|
||||
break;
|
||||
case 'd':
|
||||
suffixArray.UseDocument();
|
||||
break;
|
||||
default:
|
||||
cerr << info;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (stdioFlag) {
|
||||
queryFlag = true;
|
||||
}
|
||||
|
||||
// check if parameter settings are legal
|
||||
if (saveFlag && !createFlag) {
|
||||
@ -74,7 +88,7 @@ int main(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// do your thing
|
||||
// get suffix array
|
||||
if (createFlag) {
|
||||
cerr << "will create\n";
|
||||
cerr << "corpus is in " << fileNameSource << endl;
|
||||
@ -88,16 +102,29 @@ int main(int argc, char* argv[])
|
||||
cerr << "will load from " << fileNameSuffix << endl;
|
||||
suffixArray.Load( fileNameSuffix );
|
||||
}
|
||||
|
||||
// do something with it
|
||||
if (stdioFlag) {
|
||||
while(true) {
|
||||
string query;
|
||||
if (getline(cin, query, '\n').eof()) {
|
||||
return 0;
|
||||
}
|
||||
cout << lookup( query ) << endl;
|
||||
if (querySentenceFlag) {
|
||||
vector< string > queryString = util::tokenize( query.c_str() );
|
||||
suffixArray.PrintSentenceMatches( queryString );
|
||||
}
|
||||
else {
|
||||
cout << lookup( query ) << endl;
|
||||
}
|
||||
}
|
||||
} else if (queryFlag) {
|
||||
}
|
||||
else if (queryFlag) {
|
||||
cout << lookup( query ) << endl;
|
||||
}
|
||||
else if (querySentenceFlag) {
|
||||
vector< string > queryString = util::tokenize( query.c_str() );
|
||||
suffixArray.PrintSentenceMatches( queryString );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -105,32 +132,6 @@ int main(int argc, char* argv[])
|
||||
size_t lookup( string query )
|
||||
{
|
||||
cerr << "query is " << query << endl;
|
||||
vector< string > queryString = tokenize( query.c_str() );
|
||||
vector< string > queryString = util::tokenize( query.c_str() );
|
||||
return suffixArray.Count( queryString );
|
||||
}
|
||||
|
||||
// Duplicate of definition in util/tokenize.hh.
|
||||
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
|
||||
// use util at all.
|
||||
vector<string> tokenize(const char input[])
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i;
|
||||
for(i = 0; input[i] != '\0'; i++) {
|
||||
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user