extended phrase lookup: print sentences, document id

2024-10-05 15:58:03 +03:00 · 2015-07-20 11:41:48 -04:00 · 2015-07-20 11:41:48 -04:00 · 6d0f482361
commit 6d0f482361
parent ec3413f7b5
3 changed files with 271 additions and 90 deletions
--- a/biconcor/SuffixArray.cpp
+++ b/biconcor/SuffixArray.cpp
@ -21,6 +21,11 @@ SuffixArray::SuffixArray()
    m_wordInSentence(NULL),
    m_sentence(NULL),
    m_sentenceLength(NULL),
+    m_document(NULL),
+    m_documentName(NULL),
+    m_documentNameLength(0),
+    m_documentCount(0),
+    m_useDocument(false),
    m_vcb(),
    m_size(0),
    m_sentenceCount(0) { }
@ -32,6 +37,8 @@ SuffixArray::~SuffixArray()
  free(m_wordInSentence);
  free(m_sentence);
  free(m_sentenceLength);
+  free(m_document);
+  free(m_documentName);
 }

 void SuffixArray::Create(const string& fileName )
@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName )
  textFile.open(fileName.c_str());

  if (!textFile) {
-    cerr << "no such file or directory " << fileName << endl;
+    cerr << "Error: no such file or directory " << fileName << endl;
    exit(1);
  }

+  // first pass through data: get size
  istream *fileP = &textFile;
  m_size = 0;
  m_sentenceCount = 0;
+  m_documentCount = 0;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
+    if (m_useDocument && ProcessDocumentLine(line,0)) continue;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    m_size += words.size() + 1;
    m_sentenceCount++;
  }
  textFile.close();
  cerr << m_size << " words (incl. sentence boundaries)" << endl;
+  if (m_useDocument) {
+    cerr << m_documentCount << " documents" << endl;
+    if (m_documentCount == 0) {
+      cerr << "Error: no documents found, aborting." << endl;
+      exit(1);
+    }
+  }

  // allocate memory
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName )
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
-
-  // fill the array
-  int wordIndex = 0;
-  int sentenceId = 0;
-  textFile.open(fileName.c_str());
-
-  if (!textFile) {
-    cerr << "no such file or directory " << fileName << endl;
-    exit(1);
+  CheckAllocation(m_array != NULL, "m_array");
+  CheckAllocation(m_index != NULL, "m_index");
+  CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
+  CheckAllocation(m_sentence != NULL, "m_sentence");
+  CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
+  if (m_useDocument) {
+    m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
+    m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
+    CheckAllocation(m_document != NULL, "m_document");
+    CheckAllocation(m_documentName != NULL, "m_documentName");
+    CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
  }

+  // second pass through data: fill the arrays
+  int wordIndex = 0;
+  int sentenceId = 0;
+  m_documentNameLength = 0; // re-use as counter
+  m_documentCount = 0;      // re-use as counter
+  textFile.open(fileName.c_str());
  fileP = &textFile;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
+    if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    vector< WORD_ID >::const_iterator i;

@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName )
  m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );

  if (m_buffer == NULL) {
-    cerr << "cannot allocate memory to m_buffer" << endl;
+    cerr << "Error: cannot allocate memory to m_buffer" << endl;
    exit(1);
  }

@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName )
  cerr << "done sorting" << endl;
 }

+// very specific code to deal with common crawl document ids
+bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId ) 
+{
+  size_t i;
+  // first 32 characters are hex-hash
+  for(i=0; i<32; i++) {
+    if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) {
+      return false;
+    }
+  }
+  if (line[i++] != ' ') return false;
+
+  // second token is float
+  for (; line[i] != ' ' && line[i] != 0; i++) {
+    if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
+      return false;
+    }
+  } 
+  i++;
+
+  // last token is url (=name)
+  size_t startName = i;
+  for (; line[i] != ' ' && line[i] != 0; i++) {}
+  if (line[i] == ' ') return false;
+  size_t endName = i+1; // include '\0'
+
+  // second pass: record name and sentence number
+  if (m_document != NULL) {
+    m_documentName[m_documentCount] = m_documentNameLength;
+    for(size_t i=startName; i<endName; i++) {
+      m_documentNameBuffer[m_documentNameLength + i-startName] = line[i];
+    }
+    m_document[m_documentCount] = sentenceId;
+  }
+  m_documentNameLength += endName-startName;
+  m_documentCount++;
+  return true;
+}
+
 // good ol' quick sort
 void SuffixArray::Sort(INDEX start, INDEX end)
 {
@ -162,7 +228,6 @@ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const

 inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
 {
-  // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
  return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
 }

@ -272,13 +337,74 @@ void SuffixArray::List(INDEX start, INDEX end)
  }
 }

+void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase ) 
+{
+  cout << "QUERY\t";
+  for(size_t i=0; i<phrase.size(); i++) {
+    if (i>0) cout << " ";
+    cout <<  phrase[i];
+  }
+  cout << '\t';
+  INDEX start = 0;
+  INDEX end = m_size-1;
+  INDEX mid = FindFirst( phrase, start, end );
+  if (mid == m_size) { // no matches
+    cout << "0 matches" << endl;
+    return;
+  }
+
+  INDEX firstMatch = FindLast( phrase, mid, start, -1 );
+  INDEX lastMatch = FindLast( phrase, mid, end, 1 );
+
+  // loop through all matches
+  cout << (lastMatch-firstMatch+1) << " matches" << endl;
+  for(INDEX i=firstMatch; i<=lastMatch;i++) {
+    // get sentence information
+    INDEX pos = GetPosition( i );
+    INDEX start = pos - GetWordInSentence( pos );
+    char length = GetSentenceLength( GetSentence( pos ) );
+    // print document name
+    if (m_useDocument) {
+      INDEX sentence = GetSentence( pos );
+      INDEX document = GetDocument( sentence );
+      PrintDocumentName( document );
+      cout << '\t';
+    }
+    // print sentence
+    for(char i=0; i<length; i++) {
+      if (i>0) cout << " ";
+      cout << GetWord( start + i );
+    }
+    cout << endl;
+  }
+}
+
+SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
+{
+  // binary search
+  INDEX min = 0;
+  INDEX max = m_documentCount-1;
+  if (sentence >= m_document[max]) {
+    return max;
+  }
+  while(true) {
+    INDEX mid = (min + max) / 2;
+    if (sentence >= m_document[mid] && sentence < m_document[mid+1]) {
+      return mid;
+    }
+    if (sentence < m_document[mid]) {
+      max = mid-1;
+    }
+    else {
+      min = mid+1;
+    }
+  }
+}
+
 void SuffixArray::Save(const string& fileName ) const
 {
  FILE *pFile = fopen ( fileName.c_str() , "w" );
-  if (pFile == NULL) {
-    cerr << "Cannot open " << fileName << endl;
-    exit(1);
-  }
+  if (pFile == NULL) Error("cannot open",fileName);

  fwrite( &m_size, sizeof(INDEX), 1, pFile );
  fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
@ -288,6 +414,16 @@ void SuffixArray::Save(const string& fileName ) const

  fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
  fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
+
+  char useDocument = m_useDocument; // not sure if that is needed
+  fwrite( &useDocument, sizeof(char), 1, pFile ); 
+  if (m_useDocument) {
+    fwrite( &m_documentCount, sizeof(INDEX), 1, pFile ); 
+    fwrite( m_document, sizeof(INDEX), m_documentCount, pFile ); 
+    fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile ); 
+    fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile ); 
+    fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ); 
+  }
  fclose( pFile );

  m_vcb.Save( fileName + ".src-vcb" );
@ -296,56 +432,81 @@ void SuffixArray::Save(const string& fileName ) const
 void SuffixArray::Load(const string& fileName )
 {
  FILE *pFile = fopen ( fileName.c_str() , "r" );
-  if (pFile == NULL) {
-    cerr << "no such file or directory " << fileName << endl;
-    exit(1);
-  }
+  if (pFile == NULL) Error("no such file or directory", fileName);

  cerr << "loading from " << fileName << endl;

-  fread( &m_size, sizeof(INDEX), 1, pFile );
+  fread( &m_size, sizeof(INDEX), 1, pFile ) 
+    || Error("could not read m_size from", fileName);
  cerr << "words in corpus: " << m_size << endl;
+
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
  m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
+  CheckAllocation(m_array != NULL, "m_array");
+  CheckAllocation(m_index != NULL, "m_index");
+  CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
+  CheckAllocation(m_sentence != NULL, "m_sentence");
+  fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
+    || Error("could not read m_array from", fileName);
+  fread( m_index, sizeof(INDEX), m_size, pFile )   // suffix array
+    || Error("could not read m_index from", fileName);
+  fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
+    || Error("could not read m_wordInSentence from", fileName);
+  fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
+    || Error("could not read m_sentence from", fileName);

-  if (m_array == NULL) {
-    cerr << "Error: cannot allocate memory to m_array" << endl;
-    exit(1);
-  }
-
-  if (m_index == NULL) {
-    cerr << "Error: cannot allocate memory to m_index" << endl;
-    exit(1);
-  }
-
-  if (m_wordInSentence == NULL) {
-    cerr << "Error: cannot allocate memory to m_wordInSentence" << endl;
-    exit(1);
-  }
-
-  if (m_sentence == NULL) {
-    cerr << "Error: cannot allocate memory to m_sentence" << endl;
-    exit(1);
-  }
-
-  fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
-  fread( m_index, sizeof(INDEX), m_size, pFile );   // suffix array
-  fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
-  fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
-
-  fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+  fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
+    || Error("could not read m_sentenceCount from", fileName);
  cerr << "sentences in corpus: " << m_sentenceCount << endl;
-  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );

-  if (m_sentenceLength == NULL) {
-    cerr << "Error: cannot allocate memory to m_sentenceLength" << endl;
-    exit(1);
+  m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
+  CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
+  fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
+    || Error("could not read m_sentenceLength from", fileName);
+
+  if (m_useDocument) { // do not read it when you do not need it
+    char useDocument;
+    fread( &useDocument, sizeof(char), 1, pFile )
+      || Error("could not read m_useDocument from", fileName);
+    if (!useDocument) {
+      cerr << "Error: stored suffix array does not have a document index\n";
+      exit(1);
+    }
+    fread( &m_documentCount, sizeof(INDEX), 1, pFile ) 
+      || Error("could not read m_documentCount from", fileName);
+    m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
+    CheckAllocation(m_document != NULL, "m_document");
+    CheckAllocation(m_documentName != NULL, "m_documentName");
+    fread( m_document, sizeof(INDEX), m_documentCount, pFile ) 
+      || Error("could not read m_document from", fileName);
+    fread( m_documentName, sizeof(INDEX), m_documentCount, pFile ) 
+      || Error("could not read m_documentName from", fileName);
+    fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
+      || Error("could not read m_documentNameLength from", fileName);
+    m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
+    CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
+    fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
+      || Error("could not read m_document from", fileName);
  }

-  fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
  fclose( pFile );

  m_vcb.Load( fileName + ".src-vcb" );
 }
+
+void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const 
+{
+  if (check) return;
+  cerr << "Error: could not allocate memory for " << dataStructure << endl;
+  exit(1);
+}
+
+bool SuffixArray::Error( const char *message, const string &fileName) const 
+{
+  cerr << "Error: " << message << " " << fileName << endl;
+  exit(1);
+  return true; // yeah, i know. 
+}
--- a/biconcor/SuffixArray.h
+++ b/biconcor/SuffixArray.h
@ -15,6 +15,12 @@ private:
  INDEX *m_sentence;
  char *m_sentenceLength;
  WORD_ID m_endOfSentence;
+  INDEX *m_document;
+  INDEX *m_documentName;
+  char *m_documentNameBuffer;
+  size_t m_documentNameLength;
+  size_t m_documentCount;
+  bool m_useDocument;
  Vocabulary m_vcb;
  INDEX m_size;
  INDEX m_sentenceCount;
@ -28,6 +34,7 @@ public:
  ~SuffixArray();

  void Create(const std::string& fileName );
+  bool ProcessDocumentLine( const char* const, const size_t );
  void Sort(INDEX start, INDEX end);
  int CompareIndex( INDEX a, INDEX b ) const;
  inline int CompareWord( WORD_ID a, WORD_ID b ) const;
@ -40,6 +47,7 @@ public:
  INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
  int Match( const std::vector< WORD > &phrase, INDEX index );
  void List( INDEX start, INDEX end );
+  void PrintSentenceMatches( const std::vector< WORD > &phrase );
  inline INDEX GetPosition( INDEX index ) const {
    return m_index[ index ];
  }
@ -58,6 +66,17 @@ public:
  inline WORD GetWord( INDEX position ) const {
    return m_vcb.GetWord( m_array[position] );
  }
+  void UseDocument() {
+    m_useDocument = true;
+  }
+  INDEX GetDocument( INDEX sentence ) const;
+  void PrintDocumentName( INDEX document ) {
+    for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
+      std::cout << m_documentNameBuffer[ i ];
+    }
+  }
  void Save(const std::string& fileName ) const;
  void Load(const std::string& fileName );
+  void CheckAllocation(bool, const char *dataStructure) const;
+  bool Error( const char* message, const std::string& fileName) const;
 };
--- a/biconcor/phrase-lookup.cpp
+++ b/biconcor/phrase-lookup.cpp
@ -1,4 +1,5 @@
 #include "SuffixArray.h"
+#include "../util/tokenize.hh"
 #include <getopt.h>

 using namespace std;
@ -13,10 +14,12 @@ int main(int argc, char* argv[])
  string query;
  string fileNameSuffix;
  string fileNameSource;
-  int loadFlag = false;
-  int saveFlag = false;
-  int createFlag = false;
-  int queryFlag = false;
+  bool loadFlag = false;
+  bool saveFlag = false;
+  bool createFlag = false;
+  bool queryFlag = false;
+  bool querySentenceFlag = false;
+  
  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
  while(1) {
@ -25,11 +28,14 @@ int main(int argc, char* argv[])
      {"save", required_argument, 0, 's'},
      {"create", required_argument, 0, 'c'},
      {"query", required_argument, 0, 'q'},
+      {"query-sentence", required_argument, 0, 'Q'},
+      {"document", required_argument, 0, 'd'},
      {"stdio", no_argument, 0, 'i'},
+      {"stdio-sentence", no_argument, 0, 'I'},
      {0, 0, 0, 0}
    };
    int option_index = 0;
-    int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
+    int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index);
    if (c == -1) break;
    switch (c) {
    case 'l':
@ -48,17 +54,25 @@ int main(int argc, char* argv[])
      query = string(optarg);
      queryFlag = true;
      break;
+    case 'Q':
+      query = string(optarg);
+      querySentenceFlag = true;
+      break;
    case 'i':
      stdioFlag = true;
      break;
+    case 'I':
+      stdioFlag = true;
+      querySentenceFlag = true;
+      break;
+    case 'd':
+      suffixArray.UseDocument();
+      break;
    default:
      cerr << info;
      exit(1);
    }
  }
-  if (stdioFlag) {
-    queryFlag = true;
-  }

  // check if parameter settings are legal
  if (saveFlag && !createFlag) {
@ -74,7 +88,7 @@ int main(int argc, char* argv[])
    exit(1);
  }

-  // do your thing
+  // get suffix array
  if (createFlag) {
    cerr << "will create\n";
    cerr << "corpus is in " << fileNameSource << endl;
@ -88,16 +102,29 @@ int main(int argc, char* argv[])
    cerr << "will load from " << fileNameSuffix << endl;
    suffixArray.Load( fileNameSuffix );
  }
+
+  // do something with it
  if (stdioFlag) {
    while(true) {
      string query;
      if (getline(cin, query, '\n').eof()) {
        return 0;
      }
-      cout << lookup( query ) << endl;
+      if (querySentenceFlag) {
+        vector< string > queryString = util::tokenize( query.c_str() );
+        suffixArray.PrintSentenceMatches( queryString );
+      }
+      else {
+        cout << lookup( query ) << endl;
+      }
    }
-  } else if (queryFlag) {
+  } 
+  else if (queryFlag) {
    cout << lookup( query ) << endl;
+  } 
+  else if (querySentenceFlag) {
+    vector< string > queryString = util::tokenize( query.c_str() );
+    suffixArray.PrintSentenceMatches( queryString );
  }
  return 0;
 }
@ -105,32 +132,6 @@ int main(int argc, char* argv[])
 size_t lookup( string query )
 {
  cerr << "query is " << query << endl;
-  vector< string > queryString = tokenize( query.c_str() );
+  vector< string > queryString = util::tokenize( query.c_str() );
  return suffixArray.Count( queryString );
 }
-
-// Duplicate of definition in util/tokenize.hh.
-// TODO: Can we de-duplicate this?  At the time of writing biconcor does not
-// use util at all.
-vector<string> tokenize(const char input[])
-{
-  vector< string > token;
-  bool betweenWords = true;
-  int start=0;
-  int i;
-  for(i = 0; input[i] != '\0'; i++) {
-    const bool isSpace = (input[i] == ' ' || input[i] == '\t');
-
-    if (!isSpace && betweenWords) {
-      start = i;
-      betweenWords = false;
-    } else if (isSpace && !betweenWords) {
-      token.push_back( string( input+start, i-start ) );
-      betweenWords = true;
-    }
-  }
-  if (!betweenWords)
-    token.push_back( string( input+start, i-start ) );
-  return token;
-}
-