open-source-search-engine/Wiktionary.h
2013-08-02 13:12:24 -07:00

181 lines
4.7 KiB
C++

// Matt Wells, copyright Aug 2012
#ifndef _WIKTIONARY_H_
#define _WIKTIONARY_H_
#define WF_NOUN 0x0001
#define WF_VERB 0x0002
#define WF_PREPOSITION 0x0004
#define WF_PRONOUN 0x0008
#define WF_ADJECTIVE 0x0010
#define WF_ADVERB 0x0020
#define WF_ARTICLE 0x0040
#define WF_INTERJECTION 0x0080
#define WF_ABBREVIATION 0x0100
#define WF_INITIALISM 0x0200
#define WF_LETTER 0x0400
#define WF_MANUAL 0x0800
//#define WF_ALLPOSFLAGS (WF_NOUN|WF_VERB|WF_PREPOSITION|WF_PRONOUN|WF_ADJECTIVE|WF_ADVERB|WF_ARTICLE|WF_INTERJECTION)
#include "BigFile.h"
#include "HashTableX.h"
class Wiktionary {
public:
Wiktionary();
~Wiktionary();
void reset();
/*
uint8_t getLangId ( long long *wid ) {
long slot = m_langTable.getSlot ( wid );
if ( slot < 0 ) return langUnknown;
// amibguous?
//if ( m_langTable.getNextSlot(slot,wid) >= 0 )
// return langTranslingual;
// ok, its unique
uint8_t *data = (uint8_t *)m_langTable.getDataFromSlot(slot);
// langid is lower 8 bits i think
return *data;
};
uint8_t getPosFlags ( long long *wid , uint8_t langId ) {
long slot = m_langTable.getSlot ( wid );
if ( slot < 0 ) return 0;
// amibguous?
if ( m_langTable.getNextSlot(slot,wid) >= 0 )
return langTranslingual;
// ok, its unique
uint8_t *data = (uint8_t *)m_langTable.getDataFromSlot(slot);
// langid is lower 8 bits i think
return *data;
};
*/
// returns a line like:
// "pt|holandesa,holandeses,holandesas\n"
// or
// "en|bushmeat,bushmeats\n"
// so you can parse the word forms out and index them
// LATER we could add the Part of Speech...
char *getSynSet ( long long wid , uint8_t langId ) {
// 0? that's bad
if ( wid == 0LL ) { char *xx=NULL;*xx=0;}//return NULL;
// hash it up like we did when adding to m_tmp table
wid ^= g_hashtab[0][langId];
//wid ^= g_hashtab[1][posFlag];
long *offPtr ;
// . try local table first so it overrides
// . now this will be the one and only synset so that
// we can fix 'wells' from mapping to 'well,better,...'
// in the wikitionary-buf.txt file!
offPtr = (long *)m_localTable.getValue ( &wid );
if ( offPtr ) return m_localBuf.getBufStart() + *offPtr;
// try wiktionary table now
offPtr = (long *)m_synTable.getValue ( &wid );
// got one?
if ( offPtr ) return m_synBuf.getBufStart() + *offPtr;
// nothing!
return NULL;
//if ( ! offPtr ) return NULL;
//if ( *offPtr < 0 ) { char *xx=NULL;*xx=0; }
//return m_synBuf.getBufStart() + *offPtr;
};
char *getNextSynSet ( long long wid , uint8_t langId , char *prev ) {
// hash it up like we did when adding to m_tmp table
wid ^= g_hashtab[0][langId];
long slot;
bool gotIt = false;
// try local table BEFORE wiktionary table
slot = m_localTable.getSlot ( &wid );
for ( ; slot >= 0 ; slot =m_localTable.getNextSlot(slot,&wid)){
long *offPtr=(long *)m_localTable.getValueFromSlot(slot);
char *ptr = m_localBuf.getBufStart() + *offPtr;
// make sure our mysynonyms.txt table OVERRIDES
// the wiktionary junk, cuz we need to do that to
// fix bugs in wiktionary like for 'wells' we do not
// want mapping to 'well,better,...' so do not allow
// any after the synset in mysynonyms.txt!
if ( ptr ) return NULL;
if ( gotIt ) return ptr;
if ( ptr == prev ) gotIt = true;
}
//wid ^= g_hashtab[1][posFlag];
slot = m_synTable.getSlot ( &wid );
for ( ; slot >= 0 ; slot = m_synTable.getNextSlot(slot,&wid)){
long *offPtr = (long *)m_synTable.getValueFromSlot(slot);
char *ptr = m_synBuf.getBufStart() + *offPtr;
if ( gotIt ) return ptr;
if ( ptr == prev ) gotIt = true;
}
return NULL;
};
//WikiEntry *getWiktionaryEntry ( uint64_t wid ) {
// return m_ht.getValue ( &h ); }
// . load from disk
// . wikititles.txt (loads wikititles.dat if and date is newer)
bool load();
bool test();
bool test2();
bool generateWiktionaryTxt ();
bool generateHashTableFromWiktionaryTxt ( long fileSize );
bool addSynsets ( char *filename ) ;
bool integrateUnifiedDict ( );
// save the binary hash table to disk to make loading faster
//bool saveHashTableBinary();
bool addWord ( char *word ,
uint8_t posFlag ,
uint8_t langId ,
char *formOf ) ;
bool compile ( ) ;
HashTableX m_debugMap;
SafeBuf m_debugBuf;
SafeBuf m_localBuf;
HashTableX m_localTable;
HashTableX m_dedup;
HashTableX m_tmp;
HashTableX m_synTable;
SafeBuf m_synBuf;
SafeBuf m_langBuf;
char m_buf[5000];
BigFile m_f;
void *m_state;
void (* m_callback)(void *);
long m_txtSize;
long m_errno;
char m_opened;
FileState m_fs;
};
extern class Wiktionary g_wiktionary;
#endif