open-source-search-engine/LangList.cpp

605 lines
23 KiB
C++
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "gb-include.h"
#include "LangList.h"
#include "Indexdb.h"
LangList g_langList;
struct TldInfo {
char* m_tld;
char* m_country;
char* m_languages;
unsigned long m_languagebv;
};
static long s_numTlds = 0;
static TldInfo s_tldInfo[] = {
{ "arpa", "Address and Routing Parameter Area", "unknown", 0xffffffff },
{ "root", "N/A", "unknown", 0xffffffff },
{ "aero", "air-transport industry", "unknown", 0xffffffff },
{ "biz", "business", "unknown", 0xffffffff },
{ "cat", "Catalan", "unknown", 0xffffffff },
{ "com", "commercial", "unknown", 0xffffffff },
{ "coop", "cooperatives", "unknown", 0xffffffff },
{ "edu", "educational", "unknown", 0xffffffff },
{ "gov", "governmental", "unknown", 0xffffffff },
{ "info", "information", "unknown", 0xffffffff },
{ "int", "international organizations", "unknown", 0xffffffff },
{ "jobs", "companies", "unknown", 0xffffffff },
{ "mil", "United States Military", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "mobi", "mobile devices", "unknown", 0xffffffff },
{ "museum", "museums", "unknown", 0xffffffff },
{ "name", "individuals, by name", "unknown", 0xffffffff },
{ "net", "network", "unknown", 0xffffffff },
{ "org", "organization", "unknown", 0xffffffff },
{ "pro", "professions", "unknown", 0xffffffff },
{ "travel", "travel and travel-agency related sites", "unknown", 0xffffffff },
{ "ac", "Ascension Island", "unknown", 0xffffffff },
{ "ad", "Andorra", "catalan", 0xffffffff },
{ "ae", "United Arab Emirates", "arabic", 0xffffffff },
{ "af", "Afghanistan", "arabic,balochi,dari,nuristani,pashto,pamiri,"
"pashai,turkmen,uzbek", 0xffffffff },
{ "ag", "Antigua and Barbuda", "english", 0xffffffff },
{ "ai", "Anguilla", "unknown", 0xffffffff },
{ "al", "Albania", "albanian", 0xffffffff },
{ "am", "Armenia", "armenian,armenian", 0xffffffff },
{ "an", "Netherlands Antilles", "dutch,frisian", 0xffffffff },
{ "ao", "Angola", "portuguese", 0xffffffff },
{ "aq", "Antarctica", "unknown", 0xffffffff },
{ "ar", "Argentina", "spanish,guarani", 0xffffffff },
{ "as", "American Samoa", "english,samoan", 0xffffffff },
{ "at", "Austria", "croatian,czech,german,hungarian,slovak,"
"slovenian,romani", 0xffffffff },
{ "au", "Australia", "australia", 0xffffffff },
{ "aw", "Aruba", "unknown", 0xffffffff },
{ "ax", "Åland", "unknown", 0xffffffff },
{ "az", "Azerbaijan", "azerbaijani", 0xffffffff },
{ "ba", "Bosnia and Herzegovina", "bosnian,croatian,serbian", 0xffffffff },
{ "bb", "Barbados", "english", 0xffffffff },
{ "bd", "Bangladesh", "bengala", 0xffffffff },
{ "be", "Belgium", "dutch,french,german", 0xffffffff },
{ "bf", "Burkina Faso", "french,more,jula,fula", 0xffffffff },
{ "bg", "Bulgaria", "bulgarian", 0xffffffff },
{ "bh", "Bahrain", "arabic", 0xffffffff },
{ "bi", "Burundi", "french,kirundi", 0xffffffff },
{ "bj", "Benin", "french", 0xffffffff },
{ "bm", "Bermuda", "unknown", 0xffffffff },
{ "bn", "Brunei Darussalam", "malay", 0xffffffff },
{ "bo", "Bolivia", "spanish,aymara,quechua", 0xffffffff },
{ "br", "Brazil", "portuguese", 0xffffffff },
{ "bs", "Bahamas", "unknown", 0xffffffff },
{ "bt", "Bhutan", "dzongkha,english", 0xffffffff },
{ "bv", "Bouvet Island", "unknown", 0xffffffff },
{ "bw", "Botswana", "english,kalanga,tswana", 0xffffffff },
{ "by", "Belarus", "belarusian,russian", 0xffffffff },
{ "bz", "Belize", "english", 0xffffffff },
{ "ca", "Canada", "chipewyan,cree,dogrib,english,french,gwich?in,inuinnaqtun,"
"inuktitut,inuvialuktun,slavey", 0xffffffff },
{ "cc", "Cocos (Keeling) Islands", "unknown", 0xffffffff },
{ "cd", "Democratic Republic of the Congo", "french,lingala,kikongo,swahili,"
"tshiluba", 0xffffffff },
{ "cf", "Central African Republic", "french,sango", 0xffffffff },
{ "cg", "Republic of the Congo", "french,lingala,munukutuba", 0xffffffff },
{ "ch", "Switzerland (Confoederatio Helvetica)", "french,german,italian,"
"romansh", 0xffffffff },
{ "ci", "C<EFBFBD>´te d'Ivoire", "french", 0xffffffff },
{ "ck", "Cook Islands", "unknown", 0xffffffff },
{ "cl", "Chile", "spanish", 0xffffffff },
{ "cm", "Cameroon", "english,french", 0xffffffff },
{ "cn", "People's Republic of China", "cantonese,english,kazakh,korean,"
"mandarin,mongolian,portuguese,tajik,tibetan,uyghur,zhuang",
0xffffffff },
{ "co", "Colombia", "spanish", 0xffffffff },
{ "cr", "Costa Rica", "spanish", 0xffffffff },
{ "cu", "Cuba", "spanish", 0xffffffff },
{ "cv", "Cape Verde", "crioulo,portuguese", 0xffffffff },
{ "cx", "Christmas Island", "unknown", 0xffffffff },
{ "cy", "Cyprus", "greek,turkish", 0xffffffff },
{ "cz", "Czech Republic", "czech", 0xffffffff },
{ "de", "Germany (Deutschland)", "danish,frisian,german,romani,"
"lower sorbian,upper sorbian", 0xffffffff },
{ "dj", "Djibouti", "arabic,french", 0xffffffff },
{ "dk", "Denmark", "danish,faroese,kalaallisut", 0xffffffff },
{ "dm", "Dominica", "english", 0xffffffff },
{ "do", "Dominican Republic", "english", 0xffffffff },
{ "dz", "Algeria", "arabic,tamazight", 0xffffffff },
{ "ec", "Ecuador", "spanish,quechua", 0xffffffff },
{ "ee", "Estonia", "estonian", 0xffffffff },
{ "eg", "Egypt", "arabic", 0xffffffff },
{ "er", "Eritrea", "arabic,english,tigrinya", 0xffffffff },
{ "es", "Spain (Espa<70>±a)", "basque,catalan,galician,occitan,"
"spanish", 0xffffffff },
{ "et", "Ethiopia", "amharic", 0xffffffff },
{ "eu", "European Union", "unknown", 0xffffffff },
{ "fi", "Finland", "finnish,sami,swedish", 0xffffffff },
{ "fj", "Fiji", "english,fijian,hindustani", 0xffffffff },
{ "fk", "Falkland Islands", "unknown", 0xffffffff },
{ "fm", "Federated States of Micronesia", "chuuk,english,kosraean,ponapean,"
"ulithian,yapese", 0xffffffff },
{ "fo", "Faroe Islands", "unknown", 0xffffffff },
{ "fr", "France", "french,tahitian", 0xffffffff },
{ "ga", "Gabon", "french", 0xffffffff },
{ "gb", "United Kingdom (Great Britain)", "english,cornish,"
"dg<EFBFBD>¨rn<EFBFBD>©siais,english,french,irish,j<>¨rriais,"
"pitcairnese,scots,scottish gaelic,welsh", 0xffffffff },
{ "gd", "Grenada", "english", 0xffffffff },
{ "ge", "Georgia", "abkhaz,georgian,ossetic,russian", 0xffffffff },
{ "gf", "French Guiana", "unknown", 0xffffffff },
{ "gg", "Guernsey", "unknown", 0xffffffff },
{ "gh", "Ghana", "adangme,dagaare,dagbani,english,ewe,ga,gonja,kasem,"
"nzema,twi", 0xffffffff },
{ "gi", "Gibraltar", "unknown", 0xffffffff },
{ "gl", "Greenland", "unknown", 0xffffffff },
{ "gm", "The Gambia", "unknown", 0xffffffff },
{ "gn", "Guinea", "french,fula", 0xffffffff },
{ "gp", "Guadeloupe", "unknown", 0xffffffff },
{ "gq", "Equatorial Guinea", "french,spanish", 0xffffffff },
{ "gr", "Greece", "greek", 0xffffffff },
{ "gs", "South Georgia and the South Sandwich Islands", "abkhaz,georgian,"
"ossetic,russian", 0xffffffff },
{ "gt", "Guatemala", "spanish", 0xffffffff },
{ "gu", "Guam", "unknown", 0xffffffff },
{ "gw", "Guinea-Bissau", "french,fula", 0xffffffff },
{ "gy", "Guyana", "english", 0xffffffff },
{ "hk", "Hong Kong", "unknown", 0xffffffff },
{ "hm", "Heard Island and McDonald Islands", "unknown", 0xffffffff },
{ "hn", "Honduras", "spanish", 0xffffffff },
{ "hr", "Croatia (Hrvatska)", "croatian,italian", 0xffffffff },
{ "ht", "Haiti", "french,haitian creole", 0xffffffff },
{ "hu", "Hungary", "hungarian", 0xffffffff },
{ "id", "Indonesia", "balinese,javanese,indonesian,sundanese", 0xffffffff },
{ "ie", "Ireland (Éire)", "unknown", 0xffffffff },
{ "il", "Israel", "arabic,hebrew", 0xffffffff },
{ "im", "Isle of Man", "unknown", 0xffffffff },
{ "in", "India", "assamese,bengala,bodo,dogri,english,gujarati,hindi,kannada,"
"kashmiri,konkani,maithili,malayalam,meitei,marathi,nepali,oriya,"
"punjabi,sanskrit,santali,sindhi,tamil,telugu,urdu,french,karbi,"
"bhojpuri,magadhi,maithili,chhattisgarhi,portuguese,pahari,tulu,"
"garo,khasi,mizo,rajasthani,kokborok,nicobarese", 0xffffffff },
{ "io", "British Indian Ocean Territory", "assamese,bengala,bodo,dogri,"
"english,gujarati,hindi,kannada,kashmiri,konkani,maithili,malayalam,"
"meitei,marathi,nepali,oriya,punjabi,sanskrit,santali,sindhi,tamil,"
"telugu,urdu,french,karbi,bhojpuri,magadhi,maithili,chhattisgarhi,"
"portuguese,pahari,tulu,garo,khasi,mizo,rajasthani,kokborok,"
"nicobarese", 0xffffffff },
{ "iq", "Iraq", "arabic,kurdish", 0xffffffff },
{ "ir", "Iran", "persian", 0xffffffff },
{ "is", "Iceland (Island)", "icelandic", 0xffffffff },
{ "it", "Italy", "italian",
//"albanian,catalan,croatian,franco-proven<65>§al,french,"
// "friulian,german,greek,italian,ladin,occitan,sardinian,slovenian",
0xffffffff },
{ "je", "Jersey", "unknown", 0xffffffff },
{ "jm", "Jamaica", "english", 0xffffffff },
{ "jo", "Jordan", "arabic", 0xffffffff },
{ "jp", "Japan", "japanese", 0xffffffff },
{ "ke", "Kenya", "english,swahili", 0xffffffff },
{ "kg", "Kyrgyzstan", "kirghiz,russian", 0xffffffff },
{ "kh", "Cambodia (Khmer)", "khmer", 0xffffffff },
{ "ki", "Kiribati", "english,kiribati", 0xffffffff },
{ "km", "Comoros", "arabic,comorian,french", 0xffffffff },
{ "kn", "Saint Kitts and Nevis", "english", 0xffffffff },
{ "kr", "South Korea", "korean", 0xffffffff },
{ "kw", "Kuwait", "arabic", 0xffffffff },
{ "ky", "Cayman Islands", "unknown", 0xffffffff },
{ "kz", "Kazakhstan", "kazakh,russian", 0xffffffff },
{ "la", "Laos", "lao,french", 0xffffffff },
{ "lb", "Lebanon", "arabic", 0xffffffff },
{ "lc", "Saint Lucia", "english", 0xffffffff },
{ "li", "Liechtenstein", "german", 0xffffffff },
{ "lk", "Sri Lanka", "sinhala,tamil", 0xffffffff },
{ "lr", "Liberia", "english", 0xffffffff },
{ "ls", "Lesotho", "english,sotho", 0xffffffff },
{ "lt", "Lithuania", "lithuanian", 0xffffffff },
{ "lu", "Luxembourg", "french,german,luxembourgish", 0xffffffff },
{ "lv", "Latvia", "latvian", 0xffffffff },
{ "ly", "Libya", "arabic", 0xffffffff },
{ "ma", "Morocco", "arabic", 0xffffffff },
{ "mc", "Monaco", "french", 0xffffffff },
{ "md", "Moldova", "gagauz,moldovan,russian,ukrainian", 0xffffffff },
{ "mg", "Madagascar", "french,malagasy", 0xffffffff },
{ "mh", "Marshall Islands", "english,marshallese", 0xffffffff },
{ "mk", "Republic of Macedonia", "unknown", 0xffffffff },
{ "ml", "Mali", "french", 0xffffffff },
{ "mm", "Myanmar", "burmese", 0xffffffff },
{ "mn", "Mongolia", "mongolian", 0xffffffff },
{ "mo", "Macau", "unknown", 0xffffffff },
{ "mp", "Northern Mariana Islands", "unknown", 0xffffffff },
{ "mq", "Martinique", "unknown", 0xffffffff },
{ "mr", "Mauritania", "arabic,fula,soninke,wolof", 0xffffffff },
{ "ms", "Montserrat", "unknown", 0xffffffff },
{ "mt", "Malta", "english,maltese", 0xffffffff },
{ "mu", "Mauritius", "english,french", 0xffffffff },
{ "mv", "Maldives", "dhivehi", 0xffffffff },
{ "mw", "Malawi", "chichewa,english", 0xffffffff },
{ "mx", "Mexico", "spanish", 0xffffffff },
{ "my", "Malaysia", "malay", 0xffffffff },
{ "mz", "Mozambique", "portuguese", 0xffffffff },
{ "na", "Namibia", "english", 0xffffffff },
{ "nc", "New Caledonia", "unknown", 0xffffffff },
{ "ne", "Niger", "french", 0xffffffff },
{ "nf", "Norfolk Island", "unknown", 0xffffffff },
{ "ng", "Nigeria", "french", 0xffffffff },
{ "ni", "Nicaragua", "spanish", 0xffffffff },
{ "nl", "Netherlands", "dutch,frisian", 0xffffffff },
{ "no", "Norway", "norwegian,norwegian,sami", 0xffffffff },
{ "np", "Nepal", "nepali", 0xffffffff },
{ "nr", "Nauru", "english,nauruan", 0xffffffff },
{ "nu", "Niue", "unknown", 0xffffffff },
{ "nz", "New Zealand", "english,maori,new zealand sign language,"
"cook islands maori,niuean,tokelauan", 0xffffffff },
{ "om", "Oman", "arabic", 0xffffffff },
{ "pa", "Panama", "spanish", 0xffffffff },
{ "pe", "Peru", "quechua,aymara,spanish", 0xffffffff },
{ "pf", "French Polynesia", "unknown", 0xffffffff },
{ "pg", "Papua New Guinea", "french,fula", 0xffffffff },
{ "ph", "Philippines", "arabic,bikol,cebuano,english,filipino,"
"hiligaynon,ilokano,kapampangan,kinaray-a,maranao,"
"maguindanao,pangasinan,spanish,tagalog,tausug,"
"waray-waray", 0xffffffff },
{ "pk", "Pakistan", "english,urdu", 0xffffffff },
{ "pl", "Poland", "polish", 0xffffffff },
{ "pm", "Saint-Pierre and Miquelon", "unknown", 0xffffffff },
{ "pn", "Pitcairn Islands", "unknown", 0xffffffff },
{ "pr", "Puerto Rico", "unknown", 0xffffffff },
{ "ps", "Palestinian territories", "unknown", 0xffffffff },
{ "pt", "Portugal", "portuguese,mirandese", 0xffffffff },
{ "pw", "Palau", "english,palauan,japanese", 0xffffffff },
{ "py", "Paraguay", "guaran<EFBFBD>­,spanish", 0xffffffff },
{ "qa", "Qatar", "arabic", 0xffffffff },
{ "re", "R<EFBFBD>©union", "unknown", 0xffffffff },
{ "ro", "Romania", "arabic", 0xffffffff },
{ "ru", "Russia", "abaza,adyghe,agul,altay,avar,bashkir,"
"buryat,chechen,chukchi,chuvash,dargin,dolgan,"
"erzya,evenk,ingush,kabardian,kalmyk,karachay-balkar,"
"khakas,khanty,komi-permyak,komi-zyrian,koryak,kumyk,"
"lak,lezgi,mansi,mari,moksha,nogai,nenets,ossetic,russian,"
"tabasaran,tatar,tuvin,udmurt,yakut,yiddish", 0xffffffff },
{ "rw", "Rwanda", "english,french,kinyarwanda", 0xffffffff },
{ "sa", "Saudi Arabia", "arabic", 0xffffffff },
{ "sb", "Solomon Islands", "english", 0xffffffff },
{ "sc", "Seychelles", "english,french,seselwa", 0xffffffff },
{ "sd", "Sudan", "arabic,english", 0xffffffff },
{ "se", "Sweden", "swedish,finnish,me<6D>¤nkieli,romani,sami,"
"yiddish", 0xffffffff },
{ "sg", "Singapore", "english,malay,mandarin,tamil", 0xffffffff },
{ "sh", "Saint Helena", "unknown", 0xffffffff },
{ "si", "Slovenia", "hungarian,italian,slovenian", 0xffffffff },
{ "sj", "Svalbard and Jan Mayen Islands", "unknown", 0xffffffff },
{ "sk", "Slovakia", "slovak", 0xffffffff },
{ "sl", "Sierra Leone", "english", 0xffffffff },
{ "sm", "San Marino", "italian", 0xffffffff },
{ "sn", "Senegal", "french,jola-fogny,malinke,mandinka,pulaar,"
"serer-sine,wolof", 0xffffffff },
{ "so", "Somalia", "french", 0xffffffff },
{ "sr", "Suriname", "dutch", 0xffffffff },
{ "st", "S<EFBFBD>£o Tom<6F>© and Pr<50>­ncipe", "portuguese",
0xffffffff },
{ "su", "former Soviet Union", "unknown", 0xffffffff },
{ "sv", "El Salvador", "spanish", 0xffffffff },
{ "sy", "Syria", "arabic,french", 0xffffffff },
{ "sz", "Swaziland", "english,swazi", 0xffffffff },
{ "tc", "Turks and Caicos Islands", "unknown", 0xffffffff },
{ "td", "Chad", "arabic,french", 0xffffffff },
{ "tf", "French Southern and Antarctic Lands", "unknown", 0xffffffff },
{ "tg", "Togo", "french", 0xffffffff },
{ "th", "Thailand", "thai", 0xffffffff },
{ "tj", "Tajikistan", "tajik", 0xffffffff },
{ "tk", "Tokelau", "unknown", 0xffffffff },
{ "tl", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tm", "Turkmenistan", "turkmen", 0xffffffff },
{ "tn", "Tunisia", "arabic", 0xffffffff },
{ "to", "Tonga", "english,tongan", 0xffffffff },
{ "tp", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tr", "Turkey", "turkish", 0xffffffff },
{ "tt", "Trinidad and Tobago", "english", 0xffffffff },
{ "tv", "Tuvalu", "english,tuvaluan", 0xffffffff },
{ "tw", "Taiwan, Republic of China", "mandarin", 0xffffffff },
{ "tz", "Tanzania", "english,swahili", 0xffffffff },
{ "ua", "Ukraine", "ukrainian", 0xffffffff },
{ "ug", "Uganda", "english,swahili", 0xffffffff },
{ "uk", "United Kingdom", "british,cornish,dg<64>¨rn<72>©siais,"
"irish,j<>¨rriais,pitcairnese,scots,scottish gaelic,"
"welsh", 0xffffffff },
{ "um", "United States Minor Outlying Islands", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "us", "United States of America", "english,carolinian,chamorro,english,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "uy", "Uruguay", "spanish", 0xffffffff },
{ "uz", "Uzbekistan", "uzbek", 0xffffffff },
{ "va", "Vatican City State", "latin", 0xffffffff },
{ "vc", "Saint Vincent and the Grenadines", "english", 0xffffffff },
{ "ve", "Venezuela", "spanish", 0xffffffff },
{ "vg", "British Virgin Islands", "unknown", 0xffffffff },
{ "vi", "U.S. Virgin Islands", "unknown", 0xffffffff },
{ "vn", "Vietnam", "vietnamese", 0xffffffff },
{ "vu", "Vanuatu", "bislama,english,french", 0xffffffff },
{ "wf", "Wallis and Futuna", "unknown", 0xffffffff },
{ "ws", "Samoa", "english,samoan", 0xffffffff },
{ "ye", "Yemen", "arabic", 0xffffffff },
{ "yt", "Mayotte", "unknown", 0xffffffff },
{ "yu", "Yugoslavia", "unknown", 0xffffffff },
{ "za", "South Africa (Zuid-Afrika)", "afrikaans,english,ndebele,"
"northern sotho,sotho,swazi,tsonga,tswana,venda,xhosa,zulu",
0xffffffff },
{ "zm", "Zambia", "english", 0xffffffff },
{ "zw", "Zimbabwe", "unknown", 0xffffffff },
};
static int s_langToCatId[] = {
0, // langUnknown
0, // langEnglish
476, // langFrench
471, // langSpanish
484, // langRussian
49884, // langJapanese
472, // langChineseTrad
494, // langChineseSimp
493, // langKorean
911729, // langGerman
478, // langDutch
477, // langItalian
503, // langFinnish
485, // langSwedish
487, // langNorwegian
483, // langPortuguese
116289, // langVietnamese
88070, // langArabic
118215, // langHebrew
464465, // langIndonesian
482, // langGreek
501, // langThai
51663, // langHindi
241315, // langBengala
480, // langPolish
173548, // langTagalog
0, // langBritish (Sadly, there are no British, UK, or Austrialian topics)
0, // langAustralia
0 // langUnknown, end of list
};
LangList::LangList ( ) {
}
LangList::~LangList ( ) {
reset();
}
void LangList::reset ( ) {
m_langTable.reset();
m_tldToCountry.reset();
}
// . returns false and sets errno on error
// . loads language lists into memory
// . looks under the langlist/ directory for langlist.# files
// each number corrisponds to a language
bool LangList::loadLists ( ) {
//log ( LOG_INIT, "lang: Loading Language Lists.");
2013-08-03 00:12:24 +04:00
// init the term table
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
// loop over the languages and load the files
long listCount = 0;
long dupCount = 0;
long allocSize = 0;
char *buf = NULL;
Words w;
for ( long i = 0; i < MAX_LANGUAGES; i++ ) {
// load the file for reading
char ff[128];
sprintf(ff, "%slanglist/langlist.%li", g_hostdb.m_dir, i );
int fd = open ( ff, O_RDONLY );
// no language file, don't complain
if ( fd < 0 ) continue;
// get the size
struct stat stats;
stats.st_size = 0;
int status = stat ( ff, &stats );
if ( status != 0 ) {
close(fd);
2013-08-03 00:12:24 +04:00
log ( "lang: Could not stat %s: %s.",
ff, strerror(errno) );
return false;
}
long fileSize = stats.st_size;
// read the file into a buffer
long thisAllocSize = 3 * fileSize;
if(thisAllocSize > allocSize) {
buf = (char*)mrealloc(buf, allocSize, thisAllocSize,
"LangList");
allocSize = thisAllocSize;
}
if ( !buf ) {
close(fd);
log ( "lang: Could not allocate %li bytes for "
"langlist buffer: %s.",
thisAllocSize, mstrerror(g_errno) );
return false;
}
if ( read ( fd, buf, fileSize ) != fileSize ) {
close(fd);
log ( "lang: Could not read %s: %s.",
ff, strerror(errno) );
return false;
}
close(fd);
// read the words out of the file
// char *p = buf;
// char *pEnd = buf + fileSize;
// *pEnd = '\0';
//UChar* ucBuf = (UChar*)(buf + fileSize);
//long ucBufLen = fileSize * 2;
long wordsInList = 0;
long writtenLen = gbstrlen(buf);
//long writtenLen = ucToUnicode(ucBuf, ucBufLen,
// buf, fileSize,
// "UTF-8", -1,
// TITLEREC_CURRENT_VERSION);
w.reset();
//doubling the written length seems hackish, may
//need to be fixed in ucToUnicode.
if(!w.set (buf ,
fileSize ,
TITLEREC_CURRENT_VERSION,true, false)) {
char *xx = NULL; *xx = 0;
return false;
}
long numWords = w.getNumWords();
for(long j = 0; j < numWords; j++) {
long long wordId = w.m_wordIds[j];
if(wordId == 0) continue;
// add it to the table
unsigned long score = m_langTable.getScore(&wordId);
//log(LOG_WARN,
// "lang: Successfully hash %lli from %s dictionary.",
//wordId, getLanguageString(i));
if ( score != (unsigned long)i ) {
if ( score > 0 ) {
dupCount++;
if ( score != 0x7fffffff )
m_langTable.addTerm ( &wordId,
0x7fffffff);
}
else {
m_langTable.addTerm ( &wordId, i );
wordsInList++;
}
}
}
// count the list
listCount++;
if ( wordsInList > 0 )
2013-08-03 00:12:24 +04:00
log ( LOG_DEBUG,
"lang: Successfully Loaded %li out of %li (%li bytes) "
"words from %s dictionary.",
wordsInList, numWords>>1, writtenLen, getLanguageString(i) );
}
// free the buffer
if(buf) mfree ( buf, allocSize, "LangList" );
log ( LOG_INIT, "lang: Successfully Loaded %li Language Lists and "
"%li duplicate word hashes.",
listCount, dupCount );
// all good
return true;
}
// . lookup word in language lists
// . returns false if not found true if found and lang set
bool LangList::lookup ( long long termId,
unsigned char *lang ) {
// lookup the termId in the table
unsigned long score = m_langTable.getScore(&termId);
// is it unknown?
if ( score == 0 || score >= MAX_LANGUAGES ) {
*lang = 0;
return false;
}
// otherwise set lang to the score
*lang = (unsigned char)score;
return true;
}
char* LangList::getCountryFromTld(char* tld, long tldLen) {
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
long j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
long index = hash32(tld, tldLen);
long slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return NULL;
return s_tldInfo[*(long *)m_tldToCountry.getValueFromSlot(slot)].m_country;
}
bool LangList::isLangValidForTld(char* tld, long tldLen, unsigned char lang) {
if(lang == langUnknown) return true; //not much we can do here.
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
long j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
long index = hash32(tld, tldLen);
long slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return true;
long *tip = (long *)m_tldToCountry.getValueFromSlot(slot);
if ( ! tip ) { char *xx=NULL;*xx=0; }
TldInfo* t = &s_tldInfo[*tip];
//it is uninitalized, init on demand.
if(t->m_languagebv == 0xffffffff) {
t->m_languagebv = 0;
for(long i = 1; i <= langTagalog; i++) {
if(strstr(t->m_languages,getLanguageString(i)) == NULL)
continue;
//set the bit corresponding to lang
t->m_languagebv |= 0x1 << (i-1);
}
}
if(t->m_languagebv == 0) return true; //its unknown.
long mask = 0x1 << (lang-1);
return mask & t->m_languagebv;
}
bool LangList::tldInit() {
s_numTlds = sizeof(s_tldInfo) / sizeof(TldInfo);
m_tldToCountry.set(4,4,0,NULL,0,false,0,"tldctrytbl");
for(long i = 0; i < s_numTlds; i++) {
long ndx = hash32n(s_tldInfo[i].m_tld);
if ( ! m_tldToCountry.addKey(&ndx , &i ) ) return false;
}
return true;
}
uint8_t LangList::catIdToLang(uint32_t catid) {
register uint32_t i;
for(i = 0; i < sizeof(s_langToCatId)/sizeof(uint32_t); i++) {
if(catid == (uint32_t)s_langToCatId[i]) return((uint8_t)i);
}
return(0);
}
uint32_t LangList::langToCatId(uint8_t lang) {
return(s_langToCatId[(int)lang]);
}
uint8_t LangList::isLangCat(int catid) {
for(int x = 0; x < MAX_LANGUAGES; x++)
if(catid == s_langToCatId[x])
return(x);
return(langUnknown);
}