open-source-search-engine/LangList.cpp
2013-08-02 13:12:24 -07:00

603 lines
23 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "gb-include.h"
#include "LangList.h"
#include "Indexdb.h"
LangList g_langList;
struct TldInfo {
char* m_tld;
char* m_country;
char* m_languages;
unsigned long m_languagebv;
};
static long s_numTlds = 0;
static TldInfo s_tldInfo[] = {
{ "arpa", "Address and Routing Parameter Area", "unknown", 0xffffffff },
{ "root", "N/A", "unknown", 0xffffffff },
{ "aero", "air-transport industry", "unknown", 0xffffffff },
{ "biz", "business", "unknown", 0xffffffff },
{ "cat", "Catalan", "unknown", 0xffffffff },
{ "com", "commercial", "unknown", 0xffffffff },
{ "coop", "cooperatives", "unknown", 0xffffffff },
{ "edu", "educational", "unknown", 0xffffffff },
{ "gov", "governmental", "unknown", 0xffffffff },
{ "info", "information", "unknown", 0xffffffff },
{ "int", "international organizations", "unknown", 0xffffffff },
{ "jobs", "companies", "unknown", 0xffffffff },
{ "mil", "United States Military", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "mobi", "mobile devices", "unknown", 0xffffffff },
{ "museum", "museums", "unknown", 0xffffffff },
{ "name", "individuals, by name", "unknown", 0xffffffff },
{ "net", "network", "unknown", 0xffffffff },
{ "org", "organization", "unknown", 0xffffffff },
{ "pro", "professions", "unknown", 0xffffffff },
{ "travel", "travel and travel-agency related sites", "unknown", 0xffffffff },
{ "ac", "Ascension Island", "unknown", 0xffffffff },
{ "ad", "Andorra", "catalan", 0xffffffff },
{ "ae", "United Arab Emirates", "arabic", 0xffffffff },
{ "af", "Afghanistan", "arabic,balochi,dari,nuristani,pashto,pamiri,"
"pashai,turkmen,uzbek", 0xffffffff },
{ "ag", "Antigua and Barbuda", "english", 0xffffffff },
{ "ai", "Anguilla", "unknown", 0xffffffff },
{ "al", "Albania", "albanian", 0xffffffff },
{ "am", "Armenia", "armenian,armenian", 0xffffffff },
{ "an", "Netherlands Antilles", "dutch,frisian", 0xffffffff },
{ "ao", "Angola", "portuguese", 0xffffffff },
{ "aq", "Antarctica", "unknown", 0xffffffff },
{ "ar", "Argentina", "spanish,guarani", 0xffffffff },
{ "as", "American Samoa", "english,samoan", 0xffffffff },
{ "at", "Austria", "croatian,czech,german,hungarian,slovak,"
"slovenian,romani", 0xffffffff },
{ "au", "Australia", "australia", 0xffffffff },
{ "aw", "Aruba", "unknown", 0xffffffff },
{ "ax", "Åland", "unknown", 0xffffffff },
{ "az", "Azerbaijan", "azerbaijani", 0xffffffff },
{ "ba", "Bosnia and Herzegovina", "bosnian,croatian,serbian", 0xffffffff },
{ "bb", "Barbados", "english", 0xffffffff },
{ "bd", "Bangladesh", "bengala", 0xffffffff },
{ "be", "Belgium", "dutch,french,german", 0xffffffff },
{ "bf", "Burkina Faso", "french,more,jula,fula", 0xffffffff },
{ "bg", "Bulgaria", "bulgarian", 0xffffffff },
{ "bh", "Bahrain", "arabic", 0xffffffff },
{ "bi", "Burundi", "french,kirundi", 0xffffffff },
{ "bj", "Benin", "french", 0xffffffff },
{ "bm", "Bermuda", "unknown", 0xffffffff },
{ "bn", "Brunei Darussalam", "malay", 0xffffffff },
{ "bo", "Bolivia", "spanish,aymara,quechua", 0xffffffff },
{ "br", "Brazil", "portuguese", 0xffffffff },
{ "bs", "Bahamas", "unknown", 0xffffffff },
{ "bt", "Bhutan", "dzongkha,english", 0xffffffff },
{ "bv", "Bouvet Island", "unknown", 0xffffffff },
{ "bw", "Botswana", "english,kalanga,tswana", 0xffffffff },
{ "by", "Belarus", "belarusian,russian", 0xffffffff },
{ "bz", "Belize", "english", 0xffffffff },
{ "ca", "Canada", "chipewyan,cree,dogrib,english,french,gwich?in,inuinnaqtun,"
"inuktitut,inuvialuktun,slavey", 0xffffffff },
{ "cc", "Cocos (Keeling) Islands", "unknown", 0xffffffff },
{ "cd", "Democratic Republic of the Congo", "french,lingala,kikongo,swahili,"
"tshiluba", 0xffffffff },
{ "cf", "Central African Republic", "french,sango", 0xffffffff },
{ "cg", "Republic of the Congo", "french,lingala,munukutuba", 0xffffffff },
{ "ch", "Switzerland (Confoederatio Helvetica)", "french,german,italian,"
"romansh", 0xffffffff },
{ "ci", "C<EFBFBD>´te d'Ivoire", "french", 0xffffffff },
{ "ck", "Cook Islands", "unknown", 0xffffffff },
{ "cl", "Chile", "spanish", 0xffffffff },
{ "cm", "Cameroon", "english,french", 0xffffffff },
{ "cn", "People's Republic of China", "cantonese,english,kazakh,korean,"
"mandarin,mongolian,portuguese,tajik,tibetan,uyghur,zhuang",
0xffffffff },
{ "co", "Colombia", "spanish", 0xffffffff },
{ "cr", "Costa Rica", "spanish", 0xffffffff },
{ "cu", "Cuba", "spanish", 0xffffffff },
{ "cv", "Cape Verde", "crioulo,portuguese", 0xffffffff },
{ "cx", "Christmas Island", "unknown", 0xffffffff },
{ "cy", "Cyprus", "greek,turkish", 0xffffffff },
{ "cz", "Czech Republic", "czech", 0xffffffff },
{ "de", "Germany (Deutschland)", "danish,frisian,german,romani,"
"lower sorbian,upper sorbian", 0xffffffff },
{ "dj", "Djibouti", "arabic,french", 0xffffffff },
{ "dk", "Denmark", "danish,faroese,kalaallisut", 0xffffffff },
{ "dm", "Dominica", "english", 0xffffffff },
{ "do", "Dominican Republic", "english", 0xffffffff },
{ "dz", "Algeria", "arabic,tamazight", 0xffffffff },
{ "ec", "Ecuador", "spanish,quechua", 0xffffffff },
{ "ee", "Estonia", "estonian", 0xffffffff },
{ "eg", "Egypt", "arabic", 0xffffffff },
{ "er", "Eritrea", "arabic,english,tigrinya", 0xffffffff },
{ "es", "Spain (Espa<70>±a)", "basque,catalan,galician,occitan,"
"spanish", 0xffffffff },
{ "et", "Ethiopia", "amharic", 0xffffffff },
{ "eu", "European Union", "unknown", 0xffffffff },
{ "fi", "Finland", "finnish,sami,swedish", 0xffffffff },
{ "fj", "Fiji", "english,fijian,hindustani", 0xffffffff },
{ "fk", "Falkland Islands", "unknown", 0xffffffff },
{ "fm", "Federated States of Micronesia", "chuuk,english,kosraean,ponapean,"
"ulithian,yapese", 0xffffffff },
{ "fo", "Faroe Islands", "unknown", 0xffffffff },
{ "fr", "France", "french,tahitian", 0xffffffff },
{ "ga", "Gabon", "french", 0xffffffff },
{ "gb", "United Kingdom (Great Britain)", "english,cornish,"
"dg<EFBFBD>¨rn<EFBFBD>©siais,english,french,irish,j<>¨rriais,"
"pitcairnese,scots,scottish gaelic,welsh", 0xffffffff },
{ "gd", "Grenada", "english", 0xffffffff },
{ "ge", "Georgia", "abkhaz,georgian,ossetic,russian", 0xffffffff },
{ "gf", "French Guiana", "unknown", 0xffffffff },
{ "gg", "Guernsey", "unknown", 0xffffffff },
{ "gh", "Ghana", "adangme,dagaare,dagbani,english,ewe,ga,gonja,kasem,"
"nzema,twi", 0xffffffff },
{ "gi", "Gibraltar", "unknown", 0xffffffff },
{ "gl", "Greenland", "unknown", 0xffffffff },
{ "gm", "The Gambia", "unknown", 0xffffffff },
{ "gn", "Guinea", "french,fula", 0xffffffff },
{ "gp", "Guadeloupe", "unknown", 0xffffffff },
{ "gq", "Equatorial Guinea", "french,spanish", 0xffffffff },
{ "gr", "Greece", "greek", 0xffffffff },
{ "gs", "South Georgia and the South Sandwich Islands", "abkhaz,georgian,"
"ossetic,russian", 0xffffffff },
{ "gt", "Guatemala", "spanish", 0xffffffff },
{ "gu", "Guam", "unknown", 0xffffffff },
{ "gw", "Guinea-Bissau", "french,fula", 0xffffffff },
{ "gy", "Guyana", "english", 0xffffffff },
{ "hk", "Hong Kong", "unknown", 0xffffffff },
{ "hm", "Heard Island and McDonald Islands", "unknown", 0xffffffff },
{ "hn", "Honduras", "spanish", 0xffffffff },
{ "hr", "Croatia (Hrvatska)", "croatian,italian", 0xffffffff },
{ "ht", "Haiti", "french,haitian creole", 0xffffffff },
{ "hu", "Hungary", "hungarian", 0xffffffff },
{ "id", "Indonesia", "balinese,javanese,indonesian,sundanese", 0xffffffff },
{ "ie", "Ireland (Éire)", "unknown", 0xffffffff },
{ "il", "Israel", "arabic,hebrew", 0xffffffff },
{ "im", "Isle of Man", "unknown", 0xffffffff },
{ "in", "India", "assamese,bengala,bodo,dogri,english,gujarati,hindi,kannada,"
"kashmiri,konkani,maithili,malayalam,meitei,marathi,nepali,oriya,"
"punjabi,sanskrit,santali,sindhi,tamil,telugu,urdu,french,karbi,"
"bhojpuri,magadhi,maithili,chhattisgarhi,portuguese,pahari,tulu,"
"garo,khasi,mizo,rajasthani,kokborok,nicobarese", 0xffffffff },
{ "io", "British Indian Ocean Territory", "assamese,bengala,bodo,dogri,"
"english,gujarati,hindi,kannada,kashmiri,konkani,maithili,malayalam,"
"meitei,marathi,nepali,oriya,punjabi,sanskrit,santali,sindhi,tamil,"
"telugu,urdu,french,karbi,bhojpuri,magadhi,maithili,chhattisgarhi,"
"portuguese,pahari,tulu,garo,khasi,mizo,rajasthani,kokborok,"
"nicobarese", 0xffffffff },
{ "iq", "Iraq", "arabic,kurdish", 0xffffffff },
{ "ir", "Iran", "persian", 0xffffffff },
{ "is", "Iceland (Island)", "icelandic", 0xffffffff },
{ "it", "Italy", "italian",
//"albanian,catalan,croatian,franco-proven<65>§al,french,"
// "friulian,german,greek,italian,ladin,occitan,sardinian,slovenian",
0xffffffff },
{ "je", "Jersey", "unknown", 0xffffffff },
{ "jm", "Jamaica", "english", 0xffffffff },
{ "jo", "Jordan", "arabic", 0xffffffff },
{ "jp", "Japan", "japanese", 0xffffffff },
{ "ke", "Kenya", "english,swahili", 0xffffffff },
{ "kg", "Kyrgyzstan", "kirghiz,russian", 0xffffffff },
{ "kh", "Cambodia (Khmer)", "khmer", 0xffffffff },
{ "ki", "Kiribati", "english,kiribati", 0xffffffff },
{ "km", "Comoros", "arabic,comorian,french", 0xffffffff },
{ "kn", "Saint Kitts and Nevis", "english", 0xffffffff },
{ "kr", "South Korea", "korean", 0xffffffff },
{ "kw", "Kuwait", "arabic", 0xffffffff },
{ "ky", "Cayman Islands", "unknown", 0xffffffff },
{ "kz", "Kazakhstan", "kazakh,russian", 0xffffffff },
{ "la", "Laos", "lao,french", 0xffffffff },
{ "lb", "Lebanon", "arabic", 0xffffffff },
{ "lc", "Saint Lucia", "english", 0xffffffff },
{ "li", "Liechtenstein", "german", 0xffffffff },
{ "lk", "Sri Lanka", "sinhala,tamil", 0xffffffff },
{ "lr", "Liberia", "english", 0xffffffff },
{ "ls", "Lesotho", "english,sotho", 0xffffffff },
{ "lt", "Lithuania", "lithuanian", 0xffffffff },
{ "lu", "Luxembourg", "french,german,luxembourgish", 0xffffffff },
{ "lv", "Latvia", "latvian", 0xffffffff },
{ "ly", "Libya", "arabic", 0xffffffff },
{ "ma", "Morocco", "arabic", 0xffffffff },
{ "mc", "Monaco", "french", 0xffffffff },
{ "md", "Moldova", "gagauz,moldovan,russian,ukrainian", 0xffffffff },
{ "mg", "Madagascar", "french,malagasy", 0xffffffff },
{ "mh", "Marshall Islands", "english,marshallese", 0xffffffff },
{ "mk", "Republic of Macedonia", "unknown", 0xffffffff },
{ "ml", "Mali", "french", 0xffffffff },
{ "mm", "Myanmar", "burmese", 0xffffffff },
{ "mn", "Mongolia", "mongolian", 0xffffffff },
{ "mo", "Macau", "unknown", 0xffffffff },
{ "mp", "Northern Mariana Islands", "unknown", 0xffffffff },
{ "mq", "Martinique", "unknown", 0xffffffff },
{ "mr", "Mauritania", "arabic,fula,soninke,wolof", 0xffffffff },
{ "ms", "Montserrat", "unknown", 0xffffffff },
{ "mt", "Malta", "english,maltese", 0xffffffff },
{ "mu", "Mauritius", "english,french", 0xffffffff },
{ "mv", "Maldives", "dhivehi", 0xffffffff },
{ "mw", "Malawi", "chichewa,english", 0xffffffff },
{ "mx", "Mexico", "spanish", 0xffffffff },
{ "my", "Malaysia", "malay", 0xffffffff },
{ "mz", "Mozambique", "portuguese", 0xffffffff },
{ "na", "Namibia", "english", 0xffffffff },
{ "nc", "New Caledonia", "unknown", 0xffffffff },
{ "ne", "Niger", "french", 0xffffffff },
{ "nf", "Norfolk Island", "unknown", 0xffffffff },
{ "ng", "Nigeria", "french", 0xffffffff },
{ "ni", "Nicaragua", "spanish", 0xffffffff },
{ "nl", "Netherlands", "dutch,frisian", 0xffffffff },
{ "no", "Norway", "norwegian,norwegian,sami", 0xffffffff },
{ "np", "Nepal", "nepali", 0xffffffff },
{ "nr", "Nauru", "english,nauruan", 0xffffffff },
{ "nu", "Niue", "unknown", 0xffffffff },
{ "nz", "New Zealand", "english,maori,new zealand sign language,"
"cook islands maori,niuean,tokelauan", 0xffffffff },
{ "om", "Oman", "arabic", 0xffffffff },
{ "pa", "Panama", "spanish", 0xffffffff },
{ "pe", "Peru", "quechua,aymara,spanish", 0xffffffff },
{ "pf", "French Polynesia", "unknown", 0xffffffff },
{ "pg", "Papua New Guinea", "french,fula", 0xffffffff },
{ "ph", "Philippines", "arabic,bikol,cebuano,english,filipino,"
"hiligaynon,ilokano,kapampangan,kinaray-a,maranao,"
"maguindanao,pangasinan,spanish,tagalog,tausug,"
"waray-waray", 0xffffffff },
{ "pk", "Pakistan", "english,urdu", 0xffffffff },
{ "pl", "Poland", "polish", 0xffffffff },
{ "pm", "Saint-Pierre and Miquelon", "unknown", 0xffffffff },
{ "pn", "Pitcairn Islands", "unknown", 0xffffffff },
{ "pr", "Puerto Rico", "unknown", 0xffffffff },
{ "ps", "Palestinian territories", "unknown", 0xffffffff },
{ "pt", "Portugal", "portuguese,mirandese", 0xffffffff },
{ "pw", "Palau", "english,palauan,japanese", 0xffffffff },
{ "py", "Paraguay", "guaran<EFBFBD>­,spanish", 0xffffffff },
{ "qa", "Qatar", "arabic", 0xffffffff },
{ "re", "R<EFBFBD>©union", "unknown", 0xffffffff },
{ "ro", "Romania", "arabic", 0xffffffff },
{ "ru", "Russia", "abaza,adyghe,agul,altay,avar,bashkir,"
"buryat,chechen,chukchi,chuvash,dargin,dolgan,"
"erzya,evenk,ingush,kabardian,kalmyk,karachay-balkar,"
"khakas,khanty,komi-permyak,komi-zyrian,koryak,kumyk,"
"lak,lezgi,mansi,mari,moksha,nogai,nenets,ossetic,russian,"
"tabasaran,tatar,tuvin,udmurt,yakut,yiddish", 0xffffffff },
{ "rw", "Rwanda", "english,french,kinyarwanda", 0xffffffff },
{ "sa", "Saudi Arabia", "arabic", 0xffffffff },
{ "sb", "Solomon Islands", "english", 0xffffffff },
{ "sc", "Seychelles", "english,french,seselwa", 0xffffffff },
{ "sd", "Sudan", "arabic,english", 0xffffffff },
{ "se", "Sweden", "swedish,finnish,me<6D>¤nkieli,romani,sami,"
"yiddish", 0xffffffff },
{ "sg", "Singapore", "english,malay,mandarin,tamil", 0xffffffff },
{ "sh", "Saint Helena", "unknown", 0xffffffff },
{ "si", "Slovenia", "hungarian,italian,slovenian", 0xffffffff },
{ "sj", "Svalbard and Jan Mayen Islands", "unknown", 0xffffffff },
{ "sk", "Slovakia", "slovak", 0xffffffff },
{ "sl", "Sierra Leone", "english", 0xffffffff },
{ "sm", "San Marino", "italian", 0xffffffff },
{ "sn", "Senegal", "french,jola-fogny,malinke,mandinka,pulaar,"
"serer-sine,wolof", 0xffffffff },
{ "so", "Somalia", "french", 0xffffffff },
{ "sr", "Suriname", "dutch", 0xffffffff },
{ "st", "S<EFBFBD>£o Tom<6F>© and Pr<50>­ncipe", "portuguese",
0xffffffff },
{ "su", "former Soviet Union", "unknown", 0xffffffff },
{ "sv", "El Salvador", "spanish", 0xffffffff },
{ "sy", "Syria", "arabic,french", 0xffffffff },
{ "sz", "Swaziland", "english,swazi", 0xffffffff },
{ "tc", "Turks and Caicos Islands", "unknown", 0xffffffff },
{ "td", "Chad", "arabic,french", 0xffffffff },
{ "tf", "French Southern and Antarctic Lands", "unknown", 0xffffffff },
{ "tg", "Togo", "french", 0xffffffff },
{ "th", "Thailand", "thai", 0xffffffff },
{ "tj", "Tajikistan", "tajik", 0xffffffff },
{ "tk", "Tokelau", "unknown", 0xffffffff },
{ "tl", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tm", "Turkmenistan", "turkmen", 0xffffffff },
{ "tn", "Tunisia", "arabic", 0xffffffff },
{ "to", "Tonga", "english,tongan", 0xffffffff },
{ "tp", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tr", "Turkey", "turkish", 0xffffffff },
{ "tt", "Trinidad and Tobago", "english", 0xffffffff },
{ "tv", "Tuvalu", "english,tuvaluan", 0xffffffff },
{ "tw", "Taiwan, Republic of China", "mandarin", 0xffffffff },
{ "tz", "Tanzania", "english,swahili", 0xffffffff },
{ "ua", "Ukraine", "ukrainian", 0xffffffff },
{ "ug", "Uganda", "english,swahili", 0xffffffff },
{ "uk", "United Kingdom", "british,cornish,dg<64>¨rn<72>©siais,"
"irish,j<>¨rriais,pitcairnese,scots,scottish gaelic,"
"welsh", 0xffffffff },
{ "um", "United States Minor Outlying Islands", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "us", "United States of America", "english,carolinian,chamorro,english,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "uy", "Uruguay", "spanish", 0xffffffff },
{ "uz", "Uzbekistan", "uzbek", 0xffffffff },
{ "va", "Vatican City State", "latin", 0xffffffff },
{ "vc", "Saint Vincent and the Grenadines", "english", 0xffffffff },
{ "ve", "Venezuela", "spanish", 0xffffffff },
{ "vg", "British Virgin Islands", "unknown", 0xffffffff },
{ "vi", "U.S. Virgin Islands", "unknown", 0xffffffff },
{ "vn", "Vietnam", "vietnamese", 0xffffffff },
{ "vu", "Vanuatu", "bislama,english,french", 0xffffffff },
{ "wf", "Wallis and Futuna", "unknown", 0xffffffff },
{ "ws", "Samoa", "english,samoan", 0xffffffff },
{ "ye", "Yemen", "arabic", 0xffffffff },
{ "yt", "Mayotte", "unknown", 0xffffffff },
{ "yu", "Yugoslavia", "unknown", 0xffffffff },
{ "za", "South Africa (Zuid-Afrika)", "afrikaans,english,ndebele,"
"northern sotho,sotho,swazi,tsonga,tswana,venda,xhosa,zulu",
0xffffffff },
{ "zm", "Zambia", "english", 0xffffffff },
{ "zw", "Zimbabwe", "unknown", 0xffffffff },
};
static int s_langToCatId[] = {
0, // langUnknown
0, // langEnglish
476, // langFrench
471, // langSpanish
484, // langRussian
49884, // langJapanese
472, // langChineseTrad
494, // langChineseSimp
493, // langKorean
911729, // langGerman
478, // langDutch
477, // langItalian
503, // langFinnish
485, // langSwedish
487, // langNorwegian
483, // langPortuguese
116289, // langVietnamese
88070, // langArabic
118215, // langHebrew
464465, // langIndonesian
482, // langGreek
501, // langThai
51663, // langHindi
241315, // langBengala
480, // langPolish
173548, // langTagalog
0, // langBritish (Sadly, there are no British, UK, or Austrialian topics)
0, // langAustralia
0 // langUnknown, end of list
};
LangList::LangList ( ) {
}
LangList::~LangList ( ) {
reset();
}
void LangList::reset ( ) {
m_langTable.reset();
m_tldToCountry.reset();
}
// . returns false and sets errno on error
// . loads language lists into memory
// . looks under the langlist/ directory for langlist.# files
// each number corrisponds to a language
bool LangList::loadLists ( ) {
log ( LOG_INIT, "lang: Loading Language Lists.");
// init the term table
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
// loop over the languages and load the files
long listCount = 0;
long dupCount = 0;
long allocSize = 0;
char *buf = NULL;
Words w;
for ( long i = 0; i < MAX_LANGUAGES; i++ ) {
// load the file for reading
char ff[128];
sprintf(ff, "%slanglist/langlist.%li", g_hostdb.m_dir, i );
int fd = open ( ff, O_RDONLY );
// no language file, don't complain
if ( fd < 0 ) continue;
// get the size
struct stat stats;
stats.st_size = 0;
int status = stat ( ff, &stats );
if ( status != 0 ) {
log ( "lang: Could not stat %s: %s.",
ff, strerror(errno) );
return false;
}
long fileSize = stats.st_size;
// read the file into a buffer
long thisAllocSize = 3 * fileSize;
if(thisAllocSize > allocSize) {
buf = (char*)mrealloc(buf, allocSize, thisAllocSize,
"LangList");
allocSize = thisAllocSize;
}
if ( !buf ) {
close(fd);
log ( "lang: Could not allocate %li bytes for "
"langlist buffer: %s.",
thisAllocSize, mstrerror(g_errno) );
return false;
}
if ( read ( fd, buf, fileSize ) != fileSize ) {
close(fd);
log ( "lang: Could not read %s: %s.",
ff, strerror(errno) );
return false;
}
close(fd);
// read the words out of the file
// char *p = buf;
// char *pEnd = buf + fileSize;
// *pEnd = '\0';
//UChar* ucBuf = (UChar*)(buf + fileSize);
//long ucBufLen = fileSize * 2;
long wordsInList = 0;
long writtenLen = gbstrlen(buf);
//long writtenLen = ucToUnicode(ucBuf, ucBufLen,
// buf, fileSize,
// "UTF-8", -1,
// TITLEREC_CURRENT_VERSION);
w.reset();
//doubling the written length seems hackish, may
//need to be fixed in ucToUnicode.
if(!w.set (buf ,
fileSize ,
TITLEREC_CURRENT_VERSION,true, false)) {
char *xx = NULL; *xx = 0;
return false;
}
long numWords = w.getNumWords();
for(long j = 0; j < numWords; j++) {
long long wordId = w.m_wordIds[j];
if(wordId == 0) continue;
// add it to the table
unsigned long score = m_langTable.getScore(&wordId);
//log(LOG_WARN,
// "lang: Successfully hash %lli from %s dictionary.",
//wordId, getLanguageString(i));
if ( score != (unsigned long)i ) {
if ( score > 0 ) {
dupCount++;
if ( score != 0x7fffffff )
m_langTable.addTerm ( &wordId,
0x7fffffff);
}
else {
m_langTable.addTerm ( &wordId, i );
wordsInList++;
}
}
}
// count the list
listCount++;
log ( LOG_DEBUG,
"lang: Successfully Loaded %li out of %li (%li bytes) "
"words from %s dictionary.",
wordsInList, numWords>>1, writtenLen, getLanguageString(i) );
}
// free the buffer
if(buf) mfree ( buf, allocSize, "LangList" );
log ( LOG_INIT, "lang: Successfully Loaded %li Language Lists and "
"%li duplicate word hashes.",
listCount, dupCount );
// all good
return true;
}
// . lookup word in language lists
// . returns false if not found true if found and lang set
bool LangList::lookup ( long long termId,
unsigned char *lang ) {
// lookup the termId in the table
unsigned long score = m_langTable.getScore(&termId);
// is it unknown?
if ( score == 0 || score >= MAX_LANGUAGES ) {
*lang = 0;
return false;
}
// otherwise set lang to the score
*lang = (unsigned char)score;
return true;
}
char* LangList::getCountryFromTld(char* tld, long tldLen) {
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
long j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
long index = hash32(tld, tldLen);
long slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return NULL;
return s_tldInfo[*(long *)m_tldToCountry.getValueFromSlot(slot)].m_country;
}
bool LangList::isLangValidForTld(char* tld, long tldLen, unsigned char lang) {
if(lang == langUnknown) return true; //not much we can do here.
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
long j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
long index = hash32(tld, tldLen);
long slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return true;
long *tip = (long *)m_tldToCountry.getValueFromSlot(slot);
if ( ! tip ) { char *xx=NULL;*xx=0; }
TldInfo* t = &s_tldInfo[*tip];
//it is uninitalized, init on demand.
if(t->m_languagebv == 0xffffffff) {
t->m_languagebv = 0;
for(long i = 1; i <= langTagalog; i++) {
if(strstr(t->m_languages,getLanguageString(i)) == NULL)
continue;
//set the bit corresponding to lang
t->m_languagebv |= 0x1 << (i-1);
}
}
if(t->m_languagebv == 0) return true; //its unknown.
long mask = 0x1 << (lang-1);
return mask & t->m_languagebv;
}
bool LangList::tldInit() {
s_numTlds = sizeof(s_tldInfo) / sizeof(TldInfo);
m_tldToCountry.set(4,4,0,NULL,0,false,0,"tldctrytbl");
for(long i = 0; i < s_numTlds; i++) {
long ndx = hash32n(s_tldInfo[i].m_tld);
if ( ! m_tldToCountry.addKey(&ndx , &i ) ) return false;
}
return true;
}
uint8_t LangList::catIdToLang(uint32_t catid) {
register uint32_t i;
for(i = 0; i < sizeof(s_langToCatId)/sizeof(uint32_t); i++) {
if(catid == (uint32_t)s_langToCatId[i]) return((uint8_t)i);
}
return(0);
}
uint32_t LangList::langToCatId(uint8_t lang) {
return(s_langToCatId[(int)lang]);
}
uint8_t LangList::isLangCat(int catid) {
for(int x = 0; x < MAX_LANGUAGES; x++)
if(catid == s_langToCatId[x])
return(x);
return(langUnknown);
}