open-source-search-engine/LangList.cpp
2014-11-10 14:45:11 -08:00

605 lines
23 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "gb-include.h"
#include "LangList.h"
#include "Indexdb.h"
LangList g_langList;
struct TldInfo {
char* m_tld;
char* m_country;
char* m_languages;
uint32_t m_languagebv;
};
static int32_t s_numTlds = 0;
static TldInfo s_tldInfo[] = {
{ "arpa", "Address and Routing Parameter Area", "unknown", 0xffffffff },
{ "root", "N/A", "unknown", 0xffffffff },
{ "aero", "air-transport industry", "unknown", 0xffffffff },
{ "biz", "business", "unknown", 0xffffffff },
{ "cat", "Catalan", "unknown", 0xffffffff },
{ "com", "commercial", "unknown", 0xffffffff },
{ "coop", "cooperatives", "unknown", 0xffffffff },
{ "edu", "educational", "unknown", 0xffffffff },
{ "gov", "governmental", "unknown", 0xffffffff },
{ "info", "information", "unknown", 0xffffffff },
{ "int", "international organizations", "unknown", 0xffffffff },
{ "jobs", "companies", "unknown", 0xffffffff },
{ "mil", "United States Military", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "mobi", "mobile devices", "unknown", 0xffffffff },
{ "museum", "museums", "unknown", 0xffffffff },
{ "name", "individuals, by name", "unknown", 0xffffffff },
{ "net", "network", "unknown", 0xffffffff },
{ "org", "organization", "unknown", 0xffffffff },
{ "pro", "professions", "unknown", 0xffffffff },
{ "travel", "travel and travel-agency related sites", "unknown", 0xffffffff },
{ "ac", "Ascension Island", "unknown", 0xffffffff },
{ "ad", "Andorra", "catalan", 0xffffffff },
{ "ae", "United Arab Emirates", "arabic", 0xffffffff },
{ "af", "Afghanistan", "arabic,balochi,dari,nuristani,pashto,pamiri,"
"pashai,turkmen,uzbek", 0xffffffff },
{ "ag", "Antigua and Barbuda", "english", 0xffffffff },
{ "ai", "Anguilla", "unknown", 0xffffffff },
{ "al", "Albania", "albanian", 0xffffffff },
{ "am", "Armenia", "armenian,armenian", 0xffffffff },
{ "an", "Netherlands Antilles", "dutch,frisian", 0xffffffff },
{ "ao", "Angola", "portuguese", 0xffffffff },
{ "aq", "Antarctica", "unknown", 0xffffffff },
{ "ar", "Argentina", "spanish,guarani", 0xffffffff },
{ "as", "American Samoa", "english,samoan", 0xffffffff },
{ "at", "Austria", "croatian,czech,german,hungarian,slovak,"
"slovenian,romani", 0xffffffff },
{ "au", "Australia", "australia", 0xffffffff },
{ "aw", "Aruba", "unknown", 0xffffffff },
{ "ax", "Åland", "unknown", 0xffffffff },
{ "az", "Azerbaijan", "azerbaijani", 0xffffffff },
{ "ba", "Bosnia and Herzegovina", "bosnian,croatian,serbian", 0xffffffff },
{ "bb", "Barbados", "english", 0xffffffff },
{ "bd", "Bangladesh", "bengala", 0xffffffff },
{ "be", "Belgium", "dutch,french,german", 0xffffffff },
{ "bf", "Burkina Faso", "french,more,jula,fula", 0xffffffff },
{ "bg", "Bulgaria", "bulgarian", 0xffffffff },
{ "bh", "Bahrain", "arabic", 0xffffffff },
{ "bi", "Burundi", "french,kirundi", 0xffffffff },
{ "bj", "Benin", "french", 0xffffffff },
{ "bm", "Bermuda", "unknown", 0xffffffff },
{ "bn", "Brunei Darussalam", "malay", 0xffffffff },
{ "bo", "Bolivia", "spanish,aymara,quechua", 0xffffffff },
{ "br", "Brazil", "portuguese", 0xffffffff },
{ "bs", "Bahamas", "unknown", 0xffffffff },
{ "bt", "Bhutan", "dzongkha,english", 0xffffffff },
{ "bv", "Bouvet Island", "unknown", 0xffffffff },
{ "bw", "Botswana", "english,kalanga,tswana", 0xffffffff },
{ "by", "Belarus", "belarusian,russian", 0xffffffff },
{ "bz", "Belize", "english", 0xffffffff },
{ "ca", "Canada", "chipewyan,cree,dogrib,english,french,gwich?in,inuinnaqtun,"
"inuktitut,inuvialuktun,slavey", 0xffffffff },
{ "cc", "Cocos (Keeling) Islands", "unknown", 0xffffffff },
{ "cd", "Democratic Republic of the Congo", "french,lingala,kikongo,swahili,"
"tshiluba", 0xffffffff },
{ "cf", "Central African Republic", "french,sango", 0xffffffff },
{ "cg", "Republic of the Congo", "french,lingala,munukutuba", 0xffffffff },
{ "ch", "Switzerland (Confoederatio Helvetica)", "french,german,italian,"
"romansh", 0xffffffff },
{ "ci", "C<EFBFBD>´te d'Ivoire", "french", 0xffffffff },
{ "ck", "Cook Islands", "unknown", 0xffffffff },
{ "cl", "Chile", "spanish", 0xffffffff },
{ "cm", "Cameroon", "english,french", 0xffffffff },
{ "cn", "People's Republic of China", "cantonese,english,kazakh,korean,"
"mandarin,mongolian,portuguese,tajik,tibetan,uyghur,zhuang",
0xffffffff },
{ "co", "Colombia", "spanish", 0xffffffff },
{ "cr", "Costa Rica", "spanish", 0xffffffff },
{ "cu", "Cuba", "spanish", 0xffffffff },
{ "cv", "Cape Verde", "crioulo,portuguese", 0xffffffff },
{ "cx", "Christmas Island", "unknown", 0xffffffff },
{ "cy", "Cyprus", "greek,turkish", 0xffffffff },
{ "cz", "Czech Republic", "czech", 0xffffffff },
{ "de", "Germany (Deutschland)", "danish,frisian,german,romani,"
"lower sorbian,upper sorbian", 0xffffffff },
{ "dj", "Djibouti", "arabic,french", 0xffffffff },
{ "dk", "Denmark", "danish,faroese,kalaallisut", 0xffffffff },
{ "dm", "Dominica", "english", 0xffffffff },
{ "do", "Dominican Republic", "english", 0xffffffff },
{ "dz", "Algeria", "arabic,tamazight", 0xffffffff },
{ "ec", "Ecuador", "spanish,quechua", 0xffffffff },
{ "ee", "Estonia", "estonian", 0xffffffff },
{ "eg", "Egypt", "arabic", 0xffffffff },
{ "er", "Eritrea", "arabic,english,tigrinya", 0xffffffff },
{ "es", "Spain (Espa<70>±a)", "basque,catalan,galician,occitan,"
"spanish", 0xffffffff },
{ "et", "Ethiopia", "amharic", 0xffffffff },
{ "eu", "European Union", "unknown", 0xffffffff },
{ "fi", "Finland", "finnish,sami,swedish", 0xffffffff },
{ "fj", "Fiji", "english,fijian,hindustani", 0xffffffff },
{ "fk", "Falkland Islands", "unknown", 0xffffffff },
{ "fm", "Federated States of Micronesia", "chuuk,english,kosraean,ponapean,"
"ulithian,yapese", 0xffffffff },
{ "fo", "Faroe Islands", "unknown", 0xffffffff },
{ "fr", "France", "french,tahitian", 0xffffffff },
{ "ga", "Gabon", "french", 0xffffffff },
{ "gb", "United Kingdom (Great Britain)", "english,cornish,"
"dg<EFBFBD>¨rn<EFBFBD>©siais,english,french,irish,j<>¨rriais,"
"pitcairnese,scots,scottish gaelic,welsh", 0xffffffff },
{ "gd", "Grenada", "english", 0xffffffff },
{ "ge", "Georgia", "abkhaz,georgian,ossetic,russian", 0xffffffff },
{ "gf", "French Guiana", "unknown", 0xffffffff },
{ "gg", "Guernsey", "unknown", 0xffffffff },
{ "gh", "Ghana", "adangme,dagaare,dagbani,english,ewe,ga,gonja,kasem,"
"nzema,twi", 0xffffffff },
{ "gi", "Gibraltar", "unknown", 0xffffffff },
{ "gl", "Greenland", "unknown", 0xffffffff },
{ "gm", "The Gambia", "unknown", 0xffffffff },
{ "gn", "Guinea", "french,fula", 0xffffffff },
{ "gp", "Guadeloupe", "unknown", 0xffffffff },
{ "gq", "Equatorial Guinea", "french,spanish", 0xffffffff },
{ "gr", "Greece", "greek", 0xffffffff },
{ "gs", "South Georgia and the South Sandwich Islands", "abkhaz,georgian,"
"ossetic,russian", 0xffffffff },
{ "gt", "Guatemala", "spanish", 0xffffffff },
{ "gu", "Guam", "unknown", 0xffffffff },
{ "gw", "Guinea-Bissau", "french,fula", 0xffffffff },
{ "gy", "Guyana", "english", 0xffffffff },
{ "hk", "Hong Kong", "unknown", 0xffffffff },
{ "hm", "Heard Island and McDonald Islands", "unknown", 0xffffffff },
{ "hn", "Honduras", "spanish", 0xffffffff },
{ "hr", "Croatia (Hrvatska)", "croatian,italian", 0xffffffff },
{ "ht", "Haiti", "french,haitian creole", 0xffffffff },
{ "hu", "Hungary", "hungarian", 0xffffffff },
{ "id", "Indonesia", "balinese,javanese,indonesian,sundanese", 0xffffffff },
{ "ie", "Ireland (Éire)", "unknown", 0xffffffff },
{ "il", "Israel", "arabic,hebrew", 0xffffffff },
{ "im", "Isle of Man", "unknown", 0xffffffff },
{ "in", "India", "assamese,bengala,bodo,dogri,english,gujarati,hindi,kannada,"
"kashmiri,konkani,maithili,malayalam,meitei,marathi,nepali,oriya,"
"punjabi,sanskrit,santali,sindhi,tamil,telugu,urdu,french,karbi,"
"bhojpuri,magadhi,maithili,chhattisgarhi,portuguese,pahari,tulu,"
"garo,khasi,mizo,rajasthani,kokborok,nicobarese", 0xffffffff },
{ "io", "British Indian Ocean Territory", "assamese,bengala,bodo,dogri,"
"english,gujarati,hindi,kannada,kashmiri,konkani,maithili,malayalam,"
"meitei,marathi,nepali,oriya,punjabi,sanskrit,santali,sindhi,tamil,"
"telugu,urdu,french,karbi,bhojpuri,magadhi,maithili,chhattisgarhi,"
"portuguese,pahari,tulu,garo,khasi,mizo,rajasthani,kokborok,"
"nicobarese", 0xffffffff },
{ "iq", "Iraq", "arabic,kurdish", 0xffffffff },
{ "ir", "Iran", "persian", 0xffffffff },
{ "is", "Iceland (Island)", "icelandic", 0xffffffff },
{ "it", "Italy", "italian",
//"albanian,catalan,croatian,franco-proven<65>§al,french,"
// "friulian,german,greek,italian,ladin,occitan,sardinian,slovenian",
0xffffffff },
{ "je", "Jersey", "unknown", 0xffffffff },
{ "jm", "Jamaica", "english", 0xffffffff },
{ "jo", "Jordan", "arabic", 0xffffffff },
{ "jp", "Japan", "japanese", 0xffffffff },
{ "ke", "Kenya", "english,swahili", 0xffffffff },
{ "kg", "Kyrgyzstan", "kirghiz,russian", 0xffffffff },
{ "kh", "Cambodia (Khmer)", "khmer", 0xffffffff },
{ "ki", "Kiribati", "english,kiribati", 0xffffffff },
{ "km", "Comoros", "arabic,comorian,french", 0xffffffff },
{ "kn", "Saint Kitts and Nevis", "english", 0xffffffff },
{ "kr", "South Korea", "korean", 0xffffffff },
{ "kw", "Kuwait", "arabic", 0xffffffff },
{ "ky", "Cayman Islands", "unknown", 0xffffffff },
{ "kz", "Kazakhstan", "kazakh,russian", 0xffffffff },
{ "la", "Laos", "lao,french", 0xffffffff },
{ "lb", "Lebanon", "arabic", 0xffffffff },
{ "lc", "Saint Lucia", "english", 0xffffffff },
{ "li", "Liechtenstein", "german", 0xffffffff },
{ "lk", "Sri Lanka", "sinhala,tamil", 0xffffffff },
{ "lr", "Liberia", "english", 0xffffffff },
{ "ls", "Lesotho", "english,sotho", 0xffffffff },
{ "lt", "Lithuania", "lithuanian", 0xffffffff },
{ "lu", "Luxembourg", "french,german,luxembourgish", 0xffffffff },
{ "lv", "Latvia", "latvian", 0xffffffff },
{ "ly", "Libya", "arabic", 0xffffffff },
{ "ma", "Morocco", "arabic", 0xffffffff },
{ "mc", "Monaco", "french", 0xffffffff },
{ "md", "Moldova", "gagauz,moldovan,russian,ukrainian", 0xffffffff },
{ "mg", "Madagascar", "french,malagasy", 0xffffffff },
{ "mh", "Marshall Islands", "english,marshallese", 0xffffffff },
{ "mk", "Republic of Macedonia", "unknown", 0xffffffff },
{ "ml", "Mali", "french", 0xffffffff },
{ "mm", "Myanmar", "burmese", 0xffffffff },
{ "mn", "Mongolia", "mongolian", 0xffffffff },
{ "mo", "Macau", "unknown", 0xffffffff },
{ "mp", "Northern Mariana Islands", "unknown", 0xffffffff },
{ "mq", "Martinique", "unknown", 0xffffffff },
{ "mr", "Mauritania", "arabic,fula,soninke,wolof", 0xffffffff },
{ "ms", "Montserrat", "unknown", 0xffffffff },
{ "mt", "Malta", "english,maltese", 0xffffffff },
{ "mu", "Mauritius", "english,french", 0xffffffff },
{ "mv", "Maldives", "dhivehi", 0xffffffff },
{ "mw", "Malawi", "chichewa,english", 0xffffffff },
{ "mx", "Mexico", "spanish", 0xffffffff },
{ "my", "Malaysia", "malay", 0xffffffff },
{ "mz", "Mozambique", "portuguese", 0xffffffff },
{ "na", "Namibia", "english", 0xffffffff },
{ "nc", "New Caledonia", "unknown", 0xffffffff },
{ "ne", "Niger", "french", 0xffffffff },
{ "nf", "Norfolk Island", "unknown", 0xffffffff },
{ "ng", "Nigeria", "french", 0xffffffff },
{ "ni", "Nicaragua", "spanish", 0xffffffff },
{ "nl", "Netherlands", "dutch,frisian", 0xffffffff },
{ "no", "Norway", "norwegian,norwegian,sami", 0xffffffff },
{ "np", "Nepal", "nepali", 0xffffffff },
{ "nr", "Nauru", "english,nauruan", 0xffffffff },
{ "nu", "Niue", "unknown", 0xffffffff },
{ "nz", "New Zealand", "english,maori,new zealand sign language,"
"cook islands maori,niuean,tokelauan", 0xffffffff },
{ "om", "Oman", "arabic", 0xffffffff },
{ "pa", "Panama", "spanish", 0xffffffff },
{ "pe", "Peru", "quechua,aymara,spanish", 0xffffffff },
{ "pf", "French Polynesia", "unknown", 0xffffffff },
{ "pg", "Papua New Guinea", "french,fula", 0xffffffff },
{ "ph", "Philippines", "arabic,bikol,cebuano,english,filipino,"
"hiligaynon,ilokano,kapampangan,kinaray-a,maranao,"
"maguindanao,pangasinan,spanish,tagalog,tausug,"
"waray-waray", 0xffffffff },
{ "pk", "Pakistan", "english,urdu", 0xffffffff },
{ "pl", "Poland", "polish", 0xffffffff },
{ "pm", "Saint-Pierre and Miquelon", "unknown", 0xffffffff },
{ "pn", "Pitcairn Islands", "unknown", 0xffffffff },
{ "pr", "Puerto Rico", "unknown", 0xffffffff },
{ "ps", "Palestinian territories", "unknown", 0xffffffff },
{ "pt", "Portugal", "portuguese,mirandese", 0xffffffff },
{ "pw", "Palau", "english,palauan,japanese", 0xffffffff },
{ "py", "Paraguay", "guaran<EFBFBD>­,spanish", 0xffffffff },
{ "qa", "Qatar", "arabic", 0xffffffff },
{ "re", "R<EFBFBD>©union", "unknown", 0xffffffff },
{ "ro", "Romania", "arabic", 0xffffffff },
{ "ru", "Russia", "abaza,adyghe,agul,altay,avar,bashkir,"
"buryat,chechen,chukchi,chuvash,dargin,dolgan,"
"erzya,evenk,ingush,kabardian,kalmyk,karachay-balkar,"
"khakas,khanty,komi-permyak,komi-zyrian,koryak,kumyk,"
"lak,lezgi,mansi,mari,moksha,nogai,nenets,ossetic,russian,"
"tabasaran,tatar,tuvin,udmurt,yakut,yiddish", 0xffffffff },
{ "rw", "Rwanda", "english,french,kinyarwanda", 0xffffffff },
{ "sa", "Saudi Arabia", "arabic", 0xffffffff },
{ "sb", "Solomon Islands", "english", 0xffffffff },
{ "sc", "Seychelles", "english,french,seselwa", 0xffffffff },
{ "sd", "Sudan", "arabic,english", 0xffffffff },
{ "se", "Sweden", "swedish,finnish,me<6D>¤nkieli,romani,sami,"
"yiddish", 0xffffffff },
{ "sg", "Singapore", "english,malay,mandarin,tamil", 0xffffffff },
{ "sh", "Saint Helena", "unknown", 0xffffffff },
{ "si", "Slovenia", "hungarian,italian,slovenian", 0xffffffff },
{ "sj", "Svalbard and Jan Mayen Islands", "unknown", 0xffffffff },
{ "sk", "Slovakia", "slovak", 0xffffffff },
{ "sl", "Sierra Leone", "english", 0xffffffff },
{ "sm", "San Marino", "italian", 0xffffffff },
{ "sn", "Senegal", "french,jola-fogny,malinke,mandinka,pulaar,"
"serer-sine,wolof", 0xffffffff },
{ "so", "Somalia", "french", 0xffffffff },
{ "sr", "Suriname", "dutch", 0xffffffff },
{ "st", "S<EFBFBD>£o Tom<6F>© and Pr<50>­ncipe", "portuguese",
0xffffffff },
{ "su", "former Soviet Union", "unknown", 0xffffffff },
{ "sv", "El Salvador", "spanish", 0xffffffff },
{ "sy", "Syria", "arabic,french", 0xffffffff },
{ "sz", "Swaziland", "english,swazi", 0xffffffff },
{ "tc", "Turks and Caicos Islands", "unknown", 0xffffffff },
{ "td", "Chad", "arabic,french", 0xffffffff },
{ "tf", "French Southern and Antarctic Lands", "unknown", 0xffffffff },
{ "tg", "Togo", "french", 0xffffffff },
{ "th", "Thailand", "thai", 0xffffffff },
{ "tj", "Tajikistan", "tajik", 0xffffffff },
{ "tk", "Tokelau", "unknown", 0xffffffff },
{ "tl", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tm", "Turkmenistan", "turkmen", 0xffffffff },
{ "tn", "Tunisia", "arabic", 0xffffffff },
{ "to", "Tonga", "english,tongan", 0xffffffff },
{ "tp", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
{ "tr", "Turkey", "turkish", 0xffffffff },
{ "tt", "Trinidad and Tobago", "english", 0xffffffff },
{ "tv", "Tuvalu", "english,tuvaluan", 0xffffffff },
{ "tw", "Taiwan, Republic of China", "mandarin", 0xffffffff },
{ "tz", "Tanzania", "english,swahili", 0xffffffff },
{ "ua", "Ukraine", "ukrainian", 0xffffffff },
{ "ug", "Uganda", "english,swahili", 0xffffffff },
{ "uk", "United Kingdom", "british,cornish,dg<64>¨rn<72>©siais,"
"irish,j<>¨rriais,pitcairnese,scots,scottish gaelic,"
"welsh", 0xffffffff },
{ "um", "United States Minor Outlying Islands", "english,carolinian,chamorro,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "us", "United States of America", "english,carolinian,chamorro,english,"
"hawaiian,samoan,spanish", 0xffffffff },
{ "uy", "Uruguay", "spanish", 0xffffffff },
{ "uz", "Uzbekistan", "uzbek", 0xffffffff },
{ "va", "Vatican City State", "latin", 0xffffffff },
{ "vc", "Saint Vincent and the Grenadines", "english", 0xffffffff },
{ "ve", "Venezuela", "spanish", 0xffffffff },
{ "vg", "British Virgin Islands", "unknown", 0xffffffff },
{ "vi", "U.S. Virgin Islands", "unknown", 0xffffffff },
{ "vn", "Vietnam", "vietnamese", 0xffffffff },
{ "vu", "Vanuatu", "bislama,english,french", 0xffffffff },
{ "wf", "Wallis and Futuna", "unknown", 0xffffffff },
{ "ws", "Samoa", "english,samoan", 0xffffffff },
{ "ye", "Yemen", "arabic", 0xffffffff },
{ "yt", "Mayotte", "unknown", 0xffffffff },
{ "yu", "Yugoslavia", "unknown", 0xffffffff },
{ "za", "South Africa (Zuid-Afrika)", "afrikaans,english,ndebele,"
"northern sotho,sotho,swazi,tsonga,tswana,venda,xhosa,zulu",
0xffffffff },
{ "zm", "Zambia", "english", 0xffffffff },
{ "zw", "Zimbabwe", "unknown", 0xffffffff },
};
static int s_langToCatId[] = {
0, // langUnknown
0, // langEnglish
476, // langFrench
471, // langSpanish
484, // langRussian
49884, // langJapanese
472, // langChineseTrad
494, // langChineseSimp
493, // langKorean
911729, // langGerman
478, // langDutch
477, // langItalian
503, // langFinnish
485, // langSwedish
487, // langNorwegian
483, // langPortuguese
116289, // langVietnamese
88070, // langArabic
118215, // langHebrew
464465, // langIndonesian
482, // langGreek
501, // langThai
51663, // langHindi
241315, // langBengala
480, // langPolish
173548, // langTagalog
0, // langBritish (Sadly, there are no British, UK, or Austrialian topics)
0, // langAustralia
0 // langUnknown, end of list
};
LangList::LangList ( ) {
}
LangList::~LangList ( ) {
reset();
}
void LangList::reset ( ) {
m_langTable.reset();
m_tldToCountry.reset();
}
// . returns false and sets errno on error
// . loads language lists into memory
// . looks under the langlist/ directory for langlist.# files
// each number corrisponds to a language
bool LangList::loadLists ( ) {
//log ( LOG_INIT, "lang: Loading Language Lists.");
// init the term table
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
// loop over the languages and load the files
int32_t listCount = 0;
int32_t dupCount = 0;
int32_t allocSize = 0;
char *buf = NULL;
Words w;
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ) {
// load the file for reading
char ff[128];
sprintf(ff, "%slanglist/langlist.%"INT32"", g_hostdb.m_dir, i );
int fd = open ( ff, O_RDONLY );
// no language file, don't complain
if ( fd < 0 ) continue;
// get the size
struct stat stats;
stats.st_size = 0;
int status = stat ( ff, &stats );
if ( status != 0 ) {
close(fd);
log ( "lang: Could not stat %s: %s.",
ff, strerror(errno) );
return false;
}
int32_t fileSize = stats.st_size;
// read the file into a buffer
int32_t thisAllocSize = 3 * fileSize;
if(thisAllocSize > allocSize) {
buf = (char*)mrealloc(buf, allocSize, thisAllocSize,
"LangList");
allocSize = thisAllocSize;
}
if ( !buf ) {
close(fd);
log ( "lang: Could not allocate %"INT32" bytes for "
"langlist buffer: %s.",
thisAllocSize, mstrerror(g_errno) );
return false;
}
if ( read ( fd, buf, fileSize ) != fileSize ) {
close(fd);
log ( "lang: Could not read %s: %s.",
ff, strerror(errno) );
return false;
}
close(fd);
// read the words out of the file
// char *p = buf;
// char *pEnd = buf + fileSize;
// *pEnd = '\0';
//UChar* ucBuf = (UChar*)(buf + fileSize);
//int32_t ucBufLen = fileSize * 2;
int32_t wordsInList = 0;
int32_t writtenLen = gbstrlen(buf);
//int32_t writtenLen = ucToUnicode(ucBuf, ucBufLen,
// buf, fileSize,
// "UTF-8", -1,
// TITLEREC_CURRENT_VERSION);
w.reset();
//doubling the written length seems hackish, may
//need to be fixed in ucToUnicode.
if(!w.set (buf ,
fileSize ,
TITLEREC_CURRENT_VERSION,true, false)) {
char *xx = NULL; *xx = 0;
return false;
}
int32_t numWords = w.getNumWords();
for(int32_t j = 0; j < numWords; j++) {
int64_t wordId = w.m_wordIds[j];
if(wordId == 0) continue;
// add it to the table
uint32_t score = m_langTable.getScore(&wordId);
//log(LOG_WARN,
// "lang: Successfully hash %"INT64" from %s dictionary.",
//wordId, getLanguageString(i));
if ( score != (uint32_t)i ) {
if ( score > 0 ) {
dupCount++;
if ( score != 0x7fffffff )
m_langTable.addTerm ( &wordId,
0x7fffffff);
}
else {
m_langTable.addTerm ( &wordId, i );
wordsInList++;
}
}
}
// count the list
listCount++;
if ( wordsInList > 0 )
log ( LOG_DEBUG,
"lang: Successfully Loaded %"INT32" out of %"INT32" (%"INT32" bytes) "
"words from %s dictionary.",
wordsInList, numWords>>1, writtenLen, getLanguageString(i) );
}
// free the buffer
if(buf) mfree ( buf, allocSize, "LangList" );
log ( LOG_INIT, "lang: Successfully Loaded %"INT32" Language Lists and "
"%"INT32" duplicate word hashes.",
listCount, dupCount );
// all good
return true;
}
// . lookup word in language lists
// . returns false if not found true if found and lang set
bool LangList::lookup ( int64_t termId,
unsigned char *lang ) {
// lookup the termId in the table
uint32_t score = m_langTable.getScore(&termId);
// is it unknown?
if ( score == 0 || score >= MAX_LANGUAGES ) {
*lang = 0;
return false;
}
// otherwise set lang to the score
*lang = (unsigned char)score;
return true;
}
char* LangList::getCountryFromTld(char* tld, int32_t tldLen) {
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
int32_t j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
int32_t index = hash32(tld, tldLen);
int32_t slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return NULL;
return s_tldInfo[*(int32_t *)m_tldToCountry.getValueFromSlot(slot)].m_country;
}
bool LangList::isLangValidForTld(char* tld, int32_t tldLen, unsigned char lang) {
if(lang == langUnknown) return true; //not much we can do here.
//initialize if not already initialized.
if(s_numTlds == 0) tldInit();
int32_t j = 0;
for(; j < tldLen; j++) {
if(tld[j] != '.') continue;
j++; //skip .
tld = &(tld[j]);
tldLen -= j;
break;
}
int32_t index = hash32(tld, tldLen);
int32_t slot = m_tldToCountry.getSlot(&index);
if(slot < 0) return true;
int32_t *tip = (int32_t *)m_tldToCountry.getValueFromSlot(slot);
if ( ! tip ) { char *xx=NULL;*xx=0; }
TldInfo* t = &s_tldInfo[*tip];
//it is uninitalized, init on demand.
if(t->m_languagebv == 0xffffffff) {
t->m_languagebv = 0;
for(int32_t i = 1; i <= langTagalog; i++) {
if(strstr(t->m_languages,getLanguageString(i)) == NULL)
continue;
//set the bit corresponding to lang
t->m_languagebv |= 0x1 << (i-1);
}
}
if(t->m_languagebv == 0) return true; //its unknown.
int32_t mask = 0x1 << (lang-1);
return mask & t->m_languagebv;
}
bool LangList::tldInit() {
s_numTlds = sizeof(s_tldInfo) / sizeof(TldInfo);
m_tldToCountry.set(4,4,0,NULL,0,false,0,"tldctrytbl");
for(int32_t i = 0; i < s_numTlds; i++) {
int32_t ndx = hash32n(s_tldInfo[i].m_tld);
if ( ! m_tldToCountry.addKey(&ndx , &i ) ) return false;
}
return true;
}
uint8_t LangList::catIdToLang(uint32_t catid) {
register uint32_t i;
for(i = 0; i < sizeof(s_langToCatId)/sizeof(uint32_t); i++) {
if(catid == (uint32_t)s_langToCatId[i]) return((uint8_t)i);
}
return(0);
}
uint32_t LangList::langToCatId(uint8_t lang) {
return(s_langToCatId[(int)lang]);
}
uint8_t LangList::isLangCat(int catid) {
for(int x = 0; x < MAX_LANGUAGES; x++)
if(catid == s_langToCatId[x])
return(x);
return(langUnknown);
}