open-source-search-engine/Lang.h
2013-08-02 13:12:24 -07:00

98 lines
2.7 KiB
C

// Matt Wells, copyright Jul 2001
// . language detector
// . TODO: use stopwords in doc to determine the language
#ifndef _LANG_H_
#define _LANG_H_
#define MAX_LANGUAGES 64
// for langs 1-55, exclude translingual
// 64 - 8 is 56, then minus 1 is 55 bits
// translingual is the 31st bit, english is the first bit
#define LANG_BIT_MASK 0x007fffffffffffffLL
#include "Unicode.h"
#include "Iso8859.h"
#include "iana_charset.h"
enum {
langUnknown = 0,
langEnglish = 1,
langFrench = 2,
langSpanish = 3,
langRussian = 4,
langTurkish = 5,
langJapanese = 6,
langChineseTrad = 7, // cantonese
langChineseSimp = 8, // mandarin
langKorean = 9,
langGerman = 10,
langDutch = 11,
langItalian = 12,
langFinnish = 13,
langSwedish = 14,
langNorwegian = 15,
langPortuguese = 16,
langVietnamese = 17,
langArabic = 18,
langHebrew = 19,
langIndonesian = 20,
langGreek = 21,
langThai = 22,
langHindi = 23,
langBengala = 24,
langPolish = 25,
langTagalog = 26,
// added for wiktionary
langLatin = 27,
langEsperanto = 28,
langCatalan = 29,
langBulgarian = 30,
langTranslingual = 31, // used by multiple langs in wiktionary
langSerboCroatian = 32,
langHungarian = 33,
langDanish = 34,
langLithuanian = 35,
langCzech = 36,
langGalician = 37,
langGeorgian = 38,
langScottishGaelic = 39,
langGothic = 40,
langRomanian = 41,
langIrish = 42,
langLatvian = 43,
langArmenian = 44,
langIcelandic = 45,
langAncientGreek = 46,
langManx = 47,
langIdo = 48,
langPersian = 49,
langTelugu = 50,
langVenetian = 51,
langMalgasy = 52,
langKurdish = 53,
langLuxembourgish = 54,
langEstonian = 55
};
uint8_t getLanguageFromName(uint8_t *name);
uint8_t getLangIdFromAbbr ( char *abbr ) ;
char *getLangAbbr ( uint8_t langId ) ;
void languageToString ( unsigned char lang , char *buf );
char* getLanguageString ( unsigned char lang);
char* getNativeLanguageString ( unsigned char lang);
char* getLanguageAbbr ( unsigned char lang);
unsigned char getLanguageCharset ( unsigned char LangId );
bool isAdult( char *s, long slen, char **loc = NULL );
//unsigned char getLanguageFromScript(UChar32 c);
unsigned char getLanguageFromAbbr(char *abbr);
unsigned char getLanguageFromAbbrN(char *abbr);
//unsigned char getLanguageFromUnicodeAbbr(UChar *abbr);
// abbr is now in utf8
unsigned char getLanguageFromUnicodeAbbr(char *abbr);
unsigned char getLanguageFromUserAgent(char *abbr);
unsigned char getLanguageFromCountryCode(char *code);
#endif