open-source-search-engine/UnicodeProperties.h
2013-08-02 13:12:24 -07:00

317 lines
7.8 KiB
C

#ifndef UNICODE_PROPERTIES_H__
#define UNICODE_PROPERTIES_H__
#include <sys/types.h>
#include "UCPropTable.h"
#ifndef USE_ICU
typedef unsigned long UChar32;
typedef unsigned short UChar;
typedef unsigned char UChar8; // utf-8
#endif
typedef unsigned short UCProps;
typedef unsigned char UCScript;
bool loadUnicodeTable(UCPropTable *table, char *filename, bool useChecksum = false, unsigned long expectedChecksum = 0);
bool saveUnicodeTable(UCPropTable *table, char *filename);
// JAB: we now have Kompatible and Canonical decomposition tables
bool saveKDecompTable(char *baseDir = NULL) ;
bool saveCDecompTable(char *baseDir = NULL) ;
// JAB: we now have Kompatible and Canonical decomposition tables
bool loadDecompTables(char *baseDir = NULL) ;
void resetDecompTables() ;
bool setKDValue(UChar32 c, UChar32* decomp, long decompCount,
bool fullComp = false);
UChar32 *getKDValue(UChar32 c, long *decompCount, bool *fullComp = NULL);
long recursiveKDExpand(UChar32 c, UChar32 *buf, long bufSize);
// JAB: we now have Kompatible and Canonical decomposition tables
bool setCDValue(UChar32 c, UChar32* decomp, long decompCount,
bool fullComp = false);
UChar32 *getCDValue(UChar32 c, long *decompCount);
long recursiveCDExpand(UChar32 c, UChar32 *buf, long bufSize);
UCProps ucProperties(UChar32 c);
bool ucIsAlpha(UChar32 c);
bool ucIsDigit(UChar32 c);
bool ucIsAlnum(UChar32 c);
bool ucIsUpper(UChar32 c);
bool ucIsLower(UChar32 c);
long ucDigitValue(UChar32 c);
UChar32 ucToLower(UChar32 c);
UChar32 ucToUpper(UChar32 c);
unsigned char ucCombiningClass(UChar32 c);
bool ucIsWhiteSpace(UChar32 c);
bool is_wspace_uc(UChar32 c);
bool ucIsIdeograph(UChar32 c);
bool ucIsPunct(UChar32 c);
bool is_punct_uc(UChar32 c);
bool ucIsWordChar(UChar32 c);
bool ucIsIgnorable(UChar32 c);
bool ucIsExtend(UChar32 c);
bool isNFKC(UChar *s, long len);
UCScript ucGetScript(UChar32 c);
// Parse Properties
#define UC_WORDCHAR ( 1 << 0 )
#define UC_IGNORABLE ( 1 << 1 )
#define UC_IDEOGRAPH ( 1 << 2 )
#define UC_HIRAGANA ( 1 << 3 )
#define UC_KATAKANA ( 1 << 4 )
#define UC_THAI ( 1 << 5 )
#define UC_EXTEND ( 1 << 6 )
// General Properties
#define UC_ALPHA ( 1 << 7 )
#define UC_DIGIT ( 1 << 8 )
#define UC_UPPER ( 1 << 9 )
#define UC_LOWER ( 1 << 10 )
#define UC_WHITESPACE ( 1 << 11 )
#define UC_NFKC_QC_NO ( 1 << 12 )
#define UC_COMP_EX ( 1 << 13 )
extern UCPropTable g_ucProps;
extern UCPropTable g_ucLowerMap;
extern UCPropTable g_ucUpperMap;
extern UCPropTable g_ucCategory;
extern UCPropTable g_ucScripts;
extern UCPropTable g_ucKDIndex;
extern UCPropTable g_ucCombiningClass;
extern char *g_ucScriptNames[];
enum ucScript {
ucScriptCommon = 0,
ucScriptArabic,
ucScriptArmenian,
ucScriptBengali,
ucScriptBopomofo,
ucScriptBraille,
ucScriptBuhid,
ucScriptCanadian_Aboriginal,
ucScriptCherokee,
ucScriptCypriot,
ucScriptCyrillic, // 10 = russian
ucScriptDeseret,
ucScriptDevanagari,
ucScriptEthiopic,
ucScriptGeorgian,
ucScriptGothic, // 15
ucScriptGreek,
ucScriptGujarati,
ucScriptGurmukhi,
ucScriptHan,
ucScriptHangul, // 20
ucScriptHanunoo,
ucScriptHebrew, // 22
ucScriptHiragana,
ucScriptInherited, // 24
ucScriptKannada,
ucScriptKatakana,
ucScriptKatakana_Or_Hiragana,
ucScriptKhmer,
ucScriptLao,
ucScriptLatin,
ucScriptLimbu,
ucScriptLinear_B,
ucScriptMalayalam,
ucScriptMongolian,
ucScriptMyanmar,
ucScriptOgham,
ucScriptOld_Italic,
ucScriptOriya,
ucScriptOsmanya,
ucScriptRunic,
ucScriptShavian,
ucScriptSinhala,
ucScriptSyriac,
ucScriptTagalog,
ucScriptTagbanwa,
ucScriptTai_Le,
ucScriptTamil,
ucScriptTelugu,
ucScriptThaana,
ucScriptThai,
ucScriptTibetan,
ucScriptUgaritic,
ucScriptYi,
ucScriptNumScripts
};
enum UCProperty {
ucPropASCII_Hex_Digit = 1,
ucPropBidi_Control = 1 << 1,
ucPropDash = 1 << 2,
ucPropDeprecated = 1 << 3,
ucPropDiacritic = 1 << 4,
ucPropExtender = 1 << 5,
ucPropGrapheme_Link = 1 << 6,
ucPropHex_Digit = 1 << 7,
ucPropHyphen = 1 << 8,
ucPropIDS_Binary_Operator = 1 << 9,
ucPropIDS_Trinary_Operator = 1 << 10,
ucPropIdeographic = 1 << 11,
ucPropJoin_Control = 1 << 12,
ucPropLogical_Order_Exception = 1 << 13,
ucPropNoncharacter_Code_Point = 1 << 14,
ucPropOther_Alphabetic = 1 << 15,
ucPropOther_Default_Ignorable_Code_Point = 1 << 16,
ucPropOther_Grapheme_Extend = 1 << 17,
ucPropOther_ID_Start = 1 << 18,
ucPropOther_Lowercase = 1 << 19,
ucPropOther_Math = 1 << 20,
ucPropOther_Uppercase = 1 << 21,
ucPropQuotation_Mark = 1 << 22,
ucPropRadical = 1 << 23,
ucPropSTerm = 1 << 24,
ucPropSoft_Dotted = 1 << 25,
ucPropTerminal_Punctuation = 1 << 26,
ucPropUnified_Ideograph = 1 << 27,
ucPropVariation_Selector = 1 << 28,
ucPropWhite_Space = 1 << 29
};
enum ucDerivedCoreProperties {
ucDCPropAlphabetic = 1,
ucDCPropDefault_Ignorable_Code_Point = 1 << 1,
ucDCPropGrapheme_Base = 1 << 2,
ucDCPropGrapheme_Extend = 1 << 3,
ucDCPropID_Continue = 1 << 4,
ucDCPropID_Start = 1 << 5,
ucDCPropLowercase = 1 << 6,
ucDCPropMath = 1 << 7,
ucDCPropUppercase = 1 << 8,
ucDCPropXID_Continue = 1 << 9,
ucDCPropXID_Start = 1 << 10
};
enum HangulComposition {
ucSBase = 0xAC00,
ucLBase = 0x1100,
ucVBase = 0x1161,
ucTBase = 0x11A7,
ucLCount = 19,
ucVCount = 21,
ucTCount = 28,
ucNCount = ucVCount * ucTCount, // 588
ucSCount = ucLCount * ucNCount // 11172
};
// Inline Functions here
inline bool ucIsAlpha(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_ALPHA;
}
inline bool ucIsDigit(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_DIGIT;
}
inline bool ucIsAlnum(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WORDCHAR;
}
inline bool ucIsUpper(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_UPPER;
}
inline bool ucIsLower(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_LOWER;
}
inline bool ucIsWhiteSpace(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WHITESPACE;
}
inline bool is_wspace_uc(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WHITESPACE;
}
inline bool ucIsIdeograph(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_IDEOGRAPH;
}
inline bool ucIsPunct(UChar32 c) {
return !ucIsWordChar(c);
}
inline bool is_punct_uc(UChar32 c) {
return !ucIsWordChar(c);
}
inline bool ucIsIgnorable(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_IGNORABLE;
}
inline bool ucIsExtend(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_EXTEND;
}
inline UChar32 ucToLower(UChar32 c) {
void *p = g_ucLowerMap.getValue(c);
if (!p || !*(UChar32*)p) return c;
return *(UChar32*)p;
}
inline UChar32 ucToUpper(UChar32 c){
void *p = g_ucUpperMap.getValue(c);
if (!p || !*(UChar32*)p) return c;
return *(UChar32*)p;
}
inline unsigned char ucCombiningClass(UChar32 c){
void *p = g_ucCombiningClass.getValue(c);
if (!p) return 0;
return *(UChar32*)p;
}
inline UCProps ucProperties(UChar32 c) {
void *p = g_ucProps.getValue(c);
if (!p) return (UCProps)0;
return *(UCProps*)p;
}
inline long ucDigitValue(UChar32 c) {
if (c >= '0' && c <= '9') return (long)(c-'0');
return 0;
}
inline UCScript ucGetScript(UChar32 c) {
void *p = g_ucScripts.getValue(c);
if (!p) return ucScriptCommon;
return *(UCScript*)p;
}
#endif