open-source-search-engine/create_ucd_tables.cpp

#include "gb-include.h"

#include "Mem.h"
#include "UCPropTable.h"
#include "Unicode.h"

bool mainShutdown(bool urgent);
bool mainShutdown(bool urgent){return true;}
// JAB: this program has not been run in a long time and required these...
bool closeAll(void*, void(*)(void *)) {return true;}
bool allExit(void) {return true;}


bool loadUnidataProps(char *s, void (*handler)(u_long, char**, u_long));

void handleUnicodeData(u_long, char **col, u_long colCount);
void handleDerivedCoreProps(u_long, char **col, u_long colCount);
void handleDerivedNormalizationProps(u_long, char **col, u_long colCount);
void handlePropList(u_long, char **col, u_long colCount);
void handleNormalizationTest(u_long, char **col, u_long colCount);
void handleScripts(u_long, char **col, u_long colCount);
void decomposeHangul();


// static int g_decompCount = 0;
static int g_canonicalDecompCount = 0;
static int g_excludeCount = 0;

int main(int argc, char **argv) {
	// Avoid SEGV

	if ( ! g_log.init( "foo.log" )        ) {
		fprintf (stderr,"db: Log file init failed.\n" ); exit( 1 ); }
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." ); exit(1); }
	// . hashinit() calls srand() w/ a fixed number
	// . let's mix it up again
	srand ( time(NULL) );

	UCProps props = 0;
	g_ucProps.setValue(0, &props);

	loadUnidataProps("UNIDATA/DerivedNormalizationProps.txt",
		handleDerivedNormalizationProps);
	loadUnidataProps("UNIDATA/UnicodeData.txt",
		handleUnicodeData);

	decomposeHangul(); // set up algorithmic hangul decomps
 	printf("%d canonical deompositions\n", g_canonicalDecompCount);
	printf("%d code points excluded\n", g_excludeCount);

	loadUnidataProps("UNIDATA/DerivedCoreProperties.txt",
		handleDerivedCoreProps);
	loadUnidataProps("UNIDATA/PropList.txt",
		handlePropList);
	loadUnidataProps("UNIDATA/Scripts.txt",
		handleScripts);

	printf("lower case map size: %d\n", g_ucLowerMap.getSize());
	saveUnicodeTable(&g_ucLowerMap, "ucdata/lowermap.dat");
	printf("upper case map size: %d\n", g_ucUpperMap.getSize());
	saveUnicodeTable(&g_ucUpperMap, "ucdata/uppermap.dat");
//	printf("categorymap size: %d\n", g_ucCategory.getSize());
//	saveUnicodeTable(&g_ucCategory, "ucdata/categories.dat");
	printf("properties size: %d\n", g_ucProps.getSize());
	saveUnicodeTable(&g_ucProps, "ucdata/properties.dat");
	printf("scripts size: %d\n", g_ucScripts.getSize());
	saveUnicodeTable(&g_ucScripts, "ucdata/scripts.dat");
	printf("combining class size: %d\n", g_ucCombiningClass.getSize());
	saveUnicodeTable(&g_ucCombiningClass, "ucdata/combiningclass.dat");

	// JAB: we now have Kompatible and Canonical decompositions
	saveKDecompTable();
	saveCDecompTable();


	if (!initCompositionTable()) {
		log("Error initializing Full Composition table\n");
		exit(1);
	}
	loadUnidataProps("UNIDATA/NormalizationTest.txt",
			 handleNormalizationTest);

	g_mem.printMem();

	if (loadUnicodeTable(&g_ucUpperMap,"ucdata/uppermap.dat") &&
	    loadUnicodeTable(&g_ucLowerMap,"ucdata/lowermap.dat") &&
	    loadUnicodeTable(&g_ucProps,"ucdata/properties.dat") &&
	    loadUnicodeTable(&g_ucCombiningClass,"ucdata/combiningclass.dat") &&
	    loadUnicodeTable(&g_ucScripts,"ucdata/scripts.dat") &&
	    // JAB: we now have Kompatible and Canonical decompositions
	    loadDecompTables()){
		printf("tables reloaded successfully\n\n");

		printf("lower case map size: %d\n", g_ucLowerMap.getSize());
		printf("upper case map size: %d\n", g_ucUpperMap.getSize());
		printf("properties size: %d\n", g_ucProps.getSize());
		printf("scripts size: %d\n", g_ucScripts.getSize());
		printf("Kompat Decomp size: %d\n", g_ucKDIndex.getSize());
		exit(0);
	}
}

void handleUnicodeData(u_long line, char **col, u_long colCount) {

	UChar32 codePoint = strtol(col[0], NULL, 16);

// 	if ((colCount < 14) || (codePoint == 0)){
// 		printf("line %ld: no data (%ld cols)\n", line, colCount);
// 		return;
// 	}
	char *name = col[1];
	char *category = col[2];
	u_char combiningClass = strtol(col[3], NULL, 10);
	char *decompStr = col[5];
	UChar32 ucMapping = strtol(col[12],NULL, 16);
	UChar32 lcMapping = strtol(col[13],NULL, 16);

	// Set general category
	//g_ucCategory.setValue(codePoint, (void*)category);
	UCProps props = ucProperties(codePoint);
	if (category[0] == 'L') props |= UC_ALPHA | UC_WORDCHAR;
	else if (category[0] == 'N') props |= UC_DIGIT | UC_WORDCHAR;
	else if (category[0] == 'Z') props |= UC_WHITESPACE;
	if (props)
		g_ucProps.setValue(codePoint, &props);

	if (lcMapping)
		g_ucLowerMap.setValue(codePoint, (void*)&lcMapping);
	if (ucMapping)
		g_ucUpperMap.setValue(codePoint, (void*)&ucMapping);
	if (combiningClass)
		g_ucCombiningClass.setValue(codePoint, (void*)&combiningClass);

	if (decompStr && decompStr[0]){

		u_char decompCount = 0;
		UChar32 decomp[32];
		bool kompat = false;
		// Get decomposition
		char *p = decompStr;
		int decompLen = gbstrlen(decompStr);
		while (p < decompStr+decompLen) {
			char *pend = p;
			while (*pend && *pend != ' ') pend++;
			*pend = '\0';
			if (p[0] == '<') kompat = true;
			else{
				decomp[decompCount++] = strtol(p, NULL, 16);
			}
			p = pend+1;
		}

//  		printf ("Code Point U+%04lx, %s: %s (%d chars)\n",
//  			codePoint, name, kompat?"(Kompatable)":"", decompCount);
// 		g_decompCount++;
// 		if (decompStr[0] != '<')
		bool fullComp=false;
		if (!kompat && !(props & UC_COMP_EX)) {
			// set up canonical combining table
			g_canonicalDecompCount++;
// 			printf("%4x:", codePoint);
// 			for (int i = 0; i<decompCount;i++)
// 				printf(" %4x", decomp[i]);
// 			printf("\n");
			fullComp = true;
		}
		setKDValue(codePoint, decomp, decompCount, fullComp);
	    	// JAB: we now have Kompatible and Canonical decompositions
		if (!kompat)
			setCDValue(codePoint, decomp, decompCount);
	}
}

void handlePropList(u_long line, char **col, u_long colCount) {
	//printf("Line %ld: ", line);
	//for (u_long i=0;i<colCount;i++)
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);
		//void *p = g_ucProps.getValue(c);
		//if (p) props = *(u_char*)p;
		if (!strncmp(col[1], "Ideographic", 11))
			props |= UC_IDEOGRAPH | UC_WORDCHAR;
		else if (!strncmp(col[1], "Unified_Ideograph", 17))
			props |= UC_IDEOGRAPH | UC_WORDCHAR;
		else if (!strncmp(col[1], "White_Space", 11))
			props |= UC_WHITESPACE;

		if (props)
			g_ucProps.setValue(c, &props);
	}
	//printf("\n");

}

void handleDerivedCoreProps(u_long line, char **col, u_long colCount) {
	//printf("Line %ld: ", line);
	//for (u_long i=0;i<colCount;i++)
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);
		if (!strncmp(col[1], "Alphabetic", 10))
			props |= UC_ALPHA | UC_WORDCHAR;
		else if (!strncmp(col[1], "Default_Ignorable_Code_Point", 28))
			props |= UC_IGNORABLE;
		else if (!strncmp(col[1], "Lowercase", 9))
			props |= UC_LOWER | UC_WORDCHAR;
		else if (!strncmp(col[1], "Uppercase", 9))
			props |= UC_UPPER | UC_WORDCHAR;
		else if (!strncmp(col[1], "Grapheme_Extend", 15))
			props |= UC_WORDCHAR;
		if (props)
			g_ucProps.setValue(c, &props);
// 		if (c == ' ' && (props&UC_WORDCHAR))
// 			printf("Yow: line %ld\n", line);
// 		if (c == 0 && props)
// 			printf("!!!\nHey: line %ld!!!\n\n", line);
	}
	//printf("\n");

}

void handleDerivedNormalizationProps(u_long line, char **col,
				     u_long colCount) {
	//printf("Line %ld: ", line);
	//for (u_long i=0;i<colCount;i++)
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);

		if (!strncmp(col[1], "NFKC_QC", 7))
			props |= UC_NFKC_QC_NO;
		else if (!strncmp(col[1], "Full_Composition_Exclusion", 26)){
			g_excludeCount++;
			props |= UC_COMP_EX;
			//printf("Excluding %4x props: %04x\n", c, props);
		}

		if (props) g_ucProps.setValue(c, &props);
	}
	//printf("\n");

}

void handleScripts(u_long, char **col, u_long colCount){
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		UCProps props = ucProperties(c);
		//void *p = g_ucProps.getValue(c);
		//if (p) props = *(u_char*)p;
		UCScript s = ucScriptCommon;
		for (int j=0; j < ucScriptNumScripts; j++) {
			if (!strcmp(col[1], g_ucScriptNames[j])){
				s = j;
				g_ucScripts.setValue(c, &j);
			}
		}
		if (s == ucScriptThai) props |= UC_THAI;
		else if (s == ucScriptHiragana) props |= UC_HIRAGANA;
		else if (s == ucScriptKatakana) props |= UC_KATAKANA;
		else if (s == ucScriptKatakana_Or_Hiragana)
			props |= UC_KATAKANA|UC_HIRAGANA;
		if (props)
			g_ucProps.setValue(c, &props);
	}

}

void handleNormalizationTest(u_long line, char **col, u_long colCount) {
	//NFKC Test:
	// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NCFK(c4) == NFKC(c5)
	UChar c[5][32]; long len[5];

	if (colCount < 5) {
		//log("Line %ld: only %ld columns!", line, colCount);
		return;
	}
	for (unsigned long i = 0 ; i < 5 ; i++) {
		char *p = col[i];
		int clen = gbstrlen(p);
		UChar *q = c[i];
		while (p < col[i]+clen) {
			char *pend = p;
			while (*pend && *pend != ' ') pend++;
			*pend = '\0';
			UChar32 d = strtol(p, NULL, 16);
			q += utf16Encode(d, q);
			p = pend+1;
		}
		len[i] = q - c[i];

	}
	for (unsigned long i = 0; i < 5 ; i++ ) {
		UChar normString[256];
		long normLen = ucNormalizeNFKD(normString, 256,
					       c[i], len[i]);
		//ucDebug(normString, normLen);
		if (ucStrCmp(normString, normLen, c[4], len[4])){
			printf("Line %ld col %ld: KD Normalization failed: \n bad: \"",
			       line, i+1);
			UChar *p = normString;
			while(p < normString+normLen) {
				UChar32 d = utf16Decode(p, &p);
				ucPutc(d);
			}
			printf("\"\ngood: \"");

			p = c[4];
			while(p < c[4]+len[4]) {
				UChar32 d = utf16Decode(p, &p);
				ucPutc(d);
			}
			printf("\"\n");
			continue;
		}


		normLen = ucNormalizeNFKC(normString, 256,
					       c[i], len[i]);

		if (ucStrCmp(normString, normLen, c[3], len[3])){
			printf("Line %ld col %ld: KC Normalization failed: \n bad: \"",
			       line, i+1);
			UChar *p = normString;
			while(p < normString+normLen) {
				UChar32 d = utf16Decode(p, &p);
				ucPutc(d);
			}
			printf("\"\ngood: \"");

			p = c[3];
			while(p < c[3]+len[3]) {
				UChar32 d = utf16Decode(p, &p);
				ucPutc(d);
			}
			printf("\"\n");
		}
	}
}
bool loadUnidataProps(char *filename,
		      void (*handler)(u_long, char**, u_long)) {
	printf("Loading %s\n", filename);
	FILE *fp = fopen(filename, "r");
	if (!fp) {
		printf("Error opening %s: %s\n",filename, strerror(errno));
		return false;
	}

	fseek(fp, 0, SEEK_END);
	size_t fsize = ftell(fp);
	// JAB: Mem.h cores on use of malloc()
	char *buf = (char*)mmalloc(fsize+1, "loadUnidataProps");
	if (!buf){
		printf("Error allocating %d bytes for %s\n",
		       fsize+1, filename);
		return false;
	}
	rewind(fp);
	size_t nread = fread(buf, 1, fsize, fp);
	//printf("Read %d bytes\n", nread);
	buf[nread] = '\0';
	fclose(fp);

	char *lineStart = buf;
	char *lineEnd = lineStart;
	u_long line = 0;

	while ((lineStart < buf+nread) && *lineStart) {
		while (*lineEnd && *lineEnd != '\n') lineEnd++;

		char *tokStart = lineStart;
		u_long colCount = 0;

		bool lineDone = false;
		char *col[16];

		while (tokStart < lineEnd )  {
			// skip leading whitespace
			while (*tokStart == ' ') tokStart++;
			char *tokEnd = tokStart;

			while (tokEnd < lineEnd &&
			       *tokEnd != ';' &&
			       *tokEnd != '#')tokEnd++;


			if ( *tokEnd == '#' )
				lineDone = true;
			char *trim = tokEnd-1;
			*tokEnd++ = '\0';
			while (trim > tokStart &&
			       (*trim == ' ' ||
				*trim == '\t')) {
				*trim-- = '\0';

			}

			//printf("Line %ld col %ld Token: '%s'\n",
			//       line, col, tokStart);
			col[colCount] = tokStart;

			tokStart = tokEnd;
			colCount++;
			if (lineDone) break;
		}
		//if (col != 14)printf("uh oh: %ld\n", col);
		//eol:
		if (colCount && col[0][0] != 0){
			handler(line, col, colCount);

		}
		// skip newline
		lineEnd++;
		lineStart = lineEnd;
		line++;
	}
	free(buf);
	return true;
}

void decomposeHangul() {
	for (UChar32 sIndex = 0; sIndex < ucSCount ; sIndex++) {
		int tIndex = sIndex % ucTCount;
		int first, second;
		if (tIndex != 0) { // triple
			first = (int)(ucSBase + sIndex - tIndex);
			second = (int) (ucTBase + tIndex);
		}
		else {
			first = (int) (ucLBase + sIndex / ucNCount);
			second = (int) (ucVBase + (sIndex % ucNCount)
					/ ucTCount);
		}
		int value = sIndex + ucSBase ;
		//printf("value: %4x, first: %4x second %4x\n",
		//       value, first, second);
		UChar32 decomp[2];
		decomp[0] = first;
		decomp[1] = second;
		g_canonicalDecompCount++;
		setKDValue(value, decomp, 2, true);
	    	// JAB: we now have Kompatible and Canonical decompositions
		setCDValue(value, decomp, 2);
	}
}