Nominatim/module/nominatim.c

/**
 * SPDX-License-Identifier: GPL-2.0-only
 *
 * This file is part of Nominatim. (https://nominatim.org)
 *
 * Copyright (C) 2022 by the Nominatim developer community.
 * For a full list of authors see the git log.
 */
#include "postgres.h"
#include "fmgr.h"
#include "mb/pg_wchar.h"
#include <utfasciitable.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

Datum transliteration( PG_FUNCTION_ARGS );
Datum gettokenstring( PG_FUNCTION_ARGS );
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
void str_dupspaces(char* buffer);

PG_FUNCTION_INFO_V1( transliteration );
Datum
transliteration( PG_FUNCTION_ARGS )
{
	static char * ascii = UTFASCII;
	static uint16 asciilookup[65536] = UTFASCIILOOKUP;
	char * asciipos;

	text *source;
	unsigned char *sourcedata;
	int sourcedatalength;

        unsigned int c1,c2,c3,c4;
	unsigned int * wchardata;
	unsigned int * wchardatastart;

	text *result;
	unsigned char *resultdata;
	int resultdatalength;
	int iLen;

	if (GetDatabaseEncoding() != PG_UTF8) 
	{
		ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                         errmsg("requires UTF8 database encoding")));
	}

	if (PG_ARGISNULL(0))
	{
		PG_RETURN_NULL();
	}

	// The original string
	source = PG_GETARG_TEXT_P(0);
	sourcedata = (unsigned char *)VARDATA(source);
	sourcedatalength = VARSIZE(source) - VARHDRSZ;

	// Intermediate wchar version of string
	wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));

	// Based on pg_utf2wchar_with_len from wchar.c
	// Postgresql strings are not zero terminalted
        while (sourcedatalength > 0)
        {
                if ((*sourcedata & 0x80) == 0)
                {
                        *wchardata = *sourcedata++;
			wchardata++;
                        sourcedatalength--;
                }
                else if ((*sourcedata & 0xe0) == 0xc0)
                {
                        if (sourcedatalength < 2) break;
                        c1 = *sourcedata++ & 0x1f;
                        c2 = *sourcedata++ & 0x3f;
                        *wchardata = (c1 << 6) | c2;
			if (*wchardata < 65536) wchardata++;
                        sourcedatalength -= 2;
                }
                else if ((*sourcedata & 0xf0) == 0xe0)
                {
                        if (sourcedatalength < 3) break;
                        c1 = *sourcedata++ & 0x0f;
                        c2 = *sourcedata++ & 0x3f;
                        c3 = *sourcedata++ & 0x3f;
                        *wchardata = (c1 << 12) | (c2 << 6) | c3;
			if (*wchardata < 65536) wchardata++;
                        sourcedatalength -= 3;
                }
                else if ((*sourcedata & 0xf8) == 0xf0)
                {
                        if (sourcedatalength < 4) break;
                        c1 = *sourcedata++ & 0x07;
                        c2 = *sourcedata++ & 0x3f;
                        c3 = *sourcedata++ & 0x3f;
                        c4 = *sourcedata++ & 0x3f;
                        *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
			if (*wchardata < 65536) wchardata++;
                        sourcedatalength -= 4;
                }
                else if ((*sourcedata & 0xfc) == 0xf8)
                {
			// table does not extend beyond 4 char long, just skip
			if (sourcedatalength < 5) break;
			sourcedatalength -= 5;
			sourcedata += 5;
		}
                else if ((*sourcedata & 0xfe) == 0xfc)
                {
			// table does not extend beyond 4 char long, just skip
			if (sourcedatalength < 6) break;
			sourcedatalength -= 6;
			sourcedata += 6;
		}
                else
                {
			// assume lenngth 1, silently drop bogus characters
                        sourcedatalength--;
			sourcedata += 1;
                }
        }
        *wchardata = 0;

	// calc the length of transliteration string
	resultdatalength = 0;
	wchardata = wchardatastart;
	while(*wchardata)
	{
		if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
		wchardata++;
	}

	// allocate & create the result
	result = (text *)palloc(resultdatalength + VARHDRSZ);
	SET_VARSIZE(result, resultdatalength + VARHDRSZ);
	resultdata = (unsigned char *)VARDATA(result);

	wchardata = wchardatastart;
	while(*wchardata)
	{
		if (*(asciilookup + *wchardata) > 0)
		{
			asciipos = ascii + *(asciilookup + *wchardata);
			for(iLen = *asciipos; iLen > 0; iLen--)
			{
				asciipos++;
				*resultdata = *asciipos;
				resultdata++;
			}
		}
		/*else
		{
			ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
		              errmsg( "missing char: %i\n", *wchardata )));
			
		}*/
		wchardata++;
	}

	pfree(wchardatastart);

	PG_RETURN_TEXT_P(result);
}

// Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
{
        char *p;

        // Search string is too long to be present
        if (fromlen > *len) return;

        p = strstr(buffer, from);
        while(p)
        {
                if (!isspace || (p > buffer && *(p-1) != ' '))
                {
                        (*changes)++;
                        if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
                        memcpy(p, to, tolen);
                        *len += tolen - fromlen;
                }
                p = strstr(p+1, from);
        }
}

void str_dupspaces(char* buffer)
{
        char *out;
        int wasspace;

        out = buffer;
        wasspace = 0;
        while(*buffer)
        {
                if (wasspace && *buffer != ' ') wasspace = 0;
                if (!wasspace)
                {
                        *out = *buffer;
                        out++;
                        wasspace = (*buffer == ' ');
                }
                buffer++;
        }
        *out = 0;
}

PG_FUNCTION_INFO_V1( gettokenstring );
Datum
gettokenstring( PG_FUNCTION_ARGS )
{
	text *source;
	unsigned char *sourcedata;
	int sourcedatalength;

	char * buffer;
	int len;
	int changes;

	text *result;

	if (GetDatabaseEncoding() != PG_UTF8) 
	{
		ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                         errmsg("requires UTF8 database encoding")));
	}

	if (PG_ARGISNULL(0))
	{
		PG_RETURN_NULL();
	}

	// The original string
	source = PG_GETARG_TEXT_P(0);
	sourcedata = (unsigned char *)VARDATA(source);
	sourcedatalength = VARSIZE(source) - VARHDRSZ;

	// Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
	buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
	memcpy(buffer+1, sourcedata, sourcedatalength);
	buffer[0] = 32;
	buffer[sourcedatalength+1] = 32;
	buffer[sourcedatalength+2] = 0;
	len = sourcedatalength+3;

	changes = 1;
	str_dupspaces(buffer);
	while(changes)
	{
		changes = 0;
		#include <tokenstringreplacements.inc>
		str_dupspaces(buffer);
	}

	// 'and' in various languages
	str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
	str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
	str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);

	// 'the' (and similar)
	str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
	str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
	str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
	str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
	str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);

	// german
	str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
	str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
	str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
	str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
	str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
	str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);

	// russian
	str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
	str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);

	// allocate & create the result
	len--;// Drop the terminating zero
	result = (text *)palloc(len + VARHDRSZ);
	SET_VARSIZE(result, len + VARHDRSZ);
	memcpy(VARDATA(result), buffer, len);

	pfree(buffer);

	PG_RETURN_TEXT_P(result);
}
add consistent SPDX copyright headers 2022-01-03 18:23:58 +03:00			`/**`
			`* SPDX-License-Identifier: GPL-2.0-only`
			`*`
			`* This file is part of Nominatim. (https://nominatim.org)`
			`*`
			`* Copyright (C) 2022 by the Nominatim developer community.`
			`* For a full list of authors see the git log.`
			`*/`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`#include "postgres.h"`
			`#include "fmgr.h"`
			`#include "mb/pg_wchar.h"`
			`#include <utfasciitable.h>`

			`#ifdef PG_MODULE_MAGIC`
			`PG_MODULE_MAGIC;`
			`#endif`

			`Datum transliteration( PG_FUNCTION_ARGS );`
			`Datum gettokenstring( PG_FUNCTION_ARGS );`
			`void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);`
			`void str_dupspaces(char* buffer);`

			`PG_FUNCTION_INFO_V1( transliteration );`
			`Datum`
			`transliteration( PG_FUNCTION_ARGS )`
			`{`
			`static char * ascii = UTFASCII;`
			`static uint16 asciilookup[65536] = UTFASCIILOOKUP;`
			`char * asciipos;`

			`text *source;`
			`unsigned char *sourcedata;`
			`int sourcedatalength;`

			`unsigned int c1,c2,c3,c4;`
			`unsigned int * wchardata;`
			`unsigned int * wchardatastart;`

			`text *result;`
			`unsigned char *resultdata;`
			`int resultdatalength;`
			`int iLen;`

			`if (GetDatabaseEncoding() != PG_UTF8)`
			`{`
			`ereport(ERROR,`
			`(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),`
			`errmsg("requires UTF8 database encoding")));`
			`}`

			`if (PG_ARGISNULL(0))`
			`{`
			`PG_RETURN_NULL();`
			`}`

			`// The original string`
			`source = PG_GETARG_TEXT_P(0);`
			`sourcedata = (unsigned char *)VARDATA(source);`
			`sourcedatalength = VARSIZE(source) - VARHDRSZ;`

			`// Intermediate wchar version of string`
			`wchardatastart = wchardata = (unsigned int )palloc((sourcedatalength+1)sizeof(int));`

			`// Based on pg_utf2wchar_with_len from wchar.c`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`// Postgresql strings are not zero terminalted`
			`while (sourcedatalength > 0)`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`{`
			`if ((*sourcedata & 0x80) == 0)`
			`{`
			`wchardata = sourcedata++;`
			`wchardata++;`
			`sourcedatalength--;`
			`}`
			`else if ((*sourcedata & 0xe0) == 0xc0)`
			`{`
			`if (sourcedatalength < 2) break;`
			`c1 = *sourcedata++ & 0x1f;`
			`c2 = *sourcedata++ & 0x3f;`
			`*wchardata = (c1 << 6) \| c2;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`if (*wchardata < 65536) wchardata++;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`sourcedatalength -= 2;`
			`}`
			`else if ((*sourcedata & 0xf0) == 0xe0)`
			`{`
			`if (sourcedatalength < 3) break;`
			`c1 = *sourcedata++ & 0x0f;`
			`c2 = *sourcedata++ & 0x3f;`
			`c3 = *sourcedata++ & 0x3f;`
			`*wchardata = (c1 << 12) \| (c2 << 6) \| c3;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`if (*wchardata < 65536) wchardata++;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`sourcedatalength -= 3;`
			`}`
			`else if ((*sourcedata & 0xf8) == 0xf0)`
			`{`
			`if (sourcedatalength < 4) break;`
			`c1 = *sourcedata++ & 0x07;`
			`c2 = *sourcedata++ & 0x3f;`
			`c3 = *sourcedata++ & 0x3f;`
			`c4 = *sourcedata++ & 0x3f;`
			`*wchardata = (c1 << 18) \| (c2 << 12) \| (c3 << 6) \| c4;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`if (*wchardata < 65536) wchardata++;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`sourcedatalength -= 4;`
			`}`
			`else if ((*sourcedata & 0xfc) == 0xf8)`
			`{`
			`// table does not extend beyond 4 char long, just skip`
			`if (sourcedatalength < 5) break;`
			`sourcedatalength -= 5;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`sourcedata += 5;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`}`
			`else if ((*sourcedata & 0xfe) == 0xfc)`
			`{`
			`// table does not extend beyond 4 char long, just skip`
			`if (sourcedatalength < 6) break;`
			`sourcedatalength -= 6;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`sourcedata += 6;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`}`
			`else`
			`{`
			`// assume lenngth 1, silently drop bogus characters`
			`sourcedatalength--;`
Don't let character walk outside translation table 2011-03-07 15:44:04 +03:00			`sourcedata += 1;`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`}`
			`}`
			`*wchardata = 0;`

			`// calc the length of transliteration string`
			`resultdatalength = 0;`
			`wchardata = wchardatastart;`
			`while(*wchardata)`
			`{`
			`if ((asciilookup + wchardata) > 0) resultdatalength += (ascii + (asciilookup + *wchardata));`
			`wchardata++;`
			`}`

			`// allocate & create the result`
			`result = (text *)palloc(resultdatalength + VARHDRSZ);`
			`SET_VARSIZE(result, resultdatalength + VARHDRSZ);`
			`resultdata = (unsigned char *)VARDATA(result);`

			`wchardata = wchardatastart;`
			`while(*wchardata)`
			`{`
			`if ((asciilookup + wchardata) > 0)`
			`{`
			`asciipos = ascii + (asciilookup + wchardata);`
			`for(iLen = *asciipos; iLen > 0; iLen--)`
			`{`
			`asciipos++;`
			`resultdata = asciipos;`
			`resultdata++;`
			`}`
			`}`
disable warning about missing chars 2013-02-28 00:01:20 +04:00			`/*else`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`{`
			`ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),`
			`errmsg( "missing char: %i\n", *wchardata )));`

disable warning about missing chars 2013-02-28 00:01:20 +04:00			`}*/`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`wchardata++;`
			`}`

			`pfree(wchardatastart);`

			`PG_RETURN_TEXT_P(result);`
			`}`

Avoid reading outside buffer Current str_replace code will read outside buffer if `isspace` and `from` occurs at the start of `buffer` 2018-02-15 21:02:59 +03:00			`// Set isspace=1 if the replacement _only_ adds a space before the search string. I.e. to == " " + from`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)`
			`{`
			`char *p;`

Avoid reading outside buffer Current str_replace code will read outside buffer if `isspace` and `from` occurs at the start of `buffer` 2018-02-15 21:02:59 +03:00			`// Search string is too long to be present`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`if (fromlen > *len) return;`

			`p = strstr(buffer, from);`
			`while(p)`
			`{`
Update nominatim.c 2018-02-26 02:07:53 +03:00			`if (!isspace \|\| (p > buffer && *(p-1) != ' '))`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`{`
			`(*changes)++;`
			`if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);`
			`memcpy(p, to, tolen);`
			`*len += tolen - fromlen;`
			`}`
			`p = strstr(p+1, from);`
			`}`
			`}`

			`void str_dupspaces(char* buffer)`
			`{`
			`char *out;`
			`int wasspace;`

			`out = buffer;`
			`wasspace = 0;`
			`while(*buffer)`
			`{`
			`if (wasspace && *buffer != ' ') wasspace = 0;`
			`if (!wasspace)`
			`{`
			`out = buffer;`
			`out++;`
			`wasspace = (*buffer == ' ');`
			`}`
			`buffer++;`
			`}`
			`*out = 0;`
			`}`

			`PG_FUNCTION_INFO_V1( gettokenstring );`
			`Datum`
			`gettokenstring( PG_FUNCTION_ARGS )`
			`{`
			`text *source;`
			`unsigned char *sourcedata;`
			`int sourcedatalength;`

			`char * buffer;`
			`int len;`
			`int changes;`

			`text *result;`

			`if (GetDatabaseEncoding() != PG_UTF8)`
			`{`
			`ereport(ERROR,`
			`(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),`
			`errmsg("requires UTF8 database encoding")));`
			`}`

			`if (PG_ARGISNULL(0))`
			`{`
			`PG_RETURN_NULL();`
			`}`

			`// The original string`
			`source = PG_GETARG_TEXT_P(0);`
			`sourcedata = (unsigned char *)VARDATA(source);`
			`sourcedatalength = VARSIZE(source) - VARHDRSZ;`

Avoid reading outside buffer Current str_replace code will read outside buffer if `isspace` and `from` occurs at the start of `buffer` 2018-02-15 21:02:59 +03:00			`// Buffer for doing the replace in - string could get slightly longer (double is massive overkill)`
nominatim refactoring 2010-10-24 03:12:37 +04:00			`buffer = (char )palloc((sourcedatalength2)*sizeof(char));`
			`memcpy(buffer+1, sourcedata, sourcedatalength);`
			`buffer[0] = 32;`
			`buffer[sourcedatalength+1] = 32;`
			`buffer[sourcedatalength+2] = 0;`
			`len = sourcedatalength+3;`

			`changes = 1;`
			`str_dupspaces(buffer);`
			`while(changes)`
			`{`
			`changes = 0;`
			`#include <tokenstringreplacements.inc>`
			`str_dupspaces(buffer);`
			`}`

			`// 'and' in various languages`
			`str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);`

			`// 'the' (and similar)`
			`str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);`
			`str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);`

			`// german`
			`str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);`
			`str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);`
			`str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);`
			`str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);`
			`str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);`
			`str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);`

			`// russian`
			`str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);`
			`str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);`

			`// allocate & create the result`
			`len--;// Drop the terminating zero`
			`result = (text *)palloc(len + VARHDRSZ);`
			`SET_VARSIZE(result, len + VARHDRSZ);`
			`memcpy(VARDATA(result), buffer, len);`

			`pfree(buffer);`

			`PG_RETURN_TEXT_P(result);`
			`}`