mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-27 15:05:28 +03:00
300 lines
8.6 KiB
C
300 lines
8.6 KiB
C
/**
|
|
* SPDX-License-Identifier: GPL-2.0-only
|
|
*
|
|
* This file is part of Nominatim. (https://nominatim.org)
|
|
*
|
|
* Copyright (C) 2022 by the Nominatim developer community.
|
|
* For a full list of authors see the git log.
|
|
*/
|
|
#include "postgres.h"
|
|
#include "fmgr.h"
|
|
#include "mb/pg_wchar.h"
|
|
#include <utfasciitable.h>
|
|
|
|
#ifdef PG_MODULE_MAGIC
|
|
PG_MODULE_MAGIC;
|
|
#endif
|
|
|
|
Datum transliteration( PG_FUNCTION_ARGS );
|
|
Datum gettokenstring( PG_FUNCTION_ARGS );
|
|
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
|
|
void str_dupspaces(char* buffer);
|
|
|
|
PG_FUNCTION_INFO_V1( transliteration );
|
|
Datum
|
|
transliteration( PG_FUNCTION_ARGS )
|
|
{
|
|
static char * ascii = UTFASCII;
|
|
static uint16 asciilookup[65536] = UTFASCIILOOKUP;
|
|
char * asciipos;
|
|
|
|
text *source;
|
|
unsigned char *sourcedata;
|
|
int sourcedatalength;
|
|
|
|
unsigned int c1,c2,c3,c4;
|
|
unsigned int * wchardata;
|
|
unsigned int * wchardatastart;
|
|
|
|
text *result;
|
|
unsigned char *resultdata;
|
|
int resultdatalength;
|
|
int iLen;
|
|
|
|
if (GetDatabaseEncoding() != PG_UTF8)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("requires UTF8 database encoding")));
|
|
}
|
|
|
|
if (PG_ARGISNULL(0))
|
|
{
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
// The original string
|
|
source = PG_GETARG_TEXT_P(0);
|
|
sourcedata = (unsigned char *)VARDATA(source);
|
|
sourcedatalength = VARSIZE(source) - VARHDRSZ;
|
|
|
|
// Intermediate wchar version of string
|
|
wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
|
|
|
|
// Based on pg_utf2wchar_with_len from wchar.c
|
|
// Postgresql strings are not zero terminalted
|
|
while (sourcedatalength > 0)
|
|
{
|
|
if ((*sourcedata & 0x80) == 0)
|
|
{
|
|
*wchardata = *sourcedata++;
|
|
wchardata++;
|
|
sourcedatalength--;
|
|
}
|
|
else if ((*sourcedata & 0xe0) == 0xc0)
|
|
{
|
|
if (sourcedatalength < 2) break;
|
|
c1 = *sourcedata++ & 0x1f;
|
|
c2 = *sourcedata++ & 0x3f;
|
|
*wchardata = (c1 << 6) | c2;
|
|
if (*wchardata < 65536) wchardata++;
|
|
sourcedatalength -= 2;
|
|
}
|
|
else if ((*sourcedata & 0xf0) == 0xe0)
|
|
{
|
|
if (sourcedatalength < 3) break;
|
|
c1 = *sourcedata++ & 0x0f;
|
|
c2 = *sourcedata++ & 0x3f;
|
|
c3 = *sourcedata++ & 0x3f;
|
|
*wchardata = (c1 << 12) | (c2 << 6) | c3;
|
|
if (*wchardata < 65536) wchardata++;
|
|
sourcedatalength -= 3;
|
|
}
|
|
else if ((*sourcedata & 0xf8) == 0xf0)
|
|
{
|
|
if (sourcedatalength < 4) break;
|
|
c1 = *sourcedata++ & 0x07;
|
|
c2 = *sourcedata++ & 0x3f;
|
|
c3 = *sourcedata++ & 0x3f;
|
|
c4 = *sourcedata++ & 0x3f;
|
|
*wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
|
|
if (*wchardata < 65536) wchardata++;
|
|
sourcedatalength -= 4;
|
|
}
|
|
else if ((*sourcedata & 0xfc) == 0xf8)
|
|
{
|
|
// table does not extend beyond 4 char long, just skip
|
|
if (sourcedatalength < 5) break;
|
|
sourcedatalength -= 5;
|
|
sourcedata += 5;
|
|
}
|
|
else if ((*sourcedata & 0xfe) == 0xfc)
|
|
{
|
|
// table does not extend beyond 4 char long, just skip
|
|
if (sourcedatalength < 6) break;
|
|
sourcedatalength -= 6;
|
|
sourcedata += 6;
|
|
}
|
|
else
|
|
{
|
|
// assume lenngth 1, silently drop bogus characters
|
|
sourcedatalength--;
|
|
sourcedata += 1;
|
|
}
|
|
}
|
|
*wchardata = 0;
|
|
|
|
// calc the length of transliteration string
|
|
resultdatalength = 0;
|
|
wchardata = wchardatastart;
|
|
while(*wchardata)
|
|
{
|
|
if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
|
|
wchardata++;
|
|
}
|
|
|
|
// allocate & create the result
|
|
result = (text *)palloc(resultdatalength + VARHDRSZ);
|
|
SET_VARSIZE(result, resultdatalength + VARHDRSZ);
|
|
resultdata = (unsigned char *)VARDATA(result);
|
|
|
|
wchardata = wchardatastart;
|
|
while(*wchardata)
|
|
{
|
|
if (*(asciilookup + *wchardata) > 0)
|
|
{
|
|
asciipos = ascii + *(asciilookup + *wchardata);
|
|
for(iLen = *asciipos; iLen > 0; iLen--)
|
|
{
|
|
asciipos++;
|
|
*resultdata = *asciipos;
|
|
resultdata++;
|
|
}
|
|
}
|
|
/*else
|
|
{
|
|
ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
|
|
errmsg( "missing char: %i\n", *wchardata )));
|
|
|
|
}*/
|
|
wchardata++;
|
|
}
|
|
|
|
pfree(wchardatastart);
|
|
|
|
PG_RETURN_TEXT_P(result);
|
|
}
|
|
|
|
// Set isspace=1 if the replacement _only_ adds a space before the search string. I.e. to == " " + from
|
|
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
|
|
{
|
|
char *p;
|
|
|
|
// Search string is too long to be present
|
|
if (fromlen > *len) return;
|
|
|
|
p = strstr(buffer, from);
|
|
while(p)
|
|
{
|
|
if (!isspace || (p > buffer && *(p-1) != ' '))
|
|
{
|
|
(*changes)++;
|
|
if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
|
|
memcpy(p, to, tolen);
|
|
*len += tolen - fromlen;
|
|
}
|
|
p = strstr(p+1, from);
|
|
}
|
|
}
|
|
|
|
void str_dupspaces(char* buffer)
|
|
{
|
|
char *out;
|
|
int wasspace;
|
|
|
|
out = buffer;
|
|
wasspace = 0;
|
|
while(*buffer)
|
|
{
|
|
if (wasspace && *buffer != ' ') wasspace = 0;
|
|
if (!wasspace)
|
|
{
|
|
*out = *buffer;
|
|
out++;
|
|
wasspace = (*buffer == ' ');
|
|
}
|
|
buffer++;
|
|
}
|
|
*out = 0;
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1( gettokenstring );
|
|
Datum
|
|
gettokenstring( PG_FUNCTION_ARGS )
|
|
{
|
|
text *source;
|
|
unsigned char *sourcedata;
|
|
int sourcedatalength;
|
|
|
|
char * buffer;
|
|
int len;
|
|
int changes;
|
|
|
|
text *result;
|
|
|
|
if (GetDatabaseEncoding() != PG_UTF8)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("requires UTF8 database encoding")));
|
|
}
|
|
|
|
if (PG_ARGISNULL(0))
|
|
{
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
// The original string
|
|
source = PG_GETARG_TEXT_P(0);
|
|
sourcedata = (unsigned char *)VARDATA(source);
|
|
sourcedatalength = VARSIZE(source) - VARHDRSZ;
|
|
|
|
// Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
|
|
buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
|
|
memcpy(buffer+1, sourcedata, sourcedatalength);
|
|
buffer[0] = 32;
|
|
buffer[sourcedatalength+1] = 32;
|
|
buffer[sourcedatalength+2] = 0;
|
|
len = sourcedatalength+3;
|
|
|
|
changes = 1;
|
|
str_dupspaces(buffer);
|
|
while(changes)
|
|
{
|
|
changes = 0;
|
|
#include <tokenstringreplacements.inc>
|
|
str_dupspaces(buffer);
|
|
}
|
|
|
|
// 'and' in various languages
|
|
str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
|
|
|
|
// 'the' (and similar)
|
|
str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
|
|
str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
|
|
|
|
// german
|
|
str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
|
|
str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
|
|
str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
|
|
str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
|
|
str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
|
|
str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
|
|
|
|
// russian
|
|
str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
|
|
str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
|
|
|
|
// allocate & create the result
|
|
len--;// Drop the terminating zero
|
|
result = (text *)palloc(len + VARHDRSZ);
|
|
SET_VARSIZE(result, len + VARHDRSZ);
|
|
memcpy(VARDATA(result), buffer, len);
|
|
|
|
pfree(buffer);
|
|
|
|
PG_RETURN_TEXT_P(result);
|
|
}
|
|
|