add framework for analysing housenumbers

This lays the groundwork for adding variants for housenumbers.
When analysis is enabled, then the 'word' field in the word table
is used as usual, so that variants can be created. There will be
only one analyser allowed which must have the fixed name
'@housenumber'.
This commit is contained in:
Sarah Hoffmann 2022-02-16 11:15:43 +01:00
parent b8c544cc98
commit a6903651fc
4 changed files with 59 additions and 12 deletions

View File

@ -157,7 +157,8 @@ class Tokenizer
$sSQL = 'SELECT word_id, word_token, type, word,';
$sSQL .= " info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
$sSQL .= " info->>'count' as count";
$sSQL .= " info->>'count' as count,";
$sSQL .= " info->>'lookup' as lookup";
$sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@ -179,7 +180,8 @@ class Tokenizer
}
break;
case 'H': // house number tokens
$oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
$sLookup = $aWord['lookup'] ?? $aWord['word_token'];
$oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup));
break;
case 'P': // postcode tokens
// Postcodes are not normalized, so they may have content

View File

@ -200,3 +200,26 @@ BEGIN
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
RETURNS INTEGER
AS $$
DECLARE
return_id INTEGER;
BEGIN
SELECT min(word_id) INTO return_id
FROM word WHERE word = norm_term and type = 'H';
IF return_id IS NULL THEN
return_id := nextval('seq_word');
INSERT INTO word (word_id, word_token, type, word, info)
SELECT return_id, lookup_term, 'H', norm_term,
json_build_object('lookup', lookup_terms[1])
FROM unnest(lookup_terms) as lookup_term;
END IF;
RETURN return_id;
END;
$$
LANGUAGE plpgsql;

View File

@ -28,6 +28,10 @@ CREATE INDEX idx_word_postcodes ON word
CREATE INDEX idx_word_full_word ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'W';
-- Used when inserting analyzed housenumbers (exclude old-style entries).
CREATE INDEX idx_word_housenumbers ON word
USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'H' and word is not null;
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";

View File

@ -485,18 +485,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" Normalize the housenumber and return the word token and the
canonical form.
"""
norm_name = self._search_normalized(hnr.name)
if not norm_name:
return None, None
analyzer = self.token_analysis.analysis.get('@housenumber')
result = None, None
token = self._cache.housenumbers.get(norm_name)
if token is None:
with self.conn.cursor() as cur:
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
token = cur.fetchone()[0]
self._cache.housenumbers[norm_name] = token
if analyzer is None:
# When no custom analyzer is set, simply normalize and transliterate
norm_name = self._search_normalized(hnr.name)
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
result = cur.fetchone()[0], norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
norm_name = analyzer.normalize(hnr.name)
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
variants = analyzer.get_variants_ascii(norm_name)
if variants:
with self.conn.cursor() as cur:
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
(norm_name, list(variants)))
result = cur.fetchone()[0], variants[0]
self._cache.housenumbers[norm_name] = result
return token, norm_name
return result
def _compute_partial_tokens(self, name):