mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-23 21:14:11 +03:00
introduce and use analyzer for postcodes
This commit is contained in:
parent
18864afa8a
commit
ca7b46511d
@ -223,3 +223,26 @@ BEGIN
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
DECLARE
|
||||
existing INTEGER;
|
||||
BEGIN
|
||||
SELECT count(*) INTO existing
|
||||
FROM word WHERE word = postcode and type = 'P';
|
||||
|
||||
IF existing > 0 THEN
|
||||
RETURN TRUE;
|
||||
END IF;
|
||||
|
||||
-- postcodes don't need word ids
|
||||
INSERT INTO word (word_token, type, word)
|
||||
SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
|
||||
|
||||
RETURN FALSE;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
|
@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from textwrap import dedent
|
||||
|
||||
from nominatim.db.connection import connect
|
||||
@ -473,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
def _process_place_address(self, token_info, address):
|
||||
for item in address:
|
||||
if item.kind == 'postcode':
|
||||
self._add_postcode(item.name)
|
||||
token_info.set_postcode(self._add_postcode(item))
|
||||
elif item.kind == 'housenumber':
|
||||
token_info.add_housenumber(*self._compute_housenumber_token(item))
|
||||
elif item.kind == 'street':
|
||||
@ -605,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
return full_tokens, partial_tokens
|
||||
|
||||
|
||||
def _add_postcode(self, postcode):
|
||||
def _add_postcode(self, item):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
if re.search(r'[:,;]', postcode) is None:
|
||||
postcode = self.normalize_postcode(postcode)
|
||||
analyzer = self.token_analysis.get_analyzer('@postcode')
|
||||
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self._search_normalized(postcode)
|
||||
if not term:
|
||||
return
|
||||
if analyzer is None:
|
||||
postcode_name = item.name.strip().upper()
|
||||
variant_base = None
|
||||
else:
|
||||
postcode_name = analyzer.normalize(item.name)
|
||||
variant_base = item.get_attr("variant")
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# no word_id needed for postcodes
|
||||
cur.execute("""INSERT INTO word (word_token, type, word)
|
||||
(SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
|
||||
WHERE NOT EXISTS
|
||||
(SELECT * FROM word
|
||||
WHERE type = 'P' and word = pc))
|
||||
""", (term, postcode))
|
||||
self._cache.postcodes.add(postcode)
|
||||
if variant_base is not None:
|
||||
postcode = f'{postcode_name}@{variant_base}'
|
||||
else:
|
||||
postcode = postcode_name
|
||||
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self._search_normalized(postcode_name)
|
||||
if not term:
|
||||
return
|
||||
|
||||
variants = {term}
|
||||
if analyzer is not None and variant_base is not None:
|
||||
variants.update(analyzer.get_variants_ascii(variant_base))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT create_postcode_word(%s, %s)",
|
||||
(postcode, list(variants)))
|
||||
self._cache.postcodes.add(postcode)
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
@ -637,6 +646,7 @@ class _TokenInfo:
|
||||
self.street_tokens = set()
|
||||
self.place_tokens = set()
|
||||
self.address_tokens = {}
|
||||
self.postcode = None
|
||||
|
||||
|
||||
@staticmethod
|
||||
@ -701,6 +711,11 @@ class _TokenInfo:
|
||||
if partials:
|
||||
self.address_tokens[key] = self._mk_array(partials)
|
||||
|
||||
def set_postcode(self, postcode):
|
||||
""" Set the postcode to the given one.
|
||||
"""
|
||||
self.postcode = postcode
|
||||
|
||||
|
||||
class _TokenCache:
|
||||
""" Cache for token information to avoid repeated database queries.
|
||||
|
@ -98,7 +98,7 @@ class _PostcodeSanitizer:
|
||||
obj.address.pop(pos)
|
||||
else:
|
||||
postcode.name = formatted[0]
|
||||
postcode.set_attr('lookup', formatted[1])
|
||||
postcode.set_attr('variant', formatted[1])
|
||||
|
||||
|
||||
def scan(self, postcode, country):
|
||||
|
54
nominatim/tokenizer/token_analysis/postcodes.py
Normal file
54
nominatim/tokenizer/token_analysis/postcodes.py
Normal file
@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Specialized processor for postcodes. Supports a 'lookup' variant of the
|
||||
token, which produces variants with optional spaces.
|
||||
"""
|
||||
|
||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||
|
||||
### Configuration section
|
||||
|
||||
def configure(rules, normalization_rules): # pylint: disable=W0613
|
||||
""" All behaviour is currently hard-coded.
|
||||
"""
|
||||
return None
|
||||
|
||||
### Analysis section
|
||||
|
||||
def create(normalizer, transliterator, config): # pylint: disable=W0613
|
||||
""" Create a new token analysis instance for this module.
|
||||
"""
|
||||
return PostcodeTokenAnalysis(normalizer, transliterator)
|
||||
|
||||
class PostcodeTokenAnalysis:
|
||||
""" Detects common housenumber patterns and normalizes them.
|
||||
"""
|
||||
def __init__(self, norm, trans):
|
||||
self.norm = norm
|
||||
self.trans = trans
|
||||
|
||||
self.mutator = MutationVariantGenerator(' ', (' ', ''))
|
||||
|
||||
|
||||
def normalize(self, name):
|
||||
""" Return the standard form of the postcode.
|
||||
"""
|
||||
return name.strip().upper()
|
||||
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized postcode.
|
||||
|
||||
The official form creates one variant. If a 'lookup version' is
|
||||
given, then it will create variants with optional spaces.
|
||||
"""
|
||||
# Postcodes follow their own transliteration rules.
|
||||
# Make sure at this point, that the terms are normalized in a way
|
||||
# that they are searchable with the standard transliteration rules.
|
||||
return [self.trans.transliterate(term) for term in
|
||||
self.mutator.generate([self.norm.transliterate(norm_name)])]
|
@ -34,7 +34,7 @@ sanitizers:
|
||||
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
|
||||
- step: clean-postcodes
|
||||
convert-to-address: yes
|
||||
default-pattern: [A-Z0-9- ]{3,12}
|
||||
default-pattern: "[A-Z0-9- ]{3,12}"
|
||||
- step: split-name-list
|
||||
- step: strip-brace-terms
|
||||
- step: tag-analyzer-by-language
|
||||
@ -46,6 +46,8 @@ token-analysis:
|
||||
- analyzer: generic
|
||||
- id: "@housenumber"
|
||||
analyzer: housenumbers
|
||||
- id: "@postcode"
|
||||
analyzer: postcodes
|
||||
- id: bg
|
||||
analyzer: generic
|
||||
mode: variant-only
|
||||
|
Loading…
Reference in New Issue
Block a user