From 8080625747dc7e87bc510d2af0d3edf5d551a6d0 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 12 May 2022 11:43:47 +0200 Subject: [PATCH] remove postcodes from countries that don't have them The postcodes will only be removed as a 'computed postcode' they are still searchable for the given object. --- .pylintrc | 2 +- nominatim/tokenizer/sanitizers/config.py | 14 ++++++++++++++ .../sanitizers/tag_analyzer_by_language.py | 3 +-- nominatim/tools/country_info.py | 14 ++++++++++++-- settings/icu_tokenizer.yaml | 2 ++ test/bdd/db/import/postcodes.feature | 16 +++++++++++++++- 6 files changed, 45 insertions(+), 6 deletions(-) diff --git a/.pylintrc b/.pylintrc index fef53872..52d9fcf9 100644 --- a/.pylintrc +++ b/.pylintrc @@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use -good-names=i,x,y,fd,db +good-names=i,x,y,fd,db,cc diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index ecfcacbe..ce5ce1eb 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -44,6 +44,20 @@ class SanitizerConfig(UserDict): return values + def get_bool(self, param, default=None): + """ Extract a configuration parameter as a boolean. + The parameter must be one of the yaml boolean values or an + user error will be raised. If `default` is given, then the parameter + may also be missing or empty. + """ + value = self.data.get(param, default) + + if not isinstance(value, bool): + raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.") + + return value + + def get_delimiter(self, default=',;'): """ Return the 'delimiter' parameter in the configuration as a compiled regular expression that can be used to split the names on the diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index 7898b1c6..9a99d127 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -48,8 +48,7 @@ class _AnalyzerByLanguage: self.deflangs = {} if use_defaults in ('mono', 'all'): - for ccode, prop in country_info.iterate(): - clangs = prop['languages'] + for ccode, clangs in country_info.iterate('languages'): if len(clangs) == 1 or use_defaults == 'all': if self.whitelist: self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index 0ad00171..d754b4dd 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -84,10 +84,20 @@ def setup_country_config(config): _COUNTRY_INFO.load(config) -def iterate(): +def iterate(prop=None): """ Iterate over country code and properties. + + When `prop` is None, all countries are returned with their complete + set of properties. + + If `prop` is given, then only countries are returned where the + given property is set. The second item of the tuple contains only + the content of the given property. """ - return _COUNTRY_INFO.items() + if prop is None: + return _COUNTRY_INFO.items() + + return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p) def setup_country_tables(dsn, sql_dir, ignore_partitions=False): diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index cd9c0d6d..544bd81d 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -32,6 +32,8 @@ sanitizers: - streetnumber convert-to-name: - (\A|.*,)[^\d,]{3,}(,.*|\Z) + - step: clean-postcodes + convert-to-address: yes - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 15beab57..50afa7cc 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -246,4 +246,18 @@ Feature: Import of postcodes | 12 445 4 | ca | 25 | 11 | | A1:BC10 | ca | 25 | 11 | - + Scenario: Postcodes outside all countries are not added to the postcode and word table + Given the places + | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | + | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | + And the places + | osm | class | type | name | geometry | + | N1 | place | hamlet | Null Island | 0 0 | + When importing + Then location_postcode contains exactly + | country | postcode | geometry | + And there are no word tokens for postcodes 01982 + When sending search query "111, 01982 Null Island" + Then results contain + | osm | display_name | + | N34 | 111, Null Island, 01982 |