add sanitizer for TIGER tags

Currently only takes over cleaning the tiger:county data. This was done by the import until now.
2024-11-22 21:28:10 +03:00 · 2022-11-22 17:10:21 +01:00 · 2022-11-22 17:10:21 +01:00 · fd3dec8efe
commit fd3dec8efe
parent 55ee08f42b
5 changed files with 100 additions and 1 deletions
--- a/.pylintrc
+++ b/.pylintrc
@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing
 #   typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
-good-names=i,x,y,fd,db,cc
+good-names=i,x,y,m,fd,db,cc
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim.
    rendering:
        heading_level: 6
 ##### clean-tiger-tags
 ::: nominatim.tokenizer.sanitizers.clean_tiger_tags
    selection:
        members: False
    rendering:
        heading_level: 6
 #### Token Analysis
--- a/nominatim/tokenizer/sanitizers/clean_tiger_tags.py
+++ b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py
@ -0,0 +1,46 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Sanitizer that preprocesses tags from the TIGER import.
 It makes the following changes:
 * remove state reference from tiger:county
 """
 from typing import Callable
 import re
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
 def _clean_tiger_county(obj: ProcessInfo) -> None:
    """ Remove the state reference from tiger:county tags.
        This transforms a name like 'Hamilton, AL' into 'Hamilton'.
        If no state reference is detected at the end, the name is left as is.
    """
    if not obj.address:
        return
    for item in obj.address:
        if item.kind == 'tiger' and item.suffix == 'county':
            m = COUNTY_MATCH.fullmatch(item.name)
            if m:
                item.name = m[1]
            # Switch kind and suffix, the split left them reversed.
            item.kind = 'county'
            item.suffix = 'tiger'
            return
 def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a housenumber processing function.
    """
    return _clean_tiger_county
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@ -35,6 +35,7 @@ sanitizers:
    - step: clean-postcodes
      convert-to-address: yes
      default-pattern: "[A-Z0-9- ]{3,12}"
    - step: clean-tiger-tags
    - step: split-name-list
    - step: strip-brace-terms
    - step: tag-analyzer-by-language
--- a/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py
+++ b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py
@ -0,0 +1,43 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Tests for sanitizer that clean up TIGER tags.
 """
 import pytest
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.data.place_info import PlaceInfo
 class TestCleanTigerTags:
    @pytest.fixture(autouse=True)
    def setup_country(self, def_config):
        self.config = def_config
    def run_sanitizer_on(self, addr):
        place = PlaceInfo({'address': addr})
        _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place)
        return sorted([(p.name, p.kind, p.suffix) for p in outaddr])
    @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'),
                                                ('Little, Borough, CA', 'Little, Borough')])
    def test_well_formatted(self, inname, outname):
        assert self.run_sanitizer_on({'tiger:county': inname})\
            == [(outname, 'county', 'tiger')]
    @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', ''))
    def test_badly_formatted(self, name):
        assert self.run_sanitizer_on({'tiger:county': name})\
            == [(name, 'county', 'tiger')]
    def test_unmatched(self):
        assert self.run_sanitizer_on({'tiger:country': 'US'})\
            == [('US', 'tiger', 'country')]