mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-22 21:28:10 +03:00
add sanitizer for TIGER tags
Currently only takes over cleaning the tiger:county data. This was done by the import until now.
This commit is contained in:
parent
55ee08f42b
commit
fd3dec8efe
@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing
|
|||||||
# typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
|
# typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
|
||||||
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
|
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
|
||||||
|
|
||||||
good-names=i,x,y,fd,db,cc
|
good-names=i,x,y,m,fd,db,cc
|
||||||
|
@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim.
|
|||||||
rendering:
|
rendering:
|
||||||
heading_level: 6
|
heading_level: 6
|
||||||
|
|
||||||
|
##### clean-tiger-tags
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.sanitizers.clean_tiger_tags
|
||||||
|
selection:
|
||||||
|
members: False
|
||||||
|
rendering:
|
||||||
|
heading_level: 6
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#### Token Analysis
|
#### Token Analysis
|
||||||
|
|
||||||
|
46
nominatim/tokenizer/sanitizers/clean_tiger_tags.py
Normal file
46
nominatim/tokenizer/sanitizers/clean_tiger_tags.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Sanitizer that preprocesses tags from the TIGER import.
|
||||||
|
|
||||||
|
It makes the following changes:
|
||||||
|
|
||||||
|
* remove state reference from tiger:county
|
||||||
|
"""
|
||||||
|
from typing import Callable
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
|
COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
|
||||||
|
|
||||||
|
def _clean_tiger_county(obj: ProcessInfo) -> None:
|
||||||
|
""" Remove the state reference from tiger:county tags.
|
||||||
|
|
||||||
|
This transforms a name like 'Hamilton, AL' into 'Hamilton'.
|
||||||
|
If no state reference is detected at the end, the name is left as is.
|
||||||
|
"""
|
||||||
|
if not obj.address:
|
||||||
|
return
|
||||||
|
|
||||||
|
for item in obj.address:
|
||||||
|
if item.kind == 'tiger' and item.suffix == 'county':
|
||||||
|
m = COUNTY_MATCH.fullmatch(item.name)
|
||||||
|
if m:
|
||||||
|
item.name = m[1]
|
||||||
|
# Switch kind and suffix, the split left them reversed.
|
||||||
|
item.kind = 'county'
|
||||||
|
item.suffix = 'tiger'
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
|
""" Create a housenumber processing function.
|
||||||
|
"""
|
||||||
|
return _clean_tiger_county
|
@ -35,6 +35,7 @@ sanitizers:
|
|||||||
- step: clean-postcodes
|
- step: clean-postcodes
|
||||||
convert-to-address: yes
|
convert-to-address: yes
|
||||||
default-pattern: "[A-Z0-9- ]{3,12}"
|
default-pattern: "[A-Z0-9- ]{3,12}"
|
||||||
|
- step: clean-tiger-tags
|
||||||
- step: split-name-list
|
- step: split-name-list
|
||||||
- step: strip-brace-terms
|
- step: strip-brace-terms
|
||||||
- step: tag-analyzer-by-language
|
- step: tag-analyzer-by-language
|
||||||
|
43
test/python/tokenizer/sanitizers/test_clean_tiger_tags.py
Normal file
43
test/python/tokenizer/sanitizers/test_clean_tiger_tags.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for sanitizer that clean up TIGER tags.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
|
||||||
|
class TestCleanTigerTags:
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup_country(self, def_config):
|
||||||
|
self.config = def_config
|
||||||
|
|
||||||
|
|
||||||
|
def run_sanitizer_on(self, addr):
|
||||||
|
place = PlaceInfo({'address': addr})
|
||||||
|
_, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.kind, p.suffix) for p in outaddr])
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'),
|
||||||
|
('Little, Borough, CA', 'Little, Borough')])
|
||||||
|
def test_well_formatted(self, inname, outname):
|
||||||
|
assert self.run_sanitizer_on({'tiger:county': inname})\
|
||||||
|
== [(outname, 'county', 'tiger')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', ''))
|
||||||
|
def test_badly_formatted(self, name):
|
||||||
|
assert self.run_sanitizer_on({'tiger:county': name})\
|
||||||
|
== [(name, 'county', 'tiger')]
|
||||||
|
|
||||||
|
|
||||||
|
def test_unmatched(self):
|
||||||
|
assert self.run_sanitizer_on({'tiger:country': 'US'})\
|
||||||
|
== [('US', 'tiger', 'country')]
|
Loading…
Reference in New Issue
Block a user