add japanese sanitizer

This commit is contained in:
miku0 2023-07-26 07:54:58 +00:00
parent 261e0cfd5a
commit 0722495434
5 changed files with 253 additions and 0 deletions

View File

@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
rendering:
heading_level: 6
#### tag-japanese
::: nominatim.tokenizer.sanitizers.tag_japanese
selection:
members: False
rendering:
heading_level: 6
#### Token Analysis
Token analyzers take a full name and transform it into one or more normalized

View File

@ -0,0 +1,150 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""
from typing import Callable
from typing import List
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.data.place_name import PlaceName
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]:
"""Set up the sanitizer
"""
return tag_japanese
#return tag_japanese(config)
def convert_kanji_sequence_to_number(sequence: str) -> str:
"""Converts Kanji numbers to Arabic numbers
"""
kanji_map = {
'': '0',
'': '1',
'': '2',
'': '3',
'': '4',
'': '5',
'': '6',
'': '7',
'': '8',
'': '9'
}
converted = ''
current_number = ''
for char in sequence:
if char in kanji_map:
current_number += kanji_map[char]
else:
converted += current_number
current_number = ''
converted += char
converted += current_number
return converted
def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: str | None,
tmp_blocknumber: str | None
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}',
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_housenumber}',
suffix=''
)
)
return new_address
def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: str | None,
tmp_quarter: str | None
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}',
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None
new_address = []
for item in obj.names:
item.name = convert_kanji_sequence_to_number(item.name)
for item in obj.address:
item.name = convert_kanji_sequence_to_number(item.name)
if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)
new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber)
new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter)
obj.address = [item for item in new_address if item.name is not None]

View File

@ -45,6 +45,7 @@ sanitizers:
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
- step: tag-japanese
token-analysis:
- analyzer: generic
- id: "@housenumber"

View File

@ -0,0 +1,29 @@
@DB
Feature: Searches in Japan
Test specifically for searches of Japanese addresses and in Japanese language.
Scenario: A block house-number is parented to the neighbourhood
Given the grid with origin JP
| 1 | | | | 2 |
| | 3 | | | |
| | | 9 | | |
| | | | 6 | |
And the places
| osm | class | type | name | geometry |
| W1 | highway | residential | | 1,2 |
And the places
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry |
| N3 | amenity | restaurant | 2 | 6 | 2 | 3 |
And the places
| osm | class | type | name | geometry |
| N9 | place | neighbourhood | 2 | 9 |
And the places
| osm | class | type | name | geometry |
| N6 | place | quarter | | 6 |
When importing
Then placex contains
| object | parent_place_id |
| N3 | N9 |
When sending search query "2 6-2"
Then results contain
| osm |
| N3 |

View File

@ -0,0 +1,65 @@
from nominatim.data.place_info import PlaceInfo
from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from typing import Mapping, Optional, List
import pytest
class TestTagJapanese:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self,type, **kwargs):
place = PlaceInfo({
'address': kwargs,
'country_code': 'jp'
})
sanitizer_args = {'step': 'tag-japanese'}
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
tmp_list = [(p.name,p.kind) for p in address]
return sorted(tmp_list)
def test_on_address(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]
def test_housenumber(self):
res = self.run_sanitizer_on('address', housenumber='2')
assert res == [('2','housenumber')]
def test_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6')
assert res == [('6','housenumber')]
#def test_neighbourhood(self):
# res = self.run_sanitizer_on('address',neighbourhood='8丁目')
# assert res == [('8','place')]
def test_neighbourhood(self):
res = self.run_sanitizer_on('address', neighbourhood='8')
assert res == [('8','place')]
def test_quarter(self):
res = self.run_sanitizer_on('address', quarter='kase')
assert res==[('kase','place')]
def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
assert res == [('6-2','housenumber')]
def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8')
assert res == [('2','housenumber'),('8','place')]
def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8')
assert res == [('6','housenumber'),('8','place')]
def test_housenumber_blocknumber_neighbourhood(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8')
assert res == [('6-2','housenumber'),('8','place')]
def test_housenumber_blocknumber_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase')
assert res == [('6-2','housenumber'),('kase8','place')]
def test_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase')
assert res == [('kase8','place')]