From 0722495434e039b7b716206974d0076db8813ca6 Mon Sep 17 00:00:00 2001 From: miku0 Date: Wed, 26 Jul 2023 07:54:58 +0000 Subject: [PATCH] add japanese sanitizer --- docs/customize/Tokenizers.md | 8 + .../tokenizer/sanitizers/tag_japanese.py | 150 ++++++++++++++++++ settings/icu_tokenizer.yaml | 1 + test/bdd/db/query/japanese.feature | 29 ++++ .../tokenizer/sanitizers/test_tag_japanese.py | 65 ++++++++ 5 files changed, 253 insertions(+) create mode 100644 nominatim/tokenizer/sanitizers/tag_japanese.py create mode 100644 test/bdd/db/query/japanese.feature create mode 100644 test/python/tokenizer/sanitizers/test_tag_japanese.py diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 11c27e38..6199ea42 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +#### tag-japanese + +::: nominatim.tokenizer.sanitizers.tag_japanese + selection: + members: False + rendering: + heading_level: 6 + #### Token Analysis Token analyzers take a full name and transform it into one or more normalized diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py new file mode 100644 index 00000000..81d3d5b3 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This sanitizer maps OSM data to Japanese block addresses. +It replaces blocknumber and housenumber with housenumber, +and quarter and neighbourhood with place. +""" + + +from typing import Callable +from typing import List + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig +from nominatim.data.place_name import PlaceName + +def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: +#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: + """Set up the sanitizer + """ + return tag_japanese + #return tag_japanese(config) + +def convert_kanji_sequence_to_number(sequence: str) -> str: + """Converts Kanji numbers to Arabic numbers + """ + kanji_map = { + '零': '0', + '一': '1', + '二': '2', + '三': '3', + '四': '4', + '五': '5', + '六': '6', + '七': '7', + '八': '8', + '九': '9' + } + converted = '' + current_number = '' + for char in sequence: + if char in kanji_map: + current_number += kanji_map[char] + else: + converted += current_number + current_number = '' + converted += char + converted += current_number + return converted + +def reconbine_housenumber( + new_address: List[PlaceName], + tmp_housenumber: str | None, + tmp_blocknumber: str | None +) -> List[PlaceName]: + """ Recombine the tag of housenumber by using housenumber and blocknumber + """ + if tmp_blocknumber and tmp_housenumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_blocknumber}-{tmp_housenumber}', + suffix='' + ) + ) + elif tmp_blocknumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_blocknumber}', + suffix='' + ) + ) + elif tmp_housenumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_housenumber}', + suffix='' + ) + ) + return new_address + +def reconbine_place( + new_address: List[PlaceName], + tmp_neighbourhood: str | None, + tmp_quarter: str | None +) -> List[PlaceName]: + """ Recombine the tag of place by using neighbourhood and quarter + """ + if tmp_neighbourhood and tmp_quarter: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_quarter}{tmp_neighbourhood}', + suffix='' + ) + ) + elif tmp_neighbourhood: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_neighbourhood}', + suffix='' + ) + ) + elif tmp_quarter: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_quarter}', + suffix='' + ) + ) + return new_address +def tag_japanese(obj: ProcessInfo) -> None: + """Recombine kind of address + """ + if obj.place.country_code != 'jp': + return + tmp_housenumber = None + tmp_blocknumber = None + tmp_neighbourhood = None + tmp_quarter = None + + new_address = [] + for item in obj.names: + item.name = convert_kanji_sequence_to_number(item.name) + + for item in obj.address: + item.name = convert_kanji_sequence_to_number(item.name) + if item.kind == 'housenumber': + tmp_housenumber = item.name + elif item.kind == 'block_number': + tmp_blocknumber = item.name + elif item.kind == 'neighbourhood': + tmp_neighbourhood = item.name + elif item.kind == 'quarter': + tmp_quarter = item.name + else: + new_address.append(item) + + new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber) + new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter) + + obj.address = [item for item in new_address if item.name is not None] diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 1fa467be..c5a809c6 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -45,6 +45,7 @@ sanitizers: whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi] use-defaults: all mode: append + - step: tag-japanese token-analysis: - analyzer: generic - id: "@housenumber" diff --git a/test/bdd/db/query/japanese.feature b/test/bdd/db/query/japanese.feature new file mode 100644 index 00000000..f21e0f5c --- /dev/null +++ b/test/bdd/db/query/japanese.feature @@ -0,0 +1,29 @@ +@DB +Feature: Searches in Japan + Test specifically for searches of Japanese addresses and in Japanese language. + Scenario: A block house-number is parented to the neighbourhood + Given the grid with origin JP + | 1 | | | | 2 | + | | 3 | | | | + | | | 9 | | | + | | | | 6 | | + And the places + | osm | class | type | name | geometry | + | W1 | highway | residential | 雉子橋通り | 1,2 | + And the places + | osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry | + | N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 | + And the places + | osm | class | type | name | geometry | + | N9 | place | neighbourhood | 2丁目 | 9 | + And the places + | osm | class | type | name | geometry | + | N6 | place | quarter | 加瀬 | 6 | + When importing + Then placex contains + | object | parent_place_id | + | N3 | N9 | + When sending search query "2丁目 6-2" + Then results contain + | osm | + | N3 | diff --git a/test/python/tokenizer/sanitizers/test_tag_japanese.py b/test/python/tokenizer/sanitizers/test_tag_japanese.py new file mode 100644 index 00000000..c82c4261 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_tag_japanese.py @@ -0,0 +1,65 @@ +from nominatim.data.place_info import PlaceInfo +from nominatim.data.place_name import PlaceName +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from typing import Mapping, Optional, List +import pytest + +class TestTagJapanese: + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self,type, **kwargs): + place = PlaceInfo({ + 'address': kwargs, + 'country_code': 'jp' + }) + sanitizer_args = {'step': 'tag-japanese'} + _, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place) + tmp_list = [(p.name,p.kind) for p in address] + return sorted(tmp_list) + + def test_on_address(self): + res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz') + assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')] + + def test_housenumber(self): + res = self.run_sanitizer_on('address', housenumber='2') + assert res == [('2','housenumber')] + + def test_blocknumber(self): + res = self.run_sanitizer_on('address', block_number='6') + assert res == [('6','housenumber')] + + #def test_neighbourhood(self): + # res = self.run_sanitizer_on('address',neighbourhood='8丁目') + # assert res == [('8','place')] + def test_neighbourhood(self): + res = self.run_sanitizer_on('address', neighbourhood='8') + assert res == [('8','place')] + def test_quarter(self): + res = self.run_sanitizer_on('address', quarter='kase') + assert res==[('kase','place')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6') + assert res == [('6-2','housenumber')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8') + assert res == [('2','housenumber'),('8','place')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8') + assert res == [('6','housenumber'),('8','place')] + + def test_housenumber_blocknumber_neighbourhood(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8') + assert res == [('6-2','housenumber'),('8','place')] + + def test_housenumber_blocknumber_neighbourhood_quarter(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase') + assert res == [('6-2','housenumber'),('kase8','place')] + def test_neighbourhood_quarter(self): + res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase') + assert res == [('kase8','place')]