mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-24 05:22:15 +03:00
Merge pull request #3122 from miku0/sanitizer-final
Adds sanitizer for Japanese addresses to correspond to block address
This commit is contained in:
commit
252fe42612
@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
#### tag-japanese
|
||||
|
||||
::: nominatim.tokenizer.sanitizers.tag_japanese
|
||||
selection:
|
||||
members: False
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
#### Token Analysis
|
||||
|
||||
Token analyzers take a full name and transform it into one or more normalized
|
||||
|
117
nominatim/tokenizer/sanitizers/tag_japanese.py
Normal file
117
nominatim/tokenizer/sanitizers/tag_japanese.py
Normal file
@ -0,0 +1,117 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
This sanitizer maps OSM data to Japanese block addresses.
|
||||
It replaces blocknumber and housenumber with housenumber,
|
||||
and quarter and neighbourhood with place.
|
||||
"""
|
||||
|
||||
|
||||
from typing import Callable
|
||||
from typing import List, Optional
|
||||
|
||||
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||
from nominatim.data.place_name import PlaceName
|
||||
|
||||
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
"""Set up the sanitizer
|
||||
"""
|
||||
return tag_japanese
|
||||
|
||||
def reconbine_housenumber(
|
||||
new_address: List[PlaceName],
|
||||
tmp_housenumber: Optional[str],
|
||||
tmp_blocknumber: Optional[str]
|
||||
) -> List[PlaceName]:
|
||||
""" Recombine the tag of housenumber by using housenumber and blocknumber
|
||||
"""
|
||||
if tmp_blocknumber and tmp_housenumber:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='housenumber',
|
||||
name=f'{tmp_blocknumber}-{tmp_housenumber}',
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
elif tmp_blocknumber:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='housenumber',
|
||||
name=tmp_blocknumber,
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
elif tmp_housenumber:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='housenumber',
|
||||
name=tmp_housenumber,
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
return new_address
|
||||
|
||||
def reconbine_place(
|
||||
new_address: List[PlaceName],
|
||||
tmp_neighbourhood: Optional[str],
|
||||
tmp_quarter: Optional[str]
|
||||
) -> List[PlaceName]:
|
||||
""" Recombine the tag of place by using neighbourhood and quarter
|
||||
"""
|
||||
if tmp_neighbourhood and tmp_quarter:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='place',
|
||||
name=f'{tmp_quarter}{tmp_neighbourhood}',
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
elif tmp_neighbourhood:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='place',
|
||||
name=tmp_neighbourhood,
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
elif tmp_quarter:
|
||||
new_address.append(
|
||||
PlaceName(
|
||||
kind='place',
|
||||
name=tmp_quarter,
|
||||
suffix=''
|
||||
)
|
||||
)
|
||||
return new_address
|
||||
def tag_japanese(obj: ProcessInfo) -> None:
|
||||
"""Recombine kind of address
|
||||
"""
|
||||
if obj.place.country_code != 'jp':
|
||||
return
|
||||
tmp_housenumber = None
|
||||
tmp_blocknumber = None
|
||||
tmp_neighbourhood = None
|
||||
tmp_quarter = None
|
||||
|
||||
new_address = []
|
||||
for item in obj.address:
|
||||
if item.kind == 'housenumber':
|
||||
tmp_housenumber = item.name
|
||||
elif item.kind == 'block_number':
|
||||
tmp_blocknumber = item.name
|
||||
elif item.kind == 'neighbourhood':
|
||||
tmp_neighbourhood = item.name
|
||||
elif item.kind == 'quarter':
|
||||
tmp_quarter = item.name
|
||||
else:
|
||||
new_address.append(item)
|
||||
|
||||
new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
|
||||
new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)
|
||||
|
||||
obj.address = [item for item in new_address if item.name is not None]
|
@ -1,14 +1,14 @@
|
||||
- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0"
|
||||
- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1"
|
||||
- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2"
|
||||
- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3"
|
||||
- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4"
|
||||
- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5"
|
||||
- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6"
|
||||
- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7"
|
||||
- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8"
|
||||
- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9"
|
||||
- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'"
|
||||
- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰零] > 0"
|
||||
- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵一] > 1"
|
||||
- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶二] > 2"
|
||||
- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷三] > 3"
|
||||
- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸四] > 4"
|
||||
- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹五] > 5"
|
||||
- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺六] > 6"
|
||||
- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻七] > 7"
|
||||
- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼八] > 8"
|
||||
- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽九] > 9"
|
||||
- "[𑜺⑩⑽⒑❿➉➓⓾十] > '10'"
|
||||
- "[⑪⑾⒒⓫] > '11'"
|
||||
- "[⑫⑿⒓⓬] > '12'"
|
||||
- "[⑬⒀⒔⓭] > '13'"
|
||||
|
@ -45,6 +45,7 @@ sanitizers:
|
||||
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
|
||||
use-defaults: all
|
||||
mode: append
|
||||
- step: tag-japanese
|
||||
token-analysis:
|
||||
- analyzer: generic
|
||||
- id: "@housenumber"
|
||||
|
30
test/bdd/db/query/japanese.feature
Normal file
30
test/bdd/db/query/japanese.feature
Normal file
@ -0,0 +1,30 @@
|
||||
@DB
|
||||
Feature: Searches in Japan
|
||||
Test specifically for searches of Japanese addresses and in Japanese language.
|
||||
@fail-legacy
|
||||
Scenario: A block house-number is parented to the neighbourhood
|
||||
Given the grid with origin JP
|
||||
| 1 | | | | 2 |
|
||||
| | 3 | | | |
|
||||
| | | 9 | | |
|
||||
| | | | 6 | |
|
||||
And the places
|
||||
| osm | class | type | name | geometry |
|
||||
| W1 | highway | residential | 雉子橋通り | 1,2 |
|
||||
And the places
|
||||
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry |
|
||||
| N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 |
|
||||
And the places
|
||||
| osm | class | type | name | geometry |
|
||||
| N9 | place | neighbourhood | 2丁目 | 9 |
|
||||
And the places
|
||||
| osm | class | type | name | geometry |
|
||||
| N6 | place | quarter | 加瀬 | 6 |
|
||||
When importing
|
||||
Then placex contains
|
||||
| object | parent_place_id |
|
||||
| N3 | N9 |
|
||||
When sending search query "2丁目 6-2"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
80
test/python/tokenizer/sanitizers/test_tag_japanese.py
Normal file
80
test/python/tokenizer/sanitizers/test_tag_japanese.py
Normal file
@ -0,0 +1,80 @@
|
||||
from nominatim.data.place_info import PlaceInfo
|
||||
from nominatim.data.place_name import PlaceName
|
||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from typing import Mapping, Optional, List
|
||||
import pytest
|
||||
|
||||
class TestTagJapanese:
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self,type, **kwargs):
|
||||
place = PlaceInfo({
|
||||
'address': kwargs,
|
||||
'country_code': 'jp'
|
||||
})
|
||||
sanitizer_args = {'step': 'tag-japanese'}
|
||||
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
|
||||
tmp_list = [(p.name,p.kind) for p in address]
|
||||
return sorted(tmp_list)
|
||||
|
||||
def test_on_address(self):
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]
|
||||
|
||||
def test_housenumber(self):
|
||||
res = self.run_sanitizer_on('address', housenumber='2')
|
||||
assert res == [('2','housenumber')]
|
||||
|
||||
def test_blocknumber(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6')
|
||||
assert res == [('6','housenumber')]
|
||||
|
||||
def test_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', neighbourhood='8')
|
||||
assert res == [('8','place')]
|
||||
|
||||
def test_quarter(self):
|
||||
res = self.run_sanitizer_on('address', quarter='kase')
|
||||
assert res==[('kase','place')]
|
||||
|
||||
def test_housenumber_blocknumber(self):
|
||||
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
|
||||
assert res == [('6-2','housenumber')]
|
||||
|
||||
def test_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', quarter='kase', neighbourhood='8')
|
||||
assert res == [('kase8','place')]
|
||||
|
||||
def test_blocknumber_housenumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase')
|
||||
assert res == [('6-2','housenumber'),('kase','place')]
|
||||
|
||||
def test_blocknumber_housenumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', neighbourhood='8')
|
||||
assert res == [('6-2','housenumber'),('8','place')]
|
||||
|
||||
def test_blocknumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', quarter='kase', neighbourhood='8')
|
||||
assert res == [('6','housenumber'),('kase8','place')]
|
||||
|
||||
def test_blocknumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', quarter='kase')
|
||||
assert res == [('6','housenumber'),('kase','place')]
|
||||
|
||||
def test_blocknumber_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',block_number='6', neighbourhood='8')
|
||||
assert res == [('6','housenumber'),('8','place')]
|
||||
|
||||
def test_housenumber_quarter_neighbourhood(self):
|
||||
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase', neighbourhood='8')
|
||||
assert res == [('2','housenumber'),('kase8','place')]
|
||||
|
||||
def test_housenumber_quarter(self):
|
||||
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase')
|
||||
assert res == [('2','housenumber'),('kase','place')]
|
||||
|
||||
def test_housenumber_blocknumber_neighbourhood_quarter(self):
|
||||
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8')
|
||||
assert res == [('6-2','housenumber'),('kase8','place')]
|
Loading…
Reference in New Issue
Block a user