mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-10-27 11:42:46 +03:00
77 lines
2.8 KiB
Python
77 lines
2.8 KiB
Python
# SPDX-License-Identifier: GPL-2.0-only
|
|
#
|
|
# This file is part of Nominatim. (https://nominatim.org)
|
|
#
|
|
# Copyright (C) 2022 by the Nominatim developer community.
|
|
# For a full list of authors see the git log.
|
|
"""
|
|
Module containing the SPWikiLoader class.
|
|
"""
|
|
import re
|
|
import logging
|
|
from collections.abc import Iterator
|
|
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
|
from nominatim.tools.exec_utils import get_url
|
|
|
|
LOG = logging.getLogger()
|
|
class SPWikiLoader(Iterator):
|
|
"""
|
|
Handles loading of special phrases from the wiki.
|
|
"""
|
|
def __init__(self, config, languages=None):
|
|
super().__init__()
|
|
self.config = config
|
|
# Compile the regex here to increase performances.
|
|
self.occurence_pattern = re.compile(
|
|
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
|
|
)
|
|
self.languages = self._load_languages() if not languages else list(languages)
|
|
|
|
def __next__(self):
|
|
if not self.languages:
|
|
raise StopIteration
|
|
|
|
lang = self.languages.pop(0)
|
|
loaded_xml = self._get_wiki_content(lang)
|
|
LOG.warning('Importing phrases for lang: %s...', lang)
|
|
return self.parse_xml(loaded_xml)
|
|
|
|
def parse_xml(self, xml):
|
|
"""
|
|
Parses XML content and extracts special phrases from it.
|
|
Return a list of SpecialPhrase.
|
|
"""
|
|
# One match will be of format [label, class, type, operator, plural]
|
|
matches = self.occurence_pattern.findall(xml)
|
|
returned_phrases = set()
|
|
for match in matches:
|
|
returned_phrases.add(
|
|
SpecialPhrase(match[0], match[1], match[2], match[3])
|
|
)
|
|
return returned_phrases
|
|
|
|
def _load_languages(self):
|
|
"""
|
|
Get list of all languages from env config file
|
|
or default if there is no languages configured.
|
|
The system will extract special phrases only from all specified languages.
|
|
"""
|
|
default_languages = [
|
|
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
|
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
|
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
|
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
|
|
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
|
|
|
|
@staticmethod
|
|
def _get_wiki_content(lang):
|
|
"""
|
|
Request and return the wiki page's content
|
|
corresponding to special phrases for a given lang.
|
|
Requested URL Example :
|
|
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
|
|
"""
|
|
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
|
|
+ lang.upper()
|
|
return get_url(url)
|