convert special phrase loaders to generators

Generators simplify the code quite a bit compared to the previous
Iterator approach.
This commit is contained in:
Sarah Hoffmann 2022-05-30 14:12:46 +02:00
parent 042e314589
commit cce0e5ea38
6 changed files with 76 additions and 117 deletions

View File

@ -11,43 +11,31 @@
""" """
import csv import csv
import os import os
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.errors import UsageError from nominatim.errors import UsageError
class SPCsvLoader(Iterator): class SPCsvLoader:
""" """
Handles loading of special phrases from external csv file. Handles loading of special phrases from external csv file.
""" """
def __init__(self, csv_path): def __init__(self, csv_path):
super().__init__() super().__init__()
self.csv_path = csv_path self.csv_path = csv_path
self.has_been_read = False
def __next__(self):
if self.has_been_read:
raise StopIteration()
self.has_been_read = True def generate_phrases(self):
self.check_csv_validity() """ Open and parse the given csv file.
return self.parse_csv()
def parse_csv(self):
"""
Open and parse the given csv file.
Create the corresponding SpecialPhrases. Create the corresponding SpecialPhrases.
""" """
phrases = set() self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd: with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',') reader = csv.DictReader(fd, delimiter=',')
for row in reader: for row in reader:
phrases.add( yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
)
return phrases
def check_csv_validity(self):
def _check_csv_validity(self):
""" """
Check that the csv file has the right extension. Check that the csv file has the right extension.
""" """

View File

@ -62,8 +62,7 @@ class SPImporter():
# Store pairs of class/type for further processing # Store pairs of class/type for further processing
class_type_pairs = set() class_type_pairs = set()
for loaded_phrases in self.sp_loader: for phrase in self.sp_loader.generate_phrases():
for phrase in loaded_phrases:
result = self._process_phrase(phrase) result = self._process_phrase(phrase)
if result: if result:
class_type_pairs.add(result) class_type_pairs.add(result)

View File

@ -9,12 +9,24 @@
""" """
import re import re
import logging import logging
from collections.abc import Iterator
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
from nominatim.tools.exec_utils import get_url from nominatim.tools.exec_utils import get_url
LOG = logging.getLogger() LOG = logging.getLogger()
class SPWikiLoader(Iterator):
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
""" """
Handles loading of special phrases from the wiki. Handles loading of special phrases from the wiki.
""" """
@ -27,28 +39,21 @@ class SPWikiLoader(Iterator):
) )
self._load_languages() self._load_languages()
def __next__(self):
if not self.languages:
raise StopIteration
lang = self.languages.pop(0) def generate_phrases(self):
loaded_xml = self._get_wiki_content(lang) """ Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang) LOG.warning('Importing phrases for lang: %s...', lang)
return self.parse_xml(loaded_xml) loaded_xml = _get_wiki_content(lang)
def parse_xml(self, xml):
"""
Parses XML content and extracts special phrases from it.
Return a list of SpecialPhrase.
"""
# One match will be of format [label, class, type, operator, plural] # One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml) matches = self.occurence_pattern.findall(loaded_xml)
returned_phrases = set()
for match in matches: for match in matches:
returned_phrases.add( yield SpecialPhrase(match[0], match[1], match[2], match[3])
SpecialPhrase(match[0], match[1], match[2], match[3])
)
return returned_phrases
def _load_languages(self): def _load_languages(self):
""" """
@ -64,15 +69,3 @@ class SPWikiLoader(Iterator):
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
@staticmethod
def _get_wiki_content(lang):
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)

View File

@ -187,8 +187,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
table_factory('place_classtype_amenity_animal_shelter') table_factory('place_classtype_amenity_animal_shelter')
table_factory('place_classtype_wrongclass_wrongtype') table_factory('place_classtype_wrongclass_wrongtype')
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
lambda self, lang: xml_wiki_content) lambda lang: xml_wiki_content)
tokenizer = tokenizer_mock() tokenizer = tokenizer_mock()
sp_importer.import_phrases(tokenizer, should_replace) sp_importer.import_phrases(tokenizer, should_replace)

View File

@ -12,50 +12,6 @@ import pytest
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader
def test_parse_csv(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = sp_csv_loader.parse_csv()
assert check_phrases_content(phrases)
def test_next(sp_csv_loader):
"""
Test objects returned from the next() method.
It should return all SpecialPhrases objects of
the sp_csv_test.csv special phrases.
"""
phrases = next(sp_csv_loader)
assert check_phrases_content(phrases)
def test_check_csv_validity(sp_csv_loader):
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
sp_csv_loader.csv_path = 'test.csv'
sp_csv_loader.check_csv_validity()
sp_csv_loader.csv_path = 'test.wrong'
with pytest.raises(UsageError):
assert sp_csv_loader.check_csv_validity()
def check_phrases_content(phrases):
"""
Asserts that the given phrases list contains
the right phrases of the sp_csv_test.csv special phrases.
"""
return len(phrases) > 1 \
and any(p.p_label == 'Billboard'
and p.p_class == 'advertising'
and p.p_type == 'billboard'
and p.p_operator == '-' for p in phrases) \
and any(p.p_label == 'Zip Lines'
and p.p_class == 'aerialway'
and p.p_type == 'zip_line'
and p.p_operator == '-' for p in phrases)
@pytest.fixture @pytest.fixture
def sp_csv_loader(src_dir): def sp_csv_loader(src_dir):
""" """
@ -64,3 +20,35 @@ def sp_csv_loader(src_dir):
csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve() csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve()
loader = SPCsvLoader(csv_path) loader = SPCsvLoader(csv_path)
return loader return loader
def test_generate_phrases(sp_csv_loader):
"""
Test method parse_csv()
Should return the right SpecialPhrase objects.
"""
phrases = list(sp_csv_loader.generate_phrases())
assert len(phrases) == 41
assert len(set(phrases)) == 41
assert any(p.p_label == 'Billboard'
and p.p_class == 'advertising'
and p.p_type == 'billboard'
and p.p_operator == '-' for p in phrases)
assert any(p.p_label == 'Zip Lines'
and p.p_class == 'aerialway'
and p.p_type == 'zip_line'
and p.p_operator == '-' for p in phrases)
def test_invalid_cvs_file():
"""
Test method check_csv_validity()
It should raise an exception when file with a
different exception than .csv is given.
"""
loader = SPCsvLoader('test.wrong')
with pytest.raises(UsageError, match='not a csv file'):
next(loader.generate_phrases())

View File

@ -26,27 +26,18 @@ def sp_wiki_loader(monkeypatch, def_config, xml_wiki_content):
""" """
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en') monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en')
loader = SPWikiLoader(def_config) loader = SPWikiLoader(def_config)
monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content',
lambda self, lang: xml_wiki_content) lambda lang: xml_wiki_content)
return loader return loader
def test_parse_xml(sp_wiki_loader, xml_wiki_content): def test_generate_phrases(sp_wiki_loader):
"""
Test method parse_xml()
Should return the right SpecialPhrase objects.
"""
phrases = sp_wiki_loader.parse_xml(xml_wiki_content)
check_phrases_content(phrases)
def test_next(sp_wiki_loader):
""" """
Test objects returned from the next() method. Test objects returned from the next() method.
It should return all SpecialPhrases objects of It should return all SpecialPhrases objects of
the 'en' special phrases. the 'en' special phrases.
""" """
phrases = next(sp_wiki_loader) phrases = list(sp_wiki_loader.generate_phrases())
check_phrases_content(phrases) check_phrases_content(phrases)
def check_phrases_content(phrases): def check_phrases_content(phrases):