Refactoring loading of external special phrases and importation process by introducing SPLoader and SPWikiLoader

This commit is contained in:
AntoJvlt 2021-05-10 21:48:11 +02:00
parent 40cb17d299
commit 00959fac57
9 changed files with 226 additions and 190 deletions

View File

@ -2,7 +2,10 @@
Implementation of the 'special-phrases' command.
import logging
from import SpecialPhrasesImporter
from nominatim.errors import UsageError
from pathlib import Path
from import SPWikiLoader
from import SPImporter
from nominatim.db.connection import connect
LOG = logging.getLogger()
@ -21,16 +24,23 @@ class ImportSpecialPhrases:
group = parser.add_argument_group('Input arguments')
group.add_argument('--import-from-wiki', action='store_true',
help='Import special phrases from the OSM wiki to the database.')
group.add_argument('--csv-file', metavar='FILE',
help='CSV file containing phrases to import.')
def run(args):
from ..tokenizer import factory as tokenizer_factory
if args.import_from_wiki:
LOG.warning('Special phrases importation starting')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as db_connection:
args.config, args.phplib_dir, db_connection
args.config, args.phplib_dir, db_connection, SPWikiLoader(args.config)
if args.csv_file:
if not Path(args.csv_file).is_file():
LOG.fatal("CSV file '%s' does not exist.", args.csv_file)
raise UsageError('Cannot access file.')
return 0

View File

@ -3,4 +3,6 @@ Module with functions for importing, updating Nominatim databases
as well as general maintenance helpers.
from import SpecialPhrasesImporter
from import SPImporter
from import SPWikiLoader
from import SpecialPhrase

View File

@ -12,10 +12,9 @@ class SpecialPhrasesImporterStatistics():
process of special phrases.
def __init__(self):
def _set_global_values_to_0(self):
def _intialize_values(self):
Set all counts for the global
import to 0.
@ -23,22 +22,14 @@ class SpecialPhrasesImporterStatistics():
self.tables_created = 0
self.tables_deleted = 0
self.tables_ignored = 0
self.global_phrases_invalid = 0
def _set_lang_values_to_0(self):
Set all counts for the current
lang to 0.
self.lang_phrases_invalid = 0
self.invalids = 0
def notify_one_phrase_invalid(self):
Add +1 to the count of invalid entries
fetched from the wiki.
self.lang_phrases_invalid += 1
self.global_phrases_invalid += 1
self.invalids += 1
def notify_one_table_created(self):
@ -58,7 +49,6 @@ class SpecialPhrasesImporterStatistics():
self.tables_ignored += 1
def notify_import_done(self):
Print stats for the whole import process
@ -66,8 +56,8 @@ class SpecialPhrasesImporterStatistics():
"""'====================================================================')'Final statistics of the import:')'- %s phrases were invalid.', self.global_phrases_invalid)
if self.global_phrases_invalid > 0:'- %s phrases were invalid.', self.invalids)
if self.invalids > 0:' Those invalid phrases have been skipped.')'- %s tables were ignored as they already exist on the database',
@ -76,26 +66,8 @@ class SpecialPhrasesImporterStatistics():
if self.tables_deleted > 0:' They were deleted as they are not valid anymore.')
if self.global_phrases_invalid > 0:
if self.invalids > 0:
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
def notify_current_lang_done(self, lang):
Print stats for the current lang
and then reset lang values.
"""'====================================================================')'Statistics for the import of %s:', lang)'- %s phrases were invalid.', self.lang_phrases_invalid)
if self.lang_phrases_invalid > 0:' Those invalid phrases have been skipped.')'====================================================================')
if self.lang_phrases_invalid > 0:
LOG.warning('%s phrases were invalid and have been skipped for the import of lang %s.',
self.lang_phrases_invalid, lang)

View File

@ -1,5 +1,11 @@
Functions to import special phrases into the database.
Module containing the class handling the import
of the special phrases.
Phrases are analyzed and imported into the database.
The phrases already present in the database which are not
valids anymore are removed.
import logging
import os
@ -10,27 +16,24 @@ import subprocess
import json
from psycopg2.sql import Identifier, Literal, SQL
from import get_url
from nominatim.errors import UsageError
from import SpecialPhrasesImporterStatistics
LOG = logging.getLogger()
class SpecialPhrasesImporter():
class SPImporter():
# pylint: disable-msg=too-many-instance-attributes
Class handling the process of special phrases importations.
Class handling the process of special phrases importations into the database.
Take a SPLoader which load the phrases from an external source.
def __init__(self, config, phplib_dir, db_connection) -> None:
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.db_connection = db_connection
def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
self.config = config
self.phplib_dir = phplib_dir
self.db_connection = db_connection
self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists()
#Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added.
# It contains tuples with the following format: (lable, class, type, operator)
@ -39,27 +42,22 @@ class SpecialPhrasesImporter():
#special phrases class/type on the wiki.
self.table_phrases_to_delete = set()
def import_from_wiki(self, tokenizer, languages=None):
def import_phrases(self, tokenizer):
Iterate through all specified languages and
extract corresponding special phrases from the wiki.
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.')
LOG.warning('Special phrases importation starting')
#Get all languages to process.
languages = self._load_languages() if not languages else languages
#Store pairs of class/type for further processing
class_type_pairs = set()
for lang in languages:
LOG.warning('Importing phrases for lang: %s...', lang)
wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
for loaded_phrases in self.sp_loader:
for phrase in loaded_phrases:
result = self._process_phrase(phrase)
if result:
@ -101,89 +99,48 @@ class SpecialPhrasesImporter():
settings = json.load(json_settings)
return settings['blackList'], settings['whiteList']
def _load_languages(self):
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
default_languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
def _get_wiki_content(lang):
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
url = '' + lang.upper() # pylint: disable=line-too-long
return get_url(url)
def _check_sanity(self, lang, phrase_class, phrase_type):
def _check_sanity(self, phrase):
Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error.
type_matchs = self.sanity_check_pattern.findall(phrase_type)
class_matchs = self.sanity_check_pattern.findall(phrase_class)
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
if not class_matchs or not type_matchs:
LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
lang, phrase_class, phrase_type)
LOG.warning("Bad class/type: %s=%s. It will not be imported",
phrase.p_class, phrase.p_type)
return False
return True
def _process_xml_content(self, xml_content, lang):
def _process_phrase(self, phrase):
Process given xml content by extracting matching patterns.
Matching patterns are processed there and returned in a
set of class/type pairs.
Processes the given phrase by checking black and white list
and sanity.
Return the class/type pair corresponding to the phrase.
#One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml_content)
#Store pairs of class/type for further processing
class_type_pairs = set()
for match in matches:
phrase_label = match[0].strip()
phrase_class = match[1].strip()
phrase_type = match[2].strip()
phrase_operator = match[3].strip()
#Needed if some operator in the wiki are not written in english
phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
#hack around a bug where building=yes was imported with quotes into the wiki
phrase_type = re.sub(r'\"|"', '', phrase_type)
#blacklisting: disallow certain class/type combinations
if (
phrase.p_class in self.black_list.keys() and
phrase.p_type in self.black_list[phrase.p_class]
): return None
#blacklisting: disallow certain class/type combinations
if (
phrase_class in self.black_list.keys() and
phrase_type in self.black_list[phrase_class]
#whitelisting: if class is in whitelist, allow only tags in the list
if (
phrase_class in self.white_list.keys() and
phrase_type not in self.white_list[phrase_class]
#whitelisting: if class is in whitelist, allow only tags in the list
if (
phrase.p_class in self.white_list.keys() and
phrase.p_type not in self.white_list[phrase.p_class]
): return None
#sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(lang, phrase_class, phrase_type):
#sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(phrase):
return None
class_type_pairs.add((phrase_class, phrase_type))
self.word_phrases.add((phrase.p_label, phrase.p_class,
phrase.p_type, phrase.p_operator))
self.word_phrases.add((phrase_label, phrase_class,
phrase_type, phrase_operator))
return class_type_pairs
return set({(phrase.p_class, phrase.p_type)})
def _create_place_classtype_table_and_indexes(self, class_type_pairs):

View File

@ -0,0 +1,16 @@
Module containing the SPLoader class.
from abc import ABC, abstractmethod
class SPLoader(ABC):
Base class for special phrases loaders.
Handle the loading of special phrases from external sources.
def __iter__(self):
return self
def __next__(self):

View File

@ -0,0 +1,71 @@
Module containing the SPWikiLoader class.
import logging
import re
from import SpecialPhrase
from import SPLoader
from import get_url
LOG = logging.getLogger()
class SPWikiLoader(SPLoader):
Handles loading of special phrases from the wiki.
def __init__(self, config, languages=None):
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' parameter should be of type list.')
self.config = config
#Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
self.languages = self._load_languages() if not languages else languages
def __next__(self):
if not self.languages:
raise StopIteration
lang = self.languages.pop(0)
loaded_xml = SPWikiLoader._get_wiki_content(lang)
LOG.warning('Importing phrases for lang: %s...', lang)
return self.parse_xml(loaded_xml)
def parse_xml(self, xml):
Parses XML content and extracts special phrases from it.
Return a list of SpecialPhrase.
#One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml)
returned_phrases = set()
for match in matches:
SpecialPhrase(match[0], match[1], match[2], match[3])
return returned_phrases
def _load_languages(self):
Get list of all languages from env config file
or default if there is no languages configured.
The system will extract special phrases only from all specified languages.
default_languages = [
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
def _get_wiki_content(lang):
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
url = '' + lang.upper() # pylint: disable=line-too-long
return get_url(url)

View File

@ -0,0 +1,19 @@
Module containing the class SpecialPhrase.
This class is a model used to transfer a special phrase through
the process of load and importation.
import re
class SpecialPhrase():
Model representing a special phrase.
def __init__(self, p_label, p_class, p_type, p_operator):
self.p_label = p_label.strip()
self.p_class = p_class.strip()
#Hack around a bug where building=yes was imported with quotes into the wiki
self.p_type = re.sub(r'\"|"', '', p_type.strip())
#Needed if some operator in the wiki are not written in english
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator

View File

@ -256,7 +256,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
assert rank_mock.called == do_ranks
def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_from_wiki')
call_nominatim('special-phrases', '--import-from-wiki')

View File

@ -1,49 +1,51 @@
Tests for import special phrases methods
of the class SpecialPhrasesImporter.
of the class SPImporter.
from import SpecialPhrase
from import SPWikiLoader
from nominatim.errors import UsageError
from pathlib import Path
import tempfile
from shutil import copyfile
import pytest
from import SpecialPhrasesImporter
from import SPImporter
TEST_BASE_DIR = Path(__file__) / '..' / '..'
def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
def test_fetch_existing_place_classtype_tables(sp_importer, temp_db_cursor):
Check for the fetch_existing_place_classtype_tables() method.
It should return the table just created.
temp_db_cursor.execute('CREATE TABLE place_classtype_testclasstypetable()')
contained_table = special_phrases_importer.table_phrases_to_delete.pop()
contained_table = sp_importer.table_phrases_to_delete.pop()
assert contained_table == 'place_classtype_testclasstypetable'
def test_check_sanity_class(special_phrases_importer):
def test_check_sanity_class(sp_importer):
Check for _check_sanity() method.
If a wrong class or type is given, an UsageError should raise.
If a good class and type are given, nothing special happens.
assert not special_phrases_importer._check_sanity('en', '', 'type')
assert not special_phrases_importer._check_sanity('en', 'class', '')
assert special_phrases_importer._check_sanity('en', 'class', 'type')
assert not sp_importer._check_sanity(SpecialPhrase('en', '', 'type', ''))
assert not sp_importer._check_sanity(SpecialPhrase('en', 'class', '', ''))
def test_load_white_and_black_lists(special_phrases_importer):
assert sp_importer._check_sanity(SpecialPhrase('en', 'class', 'type', ''))
def test_load_white_and_black_lists(sp_importer):
Test that _load_white_and_black_lists() well return
black list and white list and that they are of dict type.
black_list, white_list = special_phrases_importer._load_white_and_black_lists()
black_list, white_list = sp_importer._load_white_and_black_lists()
assert isinstance(black_list, dict) and isinstance(white_list, dict)
def test_convert_php_settings(special_phrases_importer):
def test_convert_php_settings(sp_importer):
Test that _convert_php_settings_if_needed() convert the given
php file to a json file.
@ -53,19 +55,19 @@ def test_convert_php_settings(special_phrases_importer):
with tempfile.TemporaryDirectory() as temp_dir:
temp_settings = (Path(temp_dir) / 'phrase_settings.php').resolve()
copyfile(php_file, temp_settings)
assert (Path(temp_dir) / 'phrase_settings.json').is_file()
def test_convert_settings_wrong_file(special_phrases_importer):
def test_convert_settings_wrong_file(sp_importer):
Test that _convert_php_settings_if_needed() raise an exception
if the given file is not a valid file.
with pytest.raises(UsageError, match='random_file is not a valid file.'):
def test_convert_settings_json_already_exist(special_phrases_importer):
def test_convert_settings_json_already_exist(sp_importer):
Test that if we give to '_convert_php_settings_if_needed' a php file path
and that a the corresponding json file already exists, it is returned.
@ -73,22 +75,22 @@ def test_convert_settings_json_already_exist(special_phrases_importer):
php_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.php').resolve()
json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
returned = special_phrases_importer._convert_php_settings_if_needed(php_file)
returned = sp_importer._convert_php_settings_if_needed(php_file)
assert returned == json_file
def test_convert_settings_giving_json(special_phrases_importer):
def test_convert_settings_giving_json(sp_importer):
Test that if we give to '_convert_php_settings_if_needed' a json file path
the same path is directly returned
json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
returned = sp_importer._convert_php_settings_if_needed(json_file)
assert returned == json_file
def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
def test_create_place_classtype_indexes(temp_db_conn, sp_importer):
Test that _create_place_classtype_indexes() create the
place_id index and centroid index on the right place_class_type table.
@ -101,24 +103,24 @@ def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
temp_db_cursor.execute("CREATE EXTENSION postgis;")
temp_db_cursor.execute('CREATE TABLE {}(place_id BIGINT, centroid GEOMETRY)'.format(table_name))
special_phrases_importer._create_place_classtype_indexes('', phrase_class, phrase_type)
sp_importer._create_place_classtype_indexes('', phrase_class, phrase_type)
assert check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type)
def test_create_place_classtype_table(temp_db_conn, placex_table, special_phrases_importer):
def test_create_place_classtype_table(temp_db_conn, placex_table, sp_importer):
Test that _create_place_classtype_table() create
the right place_classtype table.
phrase_class = 'class'
phrase_type = 'type'
special_phrases_importer._create_place_classtype_table('', phrase_class, phrase_type)
sp_importer._create_place_classtype_table('', phrase_class, phrase_type)
assert check_table_exist(temp_db_conn, phrase_class, phrase_type)
def test_grant_access_to_web_user(temp_db_conn, def_config, special_phrases_importer):
def test_grant_access_to_web_user(temp_db_conn, def_config, sp_importer):
Test that _grant_access_to_webuser() give
Test that _grant_access_to_webuser() give
right access to the web user.
phrase_class = 'class'
@ -128,13 +130,13 @@ def test_grant_access_to_web_user(temp_db_conn, def_config, special_phrases_impo
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute('CREATE TABLE {}()'.format(table_name))
special_phrases_importer._grant_access_to_webuser(phrase_class, phrase_type)
sp_importer._grant_access_to_webuser(phrase_class, phrase_type)
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, phrase_class, phrase_type)
def test_create_place_classtype_table_and_indexes(
temp_db_conn, def_config, placex_table,
Test that _create_place_classtype_table_and_indexes()
create the right place_classtype tables and place_id indexes
@ -143,28 +145,14 @@ def test_create_place_classtype_table_and_indexes(
pairs = set([('class1', 'type1'), ('class2', 'type2')])
for pair in pairs:
assert check_table_exist(temp_db_conn, pair[0], pair[1])
assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
Test that _process_xml_content() process the given xml content right
by executing the right SQL functions for amenities and
by returning the right set of pairs.
class_test = 'aerialway'
type_test = 'zip_line'
#Converted output set to a dict for easy assert further.
results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
assert results[class_test] and type_test in results.values()
def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
def test_remove_non_existent_tables_from_db(sp_importer, default_phrases,
Check for the remove_non_existent_phrases_from_db() method.
@ -177,7 +165,7 @@ def test_remove_non_existent_tables_from_db(special_phrases_importer, default_ph
be deleted.
with temp_db_conn.cursor() as temp_db_cursor:
special_phrases_importer.table_phrases_to_delete = {
sp_importer.table_phrases_to_delete = {
@ -188,7 +176,7 @@ def test_remove_non_existent_tables_from_db(special_phrases_importer, default_ph
AND table_name like 'place_classtype_%';
tables_result = temp_db_cursor.fetchall()
@ -196,13 +184,13 @@ def test_remove_non_existent_tables_from_db(special_phrases_importer, default_ph
tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, sp_importer,
placex_table, tokenizer_mock):
Check that the main import_from_wiki() method is well executed.
It should create the place_classtype table, the place_id and centroid indexes,
grand access to the web user and executing the SQL functions for amenities.
It should also update the database well by deleting or preserving existing entries
It should also update the database well by deleting or preserving existing entries
of the database.
#Add some data to the database before execution in order to test
@ -211,10 +199,10 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
CREATE TABLE place_classtype_amenity_animal_shelter();
CREATE TABLE place_classtype_wrongclass_wrongtype();""")
monkeypatch.setattr('', mock_get_wiki_content)
monkeypatch.setattr('', mock_get_wiki_content)
tokenizer = tokenizer_mock()
special_phrases_importer.import_from_wiki(tokenizer, ['en'])
assert len(tokenizer.analyser_cache['special_phrases']) == 18
@ -256,7 +244,7 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
assert not temp_db_cursor.fetchone()
def mock_get_wiki_content(lang):
def mock_get_wiki_content(self, lang):
Mock the _get_wiki_content() method to return
static xml test file content.
@ -315,11 +303,12 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
def sp_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
Return an instance of SpecialPhrasesImporter.
Return an instance of SPImporter.
return SpecialPhrasesImporter(def_config, temp_phplib_dir_with_migration, temp_db_conn)
loader = SPWikiLoader(def_config, ['en'])
return SPImporter(def_config, temp_phplib_dir_with_migration, temp_db_conn, loader)
def temp_phplib_dir_with_migration():