From 4e1e166c6a3f4b83c114eb22babca0a4eee3f96e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 10 May 2022 23:00:18 +0200 Subject: [PATCH 1/7] add a function to return a formatted version Replaces the various repeated format strings throughout the code. --- .pylintrc | 2 +- nominatim/cli.py | 4 ++-- nominatim/clicmd/setup.py | 5 ++--- nominatim/tools/exec_utils.py | 4 ++-- nominatim/tools/migration.py | 7 +++---- nominatim/tools/refresh.py | 13 ++++++------- nominatim/version.py | 8 ++++++++ 7 files changed, 24 insertions(+), 19 deletions(-) diff --git a/.pylintrc b/.pylintrc index d5c4514f..89135138 100644 --- a/.pylintrc +++ b/.pylintrc @@ -11,6 +11,6 @@ ignored-modules=icu,datrie # 'with' statements. ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict -disable=too-few-public-methods,duplicate-code,too-many-ancestors +disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value good-names=i,x,y,fd,db diff --git a/nominatim/cli.py b/nominatim/cli.py index 01eb6119..f911023b 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -60,9 +60,9 @@ class CommandlineParser: def nominatim_version_text(): """ Program name and version number as string """ - text = 'Nominatim version %s.%s.%s.%s' % version.NOMINATIM_VERSION + text = f'Nominatim version {version.version_str()}' if version.GIT_COMMIT_HASH is not None: - text += ' (%s)' % version.GIT_COMMIT_HASH + text += f' ({version.GIT_COMMIT_HASH})' return text def add_subcommand(self, name, cmd): diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index e822cbe5..7968fbce 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -14,7 +14,7 @@ import psutil from nominatim.db.connection import connect from nominatim.db import status, properties -from nominatim.version import NOMINATIM_VERSION +from nominatim.version import version_str # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -213,5 +213,4 @@ class SetupAll: except Exception as exc: # pylint: disable=broad-except LOG.error('Cannot determine date of database: %s', exc) - properties.set_property(conn, 'database_version', - '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) + properties.set_property(conn, 'database_version', version_str()) diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index b06b8533..db2a4c2e 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -12,7 +12,7 @@ import subprocess import urllib.request as urlrequest from urllib.parse import urlencode -from nominatim.version import NOMINATIM_VERSION +from nominatim.version import version_str from nominatim.db.connection import get_pg_env LOG = logging.getLogger() @@ -150,7 +150,7 @@ def run_osm2pgsql(options): def get_url(url): """ Get the contents from the given URL and return it as a UTF-8 string. """ - headers = {"User-Agent": "Nominatim/{0[0]}.{0[1]}.{0[2]}-{0[3]}".format(NOMINATIM_VERSION)} + headers = {"User-Agent": f"Nominatim/{version_str()}"} try: with urlrequest.urlopen(urlrequest.Request(url, headers=headers)) as response: diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index 76726e8c..290016ad 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -11,7 +11,7 @@ import logging from nominatim.db import properties from nominatim.db.connection import connect -from nominatim.version import NOMINATIM_VERSION +from nominatim.version import NOMINATIM_VERSION, version_str from nominatim.tools import refresh from nominatim.tokenizer import factory as tokenizer_factory from nominatim.errors import UsageError @@ -47,7 +47,7 @@ def migrate(config, paths): for version, func in _MIGRATION_FUNCTIONS: if db_version <= version: LOG.warning("Runnning: %s (%s)", func.__doc__.split('\n', 1)[0], - '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version)) + '{}.{}.{}-{}'.format(*version)) kwargs = dict(conn=conn, config=config, paths=paths) func(**kwargs) conn.commit() @@ -59,8 +59,7 @@ def migrate(config, paths): tokenizer = tokenizer_factory.get_tokenizer_for_db(config) tokenizer.update_sql_functions(config) - properties.set_property(conn, 'database_version', - '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) + properties.set_property(conn, 'database_version', version_str()) conn.commit() diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index aacc622b..b9a926f2 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -15,7 +15,7 @@ from psycopg2 import sql as pysql from nominatim.db.utils import execute_file from nominatim.db.sql_preprocessor import SQLPreprocessor -from nominatim.version import NOMINATIM_VERSION +from nominatim.version import version_str LOG = logging.getLogger() @@ -186,16 +186,15 @@ def setup_website(basedir, config, conn): LOG.info('Creating website directory.') basedir.mkdir() - template = dedent("""\ + template = dedent(f"""\ Date: Wed, 11 May 2022 08:59:28 +0200 Subject: [PATCH 2/7] pylint: avoid explicit use of format() function Use psycopg2 SQL formatters for SQL and formatted string literals everywhere else. --- nominatim/clicmd/replication.py | 5 ++- nominatim/clicmd/setup.py | 10 +++--- nominatim/config.py | 2 +- nominatim/db/connection.py | 2 +- nominatim/db/sql_preprocessor.py | 6 ++-- nominatim/db/status.py | 2 +- nominatim/indexer/runners.py | 4 +-- nominatim/tokenizer/legacy_tokenizer.py | 14 ++++----- .../token_analysis/config_variants.py | 4 +-- nominatim/tools/database_import.py | 2 +- nominatim/tools/exec_utils.py | 6 ++-- nominatim/tools/migration.py | 18 ++++++----- nominatim/tools/postcodes.py | 4 +-- nominatim/tools/refresh.py | 9 ++++-- .../tools/special_phrases/sp_csv_loader.py | 2 +- .../tools/special_phrases/sp_importer.py | 31 ++++++++++--------- nominatim/version.py | 4 +-- 17 files changed, 66 insertions(+), 59 deletions(-) diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py index 849a0e49..9d946304 100644 --- a/nominatim/clicmd/replication.py +++ b/nominatim/clicmd/replication.py @@ -21,7 +21,7 @@ LOG = logging.getLogger() # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 # Using non-top-level imports to make pyosmium optional for replication only. -# pylint: disable=E0012,C0415 +# pylint: disable=C0415 class UpdateReplication: """\ @@ -96,8 +96,7 @@ class UpdateReplication: end = dt.datetime.now(dt.timezone.utc) LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.", round_time((start_index or end) - start_import), - "Indexing: {} ".format(round_time(end - start_index)) - if start_index else '', + f"Indexing: {round_time(end - start_index)} " if start_index else '', round_time(end - start_import), round_time(end - batchdate)) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 7968fbce..b643c5ba 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -19,7 +19,7 @@ from nominatim.version import version_str # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 # Using non-top-level imports to avoid eventually unused imports. -# pylint: disable=E0012,C0415 +# pylint: disable=C0415 LOG = logging.getLogger() @@ -194,10 +194,10 @@ class SetupAll: LOG.warning('Creating support index') if tablespace: tablespace = 'TABLESPACE ' + tablespace - cur.execute("""CREATE INDEX idx_placex_pendingsector - ON placex USING BTREE (rank_address,geometry_sector) - {} WHERE indexed_status > 0 - """.format(tablespace)) + cur.execute(f"""CREATE INDEX idx_placex_pendingsector + ON placex USING BTREE (rank_address,geometry_sector) + {tablespace} WHERE indexed_status > 0 + """) conn.commit() diff --git a/nominatim/config.py b/nominatim/config.py index a3f91055..ef261079 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -144,7 +144,7 @@ class Configuration: style = self.__getattr__('IMPORT_STYLE') if style in ('admin', 'street', 'address', 'full', 'extratags'): - return self.config_dir / 'import-{}.style'.format(style) + return self.config_dir / f'import-{style}.style' return self.find_config_file('', 'IMPORT_STYLE') diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 45bc173d..90194e63 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -163,7 +163,7 @@ def connect(dsn): ctxmgr.connection = conn return ctxmgr except psycopg2.OperationalError as err: - raise UsageError("Cannot connect to database: {}".format(err)) from err + raise UsageError(f"Cannot connect to database: {err}") from err # Translation from PG connection string parameters to PG environment variables. diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py index 10d93666..4de53886 100644 --- a/nominatim/db/sql_preprocessor.py +++ b/nominatim/db/sql_preprocessor.py @@ -39,10 +39,10 @@ def _setup_tablespace_sql(config): out = {} for subset in ('ADDRESS', 'SEARCH', 'AUX'): for kind in ('DATA', 'INDEX'): - tspace = getattr(config, 'TABLESPACE_{}_{}'.format(subset, kind)) + tspace = getattr(config, f'TABLESPACE_{subset}_{kind}') if tspace: - tspace = 'TABLESPACE "{}"'.format(tspace) - out['{}_{}'.format(subset.lower(), kind.lower())] = tspace + tspace = f'TABLESPACE "{tspace}"' + out[f'{subset.lower()}_{kind.lower()}'] = tspace return out diff --git a/nominatim/db/status.py b/nominatim/db/status.py index 12b24a83..d31196b3 100644 --- a/nominatim/db/status.py +++ b/nominatim/db/status.py @@ -34,7 +34,7 @@ def compute_database_date(conn): LOG.info("Using node id %d for timestamp lookup", osmid) # Get the node from the API to find the timestamp when it was created. - node_url = 'https://www.openstreetmap.org/api/0.6/node/{}/1'.format(osmid) + node_url = f'https://www.openstreetmap.org/api/0.6/node/{osmid}/1' data = get_url(node_url) match = re.search(r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"', data) diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index ac7a0015..9a30ffe6 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -66,7 +66,7 @@ class RankRunner(AbstractPlacexRunner): """ def name(self): - return "rank {}".format(self.rank) + return f"rank {self.rank}" def sql_count_objects(self): return pysql.SQL("""SELECT count(*) FROM placex @@ -86,7 +86,7 @@ class BoundaryRunner(AbstractPlacexRunner): """ def name(self): - return "boundaries rank {}".format(self.rank) + return f"boundaries rank {self.rank}" def sql_count_objects(self): return pysql.SQL("""SELECT count(*) FROM placex diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 97ce6d16..299e2fde 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -74,10 +74,10 @@ def _check_module(module_dir, conn): with conn.cursor() as cur: try: cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) - RETURNS text AS '{}/nominatim.so', 'transliteration' + RETURNS text AS %s, 'transliteration' LANGUAGE c IMMUTABLE STRICT; DROP FUNCTION nominatim_test_import_func(text) - """.format(module_dir)) + """, (f'{module_dir}/nominatim.so', )) except psycopg2.DatabaseError as err: LOG.fatal("Error accessing database module: %s", err) raise UsageError("Database module cannot be accessed.") from err @@ -250,12 +250,12 @@ class LegacyTokenizer(AbstractTokenizer): php_file = self.data_dir / "tokenizer.php" if not php_file.exists() or overwrite: - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ ', rule) if len(parts) != 4: - raise UsageError("Syntax error in variant rule: " + rule) + raise UsageError(f"Syntax error in variant rule: {rule}") decompose = parts[1] is None src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] @@ -89,7 +89,7 @@ class _VariantMaker: name = name.strip() match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) if match is None or (match.group(1) == '~' and match.group(3) == '~'): - raise UsageError("Invalid variant word descriptor '{}'".format(name)) + raise UsageError(f"Invalid variant word descriptor '{name}'") norm_name = self.norm.transliterate(match.group(2)).strip() if not norm_name: return None diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index caec9035..50938c19 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -234,7 +234,7 @@ def create_search_indices(conn, config, drop=False): bad_indices = [row[0] for row in list(cur)] for idx in bad_indices: LOG.info("Drop invalid index %s.", idx) - cur.execute('DROP INDEX "{}"'.format(idx)) + cur.execute(pysql.SQL('DROP INDEX {}').format(pysql.Identifier(idx))) conn.commit() sql = SQLPreprocessor(conn, config) diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index db2a4c2e..a81a8d6b 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -55,10 +55,10 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None, query_string = urlencode(params or {}) env = dict(QUERY_STRING=query_string, - SCRIPT_NAME='/{}.php'.format(endpoint), - REQUEST_URI='/{}.php?{}'.format(endpoint, query_string), + SCRIPT_NAME=f'/{endpoint}.php', + REQUEST_URI=f'/{endpoint}.php?{query_string}', CONTEXT_DOCUMENT_ROOT=webdir, - SCRIPT_FILENAME='{}/{}.php'.format(webdir, endpoint), + SCRIPT_FILENAME=f'{webdir}/{endpoint}.php', HTTP_HOST='localhost', HTTP_USER_AGENT='nominatim-tool', REMOTE_ADDR='0.0.0.0', diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index 290016ad..8eaad2f7 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -9,6 +9,8 @@ Functions for database migration to newer software versions. """ import logging +from psycopg2 import sql as pysql + from nominatim.db import properties from nominatim.db.connection import connect from nominatim.version import NOMINATIM_VERSION, version_str @@ -47,7 +49,7 @@ def migrate(config, paths): for version, func in _MIGRATION_FUNCTIONS: if db_version <= version: LOG.warning("Runnning: %s (%s)", func.__doc__.split('\n', 1)[0], - '{}.{}.{}-{}'.format(*version)) + version_str(version)) kwargs = dict(conn=conn, config=config, paths=paths) func(**kwargs) conn.commit() @@ -124,11 +126,12 @@ def add_nominatim_property_table(conn, config, **_): """ if not conn.table_exists('nominatim_properties'): with conn.cursor() as cur: - cur.execute("""CREATE TABLE nominatim_properties ( - property TEXT, - value TEXT); - GRANT SELECT ON TABLE nominatim_properties TO "{}"; - """.format(config.DATABASE_WEBUSER)) + cur.execute(pysql.SQL("""CREATE TABLE nominatim_properties ( + property TEXT, + value TEXT); + GRANT SELECT ON TABLE nominatim_properties TO {}; + """) + .format(pysql.Identifier(config.DATABASE_WEBUSER))) @_migration(3, 6, 0, 0) def change_housenumber_transliteration(conn, **_): @@ -193,7 +196,8 @@ def install_legacy_tokenizer(conn, config, **_): and column_name = 'token_info'""", (table, )) if has_column == 0: - cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table)) + cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB') + .format(pysql.Identifier(table))) tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False, module_name='legacy') diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index adc58ec5..cd778e5f 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -136,13 +136,13 @@ class _CountryPostcodesCollector: def _open_external(self, project_dir): - fname = project_dir / '{}_postcodes.csv'.format(self.country) + fname = project_dir / f'{self.country}_postcodes.csv' if fname.is_file(): LOG.info("Using external postcode file '%s'.", fname) return open(fname, 'r') - fname = project_dir / '{}_postcodes.csv.gz'.format(self.country) + fname = project_dir / f'{self.country}_postcodes.csv.gz' if fname.is_file(): LOG.info("Using external postcode file '%s'.", fname) diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index b9a926f2..ca6f5c62 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -52,16 +52,19 @@ def load_address_levels(conn, table, levels): with conn.cursor() as cur: cur.drop_table(table) - cur.execute("""CREATE TABLE {} (country_code varchar(2), + cur.execute(pysql.SQL("""CREATE TABLE {} ( + country_code varchar(2), class TEXT, type TEXT, rank_search SMALLINT, - rank_address SMALLINT)""".format(table)) + rank_address SMALLINT)""") + .format(pysql.Identifier(table))) cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s") .format(pysql.Identifier(table)), rows) - cur.execute('CREATE UNIQUE INDEX ON {} (country_code, class, type)'.format(table)) + cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)') + .format(pysql.Identifier(table))) conn.commit() diff --git a/nominatim/tools/special_phrases/sp_csv_loader.py b/nominatim/tools/special_phrases/sp_csv_loader.py index a32a4b39..2a67687f 100644 --- a/nominatim/tools/special_phrases/sp_csv_loader.py +++ b/nominatim/tools/special_phrases/sp_csv_loader.py @@ -54,4 +54,4 @@ class SPCsvLoader(Iterator): _, extension = os.path.splitext(self.csv_path) if extension != '.csv': - raise UsageError('The file {} is not a csv file.'.format(self.csv_path)) + raise UsageError(f'The file {self.csv_path} is not a csv file.') diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py index 195f8387..9eefaa19 100644 --- a/nominatim/tools/special_phrases/sp_importer.py +++ b/nominatim/tools/special_phrases/sp_importer.py @@ -16,7 +16,7 @@ import logging import re -from psycopg2.sql import Identifier, Literal, SQL +from psycopg2.sql import Identifier, SQL from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics LOG = logging.getLogger() @@ -195,35 +195,36 @@ class SPImporter(): """ table_name = _classtype_table(phrase_class, phrase_type) with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(""" - CREATE TABLE IF NOT EXISTS {{}} {} - AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex - WHERE class = {{}} AND type = {{}}""".format(sql_tablespace)) - .format(Identifier(table_name), Literal(phrase_class), - Literal(phrase_type))) + db_cursor.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS + SELECT place_id AS place_id, + st_centroid(geometry) AS centroid + FROM placex + WHERE class = %s AND type = %s""") + .format(Identifier(table_name), SQL(sql_tablespace)), + (phrase_class, phrase_type)) def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): """ Create indexes on centroid and place_id for the place_classtype table. """ - index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type) + index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_' base_table = _classtype_table(phrase_class, phrase_type) # Index on centroid if not self.db_connection.index_exists(index_prefix + 'centroid'): with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(""" - CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace)) - .format(Identifier(index_prefix + 'centroid'), - Identifier(base_table)), sql_tablespace) + db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}") + .format(Identifier(index_prefix + 'centroid'), + Identifier(base_table), + SQL(sql_tablespace))) # Index on place_id if not self.db_connection.index_exists(index_prefix + 'place_id'): with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL( - """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace)) + db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}") .format(Identifier(index_prefix + 'place_id'), - Identifier(base_table))) + Identifier(base_table), + SQL(sql_tablespace))) def _grant_access_to_webuser(self, phrase_class, phrase_type): diff --git a/nominatim/version.py b/nominatim/version.py index d4f69da1..88d42af9 100644 --- a/nominatim/version.py +++ b/nominatim/version.py @@ -37,8 +37,8 @@ GIT_COMMIT_HASH = None # pylint: disable=consider-using-f-string -def version_str(): +def version_str(version=NOMINATIM_VERSION): """ Return a human-readable string of the version. """ - return '{}.{}.{}-{}'.format(*NOMINATIM_VERSION) + return '{}.{}.{}-{}'.format(*version) From ae6b029543f7cacd9a491ae5965981a610ca424e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 09:06:32 +0200 Subject: [PATCH 3/7] remove redundant 'u' prefixes for unicode strings --- nominatim/db/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 87b0545f..7d58a668 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -67,9 +67,9 @@ def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None) # List of characters that need to be quoted for the copy command. -_SQL_TRANSLATION = {ord(u'\\'): u'\\\\', - ord(u'\t'): u'\\t', - ord(u'\n'): u'\\n'} +_SQL_TRANSLATION = {ord('\\'): '\\\\', + ord('\t'): '\\t', + ord('\n'): '\\n'} class CopyBuffer: From 5d5f40a82f07e7e2a341435aeb45d8619d252525 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 09:45:15 +0200 Subject: [PATCH 4/7] use context management when processing Tiger data --- nominatim/tools/tiger_data.py | 107 ++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index 8610880f..6e37df5e 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -21,33 +21,57 @@ from nominatim.indexer.place_info import PlaceInfo LOG = logging.getLogger() - -def handle_tarfile_or_directory(data_dir): - """ Handles tarfile or directory for importing tiger data +class TigerInput: + """ Context manager that goes through Tiger input files which may + either be in a directory or gzipped together in a tar file. """ - tar = None - if data_dir.endswith('.tar.gz'): - try: - tar = tarfile.open(data_dir) - except tarfile.ReadError as err: - LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir) - raise UsageError("Cannot open Tiger data file.") from err + def __init__(self, data_dir): + self.tar_handle = None + self.files = [] - csv_files = [i for i in tar.getmembers() if i.name.endswith('.csv')] - LOG.warning("Found %d CSV files in tarfile with path %s", len(csv_files), data_dir) - if not csv_files: - LOG.warning("Tiger data import selected but no files in tarfile's path %s", data_dir) - return None, None - else: - files = os.listdir(data_dir) - csv_files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')] - LOG.warning("Found %d CSV files in path %s", len(csv_files), data_dir) - if not csv_files: - LOG.warning("Tiger data import selected but no files found in path %s", data_dir) - return None, None + if data_dir.endswith('.tar.gz'): + try: + self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with + except tarfile.ReadError as err: + LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir) + raise UsageError("Cannot open Tiger data file.") from err - return csv_files, tar + self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')] + LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir) + else: + files = os.listdir(data_dir) + self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')] + LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir) + + if not self.files: + LOG.warning("Tiger data import selected but no files found at %s", data_dir) + + + def __enter__(self): + return self + + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.tar_handle: + self.tar_handle.close() + self.tar_handle = None + + + def next_file(self): + """ Return a file handle to the next file to be processed. + Raises an IndexError if there is no file left. + """ + fname = self.files.pop(0) + + if self.tar_handle is not None: + return io.TextIOWrapper(self.tar_handle.extractfile(fname)) + + return open(fname, encoding='utf-8') + + + def __len__(self): + return len(self.files) def handle_threaded_sql_statements(pool, fd, analyzer): @@ -79,34 +103,27 @@ def add_tiger_data(data_dir, config, threads, tokenizer): """ Import tiger data from directory or tar file `data dir`. """ dsn = config.get_libpq_dsn() - files, tar = handle_tarfile_or_directory(data_dir) - if not files: - return + with TigerInput(data_dir) as tar: + if not tar: + return - with connect(dsn) as conn: - sql = SQLPreprocessor(conn, config) - sql.run_sql_file(conn, 'tiger_import_start.sql') + with connect(dsn) as conn: + sql = SQLPreprocessor(conn, config) + sql.run_sql_file(conn, 'tiger_import_start.sql') - # Reading files and then for each file line handling - # sql_query in chunks. - place_threads = max(1, threads - 1) + # Reading files and then for each file line handling + # sql_query in chunks. + place_threads = max(1, threads - 1) - with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool: - with tokenizer.name_analyzer() as analyzer: - for fname in files: - if not tar: - fd = open(fname) - else: - fd = io.TextIOWrapper(tar.extractfile(fname)) + with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool: + with tokenizer.name_analyzer() as analyzer: + while tar: + with tar.next_file() as fd: + handle_threaded_sql_statements(pool, fd, analyzer) - handle_threaded_sql_statements(pool, fd, analyzer) + print('\n') - fd.close() - - if tar: - tar.close() - print('\n') LOG.warning("Creating indexes on Tiger data") with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) From 7f7a7df3a2dfb7ae97bafb2c2e83209a87b75f24 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 10:22:14 +0200 Subject: [PATCH 5/7] solve assorted issue with newer pylint versions Includes more use of 'with', adding encodings to open statements and a couple of issues with parameter renaming. --- nominatim/db/connection.py | 3 +- nominatim/db/utils.py | 34 +++++++++---------- nominatim/tools/postcodes.py | 4 +-- .../tools/special_phrases/sp_csv_loader.py | 4 +-- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 90194e63..c60bcfdd 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -25,7 +25,8 @@ class _Cursor(psycopg2.extras.DictCursor): execution functions. """ - def execute(self, query, args=None): # pylint: disable=W0221 + # pylint: disable=arguments-renamed,arguments-differ + def execute(self, query, args=None): """ Query execution that logs the SQL query when debugging is enabled. """ LOG.debug(self.mogrify(query, args).decode('utf-8')) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 7d58a668..b859afa8 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -40,27 +40,27 @@ def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None) cmd.extend(('-v', 'ON_ERROR_STOP=1')) if not LOG.isEnabledFor(logging.INFO): cmd.append('--quiet') - proc = subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) - try: - if not LOG.isEnabledFor(logging.INFO): - proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) + with subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) as proc: + try: + if not LOG.isEnabledFor(logging.INFO): + proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8')) - if pre_code: - proc.stdin.write((pre_code + ';').encode('utf-8')) + if pre_code: + proc.stdin.write((pre_code + ';').encode('utf-8')) - if fname.suffix == '.gz': - with gzip.open(str(fname), 'rb') as fdesc: - remain = _pipe_to_proc(proc, fdesc) - else: - with fname.open('rb') as fdesc: - remain = _pipe_to_proc(proc, fdesc) + if fname.suffix == '.gz': + with gzip.open(str(fname), 'rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) + else: + with fname.open('rb') as fdesc: + remain = _pipe_to_proc(proc, fdesc) - if remain == 0 and post_code: - proc.stdin.write((';' + post_code).encode('utf-8')) - finally: - proc.stdin.close() - ret = proc.wait() + if remain == 0 and post_code: + proc.stdin.write((';' + post_code).encode('utf-8')) + finally: + proc.stdin.close() + ret = proc.wait() if ret != 0 or remain > 0: raise UsageError("Failed to execute SQL file.") diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index cd778e5f..2b7027e7 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -36,7 +36,7 @@ class _CountryPostcodesCollector: def __init__(self, country): self.country = country - self.collected = dict() + self.collected = {} def add(self, postcode, x, y): @@ -140,7 +140,7 @@ class _CountryPostcodesCollector: if fname.is_file(): LOG.info("Using external postcode file '%s'.", fname) - return open(fname, 'r') + return open(fname, 'r', encoding='utf-8') fname = project_dir / f'{self.country}_postcodes.csv.gz' diff --git a/nominatim/tools/special_phrases/sp_csv_loader.py b/nominatim/tools/special_phrases/sp_csv_loader.py index 2a67687f..55a9d8d0 100644 --- a/nominatim/tools/special_phrases/sp_csv_loader.py +++ b/nominatim/tools/special_phrases/sp_csv_loader.py @@ -39,8 +39,8 @@ class SPCsvLoader(Iterator): """ phrases = set() - with open(self.csv_path) as file: - reader = csv.DictReader(file, delimiter=',') + with open(self.csv_path, encoding='utf-8') as fd: + reader = csv.DictReader(fd, delimiter=',') for row in reader: phrases.add( SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator']) From d14a585cc92c1558c23ca86855474f14fd711cb2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 10:25:00 +0200 Subject: [PATCH 6/7] pylint: disable no-self-use check This checker encourages bad behaviour (namely changing the static status of a function during inheritence) and will be made optional in upcoming versions of pylint. --- .pylintrc | 2 +- nominatim/tokenizer/icu_tokenizer.py | 3 +-- nominatim/tokenizer/legacy_tokenizer.py | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.pylintrc b/.pylintrc index 89135138..fef53872 100644 --- a/.pylintrc +++ b/.pylintrc @@ -11,6 +11,6 @@ ignored-modules=icu,datrie # 'with' statements. ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict -disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value +disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use good-names=i,x,y,fd,db diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 9c7138ce..bf5544ed 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -278,8 +278,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()] - @staticmethod - def normalize_postcode(postcode): + def normalize_postcode(self, postcode): """ Convert the postcode to a standardized form. This function must yield exactly the same result as the SQL function diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 299e2fde..7b78b22a 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -337,8 +337,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): return self.normalizer.transliterate(phrase) - @staticmethod - def normalize_postcode(postcode): + def normalize_postcode(self, postcode): """ Convert the postcode to a standardized form. This function must yield exactly the same result as the SQL function From 3ba975466c166e9d9eceee34189d22c1561e69c1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 10:36:09 +0200 Subject: [PATCH 7/7] fix spacing Some versions of pylint are oddly picky. --- nominatim/tools/migration.py | 5 ++-- nominatim/tools/refresh.py | 6 ++--- .../tools/special_phrases/sp_importer.py | 25 ++++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index 8eaad2f7..28a14455 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -130,8 +130,7 @@ def add_nominatim_property_table(conn, config, **_): property TEXT, value TEXT); GRANT SELECT ON TABLE nominatim_properties TO {}; - """) - .format(pysql.Identifier(config.DATABASE_WEBUSER))) + """).format(pysql.Identifier(config.DATABASE_WEBUSER))) @_migration(3, 6, 0, 0) def change_housenumber_transliteration(conn, **_): @@ -197,7 +196,7 @@ def install_legacy_tokenizer(conn, config, **_): (table, )) if has_column == 0: cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB') - .format(pysql.Identifier(table))) + .format(pysql.Identifier(table))) tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False, module_name='legacy') diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index ca6f5c62..561bcf83 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -57,14 +57,14 @@ def load_address_levels(conn, table, levels): class TEXT, type TEXT, rank_search SMALLINT, - rank_address SMALLINT)""") - .format(pysql.Identifier(table))) + rank_address SMALLINT) + """).format(pysql.Identifier(table))) cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s") .format(pysql.Identifier(table)), rows) cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)') - .format(pysql.Identifier(table))) + .format(pysql.Identifier(table))) conn.commit() diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py index 9eefaa19..8137142b 100644 --- a/nominatim/tools/special_phrases/sp_importer.py +++ b/nominatim/tools/special_phrases/sp_importer.py @@ -191,17 +191,18 @@ class SPImporter(): def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type): """ - Create table place_classtype of the given phrase_class/phrase_type if doesn't exit. + Create table place_classtype of the given phrase_class/phrase_type + if doesn't exit. """ table_name = _classtype_table(phrase_class, phrase_type) - with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS - SELECT place_id AS place_id, - st_centroid(geometry) AS centroid - FROM placex - WHERE class = %s AND type = %s""") - .format(Identifier(table_name), SQL(sql_tablespace)), - (phrase_class, phrase_type)) + with self.db_connection.cursor() as cur: + cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS + SELECT place_id AS place_id, + st_centroid(geometry) AS centroid + FROM placex + WHERE class = %s AND type = %s + """).format(Identifier(table_name), SQL(sql_tablespace)), + (phrase_class, phrase_type)) def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): @@ -214,9 +215,9 @@ class SPImporter(): if not self.db_connection.index_exists(index_prefix + 'centroid'): with self.db_connection.cursor() as db_cursor: db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}") - .format(Identifier(index_prefix + 'centroid'), - Identifier(base_table), - SQL(sql_tablespace))) + .format(Identifier(index_prefix + 'centroid'), + Identifier(base_table), + SQL(sql_tablespace))) # Index on place_id if not self.db_connection.index_exists(index_prefix + 'place_id'):