adapt tests to new TIGER CSV format

This commit is contained in:
Sarah Hoffmann 2021-05-13 23:37:51 +02:00
parent 35efe3b41c
commit 7d621389ee
4 changed files with 6331 additions and 6250 deletions

View File

@ -12,7 +12,7 @@ import psycopg2.extras
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.db.async_connection import WorkerPool from nominatim.db.async_connection import WorkerPool
from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.errors import UsageError
LOG = logging.getLogger() LOG = logging.getLogger()
@ -23,7 +23,12 @@ def handle_tarfile_or_directory(data_dir):
tar = None tar = None
if data_dir.endswith('.tar.gz'): if data_dir.endswith('.tar.gz'):
tar = tarfile.open(data_dir) try:
tar = tarfile.open(data_dir)
except tarfile.ReadError as err:
LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
raise UsageError("Cannot open Tiger data file.") from err
csv_files = [i for i in tar.getmembers() if i.name.endswith('.csv')] csv_files = [i for i in tar.getmembers() if i.name.endswith('.csv')]
LOG.warning("Found %d CSV files in tarfile with path %s", len(csv_files), data_dir) LOG.warning("Found %d CSV files in tarfile with path %s", len(csv_files), data_dir)
if not csv_files: if not csv_files:

View File

@ -2,60 +2,137 @@
Test for tiger data function Test for tiger data function
""" """
from pathlib import Path from pathlib import Path
from textwrap import dedent
import pytest import pytest
import tarfile import tarfile
from nominatim.tools import tiger_data, database_import from nominatim.tools import tiger_data, database_import
from nominatim.errors import UsageError
class MockTigerTable:
def __init__(self, conn):
self.conn = conn
with conn.cursor() as cur:
cur.execute("""CREATE TABLE tiger (linegeo GEOMETRY,
start INTEGER,
stop INTEGER,
interpol TEXT,
token_info JSONB,
postcode TEXT)""")
def count(self):
with self.conn.cursor() as cur:
return cur.scalar("SELECT count(*) FROM tiger")
def row(self):
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM tiger LIMIT 1")
return cur.fetchone()
@pytest.fixture
def tiger_table(def_config, temp_db_conn, sql_preprocessor,
temp_db_with_extensions, tmp_path):
def_config.lib_dir.sql = tmp_path / 'sql'
def_config.lib_dir.sql.mkdir()
(def_config.lib_dir.sql / 'tiger_import_start.sql').write_text(
"""CREATE OR REPLACE FUNCTION tiger_line_import(linegeo GEOMETRY, start INTEGER,
stop INTEGER, interpol TEXT,
token_info JSONB, postcode TEXT)
RETURNS INTEGER AS $$
INSERT INTO tiger VALUES(linegeo, start, stop, interpol, token_info, postcode) RETURNING 1
$$ LANGUAGE SQL;""")
(def_config.lib_dir.sql / 'tiger_import_finish.sql').write_text(
"""DROP FUNCTION tiger_line_import (linegeo GEOMETRY, in_startnumber INTEGER,
in_endnumber INTEGER, interpolationtype TEXT,
token_info JSONB, in_postcode TEXT);""")
return MockTigerTable(temp_db_conn)
@pytest.fixture
def csv_factory(tmp_path):
def _mk_file(fname, hnr_from=1, hnr_to=9, interpol='odd', street='Main St',
city='Newtown', state='AL', postcode='12345',
geometry='LINESTRING(-86.466995 32.428956,-86.466923 32.428933)'):
(tmp_path / (fname + '.csv')).write_text(dedent("""\
from;to;interpolation;street;city;state;postcode;geometry
{};{};{};{};{};{};{};{}
""".format(hnr_from, hnr_to, interpol, street, city, state,
postcode, geometry)))
return _mk_file
@pytest.mark.parametrize("threads", (1, 5)) @pytest.mark.parametrize("threads", (1, 5))
def test_add_tiger_data(def_config, tmp_path, sql_preprocessor, def test_add_tiger_data(def_config, src_dir, tiger_table, tokenizer_mock, threads):
temp_db_cursor, threads, temp_db_with_extensions): tiger_data.add_tiger_data(str(src_dir / 'test' / 'testdb' / 'tiger'),
temp_db_cursor.execute('CREATE TABLE place (id INT)') def_config, threads, tokenizer_mock())
sqlfile = tmp_path / '1010.sql'
sqlfile.write_text("""INSERT INTO place values (1);
INSERT INTO non_existant_table values (1);""")
tiger_data.add_tiger_data(str(tmp_path), def_config, threads)
assert temp_db_cursor.table_rows('place') == 1 assert tiger_table.count() == 6213
@pytest.mark.parametrize("threads", (1, 5)) def test_add_tiger_data_no_files(def_config, tiger_table, tokenizer_mock,
def test_add_tiger_data_bad_file(def_config, tmp_path, sql_preprocessor, tmp_path):
temp_db_cursor, threads, temp_db_with_extensions): tiger_data.add_tiger_data(str(tmp_path), def_config, 1, tokenizer_mock())
temp_db_cursor.execute('CREATE TABLE place (id INT)')
sqlfile = tmp_path / '1010.txt' assert tiger_table.count() == 0
def test_add_tiger_data_bad_file(def_config, tiger_table, tokenizer_mock,
tmp_path):
sqlfile = tmp_path / '1010.csv'
sqlfile.write_text("""Random text""") sqlfile.write_text("""Random text""")
tiger_data.add_tiger_data(str(tmp_path), def_config, threads)
assert temp_db_cursor.table_rows('place') == 0 tiger_data.add_tiger_data(str(tmp_path), def_config, 1, tokenizer_mock())
assert tiger_table.count() == 0
def test_add_tiger_data_hnr_nan(def_config, tiger_table, tokenizer_mock,
csv_factory, tmp_path):
csv_factory('file1', hnr_from=99)
csv_factory('file2', hnr_from='L12')
csv_factory('file3', hnr_to='12.4')
tiger_data.add_tiger_data(str(tmp_path), def_config, 1, tokenizer_mock())
assert tiger_table.count() == 1
assert tiger_table.row()['start'] == 99
@pytest.mark.parametrize("threads", (1, 5)) @pytest.mark.parametrize("threads", (1, 5))
def test_add_tiger_data_tarfile(def_config, tmp_path, temp_db_cursor, def test_add_tiger_data_tarfile(def_config, tiger_table, tokenizer_mock,
threads, temp_db_with_extensions, sql_preprocessor): tmp_path, src_dir, threads):
temp_db_cursor.execute('CREATE TABLE place (id INT)')
sqlfile = tmp_path / '1010.sql'
sqlfile.write_text("""INSERT INTO place values (1);
INSERT INTO non_existant_table values (1);""")
tar = tarfile.open(str(tmp_path / 'sample.tar.gz'), "w:gz") tar = tarfile.open(str(tmp_path / 'sample.tar.gz'), "w:gz")
tar.add(sqlfile) tar.add(str(src_dir / 'test' / 'testdb' / 'tiger' / '01001.csv'))
tar.close() tar.close()
tiger_data.add_tiger_data(str(tmp_path / 'sample.tar.gz'), def_config, threads)
assert temp_db_cursor.table_rows('place') == 1 tiger_data.add_tiger_data(str(tmp_path / 'sample.tar.gz'), def_config, 1,
tokenizer_mock())
assert tiger_table.count() == 6213
@pytest.mark.parametrize("threads", (1, 5)) def test_add_tiger_data_bad_tarfile(def_config, tiger_table, tokenizer_mock,
def test_add_tiger_data_bad_tarfile(def_config, tmp_path, temp_db_cursor, threads, tmp_path):
temp_db_with_extensions, sql_preprocessor): tarfile = tmp_path / 'sample.tar.gz'
temp_db_cursor.execute('CREATE TABLE place (id INT)') tarfile.write_text("""Random text""")
sqlfile = tmp_path / '1010.txt'
sqlfile.write_text("""Random text""") with pytest.raises(UsageError):
tiger_data.add_tiger_data(str(tarfile), def_config, 1, tokenizer_mock())
def test_add_tiger_data_empty_tarfile(def_config, tiger_table, tokenizer_mock,
tmp_path, src_dir):
tar = tarfile.open(str(tmp_path / 'sample.tar.gz'), "w:gz") tar = tarfile.open(str(tmp_path / 'sample.tar.gz'), "w:gz")
tar.add(sqlfile) tar.add(__file__)
tar.close() tar.close()
tiger_data.add_tiger_data(str(tmp_path / 'sample.tar.gz'), def_config, threads)
assert temp_db_cursor.table_rows('place') == 0 tiger_data.add_tiger_data(str(tmp_path / 'sample.tar.gz'), def_config, 1,
tokenizer_mock())
assert tiger_table.count() == 0

6214
test/testdb/tiger/01001.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff