port address level computation to Python

Also adds simple tests for correct table creation.
This commit is contained in:
Sarah Hoffmann 2021-01-23 17:25:14 +01:00
parent e6c2842b66
commit 94fa7162be
13 changed files with 268 additions and 171 deletions

View File

@ -4,7 +4,6 @@
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/setup_functions.php');
require_once(CONST_LibDir.'/setup/SetupClass.php');
require_once(CONST_LibDir.'/setup/AddressLevelParser.php');
ini_set('memory_limit', '800M');
@ -275,10 +274,7 @@ if ($aResult['index']) {
}
if ($aResult['update-address-levels']) {
$sAddressLevelConfig = getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json');
echo 'Updating address levels from '.$sAddressLevelConfig.".\n";
$oAlParser = new \Nominatim\Setup\AddressLevelParser($sAddressLevelConfig);
$oAlParser->createTable($oDB, 'address_levels');
(clone($oNominatimCmd))->addParams('refresh', '--address-levels')->run();
}
if ($aResult['recompute-importance']) {

View File

@ -1,98 +0,0 @@
<?php
namespace Nominatim\Setup;
/**
* Parses an address level description.
*/
class AddressLevelParser
{
private $aLevels;
public function __construct($sDescriptionFile)
{
$sJson = file_get_contents($sDescriptionFile);
$this->aLevels = json_decode($sJson, true);
if (!$this->aLevels) {
switch (json_last_error()) {
case JSON_ERROR_NONE:
break;
case JSON_ERROR_DEPTH:
fail('JSON error - Maximum stack depth exceeded');
break;
case JSON_ERROR_STATE_MISMATCH:
fail('JSON error - Underflow or the modes mismatch');
break;
case JSON_ERROR_CTRL_CHAR:
fail('JSON error - Unexpected control character found');
break;
case JSON_ERROR_SYNTAX:
fail('JSON error - Syntax error, malformed JSON');
break;
case JSON_ERROR_UTF8:
fail('JSON error - Malformed UTF-8 characters, possibly incorrectly encoded');
break;
default:
fail('JSON error - Unknown error');
break;
}
}
}
/**
* Dump the description into a database table.
*
* @param object $oDB Database conneciton to use.
* @param string $sTable Name of table to create.
*
* @return null
*
* A new table is created. Any previously existing table is dropped.
* The table has the following columns:
* country, class, type, rank_search, rank_address.
*/
public function createTable($oDB, $sTable)
{
$oDB->exec('DROP TABLE IF EXISTS '.$sTable);
$sSql = 'CREATE TABLE '.$sTable;
$sSql .= '(country_code varchar(2), class TEXT, type TEXT,';
$sSql .= ' rank_search SMALLINT, rank_address SMALLINT)';
$oDB->exec($sSql);
$sSql = 'CREATE UNIQUE INDEX ON '.$sTable.' (country_code, class, type)';
$oDB->exec($sSql);
$sSql = 'INSERT INTO '.$sTable.' VALUES ';
foreach ($this->aLevels as $aLevel) {
$aCountries = array();
if (isset($aLevel['countries'])) {
foreach ($aLevel['countries'] as $sCountry) {
$aCountries[$sCountry] = $oDB->getDBQuoted($sCountry);
}
} else {
$aCountries['NULL'] = 'NULL';
}
foreach ($aLevel['tags'] as $sKey => $aValues) {
foreach ($aValues as $sValue => $mRanks) {
$aFields = array(
$oDB->getDBQuoted($sKey),
$sValue ? $oDB->getDBQuoted($sValue) : 'NULL'
);
if (is_array($mRanks)) {
$aFields[] = (string) $mRanks[0];
$aFields[] = (string) $mRanks[1];
} else {
$aFields[] = (string) $mRanks;
$aFields[] = (string) $mRanks;
}
$sLine = ','.join(',', $aFields).'),';
foreach ($aCountries as $sCountries) {
$sSql .= '('.$sCountries.$sLine;
}
}
}
}
$oDB->exec(rtrim($sSql, ','));
}
}

View File

@ -2,7 +2,6 @@
namespace Nominatim\Setup;
require_once(CONST_LibDir.'/setup/AddressLevelParser.php');
require_once(CONST_LibDir.'/Shell.php');
class SetupFunctions
@ -19,6 +18,7 @@ class SetupFunctions
protected $bNoPartitions;
protected $bDrop;
protected $oDB = null;
protected $oNominatimCmd;
public function __construct(array $aCMDResult)
{
@ -81,6 +81,14 @@ class SetupFunctions
}
$this->bDrop = isset($aCMDResult['drop']) && $aCMDResult['drop'];
$this->oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'));
if ($this->bQuiet) {
$this->oNominatimCmd->addParams('--quiet');
}
if ($this->bVerbose) {
$this->oNominatimCmd->addParams('--verbose');
}
}
public function createDB()
@ -256,8 +264,7 @@ class SetupFunctions
$this->dropTable('search_name');
}
$oAlParser = new AddressLevelParser(getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json'));
$oAlParser->createTable($this->db(), 'address_levels');
(clone($this->oNominatimCmd))->addParams('refresh', '--address-levels')->run();
}
public function createTableTriggers()
@ -549,19 +556,10 @@ class SetupFunctions
{
$this->checkModulePresence(); // raises exception on failure
$oBaseCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')))
->addParams('index');
if ($this->bQuiet) {
$oBaseCmd->addParams('-q');
}
if ($this->bVerbose) {
$oBaseCmd->addParams('-v');
}
$oBaseCmd = (clone $this->oNominatimCmd)->addParams('index');
info('Index ranks 0 - 4');
$oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4);
echo $oCmd->escapedCmd();
$iStatus = $oCmd->run();
if ($iStatus != 0) {

View File

@ -372,33 +372,38 @@ class UpdateRefresh:
def run(args):
import nominatim.tools.refresh
with psycopg2.connect(args.config.get_libpq_dsn()) as conn:
if args.postcodes:
LOG.warning("Update postcodes centroid")
nominatim.tools.refresh.update_postcodes(conn, args.data_dir)
if args.word_counts:
LOG.warning('Recompute frequency of full-word search terms')
nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir)
if args.address_levels:
run_legacy_script('update.php', '--update-address-levels',
nominatim_env=args, throw_on_fail=True)
if args.functions:
params = ['setup.php', '--create-functions', '--create-partition-functions']
if args.diffs:
params.append('--enable-diff-updates')
if args.enable_debug_statements:
params.append('--enable-debug-statements')
run_legacy_script(*params, nominatim_env=args, throw_on_fail=True)
if args.wiki_data:
run_legacy_script('setup.php', '--import-wikipedia-articles',
nominatim_env=args, throw_on_fail=True)
# Attention: importance MUST come after wiki data import.
if args.importance:
run_legacy_script('update.php', '--recompute-importance',
nominatim_env=args, throw_on_fail=True)
if args.website:
run_legacy_script('setup.php', '--setup-website',
nominatim_env=args, throw_on_fail=True)
conn = psycopg2.connect(args.config.get_libpq_dsn())
if args.postcodes:
LOG.warning("Update postcodes centroid")
nominatim.tools.refresh.update_postcodes(conn, args.data_dir)
if args.word_counts:
LOG.warning('Recompute frequency of full-word search terms')
nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir)
if args.address_levels:
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
LOG.warning('Updating address levels from %s', cfg)
nominatim.tools.refresh.load_address_levels_from_file(conn, cfg)
if args.functions:
params = ['setup.php', '--create-functions', '--create-partition-functions']
if args.diffs:
params.append('--enable-diff-updates')
if args.enable_debug_statements:
params.append('--enable-debug-statements')
run_legacy_script(*params, nominatim_env=args, throw_on_fail=True)
if args.wiki_data:
run_legacy_script('setup.php', '--import-wikipedia-articles',
nominatim_env=args, throw_on_fail=True)
# Attention: importance MUST come after wiki data import.
if args.importance:
run_legacy_script('update.php', '--recompute-importance',
nominatim_env=args, throw_on_fail=True)
if args.website:
run_legacy_script('setup.php', '--setup-website',
nominatim_env=args, throw_on_fail=True)
conn.close()
return 0

View File

@ -24,6 +24,13 @@ class Configuration:
if project_dir is not None:
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
# Add defaults for variables that are left empty to set the default.
# They may still be overwritten by environment variables.
if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']:
self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \
str(config_dir / 'address-levels.json')
def __getattr__(self, name):
name = 'NOMINATIM_' + name

View File

@ -9,3 +9,4 @@ def execute_file(conn, fname):
sql = fdesc.read()
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()

View File

@ -1,6 +1,10 @@
"""
Functions for bringing auxiliary data in the database up-to-date.
"""
import json
from psycopg2.extras import execute_values
from ..db.utils import execute_file
def update_postcodes(conn, datadir):
@ -14,3 +18,54 @@ def recompute_word_counts(conn, datadir):
""" Compute the frequency of full-word search terms.
"""
execute_file(conn, datadir / 'sql' / 'words_from_search_name.sql')
def _add_address_level_rows_from_entry(rows, entry):
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a
PostgreSQL table and adds these lines to `rows`.
"""
countries = entry.get('countries') or (None, )
for key, values in entry['tags'].items():
for value, ranks in values.items():
if isinstance(ranks, list):
rank_search, rank_address = ranks
else:
rank_search = rank_address = ranks
if not value:
value = None
for country in countries:
rows.append((country, key, value, rank_search, rank_address))
def load_address_levels(conn, table, levels):
""" Replace the `address_levels` table with the contents of `levels'.
A new table is created any previously existing table is dropped.
The table has the following columns:
country, class, type, rank_search, rank_address
"""
rows = []
for entry in levels:
_add_address_level_rows_from_entry(rows, entry)
with conn.cursor() as cur:
cur.execute('DROP TABLE IF EXISTS {}'.format(table))
cur.execute("""CREATE TABLE {} (country_code varchar(2),
class TEXT,
type TEXT,
rank_search SMALLINT,
rank_address SMALLINT)""".format(table))
execute_values(cur, "INSERT INTO {} VALUES %s".format(table), rows)
cur.execute('CREATE UNIQUE INDEX ON {} (country_code, class, type)'.format(table))
conn.commit()
def load_address_levels_from_file(conn, config_file):
""" Replace the `address_levels` table with the contents of the config
file.
"""
with config_file.open('r') as fdesc:
load_address_levels(conn, 'address_levels', json.load(fdesc))

View File

@ -2,13 +2,43 @@ import sys
from pathlib import Path
import psycopg2
import psycopg2.extras
import pytest
SRC_DIR = Path(__file__) / '..' / '..' / '..'
# always test against the source
sys.path.insert(0, str((Path(__file__) / '..' / '..' / '..').resolve()))
sys.path.insert(0, str(SRC_DIR.resolve()))
from nominatim.config import Configuration
class _TestingCursor(psycopg2.extras.DictCursor):
""" Extension to the DictCursor class that provides execution
short-cuts that simplify writing assertions.
"""
def scalar(self, sql, params=None):
""" Execute a query with a single return value and return this value.
Raises an assertion when not exactly one row is returned.
"""
self.execute(sql, params)
assert self.rowcount == 1
return self.fetchone()[0]
def row_set(self, sql, params=None):
""" Execute a query and return the result as a set of tuples.
"""
self.execute(sql, params)
if self.rowcount == 1:
return set(tuple(self.fetchone()))
return set((tuple(row) for row in self))
@pytest.fixture
def temp_db(monkeypatch):
""" Create an empty database for the test. The database name is also
exported into NOMINATIM_DATABASE_DSN.
"""
name = 'test_nominatim_python_unittest'
with psycopg2.connect(database='postgres') as conn:
conn.set_isolation_level(0)
@ -24,3 +54,29 @@ def temp_db(monkeypatch):
conn.set_isolation_level(0)
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(name))
@pytest.fixture
def temp_db_conn(temp_db):
""" Connection to the test database.
"""
conn = psycopg2.connect(database=temp_db)
yield conn
conn.close()
@pytest.fixture
def temp_db_cursor(temp_db):
""" Connection and cursor towards the test database. The connection will
be in auto-commit mode.
"""
conn = psycopg2.connect('dbname=' + temp_db)
conn.set_isolation_level(0)
with conn.cursor(cursor_factory=_TestingCursor) as cur:
yield cur
conn.close()
@pytest.fixture
def def_config():
return Configuration(None, SRC_DIR.resolve() / 'settings')

View File

@ -84,10 +84,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
(['--boundaries-only'], 1, 0),
(['--no-boundaries'], 0, 1),
(['--boundaries-only', '--no-boundaries'], 0, 0)])
def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks):
with psycopg2.connect(database=temp_db) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE import_status (indexed bool)")
def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks):
temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
bnd_mock = MockParamCapture()
monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', bnd_mock)
rank_mock = MockParamCapture()
@ -100,7 +98,6 @@ def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks):
@pytest.mark.parametrize("command,params", [
('address-levels', ('update.php', '--update-address-levels')),
('functions', ('setup.php',)),
('wiki-data', ('setup.php', '--import-wikipedia-articles')),
('importance', ('update.php', '--recompute-importance')),
@ -116,6 +113,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params):
@pytest.mark.parametrize("command,func", [
('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
])
def test_refresh_command(monkeypatch, command, func):
func_mock = MockParamCapture()

View File

@ -6,28 +6,25 @@ import pytest
import nominatim.db.utils as db_utils
def test_execute_file_success(temp_db, tmp_path):
def test_execute_file_success(temp_db_conn, tmp_path):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')
with psycopg2.connect('dbname=' + temp_db) as conn:
db_utils.execute_file(conn, tmpfile)
db_utils.execute_file(temp_db_conn, tmpfile)
with conn.cursor() as cur:
cur.execute('SELECT * FROM test')
with temp_db_conn.cursor() as cur:
cur.execute('SELECT * FROM test')
assert cur.rowcount == 1
assert cur.fetchone()[0] == 56
assert cur.rowcount == 1
assert cur.fetchone()[0] == 56
def test_execute_file_bad_file(temp_db, tmp_path):
with psycopg2.connect('dbname=' + temp_db) as conn:
with pytest.raises(FileNotFoundError):
db_utils.execute_file(conn, tmp_path / 'test2.sql')
def test_execute_file_bad_file(temp_db_conn, tmp_path):
with pytest.raises(FileNotFoundError):
db_utils.execute_file(temp_db_conn, tmp_path / 'test2.sql')
def test_execute_file_bad_sql(temp_db, tmp_path):
def test_execute_file_bad_sql(temp_db_conn, tmp_path):
tmpfile = tmp_path / 'test.sql'
tmpfile.write_text('CREATE STABLE test (id INT)')
with psycopg2.connect('dbname=' + temp_db) as conn:
with pytest.raises(psycopg2.ProgrammingError):
db_utils.execute_file(conn, tmpfile)
with pytest.raises(psycopg2.ProgrammingError):
db_utils.execute_file(temp_db_conn, tmpfile)

View File

@ -82,10 +82,8 @@ class IndexerTestDB:
@pytest.fixture
def test_db(temp_db):
conn = psycopg2.connect(database=temp_db)
yield IndexerTestDB(conn)
conn.close()
def test_db(temp_db_conn):
yield IndexerTestDB(temp_db_conn)
@pytest.mark.parametrize("threads", [1, 15])

View File

@ -7,7 +7,6 @@ import tempfile
import pytest
from nominatim.config import Configuration
import nominatim.tools.exec_utils as exec_utils
@pytest.fixture
@ -18,9 +17,9 @@ def tmp_phplib_dir():
yield Path(phpdir)
@pytest.fixture
def nominatim_env(tmp_phplib_dir):
def nominatim_env(tmp_phplib_dir, def_config):
class _NominatimEnv:
config = Configuration(None, Path(__file__) / '..' / '..' / '..' / 'settings')
config = def_config
phplib_dir = tmp_phplib_dir
data_dir = Path('data')
project_dir = Path('.')

View File

@ -0,0 +1,85 @@
"""
Tests for function for importing address ranks.
"""
import json
import pytest
from pathlib import Path
from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file
def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config):
load_address_levels_from_file(temp_db_conn, Path(def_config.ADDRESS_LEVEL_CONFIG))
assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0
def test_load_ranks_from_file(temp_db_conn, temp_db_cursor, tmp_path):
test_file = tmp_path / 'test_levels.json'
test_file.write_text('[{"tags":{"place":{"sea":2}}}]')
load_address_levels_from_file(temp_db_conn, test_file)
assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0
def test_load_ranks_from_broken_file(temp_db_conn, tmp_path):
test_file = tmp_path / 'test_levels.json'
test_file.write_text('[{"tags":"place":{"sea":2}}}]')
with pytest.raises(json.decoder.JSONDecodeError):
load_address_levels_from_file(temp_db_conn, test_file)
def test_load_ranks_country(temp_db_conn, temp_db_cursor):
load_address_levels(temp_db_conn, 'levels',
[{"tags": {"place": {"village": 14}}},
{"countries": ['de'],
"tags": {"place": {"village": 15}}},
{"countries": ['uk', 'us' ],
"tags": {"place": {"village": 16}}}
])
assert temp_db_cursor.row_set('SELECT * FROM levels') == \
set([(None, 'place', 'village', 14, 14),
('de', 'place', 'village', 15, 15),
('uk', 'place', 'village', 16, 16),
('us', 'place', 'village', 16, 16),
])
def test_load_ranks_default_value(temp_db_conn, temp_db_cursor):
load_address_levels(temp_db_conn, 'levels',
[{"tags": {"boundary": {"": 28}}},
{"countries": ['hu'],
"tags": {"boundary": {"": 29}}}
])
assert temp_db_cursor.row_set('SELECT * FROM levels') == \
set([(None, 'boundary', None, 28, 28),
('hu', 'boundary', None, 29, 29),
])
def test_load_ranks_multiple_keys(temp_db_conn, temp_db_cursor):
load_address_levels(temp_db_conn, 'levels',
[{"tags":
{"place": {"city": 14},
"boundary": {"administrative2" : 4}}
}])
assert temp_db_cursor.row_set('SELECT * FROM levels') == \
set([(None, 'place', 'city', 14, 14),
(None, 'boundary', 'administrative2', 4, 4),
])
def test_load_ranks_address(temp_db_conn, temp_db_cursor):
load_address_levels(temp_db_conn, 'levels',
[{"tags":
{"place": {"city": 14,
"town" : [14, 13]}}
}])
assert temp_db_cursor.row_set('SELECT * FROM levels') == \
set([(None, 'place', 'city', 14, 14),
(None, 'place', 'town', 14, 13),
])