move warm script to python code

This commit is contained in:
Sarah Hoffmann 2023-07-16 20:12:53 +02:00
parent 261e0cfd5a
commit faeee7528f
6 changed files with 53 additions and 133 deletions

View File

@ -1,115 +0,0 @@
<?php
/**
* SPDX-License-Identifier: GPL-2.0-only
*
* This file is part of Nominatim. (https://nominatim.org)
*
* Copyright (C) 2022 by the Nominatim developer community.
* For a full list of authors see the git log.
*/
@define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/log.php');
require_once(CONST_LibDir.'/PlaceLookup.php');
require_once(CONST_LibDir.'/ReverseGeocode.php');
ini_set('memory_limit', '800M');
$aCMDOptions = array(
'Tools to warm nominatim db',
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
array('reverse-only', '', 0, 1, 0, 0, 'bool', 'Warm reverse only'),
array('search-only', '', 0, 1, 0, 0, 'bool', 'Warm search only'),
array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
);
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aResult, true, true);
loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Database_DSN', getSetting('DATABASE_DSN'));
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
@define('CONST_Search_WithinCountries', getSetting('SEARCH_WITHIN_COUNTRIES', false));
require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB();
$oDB->connect();
$bVerbose = $aResult['verbose'];
function print_results($aResults, $bVerbose)
{
if ($bVerbose) {
if ($aResults && count($aResults)) {
echo $aResults[0]['langaddress']."\n";
} else {
echo "<not found>\n";
}
} else {
echo '.';
}
}
if (!$aResult['search-only']) {
$oReverseGeocode = new Nominatim\ReverseGeocode($oDB);
$oReverseGeocode->setZoom(20);
$oPlaceLookup = new Nominatim\PlaceLookup($oDB);
$oPlaceLookup->setIncludeAddressDetails(true);
$oPlaceLookup->setLanguagePreference(array('en'));
echo 'Warm reverse: ';
if ($bVerbose) {
echo "\n";
}
for ($i = 0; $i < 1000; $i++) {
$fLat = rand(-9000, 9000) / 100;
$fLon = rand(-18000, 18000) / 100;
if ($bVerbose) {
echo "$fLat, $fLon = ";
}
$oLookup = $oReverseGeocode->lookup($fLat, $fLon);
$aSearchResults = $oLookup ? $oPlaceLookup->lookup(array($oLookup->iId => $oLookup)) : null;
print_results($aSearchResults, $bVerbose);
}
echo "\n";
}
if (!$aResult['reverse-only']) {
$oGeocode = new Nominatim\Geocode($oDB);
echo 'Warm search: ';
if ($bVerbose) {
echo "\n";
}
$oTokenizer = new \Nominatim\Tokenizer($oDB);
$aWords = $oTokenizer->mostFrequentWords(1000);
$sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
foreach ($aWords as $sWord) {
if ($bVerbose) {
echo "$sWord = ";
}
$oGeocode->setLanguagePreference(array('en'));
$oGeocode->setQuery($sWord);
$aSearchResults = $oGeocode->lookup();
print_results($aSearchResults, $bVerbose);
}
echo "\n";
}

View File

@ -9,9 +9,11 @@ Implementation of the 'admin' subcommand.
"""
import logging
import argparse
import random
from nominatim.tools.exec_utils import run_legacy_script
from nominatim.db.connection import connect
from nominatim.clicmd.args import NominatimArgs
import nominatim.api as napi
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
@ -81,11 +83,25 @@ class AdminFuncs:
return 1
def _warm(self, args: NominatimArgs) -> int:
LOG.warning('Warming database caches')
params = ['warm.php']
if args.target == 'reverse':
params.append('--reverse-only')
if args.target == 'search':
params.append('--search-only')
return run_legacy_script(*params, config=args.config)
api = napi.NominatimAPI(args.project_dir)
if args.target != 'reverse':
for _ in range(1000):
api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
address_details=True)
if args.target != 'search':
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
words = tokenizer.most_frequent_words(conn, 1000)
for word in words:
api.search(word)
return 0

View File

@ -13,6 +13,7 @@ from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from nominatim.config import Configuration
from nominatim.db.connection import Connection
from nominatim.data.place_info import PlaceInfo
from nominatim.typing import Protocol
@ -233,6 +234,13 @@ class AbstractTokenizer(ABC):
"""
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.

View File

@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
self.loader.make_token_analysis())
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute("""SELECT word, sum((info->'count')::int) as count
FROM word WHERE type = 'W'
GROUP BY word
ORDER BY count DESC LIMIT %s""", (num,))
return list(s[0].split('@')[0] for s in cur)
def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""

View File

@ -256,6 +256,16 @@ class LegacyTokenizer(AbstractTokenizer):
return LegacyNameAnalyzer(self.dsn, normalizer)
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute(""" SELECT word FROM word WHERE word is not null
ORDER BY search_name_count DESC LIMIT %s""", (num,))
return list(s[0] for s in cur)
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""

View File

@ -19,17 +19,6 @@ import nominatim.tools.migration
import nominatim.clicmd.admin
@pytest.mark.parametrize("params", [('--warm', ),
('--warm', '--reverse-only'),
('--warm', '--search-only')])
def test_admin_command_legacy(cli_call, mock_func_factory, params):
mock_run_legacy = mock_func_factory(nominatim.clicmd.admin, 'run_legacy_script')
assert cli_call('admin', *params) == 0
assert mock_run_legacy.called == 1
def test_admin_command_check_database(cli_call, mock_func_factory):
mock = mock_func_factory(nominatim.tools.check_database, 'check_database')