Merge pull request #2305 from lonvia/tokenizer

Factor out normalization into a separate module
This commit is contained in:
Sarah Hoffmann 2021-05-03 09:15:34 +02:00 committed by GitHub
commit 8b1a509442
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
61 changed files with 3032 additions and 2012 deletions

View File

@ -8,12 +8,14 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
require_once(CONST_LibDir.'/SearchDescription.php'); require_once(CONST_LibDir.'/SearchDescription.php');
require_once(CONST_LibDir.'/SearchContext.php'); require_once(CONST_LibDir.'/SearchContext.php');
require_once(CONST_LibDir.'/TokenList.php'); require_once(CONST_LibDir.'/TokenList.php');
require_once(CONST_TokenizerDir.'/tokenizer.php');
class Geocode class Geocode
{ {
protected $oDB; protected $oDB;
protected $oPlaceLookup; protected $oPlaceLookup;
protected $oTokenizer;
protected $aLangPrefOrder = array(); protected $aLangPrefOrder = array();
@ -41,23 +43,12 @@ class Geocode
protected $sQuery = false; protected $sQuery = false;
protected $aStructuredQuery = false; protected $aStructuredQuery = false;
protected $oNormalizer = null;
public function __construct(&$oDB) public function __construct(&$oDB)
{ {
$this->oDB =& $oDB; $this->oDB =& $oDB;
$this->oPlaceLookup = new PlaceLookup($this->oDB); $this->oPlaceLookup = new PlaceLookup($this->oDB);
$this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
}
private function normTerm($sTerm)
{
if ($this->oNormalizer === null) {
return $sTerm;
}
return $this->oNormalizer->transliterate($sTerm);
} }
public function setLanguagePreference($aLangPref) public function setLanguagePreference($aLangPref)
@ -510,12 +501,10 @@ class Geocode
if ($this->aCountryCodes) { if ($this->aCountryCodes) {
$oCtx->setCountryList($this->aCountryCodes); $oCtx->setCountryList($this->aCountryCodes);
} }
$this->oTokenizer->setCountryRestriction($this->aCountryCodes);
Debug::newSection('Query Preprocessing'); Debug::newSection('Query Preprocessing');
$sNormQuery = $this->normTerm($this->sQuery);
Debug::printVar('Normalized query', $sNormQuery);
$sLanguagePrefArraySQL = $this->oDB->getArraySQL( $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
$this->oDB->getDBQuotedList($this->aLangPrefOrder) $this->oDB->getDBQuotedList($this->aLangPrefOrder)
); );
@ -569,108 +558,55 @@ class Geocode
} }
if ($sSpecialTerm && !$aSearches[0]->hasOperator()) { if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
$sSpecialTerm = pg_escape_string($sSpecialTerm); $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
$sToken = $this->oDB->getOne(
'SELECT make_standard_name(:term)',
array(':term' => $sSpecialTerm),
'Cannot decode query. Wrong encoding?'
);
$sSQL = 'SELECT class, type FROM word ';
$sSQL .= ' WHERE word_token in (\' '.$sToken.'\')';
$sSQL .= ' AND class is not null AND class not in (\'place\')';
Debug::printSQL($sSQL); if (!empty($aTokens)) {
$aSearchWords = $this->oDB->getAll($sSQL); $aNewSearches = array();
$aNewSearches = array(); foreach ($aSearches as $oSearch) {
foreach ($aSearches as $oSearch) { foreach ($aTokens as $oToken) {
foreach ($aSearchWords as $aSearchTerm) { $oNewSearch = clone $oSearch;
$oNewSearch = clone $oSearch; $oNewSearch->setPoiSearch(
$oNewSearch->setPoiSearch( $oToken->iOperator,
Operator::TYPE, $oToken->sClass,
$aSearchTerm['class'], $oToken->sType
$aSearchTerm['type'] );
); $aNewSearches[] = $oNewSearch;
$aNewSearches[] = $oNewSearch; }
} }
$aSearches = $aNewSearches;
} }
$aSearches = $aNewSearches;
} }
// Split query into phrases // Split query into phrases
// Commas are used to reduce the search space by indicating where phrases split // Commas are used to reduce the search space by indicating where phrases split
$aPhrases = array();
if ($this->aStructuredQuery) { if ($this->aStructuredQuery) {
$aInPhrases = $this->aStructuredQuery; foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
$aPhrases[] = new Phrase($sPhrase, $iPhrase);
}
} else { } else {
$aInPhrases = explode(',', $sQuery); foreach (explode(',', $sQuery) as $sPhrase) {
$aPhrases[] = new Phrase($sPhrase, '');
}
} }
Debug::printDebugArray('Search context', $oCtx); Debug::printDebugArray('Search context', $oCtx);
Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]); Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
Debug::printVar('Final query phrases', $aInPhrases);
// Convert each phrase to standard form
// Create a list of standard words
// Get all 'sets' of words
// Generate a complete list of all
Debug::newSection('Tokenization'); Debug::newSection('Tokenization');
$aTokens = array(); $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
$aPhrases = array();
foreach ($aInPhrases as $iPhrase => $sPhrase) {
$sPhrase = $this->oDB->getOne(
'SELECT make_standard_name(:phrase)',
array(':phrase' => $sPhrase),
'Cannot normalize query string (is it a UTF-8 string?)'
);
if (trim($sPhrase)) {
$oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
$oPhrase->addTokens($aTokens);
$aPhrases[] = $oPhrase;
}
}
Debug::printVar('Tokens', $aTokens);
$oValidTokens = new TokenList();
if (!empty($aTokens)) {
$oValidTokens->addTokensFromDB(
$this->oDB,
$aTokens,
$this->aCountryCodes,
$sNormQuery,
$this->oNormalizer
);
if ($oValidTokens->count() > 0) {
$oCtx->setFullNameWords($oValidTokens->getFullWordIDs()); $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
// Try more interpretations for Tokens that could not be matched. $aPhrases = array_filter($aPhrases, function ($oPhrase) {
foreach ($aTokens as $sToken) { return $oPhrase->getWordSets() !== null;
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { });
if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
// US ZIP+4 codes - merge in the 5-digit ZIP code
$oValidTokens->addToken(
$sToken,
new Token\Postcode(null, $aData[1], 'us')
);
} elseif (preg_match('/^ [0-9]+$/', $sToken)) {
// Unknown single word token with a number.
// Assume it is a house number.
$oValidTokens->addToken(
$sToken,
new Token\HouseNumber(null, trim($sToken))
);
}
}
}
// Any words that have failed completely? // Any words that have failed completely?
// TODO: suggestions // TODO: suggestions
Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
foreach ($aPhrases as $oPhrase) {
$oPhrase->computeWordSets($oValidTokens);
}
Debug::printDebugTable('Phrases', $aPhrases); Debug::printDebugTable('Phrases', $aPhrases);
Debug::newSection('Search candidates'); Debug::newSection('Search candidates');

View File

@ -16,8 +16,6 @@ class Phrase
private $sPhrase; private $sPhrase;
// Element type for structured searches. // Element type for structured searches.
private $sPhraseType; private $sPhraseType;
// Space-separated words of the phrase.
private $aWords;
// Possible segmentations of the phrase. // Possible segmentations of the phrase.
private $aWordSets; private $aWordSets;
@ -38,7 +36,14 @@ class Phrase
{ {
$this->sPhrase = trim($sPhrase); $this->sPhrase = trim($sPhrase);
$this->sPhraseType = $sPhraseType; $this->sPhraseType = $sPhraseType;
$this->aWords = explode(' ', $this->sPhrase); }
/**
* Get the orginal phrase of the string.
*/
public function getPhrase()
{
return $this->sPhrase;
} }
/** /**
@ -63,30 +68,6 @@ class Phrase
return $this->aWordSets; return $this->aWordSets;
} }
/**
* Add the tokens from this phrase to the given list of tokens.
*
* @param string[] $aTokens List of tokens to append.
*
* @return void
*/
public function addTokens(&$aTokens)
{
$iNumWords = count($this->aWords);
for ($i = 0; $i < $iNumWords; $i++) {
$sPhrase = $this->aWords[$i];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
for ($j = $i + 1; $j < $iNumWords; $j++) {
$sPhrase .= ' '.$this->aWords[$j];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
}
}
}
/** /**
* Invert the set of possible segmentations. * Invert the set of possible segmentations.
* *
@ -99,21 +80,27 @@ class Phrase
} }
} }
public function computeWordSets($oTokens) public function computeWordSets($aWords, $oTokens)
{ {
$iNumWords = count($this->aWords); $iNumWords = count($aWords);
if ($iNumWords == 0) {
$this->aWordSets = null;
return;
}
// Caches the word set for the partial phrase up to word i. // Caches the word set for the partial phrase up to word i.
$aSetCache = array_fill(0, $iNumWords, array()); $aSetCache = array_fill(0, $iNumWords, array());
// Initialise first element of cache. There can only be the word. // Initialise first element of cache. There can only be the word.
if ($oTokens->containsAny($this->aWords[0])) { if ($oTokens->containsAny($aWords[0])) {
$aSetCache[0][] = array($this->aWords[0]); $aSetCache[0][] = array($aWords[0]);
} }
// Now do the next elements using what we already have. // Now do the next elements using what we already have.
for ($i = 1; $i < $iNumWords; $i++) { for ($i = 1; $i < $iNumWords; $i++) {
for ($j = $i; $j > 0; $j--) { for ($j = $i; $j > 0; $j--) {
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial; $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
$aPartial = array($sPartial); $aPartial = array($sPartial);
foreach ($aSetCache[$j - 1] as $aSet) { foreach ($aSetCache[$j - 1] as $aSet) {
@ -136,7 +123,7 @@ class Phrase
} }
// finally the current full phrase // finally the current full phrase
$sPartial = $this->aWords[0].' '.$sPartial; $sPartial = $aWords[0].' '.$sPartial;
if ($oTokens->containsAny($sPartial)) { if ($oTokens->containsAny($sPartial)) {
$aSetCache[$i][] = array($sPartial); $aSetCache[$i][] = array($sPartial);
} }
@ -153,7 +140,6 @@ class Phrase
return array( return array(
'Type' => $this->sPhraseType, 'Type' => $this->sPhraseType,
'Phrase' => $this->sPhrase, 'Phrase' => $this->sPhrase,
'Words' => $this->aWords,
'WordSets' => $this->aWordSets 'WordSets' => $this->aWordSets
); );
} }

View File

@ -2,6 +2,8 @@
namespace Nominatim; namespace Nominatim;
require_once(CONST_TokenizerDir.'/tokenizer.php');
use Exception; use Exception;
class Status class Status
@ -25,24 +27,8 @@ class Status
throw new Exception('Database connection failed', 700); throw new Exception('Database connection failed', 700);
} }
$sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')"); $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
if ($sStandardWord === false) { $oTokenizer->checkStatus();
throw new Exception('Module failed', 701);
}
if ($sStandardWord != 'a') {
throw new Exception('Module call failed', 702);
}
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
$sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
$iWordID = $this->oDB->getOne($sSQL);
if ($iWordID === false) {
throw new Exception('Query failed', 703);
}
if (!$iWordID) {
throw new Exception('No value', 704);
}
} }
public function dataDate() public function dataDate()

View File

@ -95,88 +95,6 @@ class TokenList
return $ids; return $ids;
} }
/**
* Add token information from the word table in the database.
*
* @param object $oDB Nominatim::DB instance.
* @param string[] $aTokens List of tokens to look up in the database.
* @param string[] $aCountryCodes List of country restrictions.
* @param string $sNormQuery Normalized query string.
* @param object $oNormalizer Normalizer function to use on tokens.
*
* @return void
*/
public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
{
// Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
$sSQL .= ' operator, coalesce(search_name_count, 0) as count';
$sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
Debug::printSQL($sSQL);
$aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
foreach ($aDBWords as $aWord) {
$oToken = null;
$iId = (int) $aWord['word_id'];
if ($aWord['class']) {
// Special terms need to appear in their normalized form.
if ($aWord['word']) {
$sNormWord = $aWord['word'];
if ($oNormalizer != null) {
$sNormWord = $oNormalizer->transliterate($aWord['word']);
}
if (strpos($sNormQuery, $sNormWord) === false) {
continue;
}
}
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
if ($aWord['word']
&& pg_escape_string($aWord['word']) == $aWord['word']
) {
$oToken = new Token\Postcode(
$iId,
$aWord['word'],
$aWord['country_code']
);
}
} else {
// near and in operator the same at the moment
$oToken = new Token\SpecialTerm(
$iId,
$aWord['class'],
$aWord['type'],
$aWord['operator'] ? Operator::NEAR : Operator::NONE
);
}
} elseif ($aWord['country_code']) {
// Filter country tokens that do not match restricted countries.
if (!$aCountryCodes
|| in_array($aWord['country_code'], $aCountryCodes)
) {
$oToken = new Token\Country($iId, $aWord['country_code']);
}
} else {
$oToken = new Token\Word(
$iId,
$aWord['word_token'][0] != ' ',
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
}
if ($oToken) {
$this->addToken($aWord['word_token'], $oToken);
}
}
}
/** /**
* Add a new token for the given word. * Add a new token for the given word.
* *

View File

@ -2,7 +2,6 @@
@define('CONST_LibDir', dirname(dirname(__FILE__))); @define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/ParameterParser.php'); require_once(CONST_LibDir.'/ParameterParser.php');
ini_set('memory_limit', '800M'); ini_set('memory_limit', '800M');
@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false)); @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB')); @define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false)); @define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL')); @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT')); @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES')); @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE')); @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD')); @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA')); @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false)); @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB; $oDB = new Nominatim\DB;
$oDB->connect(); $oDB->connect();

View File

@ -3,7 +3,6 @@
require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/log.php'); require_once(CONST_LibDir.'/log.php');
require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/PlaceLookup.php'); require_once(CONST_LibDir.'/PlaceLookup.php');
require_once(CONST_LibDir.'/ReverseGeocode.php'); require_once(CONST_LibDir.'/ReverseGeocode.php');
@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false)); @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB')); @define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false)); @define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL')); @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT')); @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES')); @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE')); @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD')); @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA')); @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false)); @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB(); $oDB = new Nominatim\DB();
$oDB->connect(); $oDB->connect();

View File

@ -0,0 +1,254 @@
<?php
namespace Nominatim;
class Tokenizer
{
private $oDB;
private $oNormalizer = null;
private $aCountryRestriction = null;
public function __construct(&$oDB)
{
$this->oDB =& $oDB;
$this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
}
public function checkStatus()
{
$sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
if ($sStandardWord === false) {
throw new Exception('Module failed', 701);
}
if ($sStandardWord != 'a') {
throw new Exception('Module call failed', 702);
}
$sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
$iWordID = $this->oDB->getOne($sSQL);
if ($iWordID === false) {
throw new Exception('Query failed', 703);
}
if (!$iWordID) {
throw new Exception('No value', 704);
}
}
public function setCountryRestriction($aCountries)
{
$this->aCountryRestriction = $aCountries;
}
public function normalizeString($sTerm)
{
if ($this->oNormalizer === null) {
return $sTerm;
}
return $this->oNormalizer->transliterate($sTerm);
}
public function tokensForSpecialTerm($sTerm)
{
$aResults = array();
$sSQL = 'SELECT word_id, class, type FROM word ';
$sSQL .= ' WHERE word_token = \' \' || make_standard_name(:term)';
$sSQL .= ' AND class is not null AND class not in (\'place\')';
Debug::printVar('Term', $sTerm);
Debug::printSQL($sSQL);
$aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
Debug::printVar('Results', $aSearchWords);
foreach ($aSearchWords as $aSearchTerm) {
$aResults[] = new \Nominatim\Token\SpecialTerm(
$aSearchTerm['word_id'],
$aSearchTerm['class'],
$aSearchTerm['type'],
\Nominatim\Operator::TYPE
);
}
Debug::printVar('Special term tokens', $aResults);
return $aResults;
}
public function extractTokensFromPhrases(&$aPhrases)
{
// First get the normalized version of all phrases
$sNormQuery = '';
$sSQL = 'SELECT ';
$aParams = array();
foreach ($aPhrases as $iPhrase => $oPhrase) {
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
}
$sSQL = substr($sSQL, 0, -1);
Debug::printSQL($sSQL);
Debug::printVar('SQL parameters', $aParams);
$aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
Debug::printVar('SQL result', $aNormPhrases);
// now compute all possible tokens
$aWordLists = array();
$aTokens = array();
foreach ($aNormPhrases as $sTitle => $sPhrase) {
if (strlen($sPhrase) > 0) {
$aWords = explode(' ', $sPhrase);
Tokenizer::addTokens($aTokens, $aWords);
$aWordLists[] = $aWords;
} else {
$aWordLists[] = array();
}
}
Debug::printVar('Tokens', $aTokens);
Debug::printVar('WordLists', $aWordLists);
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
foreach ($aPhrases as $iPhrase => $oPhrase) {
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
}
return $oValidTokens;
}
private function computeValidTokens($aTokens, $sNormQuery)
{
$oValidTokens = new TokenList();
if (!empty($aTokens)) {
$this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
// Try more interpretations for Tokens that could not be matched.
foreach ($aTokens as $sToken) {
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
// US ZIP+4 codes - merge in the 5-digit ZIP code
$oValidTokens->addToken(
$sToken,
new Token\Postcode(null, $aData[1], 'us')
);
} elseif (preg_match('/^ [0-9]+$/', $sToken)) {
// Unknown single word token with a number.
// Assume it is a house number.
$oValidTokens->addToken(
$sToken,
new Token\HouseNumber(null, trim($sToken))
);
}
}
}
}
return $oValidTokens;
}
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
{
// Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
$sSQL .= ' operator, coalesce(search_name_count, 0) as count';
$sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
Debug::printSQL($sSQL);
$aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
foreach ($aDBWords as $aWord) {
$oToken = null;
$iId = (int) $aWord['word_id'];
if ($aWord['class']) {
// Special terms need to appear in their normalized form.
// (postcodes are not normalized in the word table)
$sNormWord = $this->normalizeString($aWord['word']);
if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
continue;
}
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
if ($aWord['word']
&& pg_escape_string($aWord['word']) == $aWord['word']
) {
$oToken = new Token\Postcode(
$iId,
$aWord['word'],
$aWord['country_code']
);
}
} else {
// near and in operator the same at the moment
$oToken = new Token\SpecialTerm(
$iId,
$aWord['class'],
$aWord['type'],
$aWord['operator'] ? Operator::NEAR : Operator::NONE
);
}
} elseif ($aWord['country_code']) {
// Filter country tokens that do not match restricted countries.
if (!$this->aCountryRestriction
|| in_array($aWord['country_code'], $this->aCountryRestriction)
) {
$oToken = new Token\Country($iId, $aWord['country_code']);
}
} else {
$oToken = new Token\Word(
$iId,
$aWord['word_token'][0] != ' ',
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
}
if ($oToken) {
$oValidTokens->addToken($aWord['word_token'], $oToken);
}
}
}
/**
* Add the tokens from this phrase to the given list of tokens.
*
* @param string[] $aTokens List of tokens to append.
*
* @return void
*/
private static function addTokens(&$aTokens, $aWords)
{
$iNumWords = count($aWords);
for ($i = 0; $i < $iNumWords; $i++) {
$sPhrase = $aWords[$i];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
for ($j = $i + 1; $j < $iNumWords; $j++) {
$sPhrase .= ' '.$aWords[$j];
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
$aTokens[$sPhrase] = $sPhrase;
}
}
}
}

View File

@ -1,5 +1,4 @@
{% include('functions/utils.sql') %} {% include('functions/utils.sql') %}
{% include('functions/normalization.sql') %}
{% include('functions/ranking.sql') %} {% include('functions/ranking.sql') %}
{% include('functions/importance.sql') %} {% include('functions/importance.sql') %}
{% include('functions/address_lookup.sql') %} {% include('functions/address_lookup.sql') %}

View File

@ -12,39 +12,47 @@ $$
LANGUAGE plpgsql IMMUTABLE; LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
RETURNS HSTORE
AS $$
DECLARE
location RECORD;
waynodes BIGINT[];
BEGIN
IF akeys(in_address) != ARRAY['interpolation'] THEN
RETURN in_address;
END IF;
SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
FOR location IN
SELECT placex.address, placex.osm_id FROM placex
WHERE osm_type = 'N' and osm_id = ANY(waynodes)
and placex.address is not null
and (placex.address ? 'street' or placex.address ? 'place')
and indexed_status < 100
LOOP
-- mark it as a derived address
RETURN location.address || in_address || hstore('_inherited', '');
END LOOP;
RETURN in_address;
END;
$$
LANGUAGE plpgsql STABLE;
-- find the parent road of the cut road parts -- find the parent road of the cut road parts
CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT, CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
place TEXT, partition SMALLINT, partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY) centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
DECLARE DECLARE
addr_street TEXT;
addr_place TEXT;
parent_place_id BIGINT; parent_place_id BIGINT;
waynodes BIGINT[];
location RECORD; location RECORD;
BEGIN BEGIN
addr_street = street; parent_place_id := find_parent_for_address(street, place, partition, centroid);
addr_place = place;
IF addr_street is null and addr_place is null THEN
select nodes from planet_osm_ways where id = wayid INTO waynodes;
FOR location IN SELECT placex.address from placex
where osm_type = 'N' and osm_id = ANY(waynodes)
and placex.address is not null
and (placex.address ? 'street' or placex.address ? 'place')
and indexed_status < 100
limit 1 LOOP
addr_street = location.address->'street';
addr_place = location.address->'place';
END LOOP;
END IF;
parent_place_id := find_parent_for_address(addr_street, addr_place,
partition, centroid);
IF parent_place_id is null THEN IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex FOR location IN SELECT place_id FROM placex
@ -147,15 +155,15 @@ BEGIN
NEW.interpolationtype = NEW.address->'interpolation'; NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo); place_centroid := ST_PointOnSurface(NEW.linegeo);
NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street', NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
NEW.address->'place', token_addr_place_match_tokens(NEW.token_info),
NEW.partition, place_centroid, NEW.linegeo); NEW.partition, place_centroid, NEW.linegeo);
IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
interpol_postcode := NEW.address->'postcode';
housenum := getorcreate_postcode_id(NEW.address->'postcode'); NEW.token_info := token_strip_info(NEW.token_info);
ELSE IF NEW.address ? '_inherited' THEN
interpol_postcode := NULL; NEW.address := hstore('interpolation', NEW.interpolationtype);
END IF; END IF;
-- if the line was newly inserted, split the line as necessary -- if the line was newly inserted, split the line as necessary
@ -202,12 +210,13 @@ BEGIN
-- determine postcode -- determine postcode
postcode := coalesce(interpol_postcode, postcode := coalesce(interpol_postcode,
prevnode.address->'postcode', token_normalized_postcode(prevnode.address->'postcode'),
nextnode.address->'postcode', token_normalized_postcode(nextnode.address->'postcode'),
postcode); postcode);
IF postcode is NULL THEN IF postcode is NULL THEN
SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode; SELECT token_normalized_postcode(placex.postcode)
FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
END IF; END IF;
IF postcode is NULL THEN IF postcode is NULL THEN
postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry); postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
@ -217,7 +226,7 @@ BEGIN
NEW.startnumber := startnumber; NEW.startnumber := startnumber;
NEW.endnumber := endnumber; NEW.endnumber := endnumber;
NEW.linegeo := sectiongeo; NEW.linegeo := sectiongeo;
NEW.postcode := upper(trim(postcode)); NEW.postcode := postcode;
ELSE ELSE
insert into location_property_osmline insert into location_property_osmline
(linegeo, partition, osm_id, parent_place_id, (linegeo, partition, osm_id, parent_place_id,

View File

@ -1,525 +0,0 @@
-- Functions for term normalisation and access to the 'word' table.
CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
AS '{{ modulepath }}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
AS $$
DECLARE
o TEXT;
BEGIN
o := public.gettokenstring(public.transliteration(name));
RETURN trim(substr(o,1,length(o)));
END;
$$
LANGUAGE plpgsql IMMUTABLE;
-- returns NULL if the word is too common
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
count INTEGER;
BEGIN
lookup_token := trim(lookup_word);
SELECT min(word_id), max(search_name_count) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id, count;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
ELSE
IF count > get_maxwordfreq() THEN
return_word_id := NULL;
END IF;
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Create housenumber tokens from an OSM addr:housenumber.
-- The housnumber is split at comma and semicolon as necessary.
-- The function returns the normalized form of the housenumber suitable
-- for comparison.
CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
RETURNS TEXT
AS $$
DECLARE
normtext TEXT;
BEGIN
SELECT array_to_string(array_agg(trans), ';')
INTO normtext
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
FROM (SELECT make_standard_name(h) as lookup_word
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
return normtext;
END;
$$ LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' ' || trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class='place' and type='house'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
'place', 'house', null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
lookup_word TEXT;
return_word_id INTEGER;
BEGIN
lookup_word := upper(trim(postcode));
lookup_token := ' ' || make_standard_name(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = lookup_word
and class='place' and type='postcode'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
'place', 'postcode', null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
lookup_country_code varchar(2))
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and country_code=lookup_country_code
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
null, null, lookup_country_code, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = normalized_word
and class = lookup_class and type = lookup_type
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
lookup_class, lookup_type, null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
normalized_word TEXT,
lookup_class text,
lookup_type text,
op text)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = normalized_word
and class = lookup_class and type = lookup_type and operator = op
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
lookup_class, lookup_type, null, 0, op);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
nospace_lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
null, null, null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
BEGIN
RETURN getorcreate_name_id(lookup_word, '');
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and lookup its word ids (partial words).
CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
words TEXT[];
id INTEGER;
return_word_id INTEGER[];
word_ids INTEGER[];
j INTEGER;
BEGIN
words := string_to_array(make_standard_name(lookup_word), ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
SELECT array_agg(word_id) INTO word_ids
FROM word
WHERE word_token = words[j] and class is null and type is null;
IF word_ids IS NULL THEN
id := nextval('seq_word');
INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
return_word_id := return_word_id || id;
ELSE
return_word_id := array_merge(return_word_id, word_ids);
END IF;
END IF;
END LOOP;
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and look up its name ids (full words).
CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
lookup_token TEXT;
return_word_ids INTEGER[];
BEGIN
lookup_token := ' '|| make_standard_name(lookup_word);
SELECT array_agg(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_ids;
RETURN return_word_ids;
END;
$$
LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
RETURNS VOID
AS $$
DECLARE
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_country(s, country_code);
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_country(s, country_code);
END IF;
END LOOP;
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
RETURNS INTEGER[]
AS $$
DECLARE
result INTEGER[];
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
result := '{}'::INTEGER[];
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_name_id(s, item.value);
IF not(ARRAY[w] <@ result) THEN
result := result || w;
END IF;
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
s := regexp_replace(item.value, '市$', '');
IF s != item.value THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, item.value);
IF NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END IF;
END LOOP;
RETURN result;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
RETURNS INTEGER
AS $$
DECLARE
s TEXT;
w INTEGER;
words TEXT[];
i INTEGER;
j INTEGER;
BEGIN
s := make_standard_name(src);
w := getorcreate_name_id(s, src);
w := getorcreate_word_id(s);
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w := getorcreate_word_id(words[j]);
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(src, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
END IF;
END LOOP;
END IF;
s := regexp_replace(src, '市$', '');
IF s != src THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, src);
END IF;
END IF;
RETURN 1;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
in_partition SMALLINT,
parent_place_id BIGINT,
address HSTORE,
country TEXT,
housenumber TEXT,
initial_name_vector INTEGER[],
geometry GEOMETRY,
OUT name_vector INTEGER[],
OUT nameaddress_vector INTEGER[])
AS $$
DECLARE
parent_name_vector INTEGER[];
parent_address_vector INTEGER[];
addr_place_ids INTEGER[];
addr_item RECORD;
parent_address_place_ids BIGINT[];
filtered_address HSTORE;
BEGIN
nameaddress_vector := '{}'::INTEGER[];
SELECT s.name_vector, s.nameaddress_vector
INTO parent_name_vector, parent_address_vector
FROM search_name s
WHERE s.place_id = parent_place_id;
-- Find all address tags that don't appear in the parent search names.
SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
FROM (SELECT skeys(address) as k, svals(address) as v) a
WHERE not addr_ids_from_name(v) && parent_address_vector
AND k not in ('country', 'street', 'place', 'postcode',
'housenumber', 'streetnumber', 'conscriptionnumber');
-- Compute all search terms from the addr: tags.
IF filtered_address IS NOT NULL THEN
FOR addr_item IN
SELECT * FROM
get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
LOOP
IF addr_item.place_id is null THEN
nameaddress_vector := array_merge(nameaddress_vector,
addr_item.keywords);
CONTINUE;
END IF;
IF parent_address_place_ids is null THEN
SELECT array_agg(parent_place_id) INTO parent_address_place_ids
FROM place_addressline
WHERE place_id = parent_place_id;
END IF;
IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
nameaddress_vector := array_merge(nameaddress_vector,
addr_item.keywords);
INSERT INTO place_addressline (place_id, address_place_id, fromarea,
isaddress, distance, cached_rank_address)
VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
true, addr_item.distance, addr_item.rank_address);
END IF;
END LOOP;
END IF;
name_vector := initial_name_vector;
-- Check if the parent covers all address terms.
-- If not, create a search name entry with the house number as the name.
-- This is unusual for the search_name table but prevents that the place
-- is returned when we only search for the street/place.
IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
name_vector := array_merge(name_vector,
ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
END IF;
IF not address ? 'street' and address ? 'place' THEN
addr_place_ids := addr_ids_from_name(address->'place');
IF not addr_place_ids <@ parent_name_vector THEN
-- make sure addr:place terms are always searchable
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-- If there is a housenumber, also add the place name as a name,
-- so we can search it by the usual housenumber+place algorithms.
IF housenumber is not null THEN
name_vector := array_merge(name_vector,
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
END IF;
END IF;
END IF;
-- Cheating here by not recomputing all terms but simply using the ones
-- from the parent object.
nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
END;
$$
LANGUAGE plpgsql;

View File

@ -63,54 +63,36 @@ END
$$ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
feature GEOMETRY, CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
address HSTORE, country TEXT) from_rank SMALLINT, to_rank SMALLINT,
RETURNS SETOF nearfeaturecentr extent FLOAT, tokens INT[])
RETURNS nearfeaturecentr
AS $$ AS $$
DECLARE DECLARE
r nearfeaturecentr%rowtype; r nearfeaturecentr%rowtype;
item RECORD;
BEGIN BEGIN
FOR item IN
SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
(SELECT skeys(address) as key, svals(address) as name) x
LOOP
IF item.from_rank is null THEN
CONTINUE;
END IF;
{% for partition in db.partitions %} {% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN IF in_partition = {{ partition }} THEN
SELECT place_id, keywords, rank_address, rank_search, SELECT place_id, keywords, rank_address, rank_search,
min(ST_Distance(feature, centroid)) as distance, min(ST_Distance(feature, centroid)) as distance,
isguess, postcode, centroid INTO r isguess, postcode, centroid INTO r
FROM location_area_large_{{ partition }} FROM location_area_large_{{ partition }}
WHERE geometry && ST_Expand(feature, item.extent) WHERE geometry && ST_Expand(feature, extent)
AND rank_address between item.from_rank and item.to_rank AND rank_address between from_rank and to_rank
AND word_ids_from_name(item.name) && keywords AND tokens && keywords
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1; ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
IF r.place_id is null THEN RETURN r;
-- If we cannot find a place for the term, just return the END IF;
-- search term for the given name. That ensures that the address
-- element can still be searched for, even though it will not be
-- displayed.
RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
null, null, null, null)::nearfeaturecentr;
ELSE
RETURN NEXT r;
END IF;
CONTINUE;
END IF;
{% endfor %} {% endfor %}
RAISE EXCEPTION 'Unknown partition %', in_partition; RAISE EXCEPTION 'Unknown partition %', in_partition;
END LOOP;
END; END;
$$ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$ create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
DECLARE DECLARE
BEGIN BEGIN

View File

@ -1,5 +1,84 @@
-- Trigger functions for the placex table. -- Trigger functions for the placex table.
-- Retrieve the data needed by the indexer for updating the place.
--
-- Return parameters:
-- name list of names
-- address list of address tags, either from the object or a surrounding
-- building
-- country_feature If the place is a country feature, this contains the
-- country code, otherwise it is null.
CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
OUT name HSTORE,
OUT address HSTORE,
OUT country_feature VARCHAR)
AS $$
BEGIN
-- For POI nodes, check if the address should be derived from a surrounding
-- building.
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
address := p.address;
ELSE
-- The additional && condition works around the misguided query
-- planner of postgis 3.0.
SELECT placex.address || hstore('_inherited', '') INTO address
FROM placex
WHERE ST_Covers(geometry, p.centroid)
and geometry && p.centroid
and placex.address is not null
and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
LIMIT 1;
END IF;
address := address - '_unlisted_place'::TEXT;
name := p.name;
country_feature := CASE WHEN p.admin_level = 2
and p.class = 'boundary' and p.type = 'administrative'
and p.osm_type = 'R'
THEN p.country_code
ELSE null
END;
END;
$$
LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
poi_osm_id BIGINT)
RETURNS BIGINT
AS $$
DECLARE
location RECORD;
parent RECORD;
BEGIN
FOR location IN
SELECT members FROM planet_osm_rels
WHERE parts @> ARRAY[poi_osm_id]
and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
and tags @> ARRAY['associatedStreet']
LOOP
FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
IF location.members[i+1] = 'street' THEN
FOR parent IN
SELECT place_id from placex
WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
and name is not null
and rank_search between 26 and 27
LOOP
RETURN parent.place_id;
END LOOP;
END IF;
END LOOP;
END LOOP;
RETURN NULL;
END;
$$
LANGUAGE plpgsql STABLE;
-- Find the parent road of a POI. -- Find the parent road of a POI.
-- --
-- \returns Place ID of parent object or NULL if none -- \returns Place ID of parent object or NULL if none
@ -10,118 +89,89 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
poi_osm_id BIGINT, poi_osm_id BIGINT,
poi_partition SMALLINT, poi_partition SMALLINT,
bbox GEOMETRY, bbox GEOMETRY,
addr_street TEXT, addr_street INTEGER[],
addr_place TEXT, addr_place INTEGER[],
fallback BOOL = true) is_place_addr BOOLEAN)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
DECLARE DECLARE
parent_place_id BIGINT DEFAULT NULL; parent_place_id BIGINT DEFAULT NULL;
location RECORD; location RECORD;
parent RECORD;
BEGIN BEGIN
{% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %} {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
-- Is this object part of an associatedStreet relation? -- Is this object part of an associatedStreet relation?
FOR location IN parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
SELECT members FROM planet_osm_rels
WHERE parts @> ARRAY[poi_osm_id]
and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
and tags @> ARRAY['associatedStreet']
LOOP
FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
IF location.members[i+1] = 'street' THEN
FOR parent IN
SELECT place_id from placex
WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
and name is not null
and rank_search between 26 and 27
LOOP
RETURN parent.place_id;
END LOOP;
END IF;
END LOOP;
END LOOP;
IF parent_place_id is null THEN
parent_place_id := find_parent_for_address(addr_street, addr_place, parent_place_id := find_parent_for_address(addr_street, addr_place,
poi_partition, bbox); poi_partition, bbox);
IF parent_place_id is not null THEN END IF;
RETURN parent_place_id;
END IF;
IF poi_osm_type = 'N' THEN IF parent_place_id is null and poi_osm_type = 'N' THEN
-- Is this node part of an interpolation? -- Is this node part of an interpolation?
FOR parent IN FOR location IN
SELECT q.parent_place_id SELECT q.parent_place_id
FROM location_property_osmline q, planet_osm_ways x FROM location_property_osmline q, planet_osm_ways x
WHERE q.linegeo && bbox and x.id = q.osm_id WHERE q.linegeo && bbox and x.id = q.osm_id
and poi_osm_id = any(x.nodes) and poi_osm_id = any(x.nodes)
LIMIT 1 LIMIT 1
LOOP LOOP
{% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %} {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
RETURN parent.parent_place_id; RETURN location.parent_place_id;
END LOOP; END LOOP;
-- Is this node part of any other way? FOR location IN
FOR location IN SELECT p.place_id, p.osm_id, p.rank_search, p.address,
SELECT p.place_id, p.osm_id, p.rank_search, p.address, coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid FROM placex p, planet_osm_ways w
FROM placex p, planet_osm_ways w WHERE p.osm_type = 'W' and p.rank_search >= 26
WHERE p.osm_type = 'W' and p.rank_search >= 26 and p.geometry && bbox
and p.geometry && bbox and w.id = p.osm_id and poi_osm_id = any(w.nodes)
and w.id = p.osm_id and poi_osm_id = any(w.nodes) LOOP
LOOP {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
{% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
-- Way IS a road then we are on it - that must be our road -- Way IS a road then we are on it - that must be our road
IF location.rank_search < 28 THEN IF location.rank_search < 28 THEN
{% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %} {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
return location.place_id; RETURN location.place_id;
END IF;
SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
location.centroid,
location.address->'street',
location.address->'place',
false)
INTO parent_place_id;
IF parent_place_id is not null THEN
RETURN parent_place_id;
END IF;
END LOOP;
END IF;
IF fallback THEN
IF addr_street is null and addr_place is not null THEN
-- The address is attached to a place we don't know.
-- Instead simply use the containing area with the largest rank.
FOR location IN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
END LOOP;
ELSEIF ST_Area(bbox) < 0.005 THEN
-- for smaller features get the nearest road
SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
{% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
ELSE
-- for larger features simply find the area with the largest rank that
-- contains the bbox, only use addressable features
FOR location IN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
END LOOP;
END IF; END IF;
END IF;
RETURN parent_place_id; parent_place_id := find_associated_street('W', location.osm_id);
END LOOP;
END IF;
IF parent_place_id is NULL THEN
IF is_place_addr THEN
-- The address is attached to a place we don't know.
-- Instead simply use the containing area with the largest rank.
FOR location IN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
END LOOP;
ELSEIF ST_Area(bbox) < 0.005 THEN
-- for smaller features get the nearest road
SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
{% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
ELSE
-- for larger features simply find the area with the largest rank that
-- contains the bbox, only use addressable features
FOR location IN
SELECT place_id FROM placex
WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
AND rank_address between 5 and 25
ORDER BY rank_address desc
LOOP
RETURN location.place_id;
END LOOP;
END IF;
END IF;
RETURN parent_place_id;
END; END;
$$ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
@ -240,6 +290,101 @@ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
in_partition SMALLINT,
parent_place_id BIGINT,
is_place_addr BOOLEAN,
country TEXT,
token_info JSONB,
geometry GEOMETRY,
OUT name_vector INTEGER[],
OUT nameaddress_vector INTEGER[])
AS $$
DECLARE
parent_name_vector INTEGER[];
parent_address_vector INTEGER[];
addr_place_ids INTEGER[];
hnr_vector INTEGER[];
addr_item RECORD;
addr_place RECORD;
parent_address_place_ids BIGINT[];
BEGIN
nameaddress_vector := '{}'::INTEGER[];
SELECT s.name_vector, s.nameaddress_vector
INTO parent_name_vector, parent_address_vector
FROM search_name s
WHERE s.place_id = parent_place_id;
FOR addr_item IN
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
FROM token_get_address_tokens(token_info)
WHERE not search_tokens <@ parent_address_vector
LOOP
addr_place := get_address_place(in_partition, geometry,
addr_item.from_rank, addr_item.to_rank,
addr_item.extent, addr_item.match_tokens);
IF addr_place is null THEN
-- No place found in OSM that matches. Make it at least searchable.
nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
ELSE
IF parent_address_place_ids is null THEN
SELECT array_agg(parent_place_id) INTO parent_address_place_ids
FROM place_addressline
WHERE place_id = parent_place_id;
END IF;
-- If the parent already lists the place in place_address line, then we
-- are done. Otherwise, add its own place_address line.
IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
INSERT INTO place_addressline (place_id, address_place_id, fromarea,
isaddress, distance, cached_rank_address)
VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
true, addr_place.distance, addr_place.rank_address);
END IF;
END IF;
END LOOP;
name_vector := token_get_name_search_tokens(token_info);
-- Check if the parent covers all address terms.
-- If not, create a search name entry with the house number as the name.
-- This is unusual for the search_name table but prevents that the place
-- is returned when we only search for the street/place.
hnr_vector := token_get_housenumber_search_tokens(token_info);
IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
name_vector := array_merge(name_vector, hnr_vector);
END IF;
IF is_place_addr THEN
addr_place_ids := token_addr_place_search_tokens(token_info);
IF not addr_place_ids <@ parent_name_vector THEN
-- make sure addr:place terms are always searchable
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-- If there is a housenumber, also add the place name as a name,
-- so we can search it by the usual housenumber+place algorithms.
IF hnr_vector is not null THEN
name_vector := array_merge(name_vector, addr_place_ids);
END IF;
END IF;
END IF;
-- Cheating here by not recomputing all terms but simply using the ones
-- from the parent object.
nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
END;
$$
LANGUAGE plpgsql;
-- Insert address of a place into the place_addressline table. -- Insert address of a place into the place_addressline table.
-- --
-- \param obj_place_id Place_id of the place to compute the address for. -- \param obj_place_id Place_id of the place to compute the address for.
@ -260,7 +405,7 @@ LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT, CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
partition SMALLINT, partition SMALLINT,
maxrank SMALLINT, maxrank SMALLINT,
address HSTORE, token_info JSONB,
geometry GEOMETRY, geometry GEOMETRY,
country TEXT, country TEXT,
OUT parent_place_id BIGINT, OUT parent_place_id BIGINT,
@ -275,7 +420,8 @@ DECLARE
current_node_area GEOMETRY := NULL; current_node_area GEOMETRY := NULL;
parent_place_rank INT := 0; parent_place_rank INT := 0;
addr_place_ids BIGINT[]; addr_place_ids BIGINT[] := '{}'::int[];
new_address_vector INT[];
location RECORD; location RECORD;
BEGIN BEGIN
@ -285,16 +431,21 @@ BEGIN
address_havelevel := array_fill(false, ARRAY[maxrank]); address_havelevel := array_fill(false, ARRAY[maxrank]);
FOR location IN FOR location IN
SELECT * FROM get_places_for_addr_tags(partition, geometry, SELECT (get_address_place(partition, geometry, from_rank, to_rank,
address, country) extent, match_tokens)).*, search_tokens
ORDER BY rank_address, distance, isguess desc FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
FROM token_get_address_tokens(token_info)) x
ORDER BY rank_address, distance, isguess desc
LOOP LOOP
{% if not db.reverse_only %} IF location.place_id is null THEN
nameaddress_vector := array_merge(nameaddress_vector, {% if not db.reverse_only %}
location.keywords::int[]); nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
{% endif %} {% endif %}
ELSE
{% if not db.reverse_only %}
nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
{% endif %}
IF location.place_id is not null THEN
location_isaddress := not address_havelevel[location.rank_address]; location_isaddress := not address_havelevel[location.rank_address];
IF not address_havelevel[location.rank_address] THEN IF not address_havelevel[location.rank_address] THEN
address_havelevel[location.rank_address] := true; address_havelevel[location.rank_address] := true;
@ -309,13 +460,13 @@ BEGIN
VALUES (obj_place_id, location.place_id, not location.isguess, VALUES (obj_place_id, location.place_id, not location.isguess,
true, location.distance, location.rank_address); true, location.distance, location.rank_address);
addr_place_ids := array_append(addr_place_ids, location.place_id); addr_place_ids := addr_place_ids || location.place_id;
END IF; END IF;
END LOOP; END LOOP;
FOR location IN FOR location IN
SELECT * FROM getNearFeatures(partition, geometry, maxrank) SELECT * FROM getNearFeatures(partition, geometry, maxrank)
WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id] WHERE not addr_place_ids @> ARRAY[place_id]
ORDER BY rank_address, isguess asc, ORDER BY rank_address, isguess asc,
distance * distance *
CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2 CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
@ -397,10 +548,11 @@ BEGIN
NEW.place_id := nextval('seq_place'); NEW.place_id := nextval('seq_place');
NEW.indexed_status := 1; --STATUS_NEW NEW.indexed_status := 1; --STATUS_NEW
NEW.country_code := lower(get_country_code(NEW.geometry)); NEW.centroid := ST_PointOnSurface(NEW.geometry);
NEW.country_code := lower(get_country_code(NEW.centroid));
NEW.partition := get_partition(NEW.country_code); NEW.partition := get_partition(NEW.country_code);
NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry); NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);
IF NEW.osm_type = 'X' THEN IF NEW.osm_type = 'X' THEN
-- E'X'ternal records should already be in the right format so do nothing -- E'X'ternal records should already be in the right format so do nothing
@ -522,8 +674,8 @@ DECLARE
parent_address_level SMALLINT; parent_address_level SMALLINT;
place_address_level SMALLINT; place_address_level SMALLINT;
addr_street TEXT; addr_street INTEGER[];
addr_place TEXT; addr_place INTEGER[];
max_rank SMALLINT; max_rank SMALLINT;
@ -531,12 +683,11 @@ DECLARE
nameaddress_vector INTEGER[]; nameaddress_vector INTEGER[];
addr_nameaddress_vector INTEGER[]; addr_nameaddress_vector INTEGER[];
inherited_address HSTORE;
linked_node_id BIGINT; linked_node_id BIGINT;
linked_importance FLOAT; linked_importance FLOAT;
linked_wikipedia TEXT; linked_wikipedia TEXT;
is_place_address BOOLEAN;
result BOOLEAN; result BOOLEAN;
BEGIN BEGIN
-- deferred delete -- deferred delete
@ -566,9 +717,9 @@ BEGIN
-- update not necessary for osmline, cause linked_place_id does not exist -- update not necessary for osmline, cause linked_place_id does not exist
NEW.extratags := NEW.extratags - 'linked_place'::TEXT; NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
NEW.address := NEW.address - '_unlisted_place'::TEXT;
IF NEW.linked_place_id is not null THEN IF NEW.linked_place_id is not null THEN
NEW.token_info := null;
{% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %} {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
RETURN NEW; RETURN NEW;
END IF; END IF;
@ -579,13 +730,34 @@ BEGIN
-- imported as place=postcode. That's why relations are allowed to pass here. -- imported as place=postcode. That's why relations are allowed to pass here.
-- This can go away in a couple of versions. -- This can go away in a couple of versions.
IF NEW.class = 'place' and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN IF NEW.class = 'place' and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
NEW.token_info := null;
RETURN NEW; RETURN NEW;
END IF; END IF;
-- Speed up searches - just use the centroid of the feature -- Compute a preliminary centroid.
-- cheaper but less acurate
NEW.centroid := ST_PointOnSurface(NEW.geometry); NEW.centroid := ST_PointOnSurface(NEW.geometry);
{% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
-- recalculate country and partition
IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
-- for countries, believe the mapped country code,
-- so that we remain in the right partition if the boundaries
-- suddenly expand.
NEW.country_code := lower(NEW.address->'country');
NEW.partition := get_partition(lower(NEW.country_code));
IF NEW.partition = 0 THEN
NEW.country_code := lower(get_country_code(NEW.centroid));
NEW.partition := get_partition(NEW.country_code);
END IF;
ELSE
IF NEW.rank_search >= 4 THEN
NEW.country_code := lower(get_country_code(NEW.centroid));
ELSE
NEW.country_code := NULL;
END IF;
NEW.partition := get_partition(NEW.country_code);
END IF;
{% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-- recompute the ranks, they might change when linking changes -- recompute the ranks, they might change when linking changes
SELECT * INTO NEW.rank_search, NEW.rank_address SELECT * INTO NEW.rank_search, NEW.rank_address
@ -665,54 +837,12 @@ BEGIN
parent_address_level := 3; parent_address_level := 3;
END IF; END IF;
{% if debug %}RAISE WARNING 'Copy over address tags';{% endif %} NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-- housenumber is a computed field, so start with an empty value addr_street := token_addr_street_match_tokens(NEW.token_info);
NEW.housenumber := NULL; addr_place := token_addr_place_match_tokens(NEW.token_info);
IF NEW.address is not NULL THEN
IF NEW.address ? 'conscriptionnumber' THEN
IF NEW.address ? 'streetnumber' THEN
NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
ELSE
NEW.housenumber := NEW.address->'conscriptionnumber';
END IF;
ELSEIF NEW.address ? 'streetnumber' THEN
NEW.housenumber := NEW.address->'streetnumber';
ELSEIF NEW.address ? 'housenumber' THEN
NEW.housenumber := NEW.address->'housenumber';
END IF;
NEW.housenumber := create_housenumber_id(NEW.housenumber);
addr_street := NEW.address->'street';
addr_place := NEW.address->'place';
IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
i := getorcreate_postcode_id(NEW.address->'postcode');
END IF;
END IF;
NEW.postcode := null; NEW.postcode := null;
-- recalculate country and partition
IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
-- for countries, believe the mapped country code,
-- so that we remain in the right partition if the boundaries
-- suddenly expand.
NEW.country_code := lower(NEW.address->'country');
NEW.partition := get_partition(lower(NEW.country_code));
IF NEW.partition = 0 THEN
NEW.country_code := lower(get_country_code(NEW.centroid));
NEW.partition := get_partition(NEW.country_code);
END IF;
ELSE
IF NEW.rank_search >= 4 THEN
NEW.country_code := lower(get_country_code(NEW.centroid));
ELSE
NEW.country_code := NULL;
END IF;
NEW.partition := get_partition(NEW.country_code);
END IF;
{% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-- waterway ways are linked when they are part of a relation and have the same class/type -- waterway ways are linked when they are part of a relation and have the same class/type
IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[] FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
@ -749,33 +879,14 @@ BEGIN
{% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %} {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
NEW.parent_place_id := null; NEW.parent_place_id := null;
is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);
-- if we have a POI and there is no address information,
-- see if we can get it from a surrounding building
inherited_address := ''::HSTORE;
IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
AND NEW.housenumber IS NULL THEN
FOR location IN
-- The additional && condition works around the misguided query
-- planner of postgis 3.0.
SELECT address from placex where ST_Covers(geometry, NEW.centroid)
and geometry && NEW.centroid
and (address ? 'housenumber' or address ? 'street' or address ? 'place')
and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
limit 1
LOOP
NEW.housenumber := location.address->'housenumber';
addr_street := location.address->'street';
addr_place := location.address->'place';
inherited_address := location.address;
END LOOP;
END IF;
-- We have to find our parent road. -- We have to find our parent road.
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
NEW.partition, NEW.partition,
ST_Envelope(NEW.geometry), ST_Envelope(NEW.geometry),
addr_street, addr_place); addr_street, addr_place,
is_place_address);
-- If we found the road take a shortcut here. -- If we found the road take a shortcut here.
-- Otherwise fall back to the full address getting method below. -- Otherwise fall back to the full address getting method below.
@ -785,12 +896,12 @@ BEGIN
SELECT p.country_code, p.postcode, p.name FROM placex p SELECT p.country_code, p.postcode, p.name FROM placex p
WHERE p.place_id = NEW.parent_place_id INTO location; WHERE p.place_id = NEW.parent_place_id INTO location;
IF addr_street is null and addr_place is not null THEN IF is_place_address THEN
-- Check if the addr:place tag is part of the parent name -- Check if the addr:place tag is part of the parent name
SELECT count(*) INTO i SELECT count(*) INTO i
FROM svals(location.name) AS pname WHERE pname = addr_place; FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
IF i = 0 THEN IF i = 0 THEN
NEW.address = NEW.address || hstore('_unlisted_place', addr_place); NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
END IF; END IF;
END IF; END IF;
@ -798,39 +909,21 @@ BEGIN
{% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %} {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
-- determine postcode -- determine postcode
IF NEW.address is not null AND NEW.address ? 'postcode' THEN NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
NEW.postcode = upper(trim(NEW.address->'postcode')); location.postcode,
ELSE get_nearest_postcode(NEW.country_code, NEW.geometry));
NEW.postcode := location.postcode;
END IF;
IF NEW.postcode is null THEN
NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
END IF;
IF NEW.name is not NULL THEN IF NEW.name is not NULL THEN
NEW.name := add_default_place_name(NEW.country_code, NEW.name); NEW.name := add_default_place_name(NEW.country_code, NEW.name);
name_vector := make_keywords(NEW.name);
IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
name_vector, NEW.rank_search, NEW.rank_address,
upper(trim(NEW.address->'postcode')), NEW.geometry,
NEW.centroid);
{% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
END IF;
END IF; END IF;
{% if not db.reverse_only %} {% if not db.reverse_only %}
IF array_length(name_vector, 1) is not NULL IF NEW.name is not NULL OR NEW.address is not NULL THEN
OR inherited_address is not NULL OR NEW.address is not NULL
THEN
SELECT * INTO name_vector, nameaddress_vector SELECT * INTO name_vector, nameaddress_vector
FROM create_poi_search_terms(NEW.place_id, FROM create_poi_search_terms(NEW.place_id,
NEW.partition, NEW.parent_place_id, NEW.partition, NEW.parent_place_id,
inherited_address || NEW.address, is_place_address, NEW.country_code,
NEW.country_code, NEW.housenumber, NEW.token_info, NEW.centroid);
name_vector, NEW.centroid);
IF array_length(name_vector, 1) is not NULL THEN IF array_length(name_vector, 1) is not NULL THEN
INSERT INTO search_name (place_id, search_rank, address_rank, INSERT INTO search_name (place_id, search_rank, address_rank,
@ -844,6 +937,17 @@ BEGIN
END IF; END IF;
{% endif %} {% endif %}
NEW.token_info := token_strip_info(NEW.token_info);
-- If the address was inherited from a surrounding building,
-- do not add it permanently to the table.
IF NEW.address ? '_inherited' THEN
IF NEW.address ? '_unlisted_place' THEN
NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
ELSE
NEW.address := null;
END IF;
END IF;
RETURN NEW; RETURN NEW;
END IF; END IF;
@ -914,19 +1018,11 @@ BEGIN
END IF; END IF;
END IF; END IF;
-- Initialise the name vector using our name
NEW.name := add_default_place_name(NEW.country_code, NEW.name);
name_vector := make_keywords(NEW.name);
-- make sure all names are in the word table
IF NEW.admin_level = 2 IF NEW.admin_level = 2
AND NEW.class = 'boundary' AND NEW.type = 'administrative' AND NEW.class = 'boundary' AND NEW.type = 'administrative'
AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R' AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
THEN THEN
PERFORM create_country(NEW.name, lower(NEW.country_code)); -- Update the list of country names. Adding an additional sanity
{% if debug %}RAISE WARNING 'Country names updated';{% endif %}
-- Also update the list of country names. Adding an additional sanity
-- check here: make sure the country does overlap with the area where -- check here: make sure the country does overlap with the area where
-- we expect it to be as per static country grid. -- we expect it to be as per static country grid.
FOR location IN FOR location IN
@ -959,29 +1055,28 @@ BEGIN
ELSEIF NEW.rank_address > 25 THEN ELSEIF NEW.rank_address > 25 THEN
max_rank := 25; max_rank := 25;
ELSE ELSE
max_rank = NEW.rank_address; max_rank := NEW.rank_address;
END IF; END IF;
SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank, SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
NEW.address, geom, NEW.country_code) NEW.token_info, geom, NEW.country_code)
INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector; INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;
{% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %} {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
IF NEW.address is not null AND NEW.address ? 'postcode' NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
AND NEW.address->'postcode' not similar to '%(,|;)%' THEN NEW.postcode);
NEW.postcode := upper(trim(NEW.address->'postcode'));
END IF;
IF NEW.postcode is null AND NEW.rank_search > 8 THEN
NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
END IF;
-- if we have a name add this to the name search table -- if we have a name add this to the name search table
IF NEW.name IS NOT NULL THEN IF NEW.name IS NOT NULL THEN
-- Initialise the name vector using our name
NEW.name := add_default_place_name(NEW.country_code, NEW.name);
name_vector := token_get_name_search_tokens(NEW.token_info);
IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid); result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
name_vector, NEW.rank_search, NEW.rank_address,
NEW.postcode, NEW.geometry, NEW.centroid);
{% if debug %}RAISE WARNING 'added to location (full)';{% endif %} {% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
END IF; END IF;
@ -990,8 +1085,11 @@ BEGIN
{% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %} {% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
END IF; END IF;
result := insertSearchName(NEW.partition, NEW.place_id, name_vector, IF NEW.rank_address between 16 and 27 THEN
NEW.rank_search, NEW.rank_address, NEW.geometry); result := insertSearchName(NEW.partition, NEW.place_id,
token_get_name_match_tokens(NEW.token_info),
NEW.rank_search, NEW.rank_address, NEW.geometry);
END IF;
{% if debug %}RAISE WARNING 'added to search name (full)';{% endif %} {% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}
{% if not db.reverse_only %} {% if not db.reverse_only %}
@ -1002,11 +1100,15 @@ BEGIN
NEW.importance, NEW.country_code, name_vector, NEW.importance, NEW.country_code, name_vector,
nameaddress_vector, NEW.centroid); nameaddress_vector, NEW.centroid);
{% endif %} {% endif %}
END IF;
IF NEW.postcode is null AND NEW.rank_search > 8 THEN
NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
END IF; END IF;
{% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %} {% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}
NEW.token_info := token_strip_info(NEW.token_info);
RETURN NEW; RETURN NEW;
END; END;
$$ $$

View File

@ -221,37 +221,30 @@ LANGUAGE plpgsql STABLE;
-- \param centroid Location of the address. -- \param centroid Location of the address.
-- --
-- \return Place ID of the parent if one was found, NULL otherwise. -- \return Place ID of the parent if one was found, NULL otherwise.
CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT, CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
partition SMALLINT, partition SMALLINT,
centroid GEOMETRY) centroid GEOMETRY)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
DECLARE DECLARE
parent_place_id BIGINT; parent_place_id BIGINT;
word_ids INTEGER[];
BEGIN BEGIN
IF street is not null THEN IF street is not null THEN
-- Check for addr:street attributes -- Check for addr:street attributes
-- Note that addr:street links can only be indexed, once the street itself is indexed -- Note that addr:street links can only be indexed, once the street itself is indexed
word_ids := word_ids_from_name(street); parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
IF word_ids is not null THEN IF parent_place_id is not null THEN
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids); {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
IF parent_place_id is not null THEN RETURN parent_place_id;
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
END IF; END IF;
END IF; END IF;
-- Check for addr:place attributes. -- Check for addr:place attributes.
IF place is not null THEN IF place is not null THEN
word_ids := word_ids_from_name(place); parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
IF word_ids is not null THEN IF parent_place_id is not null THEN
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids); {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
IF parent_place_id is not null THEN RETURN parent_place_id;
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
END IF; END IF;
END IF; END IF;

View File

@ -1,9 +1,6 @@
-- Indices used only during search and update. -- Indices used only during search and update.
-- These indices are created only after the indexing process is done. -- These indices are created only after the indexing process is done.
CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
ON word USING BTREE (word_id) {{db.tablespace.search_index}};
CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}}; ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};

View File

@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
); );
GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}"; GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
drop table IF EXISTS word;
CREATE TABLE word (
word_id INTEGER,
word_token text,
word text,
class text,
type text,
country_code varchar(2),
search_name_count INTEGER,
operator TEXT
) {{db.tablespace.search_data}};
CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
drop table IF EXISTS location_area CASCADE; drop table IF EXISTS location_area CASCADE;
CREATE TABLE location_area ( CREATE TABLE location_area (
place_id BIGINT, place_id BIGINT,
@ -109,6 +93,7 @@ CREATE TABLE location_property_osmline (
linegeo GEOMETRY, linegeo GEOMETRY,
interpolationtype TEXT, interpolationtype TEXT,
address HSTORE, address HSTORE,
token_info JSONB, -- custom column for tokenizer use only
postcode TEXT, postcode TEXT,
country_code VARCHAR(2) country_code VARCHAR(2)
){{db.tablespace.search_data}}; ){{db.tablespace.search_data}};
@ -158,6 +143,7 @@ CREATE TABLE placex (
indexed_status SMALLINT, indexed_status SMALLINT,
LIKE place INCLUDING CONSTRAINTS, LIKE place INCLUDING CONSTRAINTS,
wikipedia TEXT, -- calculated wikipedia article name (language:title) wikipedia TEXT, -- calculated wikipedia article name (language:title)
token_info JSONB, -- custom column for tokenizer use only
country_code varchar(2), country_code varchar(2),
housenumber TEXT, housenumber TEXT,
postcode TEXT, postcode TEXT,
@ -168,6 +154,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL; CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}}; CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}}; CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
CREATE INDEX idx_placex_geometry_buildings ON placex
USING GIST (geometry) {{db.tablespace.search_index}}
WHERE address is not null and rank_search = 30
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
CREATE INDEX idx_placex_geometry_placenode ON placex CREATE INDEX idx_placex_geometry_placenode ON placex
USING GIST (geometry) {{db.tablespace.search_index}} USING GIST (geometry) {{db.tablespace.search_index}}
WHERE osm_type = 'N' and rank_search < 26 WHERE osm_type = 'N' and rank_search < 26
@ -178,7 +168,6 @@ DROP SEQUENCE IF EXISTS seq_place;
CREATE SEQUENCE seq_place start 1; CREATE SEQUENCE seq_place start 1;
GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;

View File

@ -0,0 +1,399 @@
-- Get tokens used for searching the given place.
--
-- These are the tokens that will be saved in the search_name table.
CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Get tokens for matching the place name against others.
--
-- This should usually be restricted to full name tokens.
CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber tokens applicable for the place.
CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'hnr_tokens')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return the housenumber in the form that it can be matched during search.
CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
RETURNS TEXT
AS $$
SELECT info->>'hnr';
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'street')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'place_match')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
SELECT (info->>'place_search')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE;
CREATE TYPE token_addresstoken AS (
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
AS $$
SELECT key, (value->>1)::int[] as match_tokens,
(value->>0)::int[] as search_tokens
FROM jsonb_each(info->'addr');
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS TEXT
AS $$
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB
AS $$
SELECT NULL::JSONB;
$$ LANGUAGE SQL IMMUTABLE STRICT;
--------------- private functions ----------------------------------------------
-- Functions for term normalisation and access to the 'word' table.
CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
AS '{{ modulepath }}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
AS $$
DECLARE
o TEXT;
BEGIN
o := public.gettokenstring(public.transliteration(name));
RETURN trim(substr(o,1,length(o)));
END;
$$
LANGUAGE plpgsql IMMUTABLE;
-- returns NULL if the word is too common
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
count INTEGER;
BEGIN
lookup_token := trim(lookup_word);
SELECT min(word_id), max(search_name_count) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id, count;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
ELSE
IF count > {{ max_word_freq }} THEN
return_word_id := NULL;
END IF;
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Create housenumber tokens from an OSM addr:housenumber.
-- The housnumber is split at comma and semicolon as necessary.
-- The function returns the normalized form of the housenumber suitable
-- for comparison.
CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
OUT tokens TEXT,
OUT normtext TEXT)
AS $$
BEGIN
SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
INTO normtext, tokens
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
FROM (SELECT make_standard_name(h) as lookup_word
FROM unnest(housenumbers) h) x) y;
END;
$$ LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' ' || trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class='place' and type='house'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
'place', 'house', null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
RETURNS BOOLEAN
AS $$
DECLARE
r RECORD;
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' ' || make_standard_name(postcode);
FOR r IN
SELECT word_id FROM word
WHERE word_token = lookup_token and word = postcode
and class='place' and type='postcode'
LOOP
RETURN false;
END LOOP;
INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
'place', 'postcode', null, 0);
RETURN true;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
nospace_lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
null, null, null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and lookup its word ids (partial words).
CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
words TEXT[];
id INTEGER;
return_word_id INTEGER[];
word_ids INTEGER[];
j INTEGER;
BEGIN
words := string_to_array(make_standard_name(lookup_word), ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
SELECT array_agg(word_id) INTO word_ids
FROM word
WHERE word_token = words[j] and class is null and type is null;
IF word_ids IS NULL THEN
id := nextval('seq_word');
INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
return_word_id := return_word_id || id;
ELSE
return_word_id := array_merge(return_word_id, word_ids);
END IF;
END IF;
END LOOP;
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and look up its name ids (full words).
CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
lookup_token TEXT;
return_word_ids INTEGER[];
BEGIN
lookup_token := ' '|| make_standard_name(lookup_word);
SELECT array_agg(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_ids;
RETURN return_word_ids;
END;
$$
LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
RETURNS INTEGER[]
AS $$
DECLARE
result INTEGER[];
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
result := '{}'::INTEGER[];
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_name_id(s, item.value);
IF not(ARRAY[w] <@ result) THEN
result := result || w;
END IF;
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
s := regexp_replace(item.value, '市$', '');
IF s != item.value THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, item.value);
IF NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END IF;
END LOOP;
RETURN result;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
RETURNS INTEGER
AS $$
DECLARE
s TEXT;
w INTEGER;
words TEXT[];
i INTEGER;
j INTEGER;
BEGIN
s := make_standard_name(src);
w := getorcreate_name_id(s, src);
w := getorcreate_word_id(s);
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w := getorcreate_word_id(words[j]);
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(src, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
END IF;
END LOOP;
END IF;
s := regexp_replace(src, '市$', '');
IF s != src THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, src);
END IF;
END IF;
RETURN 1;
END;
$$
LANGUAGE plpgsql;

View File

@ -0,0 +1,2 @@
CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
ON word USING BTREE (word_id) {{db.tablespace.search_index}};

View File

@ -0,0 +1,19 @@
DROP TABLE IF EXISTS word;
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
word text,
class text,
type text,
country_code varchar(2),
search_name_count INTEGER,
operator TEXT
) {{db.tablespace.search_data}};
CREATE INDEX idx_word_word_token ON word
USING BTREE (word_token) {{db.tablespace.search_index}};
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";

View File

@ -32,8 +32,11 @@ class UpdateIndex:
@staticmethod @staticmethod
def run(args): def run(args):
from ..indexer.indexer import Indexer from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1) args.threads or psutil.cpu_count() or 1)
if not args.no_boundaries: if not args.no_boundaries:

View File

@ -46,6 +46,7 @@ class UpdateRefresh:
@staticmethod @staticmethod
def run(args): def run(args):
from ..tools import refresh from ..tools import refresh
from ..tokenizer import factory as tokenizer_factory
if args.postcodes: if args.postcodes:
LOG.warning("Update postcodes centroid") LOG.warning("Update postcodes centroid")
@ -66,6 +67,8 @@ class UpdateRefresh:
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config, refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements) args.diffs, args.enable_debug_statements)
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
tokenizer.update_sql_functions(args.config)
if args.wiki_data: if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH data_path = Path(args.config.WIKIPEDIA_DATA_PATH

View File

@ -83,6 +83,7 @@ class UpdateReplication:
def _update(args): def _update(args):
from ..tools import replication from ..tools import replication
from ..indexer.indexer import Indexer from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
params = args.osm2pgsql_options(default_cache=2000, default_threads=1) params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
params.update(base_url=args.config.REPLICATION_URL, params.update(base_url=args.config.REPLICATION_URL,
@ -106,6 +107,8 @@ class UpdateReplication:
raise UsageError("Bad argument '--no-index'.") raise UsageError("Bad argument '--no-index'.")
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
while True: while True:
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:
start = dt.datetime.now(dt.timezone.utc) start = dt.datetime.now(dt.timezone.utc)
@ -116,7 +119,7 @@ class UpdateReplication:
if state is not replication.UpdateState.NO_CHANGES and args.do_index: if state is not replication.UpdateState.NO_CHANGES and args.do_index:
index_start = dt.datetime.now(dt.timezone.utc) index_start = dt.datetime.now(dt.timezone.utc)
indexer = Indexer(args.config.get_libpq_dsn(), indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or 1) args.threads or 1)
indexer.index_boundaries(0, 30) indexer.index_boundaries(0, 30)
indexer.index_by_rank(0, 30) indexer.index_by_rank(0, 30)

View File

@ -56,6 +56,7 @@ class SetupAll:
from ..tools import refresh from ..tools import refresh
from ..indexer.indexer import Indexer from ..indexer.indexer import Indexer
from ..tools import postcodes from ..tools import postcodes
from ..tokenizer import factory as tokenizer_factory
if args.osm_file and not Path(args.osm_file).is_file(): if args.osm_file and not Path(args.osm_file).is_file():
LOG.fatal("OSM file '%s' does not exist.", args.osm_file) LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
@ -67,12 +68,6 @@ class SetupAll:
args.no_partitions, args.no_partitions,
rouser=args.config.DATABASE_WEBUSER) rouser=args.config.DATABASE_WEBUSER)
LOG.warning('Installing database module')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.install_module(args.module_dir, args.project_dir,
args.config.DATABASE_MODULE_PATH,
conn=conn)
LOG.warning('Importing OSM data file') LOG.warning('Importing OSM data file')
database_import.import_osm_data(Path(args.osm_file), database_import.import_osm_data(Path(args.osm_file),
args.osm2pgsql_options(0, 1), args.osm2pgsql_options(0, 1),
@ -105,22 +100,31 @@ class SetupAll:
if args.continue_at is None or args.continue_at == 'load-data': if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Initialise tables') LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY) database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table') LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(), database_import.load_data(args.config.get_libpq_dsn(),
args.data_dir,
args.threads or psutil.cpu_count() or 1) args.threads or psutil.cpu_count() or 1)
LOG.warning("Setting up tokenizer")
if args.continue_at is None or args.continue_at == 'load-data':
# (re)initialise the tokenizer data
tokenizer = tokenizer_factory.create_tokenizer(args.config)
else:
# just load the tokenizer
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes') LOG.warning('Calculate postcodes')
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir) postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'): if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data': if args.continue_at is not None and args.continue_at != 'load-data':
with connect(args.config.get_libpq_dsn()) as conn: with connect(args.config.get_libpq_dsn()) as conn:
SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX) SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
LOG.warning('Indexing places') LOG.warning('Indexing places')
indexer = Indexer(args.config.get_libpq_dsn(), indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1) args.threads or psutil.cpu_count() or 1)
indexer.index_full(analyse=not args.index_noanalyse) indexer.index_full(analyse=not args.index_noanalyse)
@ -129,7 +133,9 @@ class SetupAll:
database_import.create_search_indices(conn, args.config, database_import.create_search_indices(conn, args.config,
drop=args.no_updates) drop=args.no_updates)
LOG.warning('Create search index for default country names.') LOG.warning('Create search index for default country names.')
database_import.create_country_names(conn, args.config) database_import.create_country_names(conn, tokenizer,
args.config.LANGUAGES)
tokenizer.finalize_import(args.config)
webdir = args.project_dir / 'website' webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir) LOG.warning('Setup website at %s', webdir)

View File

@ -9,6 +9,8 @@ LOG = logging.getLogger()
# Do not repeat documentation of subcommand classes. # Do not repeat documentation of subcommand classes.
# pylint: disable=C0111 # pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class ImportSpecialPhrases: class ImportSpecialPhrases:
"""\ """\
@ -22,10 +24,13 @@ class ImportSpecialPhrases:
@staticmethod @staticmethod
def run(args): def run(args):
from ..tokenizer import factory as tokenizer_factory
if args.import_from_wiki: if args.import_from_wiki:
LOG.warning('Special phrases importation starting') LOG.warning('Special phrases importation starting')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as db_connection: with connect(args.config.get_libpq_dsn()) as db_connection:
SpecialPhrasesImporter( SpecialPhrasesImporter(
args.config, args.phplib_dir, db_connection args.config, args.phplib_dir, db_connection
).import_from_wiki() ).import_from_wiki(tokenizer)
return 0 return 0

View File

@ -48,14 +48,14 @@ class DBConnection:
""" A single non-blocking database connection. """ A single non-blocking database connection.
""" """
def __init__(self, dsn): def __init__(self, dsn, cursor_factory=None):
self.current_query = None self.current_query = None
self.current_params = None self.current_params = None
self.dsn = dsn self.dsn = dsn
self.conn = None self.conn = None
self.cursor = None self.cursor = None
self.connect() self.connect(cursor_factory=cursor_factory)
def close(self): def close(self):
""" Close all open connections. Does not wait for pending requests. """ Close all open connections. Does not wait for pending requests.
@ -66,7 +66,7 @@ class DBConnection:
self.conn = None self.conn = None
def connect(self): def connect(self, cursor_factory=None):
""" (Re)connect to the database. Creates an asynchronous connection """ (Re)connect to the database. Creates an asynchronous connection
with JIT and parallel processing disabled. If a connection was with JIT and parallel processing disabled. If a connection was
already open, it is closed and a new connection established. already open, it is closed and a new connection established.
@ -79,7 +79,7 @@ class DBConnection:
self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True}) self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
self.wait() self.wait()
self.cursor = self.conn.cursor() self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
# Disable JIT and parallel workers as they are known to cause problems. # Disable JIT and parallel workers as they are known to cause problems.
# Update pg_settings instead of using SET because it does not yield # Update pg_settings instead of using SET because it does not yield
# errors on older versions of Postgres where the settings are not # errors on older versions of Postgres where the settings are not

View File

@ -89,8 +89,6 @@ class SQLPreprocessor:
self.env.globals['db'] = db_info self.env.globals['db'] = db_info
self.env.globals['sql'] = _setup_postgres_sql(conn) self.env.globals['sql'] = _setup_postgres_sql(conn)
self.env.globals['postgres'] = _setup_postgresql_features(conn) self.env.globals['postgres'] = _setup_postgresql_features(conn)
self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
def run_sql_file(self, conn, name, **kwargs): def run_sql_file(self, conn, name, **kwargs):

View File

@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database.
""" """
import logging import logging
import select import select
import time
import psycopg2.extras
from nominatim.indexer.progress import ProgressLogger from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners from nominatim.indexer import runners
@ -11,6 +14,73 @@ from nominatim.db.connection import connect
LOG = logging.getLogger() LOG = logging.getLogger()
class PlaceFetcher:
""" Asynchronous connection that fetches place details for processing.
"""
def __init__(self, dsn, setup_conn):
self.wait_time = 0
self.current_ids = None
self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
with setup_conn.cursor() as cur:
# need to fetch those manually because register_hstore cannot
# fetch them on an asynchronous connection below.
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
array_oid=hstore_array_oid)
def close(self):
""" Close the underlying asynchronous connection.
"""
if self.conn:
self.conn.close()
self.conn = None
def fetch_next_batch(self, cur, runner):
""" Send a request for the next batch of places.
If details for the places are required, they will be fetched
asynchronously.
Returns true if there is still data available.
"""
ids = cur.fetchmany(100)
if not ids:
self.current_ids = None
return False
if hasattr(runner, 'get_place_details'):
runner.get_place_details(self.conn, ids)
self.current_ids = []
else:
self.current_ids = ids
return True
def get_batch(self):
""" Get the next batch of data, previously requested with
`fetch_next_batch`.
"""
if self.current_ids is not None and not self.current_ids:
tstart = time.time()
self.conn.wait()
self.wait_time += time.time() - tstart
self.current_ids = self.conn.cursor.fetchall()
return self.current_ids
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.conn.wait()
self.close()
class WorkerPool: class WorkerPool:
""" A pool of asynchronous database connections. """ A pool of asynchronous database connections.
@ -21,6 +91,7 @@ class WorkerPool:
def __init__(self, dsn, pool_size): def __init__(self, dsn, pool_size):
self.threads = [DBConnection(dsn) for _ in range(pool_size)] self.threads = [DBConnection(dsn) for _ in range(pool_size)]
self.free_workers = self._yield_free_worker() self.free_workers = self._yield_free_worker()
self.wait_time = 0
def finish_all(self): def finish_all(self):
@ -64,7 +135,9 @@ class WorkerPool:
ready = self.threads ready = self.threads
command_stat = 0 command_stat = 0
else: else:
tstart = time.time()
_, ready, _ = select.select([], self.threads, []) _, ready, _ = select.select([], self.threads, [])
self.wait_time += time.time() - tstart
def __enter__(self): def __enter__(self):
@ -72,6 +145,7 @@ class WorkerPool:
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
self.finish_all()
self.close() self.close()
@ -79,8 +153,9 @@ class Indexer:
""" Main indexing routine. """ Main indexing routine.
""" """
def __init__(self, dsn, num_threads): def __init__(self, dsn, tokenizer, num_threads):
self.dsn = dsn self.dsn = dsn
self.tokenizer = tokenizer
self.num_threads = num_threads self.num_threads = num_threads
@ -123,8 +198,9 @@ class Indexer:
LOG.warning("Starting indexing boundaries using %s threads", LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads) self.num_threads)
for rank in range(max(minrank, 4), min(maxrank, 26)): with self.tokenizer.name_analyzer() as analyzer:
self._index(runners.BoundaryRunner(rank)) for rank in range(max(minrank, 4), min(maxrank, 26)):
self._index(runners.BoundaryRunner(rank, analyzer))
def index_by_rank(self, minrank, maxrank): def index_by_rank(self, minrank, maxrank):
""" Index all entries of placex in the given rank range (inclusive) """ Index all entries of placex in the given rank range (inclusive)
@ -137,15 +213,16 @@ class Indexer:
LOG.warning("Starting indexing rank (%i to %i) using %i threads", LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads) minrank, maxrank, self.num_threads)
for rank in range(max(1, minrank), maxrank): with self.tokenizer.name_analyzer() as analyzer:
self._index(runners.RankRunner(rank)) for rank in range(max(1, minrank), maxrank):
self._index(runners.RankRunner(rank, analyzer))
if maxrank == 30: if maxrank == 30:
self._index(runners.RankRunner(0)) self._index(runners.RankRunner(0, analyzer))
self._index(runners.InterpolationRunner(), 20) self._index(runners.InterpolationRunner(analyzer), 20)
self._index(runners.RankRunner(30), 20) self._index(runners.RankRunner(30, analyzer), 20)
else: else:
self._index(runners.RankRunner(maxrank)) self._index(runners.RankRunner(maxrank, analyzer))
def index_postcodes(self): def index_postcodes(self):
@ -173,6 +250,7 @@ class Indexer:
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch) LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur: with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects()) total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples) LOG.debug("Total number of rows: %i", total_tuples)
@ -185,19 +263,24 @@ class Indexer:
with conn.cursor(name='places') as cur: with conn.cursor(name='places') as cur:
cur.execute(runner.sql_get_objects()) cur.execute(runner.sql_get_objects())
with WorkerPool(self.dsn, self.num_threads) as pool: with PlaceFetcher(self.dsn, conn) as fetcher:
while True: with WorkerPool(self.dsn, self.num_threads) as pool:
places = [p[0] for p in cur.fetchmany(batch)] has_more = fetcher.fetch_next_batch(cur, runner)
if not places: while has_more:
break places = fetcher.get_batch()
LOG.debug("Processing places: %s", str(places)) # asynchronously get the next batch
worker = pool.next_free_worker() has_more = fetcher.fetch_next_batch(cur, runner)
worker.perform(runner.sql_index_place(places)) # And insert the curent batch
progress.add(len(places)) for idx in range(0, len(places), batch):
part = places[idx:idx+batch]
LOG.debug("Processing places: %s", str(part))
runner.index_places(pool.next_free_worker(), part)
progress.add(len(part))
pool.finish_all() LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
fetcher.wait_time, pool.wait_time)
conn.commit() conn.commit()

View File

@ -2,15 +2,52 @@
Mix-ins that provide the actual commands for the indexer for various indexing Mix-ins that provide the actual commands for the indexer for various indexing
tasks. tasks.
""" """
import functools
import psycopg2.extras
# pylint: disable=C0111 # pylint: disable=C0111
class RankRunner: class AbstractPlacexRunner:
""" Returns SQL commands for indexing of the placex table.
"""
SELECT_SQL = 'SELECT place_id FROM placex'
def __init__(self, rank, analyzer):
self.rank = rank
self.analyzer = analyzer
@staticmethod
@functools.lru_cache(maxsize=1)
def _index_sql(num_places):
return """ UPDATE placex
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
@staticmethod
def get_place_details(worker, ids):
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
FROM placex WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
def index_places(self, worker, places):
values = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
worker.perform(self._index_sql(len(places)), values)
class RankRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing one rank within the placex table. """ Returns SQL commands for indexing one rank within the placex table.
""" """
def __init__(self, rank):
self.rank = rank
def name(self): def name(self):
return "rank {}".format(self.rank) return "rank {}".format(self.rank)
@ -20,24 +57,16 @@ class RankRunner:
""".format(self.rank) """.format(self.rank)
def sql_get_objects(self): def sql_get_objects(self):
return """SELECT place_id FROM placex return """{} WHERE indexed_status > 0 and rank_address = {}
WHERE indexed_status > 0 and rank_address = {} ORDER BY geometry_sector
ORDER BY geometry_sector""".format(self.rank) """.format(self.SELECT_SQL, self.rank)
@staticmethod
def sql_index_place(ids):
return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
.format(','.join((str(i) for i in ids)))
class BoundaryRunner: class BoundaryRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing the administrative boundaries """ Returns SQL commands for indexing the administrative boundaries
of a certain rank. of a certain rank.
""" """
def __init__(self, rank):
self.rank = rank
def name(self): def name(self):
return "boundaries rank {}".format(self.rank) return "boundaries rank {}".format(self.rank)
@ -49,16 +78,10 @@ class BoundaryRunner:
""".format(self.rank) """.format(self.rank)
def sql_get_objects(self): def sql_get_objects(self):
return """SELECT place_id FROM placex return """{} WHERE indexed_status > 0 and rank_search = {}
WHERE indexed_status > 0 and rank_search = {} and class = 'boundary' and type = 'administrative'
and class = 'boundary' and type = 'administrative' ORDER BY partition, admin_level
ORDER BY partition, admin_level """.format(self.SELECT_SQL, self.rank)
""".format(self.rank)
@staticmethod
def sql_index_place(ids):
return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
.format(','.join((str(i) for i in ids)))
class InterpolationRunner: class InterpolationRunner:
@ -66,6 +89,10 @@ class InterpolationRunner:
location_property_osmline. location_property_osmline.
""" """
def __init__(self, analyzer):
self.analyzer = analyzer
@staticmethod @staticmethod
def name(): def name():
return "interpolation lines (location_property_osmline)" return "interpolation lines (location_property_osmline)"
@ -77,15 +104,37 @@ class InterpolationRunner:
@staticmethod @staticmethod
def sql_get_objects(): def sql_get_objects():
return """SELECT place_id FROM location_property_osmline return """SELECT place_id
FROM location_property_osmline
WHERE indexed_status > 0 WHERE indexed_status > 0
ORDER BY geometry_sector""" ORDER BY geometry_sector"""
@staticmethod @staticmethod
def sql_index_place(ids): def get_place_details(worker, ids):
return """UPDATE location_property_osmline worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
SET indexed_status = 0 WHERE place_id IN ({}) FROM location_property_osmline WHERE place_id IN %s""",
""".format(','.join((str(i) for i in ids))) (tuple((p[0] for p in ids)), ))
@staticmethod
@functools.lru_cache(maxsize=1)
def _index_sql(num_places):
return """ UPDATE location_property_osmline
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
values = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
worker.perform(self._index_sql(len(places)), values)
class PostcodeRunner: class PostcodeRunner:
@ -107,7 +156,7 @@ class PostcodeRunner:
ORDER BY country_code, postcode""" ORDER BY country_code, postcode"""
@staticmethod @staticmethod
def sql_index_place(ids): def index_places(worker, ids):
return """UPDATE location_postcode SET indexed_status = 0 worker.perform(""" UPDATE location_postcode SET indexed_status = 0
WHERE place_id IN ({}) WHERE place_id IN ({})
""".format(','.join((str(i) for i in ids))) """.format(','.join((str(i[0]) for i in ids))))

View File

View File

@ -0,0 +1,88 @@
"""
Functions for creating a tokenizer or initialising the right one for an
existing database.
A tokenizer is something that is bound to the lifetime of a database. It
can be choosen and configured before the intial import but then needs to
be used consistently when querying and updating the database.
This module provides the functions to create and configure a new tokenizer
as well as instanciating the appropriate tokenizer for updating an existing
database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
import logging
import importlib
from ..errors import UsageError
from ..db import properties
from ..db.connection import connect
LOG = logging.getLogger()
def _import_tokenizer(name):
""" Load the tokenizer.py module from project directory.
"""
try:
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
except ModuleNotFoundError as exp:
LOG.fatal("No tokenizer named '%s' available. "
"Check the setting of NOMINATIM_TOKENIZER.", name)
raise UsageError('Tokenizer not found') from exp
def create_tokenizer(config, init_db=True, module_name=None):
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
of the project directory and the tokenizer loaded from its new location.
"""
if module_name is None:
module_name = config.TOKENIZER
# Create the directory for the tokenizer data
basedir = config.project_dir / 'tokenizer'
if not basedir.exists():
basedir.mkdir()
elif not basedir.is_dir():
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.")
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name)
return tokenizer
def get_tokenizer_for_db(config):
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database
and initialises it.
"""
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
raise UsageError('Cannot initialize tokenizer.')
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
if name is None:
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
raise UsageError('Cannot initialize tokenizer.')
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project()
return tokenizer

View File

@ -0,0 +1,540 @@
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from collections import OrderedDict
import logging
import re
import shutil
from textwrap import dedent
from icu import Transliterator
import psycopg2
import psycopg2.extras
from nominatim.db.connection import connect
from nominatim.db import properties
from nominatim.db import utils as db_utils
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.errors import UsageError
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
LOG = logging.getLogger()
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
"""
return LegacyTokenizer(dsn, data_dir)
def _install_module(config_module_path, src_dir, module_dir):
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
data.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
"""
# Custom module locations are simply used as is.
if config_module_path:
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
return module_dir
# In any other case install the module in the project directory.
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
return module_dir
def _check_module(module_dir, conn):
""" Try to use the PostgreSQL module to confirm that it is correctly
installed and accessible from PostgreSQL.
"""
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(module_dir))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
class LegacyTokenizer:
""" The legacy tokenizer uses a special PostgreSQL module to normalize
names and queries. The tokenizer thus implements normalization through
calls to the database.
"""
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.normalization = None
def init_new_db(self, config, init_db=True):
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self.normalization = config.TERM_NORMALIZATION
self._install_php(config)
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
if init_db:
self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
def finalize_import(self, config):
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config):
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
max_word_freq=max_word_freq,
modulepath=modulepath)
def check_database(self):
""" Check that the tokenizer is set up correctly.
"""
hint = """\
The Postgresql extension nominatim.so was not correctly loaded.
Error: {error}
Hints:
* Check the output of the CMmake/make installation step
* Does nominatim.so exist?
* Does nominatim.so exist on the database server?
* Can nominatim.so be accessed by the database user?
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
try:
out = cur.scalar("SELECT make_standard_name('a')")
except psycopg2.Error as err:
return hint.format(error=str(err))
if out != 'a':
return hint.format(error='Unexpected result for make_standard_name()')
return None
def migrate_database(self, config):
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
normalizer = Transliterator.createFromRules("phrase normalizer",
self.normalization)
return LegacyNameAnalyzer(self.dsn, normalizer)
def _install_php(self, config):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config)))
def _init_db_tables(self, config):
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def _save_config(self, conn, config):
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
class LegacyNameAnalyzer:
""" The legacy analyzer uses the special Postgresql module for
splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn, normalizer):
self.conn = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def close(self):
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return self.normalizer.transliterate(phrase)
def add_postcodes_from_db(self):
""" Add postcodes from the location_postcode table to the word table.
"""
with self.conn.cursor() as cur:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM (SELECT distinct(postcode) as pc
FROM location_postcode) x""")
def update_special_phrases(self, phrases):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("""SELECT word, class, type, operator FROM word
WHERE class != 'place'
OR (type != 'house' AND type != 'postcode')""")
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
to_add = norm_phrases - existing_phrases
to_delete = existing_phrases - norm_phrases
if to_add:
psycopg2.extras.execute_values(
cur,
""" INSERT INTO word (word_id, word_token, word, class, type,
search_name_count, operator)
(SELECT nextval('seq_word'), make_standard_name(name), name,
class, type, 0,
CASE WHEN op in ('in', 'near') THEN op ELSE null END
FROM (VALUES %s) as v(name, class, type, op))""",
to_add)
if to_delete:
psycopg2.extras.execute_values(
cur,
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE word = name and class = in_class and type = in_type
and ((op = '-' and operator is null) or op = operator)""",
to_delete)
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), len(to_add), len(to_delete))
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO word (word_id, word_token, country_code)
(SELECT nextval('seq_word'), lookup_token, %s
FROM (SELECT ' ' || make_standard_name(n) as lookup_token
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, names, country_code))
def process_place(self, place):
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo(self._cache)
names = place.get('name')
if names:
token_info.add_names(self.conn, names)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), list(names.values()))
address = place.get('address')
if address:
hnrs = []
addr_terms = []
for key, value in address.items():
if key == 'postcode':
self._add_postcode(value)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, value)
elif key == 'place':
token_info.add_place(self.conn, value)
elif not key.startswith('_') and \
key not in ('country', 'full'):
addr_terms.append((key, value))
if hnrs:
token_info.add_housenumbers(self.conn, hnrs)
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
return token_info.data
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
def _create_postcode_from_db(pcode):
with self.conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
if re.search(r'[:,;]', postcode) is None:
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache):
self.cache = cache
self.data = {}
def add_names(self, conn, names):
""" Add token information for the names of the place.
"""
with conn.cursor() as cur:
# Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, ))
def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from the address.
"""
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
def add_street(self, conn, street):
""" Add addr:street match terms.
"""
def _get_street(name):
with conn.cursor() as cur:
return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
self.data['street'] = self.cache.streets.get(street, _get_street)
def add_place(self, conn, place):
""" Add addr:place search and match terms.
"""
def _get_place(name):
with conn.cursor() as cur:
cur.execute("""SELECT (addr_ids_from_name(%s)
|| getorcreate_name_id(make_standard_name(%s), ''))::text,
word_ids_from_name(%s)::text""",
(name, name, name))
return cur.fetchone()
self.data['place_search'], self.data['place_match'] = \
self.cache.places.get(place, _get_place)
def add_address_terms(self, conn, terms):
""" Add additional address terms.
"""
def _get_address_term(name):
with conn.cursor() as cur:
cur.execute("""SELECT addr_ids_from_name(%s)::text,
word_ids_from_name(%s)::text""",
(name, name))
return cur.fetchone()
tokens = {}
for key, value in terms:
tokens[key] = self.cache.address_terms.get(value, _get_address_term)
self.data['addr'] = tokens
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize=128, init_data=None):
self.data = init_data or OrderedDict()
self.maxsize = maxsize
if init_data is not None and len(init_data) > maxsize:
self.maxsize = len(init_data)
def get(self, key, generator):
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn):
# various LRU caches
self.streets = _LRU(maxsize=256)
self.places = _LRU(maxsize=128)
self.address_terms = _LRU(maxsize=1024)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
# Get postcodes that are already saved
postcodes = OrderedDict()
with conn.cursor() as cur:
cur.execute("""SELECT word FROM word
WHERE class ='place' and type = 'postcode'""")
for row in cur:
postcodes[row[0]] = None
self.postcodes = _LRU(maxsize=32, init_data=postcodes)
def get_housenumber(self, number):
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)

View File

@ -4,10 +4,9 @@ Collection of functions that check if the database is complete and functional.
from enum import Enum from enum import Enum
from textwrap import dedent from textwrap import dedent
import psycopg2
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tokenizer import factory as tokenizer_factory
CHECKLIST = [] CHECKLIST = []
@ -78,8 +77,7 @@ def check_database(config):
def _get_indexes(conn): def _get_indexes(conn):
indexes = ['idx_word_word_id', indexes = ['idx_place_addressline_address_place_id',
'idx_place_addressline_address_place_id',
'idx_placex_rank_search', 'idx_placex_rank_search',
'idx_placex_rank_address', 'idx_placex_rank_address',
'idx_placex_parent_place_id', 'idx_placex_parent_place_id',
@ -149,7 +147,7 @@ def check_placex_table(conn, config):
@_check(hint="""placex table has no data. Did the import finish sucessfully?""") @_check(hint="""placex table has no data. Did the import finish sucessfully?""")
def check_placex_size(conn, config): # pylint: disable=W0613 def check_placex_size(conn, _):
""" Checking for placex content """ Checking for placex content
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
@ -158,38 +156,30 @@ def check_placex_size(conn, config): # pylint: disable=W0613
return CheckState.OK if cnt > 0 else CheckState.FATAL return CheckState.OK if cnt > 0 else CheckState.FATAL
@_check(hint="""\ @_check(hint="""{msg}""")
The Postgresql extension nominatim.so was not correctly loaded. def check_tokenizer(_, config):
""" Checking that tokenizer works
Error: {error}
Hints:
* Check the output of the CMmake/make installation step
* Does nominatim.so exist?
* Does nominatim.so exist on the database server?
* Can nominatim.so be accessed by the database user?
""")
def check_module(conn, config): # pylint: disable=W0613
""" Checking that nominatim.so module is installed
""" """
with conn.cursor() as cur: try:
try: tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
out = cur.scalar("SELECT make_standard_name('a')") except UsageError:
except psycopg2.ProgrammingError as err: return CheckState.FAIL, dict(msg="""\
return CheckState.FAIL, dict(error=str(err)) Cannot load tokenizer. Did the import finish sucessfully?""")
if out != 'a': result = tokenizer.check_database()
return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
if result is None:
return CheckState.OK return CheckState.OK
return CheckState.FAIL, dict(msg=result)
@_check(hint="""\ @_check(hint="""\
The indexing didn't finish. {count} entries are not yet indexed. The indexing didn't finish. {count} entries are not yet indexed.
To index the remaining entries, run: {index_cmd} To index the remaining entries, run: {index_cmd}
""") """)
def check_indexing(conn, config): # pylint: disable=W0613 def check_indexing(conn, _):
""" Checking indexing status """ Checking indexing status
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
@ -198,7 +188,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
if cnt == 0: if cnt == 0:
return CheckState.OK return CheckState.OK
if conn.index_exists('idx_word_word_id'): if conn.index_exists('idx_placex_rank_search'):
# Likely just an interrupted update. # Likely just an interrupted update.
index_cmd = 'nominatim index' index_cmd = 'nominatim index'
else: else:
@ -214,7 +204,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
Rerun the index creation with: nominatim import --continue db-postprocess Rerun the index creation with: nominatim import --continue db-postprocess
""") """)
def check_database_indexes(conn, config): # pylint: disable=W0613 def check_database_indexes(conn, _):
""" Checking that database indexes are complete """ Checking that database indexes are complete
""" """
missing = [] missing = []
@ -236,7 +226,7 @@ def check_database_indexes(conn, config): # pylint: disable=W0613
Invalid indexes: Invalid indexes:
{indexes} {indexes}
""") """)
def check_database_index_valid(conn, config): # pylint: disable=W0613 def check_database_index_valid(conn, _):
""" Checking that all database indexes are valid """ Checking that all database indexes are valid
""" """
with conn.cursor() as cur: with conn.cursor() as cur:

View File

@ -5,11 +5,10 @@ import logging
import os import os
import selectors import selectors
import subprocess import subprocess
import shutil
from pathlib import Path from pathlib import Path
import psutil import psutil
import psycopg2 import psycopg2.extras
from nominatim.db.connection import connect, get_pg_env from nominatim.db.connection import connect, get_pg_env
from nominatim.db import utils as db_utils from nominatim.db import utils as db_utils
@ -89,49 +88,6 @@ def setup_extensions(conn):
raise UsageError('PostGIS version is too old.') raise UsageError('PostGIS version is too old.')
def install_module(src_dir, project_dir, module_dir, conn=None):
""" Copy the normalization module from src_dir into the project
directory under the '/module' directory. If 'module_dir' is set, then
use the module from there instead and check that it is accessible
for Postgresql.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
If 'conn' is given, then the function also tests if the module
can be access via the given database.
"""
if not module_dir:
module_dir = project_dir / 'module'
if not module_dir.exists() or not src_dir.samefile(module_dir):
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
else:
LOG.info('Running from build directory. Leaving database module as is.')
else:
LOG.info("Using custom path for database module at '%s'", module_dir)
if conn is not None:
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(module_dir))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
def import_base_data(dsn, sql_dir, ignore_partitions=False): def import_base_data(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides """ Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist. the background for geocoding. Data is assumed to not yet exist.
@ -205,11 +161,10 @@ def create_partition_tables(conn, config):
sql.run_sql_file(conn, 'partition-tables.src.sql') sql.run_sql_file(conn, 'partition-tables.src.sql')
def truncate_data_tables(conn, max_word_frequency=None): def truncate_data_tables(conn):
""" Truncate all data tables to prepare for a fresh load. """ Truncate all data tables to prepare for a fresh load.
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute('TRUNCATE word')
cur.execute('TRUNCATE placex') cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline') cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area') cur.execute('TRUNCATE location_area')
@ -228,23 +183,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
for table in [r[0] for r in list(cur)]: for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table) cur.execute('TRUNCATE ' + table)
if max_word_frequency is not None: conn.commit()
# Used by getorcreate_word_id to ignore frequent partial words.
cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
RETURNS integer AS $$
SELECT {} as maxwordfreq;
$$ LANGUAGE SQL IMMUTABLE
""".format(max_word_frequency))
conn.commit()
_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry' _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
def load_data(dsn, data_dir, threads): def load_data(dsn, threads):
""" Copy data into the word and placex table. """ Copy data into the word and placex table.
""" """
# Pre-calculate the most important terms in the word list.
db_utils.execute_file(dsn, data_dir / 'words.sql')
sel = selectors.DefaultSelector() sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks. # Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1) place_threads = max(1, threads - 1)
@ -306,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
sql.run_sql_file(conn, 'indices.sql', drop=drop) sql.run_sql_file(conn, 'indices.sql', drop=drop)
def create_country_names(conn, config): def create_country_names(conn, tokenizer, languages=None):
""" Create search index for default country names. """ Add default country names to search index. `languages` is a comma-
separated list of language codes as used in OSM. If `languages` is not
empty then only name translations for the given languages are added
to the index.
""" """
if languages:
languages = languages.split(',')
def _include_key(key):
return key == 'name' or \
(key.startswith('name:') \
and (not languages or key[5:] in languages))
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""") psycopg2.extras.register_hstore(cur)
cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""") cur.execute("""SELECT country_code, name FROM country_name
cur.execute("""SELECT COUNT(*) FROM WHERE country_code is not null""")
(SELECT getorcreate_country(make_standard_name(country_code),
country_code) FROM country_name WHERE country_code is not null) AS x""")
cur.execute("""SELECT COUNT(*) FROM
(SELECT getorcreate_country(make_standard_name(name->'name'), country_code)
FROM country_name WHERE name ? 'name') AS x""")
sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
country_code) FROM (SELECT country_code, skeys(name)
AS k, svals(name) AS v FROM country_name) x WHERE k"""
languages = config.LANGUAGES with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = [code]
if code == 'gb':
names.append('UK')
if code == 'us':
names.append('United States')
# country names (only in languages as provided)
if name:
names.extend((v for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names)
if languages:
sql_statement = "{} IN (".format(sql_statement)
delim = ''
for language in languages.split(','):
sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
delim = ', '
sql_statement = '{})'.format(sql_statement)
else:
sql_statement = "{} LIKE 'name:%'".format(sql_statement)
sql_statement = "{}) v".format(sql_statement)
cur.execute(sql_statement)
conn.commit() conn.commit()

View File

@ -6,7 +6,8 @@ import logging
from nominatim.db import properties from nominatim.db import properties
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.version import NOMINATIM_VERSION from nominatim.version import NOMINATIM_VERSION
from nominatim.tools import refresh, database_import from nominatim.tools import refresh
from nominatim.tokenizer import factory as tokenizer_factory
from nominatim.errors import UsageError from nominatim.errors import UsageError
LOG = logging.getLogger() LOG = logging.getLogger()
@ -43,11 +44,14 @@ def migrate(config, paths):
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version)) '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
kwargs = dict(conn=conn, config=config, paths=paths) kwargs = dict(conn=conn, config=config, paths=paths)
func(**kwargs) func(**kwargs)
conn.commit()
has_run_migration = True has_run_migration = True
if has_run_migration: if has_run_migration:
LOG.warning('Updating SQL functions.') LOG.warning('Updating SQL functions.')
refresh.create_functions(conn, config) refresh.create_functions(conn, config)
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
tokenizer.update_sql_functions(config)
properties.set_property(conn, 'database_version', properties.set_property(conn, 'database_version',
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
@ -108,17 +112,6 @@ def import_status_timestamp_change(conn, **_):
TYPE timestamp with time zone;""") TYPE timestamp with time zone;""")
@_migration(3, 5, 0, 99)
def install_database_module_in_project_directory(conn, config, paths, **_):
""" Install database module in project directory.
The database module needs to be present in the project directory
since those were introduced.
"""
database_import.install_module(paths.module_dir, paths.project_dir,
config.DATABASE_MODULE_PATH, conn=conn)
@_migration(3, 5, 0, 99) @_migration(3, 5, 0, 99)
def add_nominatim_property_table(conn, config, **_): def add_nominatim_property_table(conn, config, **_):
""" Add nominatim_property table. """ Add nominatim_property table.
@ -137,6 +130,9 @@ def change_housenumber_transliteration(conn, **_):
The database schema switched from saving raw housenumbers in The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones. placex.housenumber to saving transliterated ones.
Note: the function create_housenumber_id() has been dropped in later
versions.
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT) cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
@ -173,3 +169,25 @@ def switch_placenode_geometry_index(conn, **_):
and class = 'place' and type != 'postcode' and class = 'place' and type != 'postcode'
and linked_place_id is null""") and linked_place_id is null""")
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """) cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
@_migration(3, 7, 0, 1)
def install_legacy_tokenizer(conn, config, **_):
""" Setup legacy tokenizer.
If no other tokenizer has been configured yet, then create the
configuration for the backwards-compatible legacy tokenizer
"""
if properties.get_property(conn, 'tokenizer') is None:
with conn.cursor() as cur:
for table in ('placex', 'location_property_osmline'):
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
WHERE table_name = %s
and column_name = 'token_info'""",
(table, ))
if has_column == 0:
cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
module_name='legacy')
tokenizer.migrate_database(config)

View File

@ -6,7 +6,7 @@ of artificial postcode centroids.
from nominatim.db.utils import execute_file from nominatim.db.utils import execute_file
from nominatim.db.connection import connect from nominatim.db.connection import connect
def import_postcodes(dsn, project_dir): def import_postcodes(dsn, project_dir, tokenizer):
""" Set up the initial list of postcodes. """ Set up the initial list of postcodes.
""" """
@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
INSERT INTO location_postcode INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry) (place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code, SELECT nextval('seq_place'), 1, country_code,
upper(trim (both ' ' from address->'postcode')) as pc, token_normalized_postcode(address->'postcode') as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry))) ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex FROM placex
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%' WHERE address ? 'postcode'
and token_normalized_postcode(address->'postcode') is not null
AND geometry IS NOT null AND geometry IS NOT null
GROUP BY country_code, pc GROUP BY country_code, pc
""") """)
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
cur.execute(""" cur.execute("""
INSERT INTO location_postcode INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry) (place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'us', postcode, SELECT nextval('seq_place'), 1, 'us',
token_normalized_postcode(postcode),
ST_SetSRID(ST_Point(x,y),4326) ST_SetSRID(ST_Point(x,y),4326)
FROM us_postcode WHERE postcode NOT IN FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode (SELECT postcode FROM location_postcode
WHERE country_code = 'us') WHERE country_code = 'us')
""") """)
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
cur.execute(""" cur.execute("""
INSERT INTO location_postcode INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry) (place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'gb', postcode, geometry SELECT nextval('seq_place'), 1, 'gb',
FROM gb_postcode WHERE postcode NOT IN token_normalized_postcode(postcode), geometry
FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode (SELECT postcode FROM location_postcode
WHERE country_code = 'gb') WHERE country_code = 'gb')
""") """)
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
DELETE FROM word WHERE class='place' and type='postcode' DELETE FROM word WHERE class='place' and type='postcode'
and word NOT IN (SELECT postcode FROM location_postcode) and word NOT IN (SELECT postcode FROM location_postcode)
""") """)
cur.execute("""
SELECT count(getorcreate_postcode_id(v)) FROM
(SELECT distinct(postcode) as v FROM location_postcode) p
""")
conn.commit() conn.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.add_postcodes_from_db()

View File

@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
('Default_Language', 'DEFAULT_LANGUAGE', str), ('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool), ('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', str), ('Log_File', 'LOG_FILE', str),
('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool), ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int), ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int), ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool), ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str), ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool), ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str), ('MapIcon_URL', 'MAPICON_URL', str),
) )
@ -175,9 +173,11 @@ def setup_website(basedir, config):
@define('CONST_Debug', $_GET['debug'] ?? false); @define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{0}'); @define('CONST_LibDir', '{0}');
@define('CONST_TokenizerDir', '{2}');
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}'); @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
""".format(config.lib_dir.php, NOMINATIM_VERSION)) """.format(config.lib_dir.php, NOMINATIM_VERSION,
config.project_dir / 'tokenizer'))
for php_name, conf_name, var_type in PHP_CONST_DEFS: for php_name, conf_name, var_type in PHP_CONST_DEFS:
if var_type == bool: if var_type == bool:

View File

@ -24,9 +24,6 @@ class SpecialPhrasesImporterStatistics():
self.tables_deleted = 0 self.tables_deleted = 0
self.tables_ignored = 0 self.tables_ignored = 0
self.global_phrases_invalid = 0 self.global_phrases_invalid = 0
self.global_phrases_added = 0
self.global_phrases_ignored = 0
self.global_phrases_deleted = 0
def _set_lang_values_to_0(self): def _set_lang_values_to_0(self):
""" """
@ -34,8 +31,6 @@ class SpecialPhrasesImporterStatistics():
lang to 0. lang to 0.
""" """
self.lang_phrases_invalid = 0 self.lang_phrases_invalid = 0
self.lang_phrases_added = 0
self.lang_phrases_ignored = 0
def notify_one_phrase_invalid(self): def notify_one_phrase_invalid(self):
""" """
@ -45,29 +40,6 @@ class SpecialPhrasesImporterStatistics():
self.lang_phrases_invalid += 1 self.lang_phrases_invalid += 1
self.global_phrases_invalid += 1 self.global_phrases_invalid += 1
def notify_one_phrase_added(self):
"""
Add +1 to the count of entries
added to the db.
"""
self.lang_phrases_added += 1
self.global_phrases_added += 1
def notify_one_phrase_ignored(self):
"""
Add +1 to the count of ignored
entries as it was already in the db.
"""
self.lang_phrases_ignored += 1
self.global_phrases_ignored += 1
def notify_one_phrase_deleted(self):
"""
Add +1 to the count of phrases deleted
from the database.
"""
self.global_phrases_deleted += 1
def notify_one_table_created(self): def notify_one_table_created(self):
""" """
Add +1 to the count of created tables. Add +1 to the count of created tables.
@ -97,12 +69,6 @@ class SpecialPhrasesImporterStatistics():
LOG.info('- %s phrases were invalid.', self.global_phrases_invalid) LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
if self.global_phrases_invalid > 0: if self.global_phrases_invalid > 0:
LOG.info(' Those invalid phrases have been skipped.') LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s phrases were ignored as they are already in the database',
self.global_phrases_ignored)
LOG.info('- %s phrases were added to the database', self.global_phrases_added)
LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
if self.global_phrases_deleted > 0:
LOG.info(' They were deleted as they are not valid anymore.')
LOG.info('- %s tables were ignored as they already exist on the database', LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored) self.tables_ignored)
LOG.info('- %s tables were created', self.tables_created) LOG.info('- %s tables were created', self.tables_created)
@ -126,9 +92,6 @@ class SpecialPhrasesImporterStatistics():
LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid) LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
if self.lang_phrases_invalid > 0: if self.lang_phrases_invalid > 0:
LOG.info(' Those invalid phrases have been skipped.') LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s phrases were ignored as they are already in the database',
self.lang_phrases_ignored)
LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
LOG.info('====================================================================') LOG.info('====================================================================')
if self.lang_phrases_invalid > 0: if self.lang_phrases_invalid > 0:

View File

@ -9,7 +9,6 @@ import re
import subprocess import subprocess
import json import json
from icu import Transliterator
from psycopg2.sql import Identifier, Literal, SQL from psycopg2.sql import Identifier, Literal, SQL
from nominatim.tools.exec_utils import get_url from nominatim.tools.exec_utils import get_url
@ -33,21 +32,14 @@ class SpecialPhrasesImporter():
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
) )
self.sanity_check_pattern = re.compile(r'^\w+$') self.sanity_check_pattern = re.compile(r'^\w+$')
self.transliterator = Transliterator.createFromRules("special-phrases normalizer", # This set will contain all existing phrases to be added.
self.config.TERM_NORMALIZATION) # It contains tuples with the following format: (lable, class, type, operator)
#This set will contain all existing phrases from the word table which self.word_phrases = set()
#no longer exist on the wiki.
#It contain tuples with the following format: (normalized_word, class, type, operator)
self.words_phrases_to_delete = set()
#This set will contain the phrases which still exist from the wiki.
#It is used to prevent duplicates on the wiki by removing them from
#the word_phrases_to_delete only at the end.
self.words_phrases_still_exist = set()
#This set will contain all existing place_classtype tables which doesn't match any #This set will contain all existing place_classtype tables which doesn't match any
#special phrases class/type on the wiki. #special phrases class/type on the wiki.
self.table_phrases_to_delete = set() self.table_phrases_to_delete = set()
def import_from_wiki(self, languages=None): def import_from_wiki(self, tokenizer, languages=None):
""" """
Iterate through all specified languages and Iterate through all specified languages and
extract corresponding special phrases from the wiki. extract corresponding special phrases from the wiki.
@ -55,7 +47,6 @@ class SpecialPhrasesImporter():
if languages is not None and not isinstance(languages, list): if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.') raise TypeError('The \'languages\' argument should be of type list.')
self._fetch_existing_words_phrases()
self._fetch_existing_place_classtype_tables() self._fetch_existing_place_classtype_tables()
#Get all languages to process. #Get all languages to process.
@ -71,30 +62,15 @@ class SpecialPhrasesImporter():
self.statistics_handler.notify_current_lang_done(lang) self.statistics_handler.notify_current_lang_done(lang)
self._create_place_classtype_table_and_indexes(class_type_pairs) self._create_place_classtype_table_and_indexes(class_type_pairs)
self._remove_non_existent_phrases_from_db() self._remove_non_existent_tables_from_db()
self.db_connection.commit() self.db_connection.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.update_special_phrases(self.word_phrases)
LOG.warning('Import done.') LOG.warning('Import done.')
self.statistics_handler.notify_import_done() self.statistics_handler.notify_import_done()
def _fetch_existing_words_phrases(self):
"""
Fetch existing special phrases from the word table.
Fill the word_phrases_to_delete set of the class.
"""
#Only extract special phrases terms:
#If class=place and type=house then it is a housenumber term.
#If class=place and type=postcode then it is a postcode term.
word_query = """
SELECT word, class, type, operator FROM word
WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(word_query))
for row in db_cursor:
row[3] = '-' if row[3] is None else row[3]
self.words_phrases_to_delete.add(
(row[0], row[1], row[2], row[3])
)
def _fetch_existing_place_classtype_tables(self): def _fetch_existing_place_classtype_tables(self):
""" """
@ -176,7 +152,6 @@ class SpecialPhrasesImporter():
for match in matches: for match in matches:
phrase_label = match[0].strip() phrase_label = match[0].strip()
normalized_label = self.transliterator.transliterate(phrase_label)
phrase_class = match[1].strip() phrase_class = match[1].strip()
phrase_type = match[2].strip() phrase_type = match[2].strip()
phrase_operator = match[3].strip() phrase_operator = match[3].strip()
@ -198,20 +173,6 @@ class SpecialPhrasesImporter():
): ):
continue continue
#Check if the phrase already exists in the database.
if (
(normalized_label, phrase_class, phrase_type, phrase_operator)
in self.words_phrases_to_delete
):
#Remove this phrase from the ones to delete as it still exist on the wiki.
self.words_phrases_still_exist.add(
(normalized_label, phrase_class, phrase_type, phrase_operator)
)
class_type_pairs.add((phrase_class, phrase_type))
self.statistics_handler.notify_one_phrase_ignored()
#Dont need to add this phrase as it already exists in the word table.
continue
#sanity check, in case somebody added garbage in the wiki #sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(lang, phrase_class, phrase_type): if not self._check_sanity(lang, phrase_class, phrase_type):
self.statistics_handler.notify_one_phrase_invalid() self.statistics_handler.notify_one_phrase_invalid()
@ -219,35 +180,11 @@ class SpecialPhrasesImporter():
class_type_pairs.add((phrase_class, phrase_type)) class_type_pairs.add((phrase_class, phrase_type))
self._process_amenity( self.word_phrases.add((phrase_label, phrase_class,
phrase_label, normalized_label, phrase_class, phrase_type, phrase_operator))
phrase_type, phrase_operator
)
self.statistics_handler.notify_one_phrase_added()
return class_type_pairs return class_type_pairs
def _process_amenity(self, phrase_label, normalized_label,
phrase_class, phrase_type, phrase_operator):
# pylint: disable-msg=too-many-arguments
"""
Add phrase lookup and corresponding class and
type to the word table based on the operator.
"""
with self.db_connection.cursor() as db_cursor:
if phrase_operator == 'near':
db_cursor.execute("""SELECT getorcreate_amenityoperator(
make_standard_name(%s), %s, %s, %s, 'near')""",
(phrase_label, normalized_label, phrase_class, phrase_type))
elif phrase_operator == 'in':
db_cursor.execute("""SELECT getorcreate_amenityoperator(
make_standard_name(%s), %s, %s, %s, 'in')""",
(phrase_label, normalized_label, phrase_class, phrase_type))
else:
db_cursor.execute("""SELECT getorcreate_amenity(
make_standard_name(%s), %s, %s, %s)""",
(phrase_label, normalized_label, phrase_class, phrase_type))
def _create_place_classtype_table_and_indexes(self, class_type_pairs): def _create_place_classtype_table_and_indexes(self, class_type_pairs):
""" """
@ -339,33 +276,15 @@ class SpecialPhrasesImporter():
.format(Identifier(table_name), .format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER))) Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_phrases_from_db(self): def _remove_non_existent_tables_from_db(self):
""" """
Remove special phrases which doesn't exist on the wiki anymore. Remove special phrases which doesn't exist on the wiki anymore.
Delete from the word table and delete the place_classtype tables. Delete the place_classtype tables.
""" """
LOG.warning('Cleaning database...') LOG.warning('Cleaning database...')
self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
#Array containing all queries to execute. Contain tuples of format (query, parameters) #Array containing all queries to execute. Contain tuples of format (query, parameters)
queries_parameters = [] queries_parameters = []
#Delete phrases from the word table which are not on the wiki anymore.
for phrase_to_delete in self.words_phrases_to_delete:
self.statistics_handler.notify_one_phrase_deleted()
if phrase_to_delete[3] == '-':
query = """
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
"""
parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
queries_parameters.append((query, parameters))
else:
query = """
DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
"""
parameters = (phrase_to_delete[0], phrase_to_delete[1],
phrase_to_delete[2], phrase_to_delete[3], )
queries_parameters.append((query, parameters))
#Delete place_classtype tables corresponding to class/type which are not on the wiki anymore #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
for table in self.table_phrases_to_delete: for table in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_deleted() self.statistics_handler.notify_one_table_deleted()

View File

@ -10,7 +10,7 @@ Version information for Nominatim.
# and must always be increased when there is a change to the database or code # and must always be increased when there is a change to the database or code
# that requires a migration. # that requires a migration.
# Released versions always have a database patch level of 0. # Released versions always have a database patch level of 0.
NOMINATIM_VERSION = (3, 7, 0, 1) NOMINATIM_VERSION = (3, 7, 0, 2)
POSTGRESQL_REQUIRED_VERSION = (9, 3) POSTGRESQL_REQUIRED_VERSION = (9, 3)
POSTGIS_REQUIRED_VERSION = (2, 2) POSTGIS_REQUIRED_VERSION = (2, 2)

View File

@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data"
# Changing this value requires to run 'nominatim refresh --functions'. # Changing this value requires to run 'nominatim refresh --functions'.
NOMINATIM_DATABASE_MODULE_PATH= NOMINATIM_DATABASE_MODULE_PATH=
# Tokenizer used for normalizing and parsing queries and names.
# The tokenizer is set up during import and cannot be changed afterwards
# without a reimport.
# Currently available tokenizers: legacy
NOMINATIM_TOKENIZER="legacy"
# Number of occurances of a word before it is considered frequent. # Number of occurances of a word before it is considered frequent.
# Similar to the concept of stop words. Frequent partial words get ignored # Similar to the concept of stop words. Frequent partial words get ignored
# or handled differently during search. # or handled differently during search.

View File

@ -24,7 +24,7 @@ Feature: Creation of search terms
When importing When importing
Then search_name contains Then search_name contains
| object | nameaddress_vector | | object | nameaddress_vector |
| N1 | Rose, Street, Walltown | | N1 | #Rose Street, Walltown |
When searching for "23 Rose Street, Walltown" When searching for "23 Rose Street, Walltown"
Then results contain Then results contain
| osm_type | osm_id | name | | osm_type | osm_id | name |
@ -248,7 +248,7 @@ Feature: Creation of search terms
When importing When importing
Then search_name contains Then search_name contains
| object | name_vector | nameaddress_vector | | object | name_vector | nameaddress_vector |
| N1 | #Green Moss | Rose, Street, Walltown | | N1 | #Green Moss | #Rose Street, Walltown |
When searching for "Green Moss, Rose Street, Walltown" When searching for "Green Moss, Rose Street, Walltown"
Then results contain Then results contain
| osm_type | osm_id | name | | osm_type | osm_id | name |
@ -299,7 +299,7 @@ Feature: Creation of search terms
When importing When importing
Then search_name contains Then search_name contains
| object | name_vector | nameaddress_vector | | object | name_vector | nameaddress_vector |
| N1 | foo | the road | | N1 | foo | #the road |
Scenario: Some addr: tags are added to address Scenario: Some addr: tags are added to address
Given the scene roads-with-pois Given the scene roads-with-pois

View File

@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
from nominatim import cli from nominatim import cli
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.tools import refresh from nominatim.tools import refresh
from nominatim.tokenizer import factory as tokenizer_factory
from steps.utils import run_script from steps.utils import run_script
class NominatimEnvironment: class NominatimEnvironment:
@ -106,9 +107,19 @@ class NominatimEnvironment:
self.website_dir.cleanup() self.website_dir.cleanup()
self.website_dir = tempfile.TemporaryDirectory() self.website_dir = tempfile.TemporaryDirectory()
cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env) refresh.setup_website(Path(self.website_dir.name) / 'website',
cfg.lib_dir.php = self.src_dir / 'lib-php' self.get_test_config())
refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
def get_test_config(self):
cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
environ=self.test_env)
cfg.set_libdirs(module=self.build_dir / 'module',
osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
php=self.src_dir / 'lib-php',
sql=self.src_dir / 'lib-sql',
data=self.src_dir / 'data')
return cfg
def get_libpq_dsn(self): def get_libpq_dsn(self):
dsn = self.test_env['NOMINATIM_DATABASE_DSN'] dsn = self.test_env['NOMINATIM_DATABASE_DSN']
@ -169,33 +180,41 @@ class NominatimEnvironment:
""" """
self.write_nominatim_config(self.api_test_db) self.write_nominatim_config(self.api_test_db)
if self.api_db_done: if not self.api_db_done:
return self.api_db_done = True
self.api_db_done = True if not self._reuse_or_drop_db(self.api_test_db):
testdata = Path('__file__') / '..' / '..' / 'testdb'
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
if self._reuse_or_drop_db(self.api_test_db): try:
return self.run_nominatim('import', '--osm-file', str(self.api_test_file))
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
testdata = Path('__file__') / '..' / '..' / 'testdb' phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
except:
self.db_drop_database(self.api_test_db)
raise
try: tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
except:
self.db_drop_database(self.api_test_db)
raise
def setup_unknown_db(self): def setup_unknown_db(self):
""" Setup a test against a non-existing database. """ Setup a test against a non-existing database.
""" """
self.write_nominatim_config('UNKNOWN_DATABASE_NAME') # The tokenizer needs an existing database to function.
# So start with the usual database
class _Context:
db = None
context = _Context()
self.setup_db(context)
tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
# Then drop the DB again
self.teardown_db(context, force_drop=True)
def setup_db(self, context): def setup_db(self, context):
""" Setup a test against a fresh, empty test database. """ Setup a test against a fresh, empty test database.
@ -212,13 +231,13 @@ class NominatimEnvironment:
context.db.autocommit = True context.db.autocommit = True
psycopg2.extras.register_hstore(context.db, globally=False) psycopg2.extras.register_hstore(context.db, globally=False)
def teardown_db(self, context): def teardown_db(self, context, force_drop=False):
""" Remove the test database, if it exists. """ Remove the test database, if it exists.
""" """
if 'db' in context: if hasattr(context, 'db'):
context.db.close() context.db.close()
if not self.keep_scenario_db: if force_drop or not self.keep_scenario_db:
self.db_drop_database(self.test_db) self.db_drop_database(self.test_db)
def _reuse_or_drop_db(self, name): def _reuse_or_drop_db(self, name):

View File

@ -7,6 +7,7 @@ from place_inserter import PlaceColumn
from table_compare import NominatimID, DBRow from table_compare import NominatimID, DBRow
from nominatim.indexer import indexer from nominatim.indexer import indexer
from nominatim.tokenizer import factory as tokenizer_factory
def check_database_integrity(context): def check_database_integrity(context):
""" Check some generic constraints on the tables. """ Check some generic constraints on the tables.
@ -86,6 +87,9 @@ def add_data_to_planet_ways(context):
def import_and_index_data_from_place_table(context): def import_and_index_data_from_place_table(context):
""" Import data previously set up in the place table. """ Import data previously set up in the place table.
""" """
nctx = context.nominatim
tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
context.nominatim.copy_from_place(context.db) context.nominatim.copy_from_place(context.db)
# XXX use tool function as soon as it is ported # XXX use tool function as soon as it is ported
@ -105,7 +109,7 @@ def import_and_index_data_from_place_table(context):
# Call directly as the refresh function does not include postcodes. # Call directly as the refresh function does not include postcodes.
indexer.LOG.setLevel(logging.ERROR) indexer.LOG.setLevel(logging.ERROR)
indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False) indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
check_database_integrity(context) check_database_integrity(context)
@ -230,7 +234,7 @@ def check_search_name_contents(context, exclude):
if exclude: if exclude:
assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1]) assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
else: else:
assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1]) assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
elif name != 'object': elif name != 'object':
assert db_row.contains(name, value), db_row.assert_msg(name, value) assert db_row.contains(name, value), db_row.assert_msg(name, value)

View File

@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testEmptyPhrase() public function testEmptyPhrase()
{ {
$oPhrase = new Phrase('', ''); $oPhrase = new Phrase('', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array(), new TokensFullSet());
$this->assertEquals( $this->assertNull($oPhrase->getWordSets());
array(array('')),
$oPhrase->getWordSets()
);
} }
public function testSingleWordPhrase() public function testSingleWordPhrase()
{ {
$oPhrase = new Phrase('a', ''); $oPhrase = new Phrase('a', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array('a'), new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a)', '(a)',
@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testMultiWordPhrase() public function testMultiWordPhrase()
{ {
$oPhrase = new Phrase('a b', ''); $oPhrase = new Phrase('a b', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b),(a|b)', '(a b),(a|b)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
$oPhrase = new Phrase('a b c', ''); $oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b c),(a|b c),(a b|c),(a|b|c)', '(a b c),(a|b c),(a b|c),(a|b|c)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
); );
$oPhrase = new Phrase('a b c d', ''); $oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
$this->assertEquals( $this->assertEquals(
'(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)', '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testInverseWordSets() public function testInverseWordSets()
{ {
$oPhrase = new Phrase('a b c', ''); $oPhrase = new Phrase('a b c', '');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals( $this->assertEquals(
@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testMaxWordSets() public function testMaxWordSets()
{ {
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); $aWords = array_fill(0, 4, 'a');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(8, count($oPhrase->getWordSets())); $this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals(8, count($oPhrase->getWordSets())); $this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); $aWords = array_fill(0, 18, 'a');
$oPhrase->computeWordSets(new TokensFullSet()); $oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(100, count($oPhrase->getWordSets())); $this->assertEquals(100, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets(); $oPhrase->invertWordSets();
$this->assertEquals(100, count($oPhrase->getWordSets())); $this->assertEquals(100, count($oPhrase->getWordSets()));
@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testPartialTokensShortTerm() public function testPartialTokensShortTerm()
{ {
$oPhrase = new Phrase('a b c d', ''); $oPhrase = new Phrase('a b c d', '');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))); $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
$this->assertEquals( $this->assertEquals(
'(a|b c d),(a|b c|d)', '(a|b c d),(a|b c|d)',
$this->serializeSets($oPhrase->getWordSets()) $this->serializeSets($oPhrase->getWordSets())
@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
public function testPartialTokensLongTerm() public function testPartialTokensLongTerm()
{ {
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); $aWords = array_fill(0, 18, 'a');
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a'))); $oPhrase = new Phrase(join(' ', $aWords), '');
$oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
$this->assertEquals(80, count($oPhrase->getWordSets())); $this->assertEquals(80, count($oPhrase->getWordSets()));
} }
} }

View File

@ -2,6 +2,8 @@
namespace Nominatim; namespace Nominatim;
@define('CONST_TokenizerDir', dirname(__FILE__));
require_once(CONST_LibDir.'/DB.php'); require_once(CONST_LibDir.'/DB.php');
require_once(CONST_LibDir.'/Status.php'); require_once(CONST_LibDir.'/Status.php');
@ -40,45 +42,6 @@ class StatusTest extends \PHPUnit\Framework\TestCase
$this->assertEquals('No database', $oStatus->status()); $this->assertEquals('No database', $oStatus->status());
} }
public function testModuleFail()
{
$this->expectException(\Exception::class);
$this->expectExceptionMessage('Module call failed');
$this->expectExceptionCode(702);
// stub has getOne method but doesn't return anything
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('connect', 'getOne'))
->getMock();
$oStatus = new Status($oDbStub);
$this->assertNull($oStatus->status());
}
public function testWordIdQueryFail()
{
$this->expectException(\Exception::class);
$this->expectExceptionMessage('No value');
$this->expectExceptionCode(704);
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('connect', 'getOne'))
->getMock();
// return no word_id
$oDbStub->method('getOne')
->will($this->returnCallback(function ($sql) {
if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
}));
$oStatus = new Status($oDbStub);
$this->assertNull($oStatus->status());
}
public function testOK() public function testOK()
{ {
$oDbStub = $this->getMockBuilder(Nominatim\DB::class) $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
@ -100,7 +63,7 @@ class StatusTest extends \PHPUnit\Framework\TestCase
$oDbStub = $this->getMockBuilder(Nominatim\DB::class) $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('getOne')) ->setMethods(array('getOne'))
->getMock(); ->getMock();
$oDbStub->method('getOne') $oDbStub->method('getOne')
->willReturn(1519430221); ->willReturn(1519430221);

View File

@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
$this->assertFalse($TL->contains('unknownword')); $this->assertFalse($TL->contains('unknownword'));
$this->assertEquals(array(), $TL->get('unknownword')); $this->assertEquals(array(), $TL->get('unknownword'));
} }
public function testAddress()
{
$this->expectOutputRegex('/<p><tt>/');
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('getAll', 'getDBQuotedList'))
->getMock();
$oDbStub->method('getDBQuotedList')
->will($this->returnCallback(function ($aVals) {
return array_map(function ($sVal) {
return "'".$sVal."'";
}, $aVals);
}));
$oDbStub->method('getAll')
->will($this->returnCallback(function ($sql) {
$aResults = array();
if (preg_match('/1051/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => '1051',
'class' => 'place',
'type' => 'house'
));
}
if (preg_match('/hauptstr/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'hauptstr',
'class' => 'place',
'type' => 'street',
'operator' => true
));
}
if (preg_match('/64286/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => '64286',
'word' => '64286',
'class' => 'place',
'type' => 'postcode'
));
}
if (preg_match('/darmstadt/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'darmstadt',
'count' => 533
));
}
if (preg_match('/alemagne/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'alemagne',
'country_code' => 'de',
));
}
if (preg_match('/mexico/', $sql)) {
$aResults[] = $this->wordResult(array(
'word_id' => 999,
'word_token' => 'mexico',
'country_code' => 'mx',
));
}
return $aResults;
}));
$aCountryCodes = array('de', 'fr');
$sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
$aTokens = explode(' ', $sNormQuery);
$TL = new TokenList;
$TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
$this->assertEquals(5, $TL->count());
$this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
$this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
$this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
$this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
$this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
}
} }

View File

@ -0,0 +1,17 @@
<?php
namespace Nominatim;
class Tokenizer
{
private $oDB;
public function __construct(&$oDB)
{
$this->oDB =& $oDB;
}
public function checkStatus()
{
}
}

View File

@ -1,3 +1,4 @@
import importlib
import itertools import itertools
import sys import sys
from pathlib import Path from pathlib import Path
@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.db import connection from nominatim.db import connection
from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.db import properties
import dummy_tokenizer
class _TestingCursor(psycopg2.extras.DictCursor): class _TestingCursor(psycopg2.extras.DictCursor):
""" Extension to the DictCursor class that provides execution """ Extension to the DictCursor class that provides execution
@ -117,9 +121,8 @@ def table_factory(temp_db_cursor):
def mk_table(name, definition='id INT', content=None): def mk_table(name, definition='id INT', content=None):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
if content is not None: if content is not None:
if not isinstance(content, str): psycopg2.extras.execute_values(
content = '),('.join([str(x) for x in content]) temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
return mk_table return mk_table
@ -144,6 +147,11 @@ def tmp_phplib_dir():
yield Path(phpdir) yield Path(phpdir)
@pytest.fixture
def property_table(table_factory):
table_factory('nominatim_properties', 'property TEXT, value TEXT')
@pytest.fixture @pytest.fixture
def status_table(temp_db_conn): def status_table(temp_db_conn):
""" Create an empty version of the status table and """ Create an empty version of the status table and
@ -281,10 +289,29 @@ def osm2pgsql_options(temp_db):
@pytest.fixture @pytest.fixture
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.') table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
table_factory('country_name', 'partition INT', (0, 1, 2))
cfg = Configuration(None, SRC_DIR.resolve() / 'settings') cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php', cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
sql=tmp_path, data=SRC_DIR / 'data') sql=tmp_path, data=SRC_DIR / 'data')
return SQLPreprocessor(temp_db_conn, cfg) return SQLPreprocessor(temp_db_conn, cfg)
@pytest.fixture
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
""" Sets up the configuration so that the test dummy tokenizer will be
loaded when the tokenizer factory is used. Also returns a factory
with which a new dummy tokenizer may be created.
"""
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
def _import_dummy(module, *args, **kwargs):
return dummy_tokenizer
monkeypatch.setattr(importlib, "import_module", _import_dummy)
properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
def _create_tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
return _create_tokenizer

View File

@ -0,0 +1,64 @@
"""
Tokenizer for testing.
"""
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
"""
return DummyTokenizer(dsn, data_dir)
class DummyTokenizer:
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.init_state = None
self.analyser_cache = {}
def init_new_db(self, *args, **kwargs):
assert self.init_state == None
self.init_state = "new"
def init_from_project(self):
assert self.init_state == None
self.init_state = "loaded"
def finalize_import(self, _):
pass
def name_analyzer(self):
return DummyNameAnalyzer(self.analyser_cache)
class DummyNameAnalyzer:
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def __init__(self, cache):
self.analyser_cache = cache
cache['countries'] = []
def close(self):
pass
def add_postcodes_from_db(self):
pass
def update_special_phrases(self, phrases):
self.analyser_cache['special_phrases'] = phrases
def add_country_names(self, code, names):
self.analyser_cache['countries'].append((code, names))
def process_place(self, place):
return {}

View File

@ -22,6 +22,7 @@ import nominatim.tools.database_import
import nominatim.tools.freeze import nominatim.tools.freeze
import nominatim.tools.refresh import nominatim.tools.refresh
import nominatim.tools.postcodes import nominatim.tools.postcodes
import nominatim.tokenizer.factory
from mocks import MockParamCapture from mocks import MockParamCapture
@ -56,6 +57,28 @@ def mock_func_factory(monkeypatch):
return get_mock return get_mock
@pytest.fixture
def tokenizer_mock(monkeypatch):
class DummyTokenizer:
def __init__(self, *args, **kwargs):
self.update_sql_functions_called = False
self.finalize_import_called = False
def update_sql_functions(self, *args):
self.update_sql_functions_called = True
def finalize_import(self, *args):
self.finalize_import_called = True
tok = DummyTokenizer()
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
lambda *args: tok)
monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
lambda *args: tok)
return tok
def test_cli_help(capsys): def test_cli_help(capsys):
""" Running nominatim tool without arguments prints help. """ Running nominatim tool without arguments prints help.
""" """
@ -84,10 +107,9 @@ def test_import_bad_file(temp_db):
assert 1 == call_nominatim('import', '--osm-file', '.') assert 1 == call_nominatim('import', '--osm-file', '.')
def test_import_full(temp_db, mock_func_factory): def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
mocks = [ mocks = [
mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'), mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
mock_func_factory(nominatim.tools.database_import, 'install_module'),
mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'), mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
@ -107,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
assert 0 == call_nominatim('import', '--osm-file', __file__) assert 0 == call_nominatim('import', '--osm-file', __file__)
assert tokenizer_mock.finalize_import_called
assert cf_mock.called > 1 assert cf_mock.called > 1
@ -114,7 +137,7 @@ def test_import_full(temp_db, mock_func_factory):
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
def test_import_continue_load_data(temp_db, mock_func_factory): def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
mocks = [ mocks = [
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'load_data'),
@ -127,12 +150,14 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
] ]
assert 0 == call_nominatim('import', '--continue', 'load-data') assert 0 == call_nominatim('import', '--continue', 'load-data')
assert tokenizer_mock.finalize_import_called
for mock in mocks: for mock in mocks:
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn): def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
temp_db_conn, tokenizer_mock):
mocks = [ mocks = [
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -153,7 +178,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
assert temp_db_conn.index_exists('idx_placex_pendingsector') assert temp_db_conn.index_exists('idx_placex_pendingsector')
def test_import_continue_postprocess(temp_db, mock_func_factory): def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
mocks = [ mocks = [
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -163,6 +188,8 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
assert 0 == call_nominatim('import', '--continue', 'db-postprocess') assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
assert tokenizer_mock.finalize_import_called
for mock in mocks: for mock in mocks:
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
@ -217,7 +244,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
(['--boundaries-only'], 1, 0), (['--boundaries-only'], 1, 0),
(['--no-boundaries'], 0, 1), (['--no-boundaries'], 0, 1),
(['--boundaries-only', '--no-boundaries'], 0, 0)]) (['--boundaries-only', '--no-boundaries'], 0, 0)])
def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks): def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
params, do_bnds, do_ranks):
temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)") temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries') bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank') rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@ -227,7 +255,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
assert bnd_mock.called == do_bnds assert bnd_mock.called == do_bnds
assert rank_mock.called == do_ranks assert rank_mock.called == do_ranks
def test_special_phrases_command(temp_db, mock_func_factory): def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki') func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
call_nominatim('special-phrases', '--import-from-wiki') call_nominatim('special-phrases', '--import-from-wiki')
@ -238,7 +266,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
('postcodes', 'update_postcodes'), ('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'), ('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'), ('address-levels', 'load_address_levels_from_file'),
('functions', 'create_functions'),
('wiki-data', 'import_wikipedia_articles'), ('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'), ('importance', 'recompute_importance'),
('website', 'setup_website'), ('website', 'setup_website'),
@ -250,6 +277,14 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
assert func_mock.called == 1 assert func_mock.called == 1
def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
assert 0 == call_nominatim('refresh', '--functions')
assert func_mock.called == 1
assert tokenizer_mock.update_sql_functions_called
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
calls = [] calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',

View File

@ -27,7 +27,29 @@ def call_nominatim(*args):
cli_args=['replication'] + list(args)) cli_args=['replication'] + list(args))
@pytest.fixture @pytest.fixture
def index_mock(monkeypatch): def tokenizer_mock(monkeypatch):
class DummyTokenizer:
def __init__(self, *args, **kwargs):
self.update_sql_functions_called = False
self.finalize_import_called = False
def update_sql_functions(self, *args):
self.update_sql_functions_called = True
def finalize_import(self, *args):
self.finalize_import_called = True
tok = DummyTokenizer()
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
lambda *args: tok)
monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
lambda *args: tok)
return tok
@pytest.fixture
def index_mock(monkeypatch, tokenizer_mock):
mock = MockParamCapture() mock = MockParamCapture()
monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock) monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock) monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@ -52,7 +74,7 @@ def init_status(temp_db_conn, status_table):
@pytest.fixture @pytest.fixture
def update_mock(mock_func_factory, init_status): def update_mock(mock_func_factory, init_status, tokenizer_mock):
return mock_func_factory(nominatim.tools.replication, 'update') return mock_func_factory(nominatim.tools.replication, 'update')
@pytest.mark.parametrize("params,func", [ @pytest.mark.parametrize("params,func", [

View File

@ -24,7 +24,6 @@ def sql_factory(tmp_path):
("'{{db.partitions|join}}'", '012'), ("'{{db.partitions|join}}'", '012'),
("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"), ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"), ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
("'{{config.DATABASE_MODULE_PATH}}'", '.')
]) ])
def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret): def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
sqlfile = sql_factory("RETURN {};".format(expr)) sqlfile = sql_factory("RETURN {};".format(expr))

View File

@ -6,6 +6,7 @@ import psycopg2
import pytest import pytest
from nominatim.indexer import indexer from nominatim.indexer import indexer
from nominatim.tokenizer import factory
class IndexerTestDB: class IndexerTestDB:
@ -17,6 +18,7 @@ class IndexerTestDB:
self.conn = conn self.conn = conn
self.conn.set_isolation_level(0) self.conn.set_isolation_level(0)
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute('CREATE EXTENSION hstore')
cur.execute("""CREATE TABLE placex (place_id BIGINT, cur.execute("""CREATE TABLE placex (place_id BIGINT,
class TEXT, class TEXT,
type TEXT, type TEXT,
@ -26,9 +28,14 @@ class IndexerTestDB:
indexed_date TIMESTAMP, indexed_date TIMESTAMP,
partition SMALLINT, partition SMALLINT,
admin_level SMALLINT, admin_level SMALLINT,
address HSTORE,
token_info JSONB,
geometry_sector INTEGER)""") geometry_sector INTEGER)""")
cur.execute("""CREATE TABLE location_property_osmline ( cur.execute("""CREATE TABLE location_property_osmline (
place_id BIGINT, place_id BIGINT,
osm_id BIGINT,
address HSTORE,
token_info JSONB,
indexed_status SMALLINT, indexed_status SMALLINT,
indexed_date TIMESTAMP, indexed_date TIMESTAMP,
geometry_sector INTEGER)""") geometry_sector INTEGER)""")
@ -46,6 +53,25 @@ class IndexerTestDB:
END IF; END IF;
RETURN NEW; RETURN NEW;
END; $$ LANGUAGE plpgsql;""") END; $$ LANGUAGE plpgsql;""")
cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
OUT name HSTORE,
OUT address HSTORE,
OUT country_feature VARCHAR)
AS $$
BEGIN
address := p.address;
name := p.address;
END;
$$ LANGUAGE plpgsql STABLE;
""")
cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
RETURNS HSTORE AS $$
BEGIN
RETURN in_address;
END;
$$ LANGUAGE plpgsql STABLE;
""")
for table in ('placex', 'location_property_osmline', 'location_postcode'): for table in ('placex', 'location_property_osmline', 'location_postcode'):
cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0} cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
FOR EACH ROW EXECUTE PROCEDURE date_update() FOR EACH ROW EXECUTE PROCEDURE date_update()
@ -76,9 +102,9 @@ class IndexerTestDB:
next_id = next(self.osmline_id) next_id = next(self.osmline_id)
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("""INSERT INTO location_property_osmline cur.execute("""INSERT INTO location_property_osmline
(place_id, indexed_status, geometry_sector) (place_id, osm_id, indexed_status, geometry_sector)
VALUES (%s, 1, %s)""", VALUES (%s, %s, 1, %s)""",
(next_id, sector)) (next_id, next_id, sector))
return next_id return next_id
def add_postcode(self, country, postcode): def add_postcode(self, country, postcode):
@ -102,8 +128,14 @@ def test_db(temp_db_conn):
yield IndexerTestDB(temp_db_conn) yield IndexerTestDB(temp_db_conn)
@pytest.fixture
def test_tokenizer(tokenizer_mock, def_config, tmp_path):
def_config.project_dir = tmp_path
return factory.create_tokenizer(def_config)
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_all_by_rank(test_db, threads): def test_index_all_by_rank(test_db, threads, test_tokenizer):
for rank in range(31): for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline() test_db.add_osmline()
@ -111,7 +143,7 @@ def test_index_all_by_rank(test_db, threads):
assert 31 == test_db.placex_unindexed() assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed() assert 1 == test_db.osmline_unindexed()
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(0, 30) idx.index_by_rank(0, 30)
assert 0 == test_db.placex_unindexed() assert 0 == test_db.placex_unindexed()
@ -142,7 +174,7 @@ def test_index_all_by_rank(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_partial_without_30(test_db, threads): def test_index_partial_without_30(test_db, threads, test_tokenizer):
for rank in range(31): for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline() test_db.add_osmline()
@ -150,7 +182,8 @@ def test_index_partial_without_30(test_db, threads):
assert 31 == test_db.placex_unindexed() assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed() assert 1 == test_db.osmline_unindexed()
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
test_tokenizer, threads)
idx.index_by_rank(4, 15) idx.index_by_rank(4, 15)
assert 19 == test_db.placex_unindexed() assert 19 == test_db.placex_unindexed()
@ -162,7 +195,7 @@ def test_index_partial_without_30(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_partial_with_30(test_db, threads): def test_index_partial_with_30(test_db, threads, test_tokenizer):
for rank in range(31): for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank) test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline() test_db.add_osmline()
@ -170,7 +203,7 @@ def test_index_partial_with_30(test_db, threads):
assert 31 == test_db.placex_unindexed() assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed() assert 1 == test_db.osmline_unindexed()
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(28, 30) idx.index_by_rank(28, 30)
assert 27 == test_db.placex_unindexed() assert 27 == test_db.placex_unindexed()
@ -181,7 +214,7 @@ def test_index_partial_with_30(test_db, threads):
WHERE indexed_status = 0 AND rank_address between 1 and 27""") WHERE indexed_status = 0 AND rank_address between 1 and 27""")
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_boundaries(test_db, threads): def test_index_boundaries(test_db, threads, test_tokenizer):
for rank in range(4, 10): for rank in range(4, 10):
test_db.add_admin(rank_address=rank, rank_search=rank) test_db.add_admin(rank_address=rank, rank_search=rank)
for rank in range(31): for rank in range(31):
@ -191,7 +224,7 @@ def test_index_boundaries(test_db, threads):
assert 37 == test_db.placex_unindexed() assert 37 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed() assert 1 == test_db.osmline_unindexed()
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_boundaries(0, 30) idx.index_boundaries(0, 30)
assert 31 == test_db.placex_unindexed() assert 31 == test_db.placex_unindexed()
@ -203,13 +236,13 @@ def test_index_boundaries(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_postcodes(test_db, threads): def test_index_postcodes(test_db, threads, test_tokenizer):
for postcode in range(1000): for postcode in range(1000):
test_db.add_postcode('de', postcode) test_db.add_postcode('de', postcode)
for postcode in range(32000, 33000): for postcode in range(32000, 33000):
test_db.add_postcode('us', postcode) test_db.add_postcode('us', postcode)
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_postcodes() idx.index_postcodes()
assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@ -217,7 +250,7 @@ def test_index_postcodes(test_db, threads):
@pytest.mark.parametrize("analyse", [True, False]) @pytest.mark.parametrize("analyse", [True, False])
def test_index_full(test_db, analyse): def test_index_full(test_db, analyse, test_tokenizer):
for rank in range(4, 10): for rank in range(4, 10):
test_db.add_admin(rank_address=rank, rank_search=rank) test_db.add_admin(rank_address=rank, rank_search=rank)
for rank in range(31): for rank in range(31):
@ -226,7 +259,7 @@ def test_index_full(test_db, analyse):
for postcode in range(1000): for postcode in range(1000):
test_db.add_postcode('de', postcode) test_db.add_postcode('de', postcode)
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
idx.index_full(analyse=analyse) idx.index_full(analyse=analyse)
assert 0 == test_db.placex_unindexed() assert 0 == test_db.placex_unindexed()
@ -236,13 +269,13 @@ def test_index_full(test_db, analyse):
@pytest.mark.parametrize("threads", [1, 15]) @pytest.mark.parametrize("threads", [1, 15])
def test_index_reopen_connection(test_db, threads, monkeypatch): def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15) monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
for _ in range(1000): for _ in range(1000):
test_db.add_place(rank_address=30, rank_search=30) test_db.add_place(rank_address=30, rank_search=30)
idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads) idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(28, 30) idx.index_by_rank(28, 30)
assert 0 == test_db.placex_unindexed() assert 0 == test_db.placex_unindexed()

View File

@ -0,0 +1,77 @@
"""
Tests for creating new tokenizers.
"""
import importlib
import pytest
from nominatim.db import properties
from nominatim.tokenizer import factory
from nominatim.errors import UsageError
from dummy_tokenizer import DummyTokenizer
@pytest.fixture
def test_config(def_config, tmp_path):
def_config.project_dir = tmp_path
return def_config
def test_setup_dummy_tokenizer(temp_db_conn, test_config,
tokenizer_mock, property_table):
tokenizer = factory.create_tokenizer(test_config)
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new"
assert (test_config.project_dir / 'tokenizer').is_dir()
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
(test_config.project_dir / 'tokenizer').mkdir()
tokenizer = factory.create_tokenizer(test_config)
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new"
def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
(test_config.project_dir / 'tokenizer').write_text("foo")
with pytest.raises(UsageError):
factory.create_tokenizer(test_config)
def test_setup_bad_tokenizer_name(test_config, monkeypatch):
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
with pytest.raises(UsageError):
factory.create_tokenizer(test_config)
def test_load_tokenizer(temp_db_conn, test_config,
tokenizer_mock, property_table):
factory.create_tokenizer(test_config)
tokenizer = factory.get_tokenizer_for_db(test_config)
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "loaded"
def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
factory.create_tokenizer(test_config)
test_config.project_dir = test_config.project_dir / 'foo'
with pytest.raises(UsageError):
factory.get_tokenizer_for_db(test_config)
def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
factory.create_tokenizer(test_config)
temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
with pytest.raises(UsageError):
factory.get_tokenizer_for_db(test_config)

View File

@ -0,0 +1,298 @@
"""
Test for legacy tokenizer.
"""
import shutil
import pytest
from nominatim.tokenizer import legacy_tokenizer
from nominatim.db import properties
from nominatim.errors import UsageError
@pytest.fixture
def test_config(def_config, tmp_path):
def_config.project_dir = tmp_path / 'project'
def_config.project_dir.mkdir()
module_dir = tmp_path / 'module_src'
module_dir.mkdir()
(module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
def_config.lib_dir.module = module_dir
sqldir = tmp_path / 'sql'
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
(sqldir / 'words.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
def_config.lib_dir.sql = sqldir
def_config.lib_dir.data = sqldir
return def_config
@pytest.fixture
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
(tmp_path / 'tokenizer').mkdir()
def _maker():
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
return _maker
@pytest.fixture
def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
@pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
word_table, temp_db_with_extensions, tmp_path):
sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
sql.write_text("""
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
""")
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
tok = tokenizer_factory()
tok.init_new_db(test_config)
monkeypatch.undo()
with tok.name_analyzer() as analyzer:
yield analyzer
@pytest.fixture
def make_standard_name(temp_db_cursor):
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
@pytest.fixture
def create_postcode_id(table_factory, temp_db_cursor):
table_factory('out_postcode_table', 'postcode TEXT')
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
RETURNS BOOLEAN AS $$
INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
$$ LANGUAGE SQL""")
@pytest.fixture
def create_housenumbers(temp_db_cursor):
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
housenumbers TEXT[],
OUT tokens TEXT, OUT normtext TEXT)
AS $$
SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
$$ LANGUAGE SQL""")
@pytest.fixture
def make_keywords(temp_db_cursor, temp_db_with_extensions):
temp_db_cursor.execute(
"""CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
def test_init_new(tokenizer_factory, test_config, monkeypatch,
temp_db_conn, sql_preprocessor):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
outfile = test_config.project_dir / 'module' / 'nominatim.so'
assert outfile.exists()
assert outfile.read_text() == 'TEST nomiantim.so'
assert outfile.stat().st_mode == 33261
def test_init_module_load_failed(tokenizer_factory, test_config,
monkeypatch, temp_db_conn):
tok = tokenizer_factory()
with pytest.raises(UsageError):
tok.init_new_db(test_config)
def test_init_module_custom(tokenizer_factory, test_config,
monkeypatch, tmp_path, sql_preprocessor):
module_dir = (tmp_path / 'custom').resolve()
module_dir.mkdir()
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
assert not (test_config.project_dir / 'module').exists()
def test_init_from_project(tokenizer_setup, tokenizer_factory):
tok = tokenizer_factory()
tok.init_from_project()
assert tok.normalization is not None
def test_update_sql_functions(sql_preprocessor, temp_db_conn,
tokenizer_factory, test_config, table_factory,
monkeypatch, temp_db_cursor):
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.init_new_db(test_config)
monkeypatch.undo()
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT')
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
('{{modulepath}}')""")
tok.update_sql_functions(test_config)
test_content = temp_db_cursor.row_set('SELECT * FROM test')
assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
tok = tokenizer_factory()
tok.migrate_database(test_config)
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
outfile = test_config.project_dir / 'module' / 'nominatim.so'
assert outfile.exists()
assert outfile.read_text() == 'TEST nomiantim.so'
assert outfile.stat().st_mode == 33261
def test_normalize(analyzer):
assert analyzer.normalize('TEsT') == 'test'
def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
create_postcode_id):
table_factory('location_postcode', 'postcode TEXT',
content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
analyzer.add_postcodes_from_db()
assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
== set((('1234', ), ('12 34', ), ('AB23',)))
def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
make_standard_name):
analyzer.update_special_phrases([
("König bei", "amenity", "royal", "near"),
("Könige", "amenity", "royal", "-"),
("strasse", "highway", "primary", "in")
])
assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
FROM word WHERE class != 'place'""") \
== set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
(' könige', 'könige', 'amenity', 'royal', None),
(' strasse', 'strasse', 'highway', 'primary', 'in')))
def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
make_standard_name):
temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
(' bar', 'bar', 'highway', 'road', null)""")
assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
analyzer.update_special_phrases([])
assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
make_standard_name):
temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
(' bar', 'bar', 'highway', 'road', null)""")
assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
analyzer.update_special_phrases([
('prison', 'amenity', 'prison', 'in'),
('bar', 'highway', 'road', '-'),
('garden', 'leisure', 'garden', 'near')
])
assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
FROM word WHERE class != 'place'""") \
== set(((' prison', 'prison', 'amenity', 'prison', 'in'),
(' bar', 'bar', 'highway', 'road', None),
(' garden', 'garden', 'leisure', 'garden', 'near')))
def test_process_place_names(analyzer, make_keywords):
info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
assert info['names'] == '{1,2,3}'
@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
info = analyzer.process_place({'address': {'postcode' : pc}})
assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
== set(((pc, ),))
@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
pc):
info = analyzer.process_place({'address': {'postcode' : pc}})
assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
info = analyzer.process_place({'address': {'housenumber' : hnr}})
assert info['hnr'] == hnr
assert info['hnr_tokens'].startswith("{")
def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
info = analyzer.process_place({'address': {'housenumber' : '134',
'conscriptionnumber' : '134',
'streetnumber' : '99a'}})
assert set(info['hnr'].split(';')) == set(('134', '99a'))

View File

@ -43,8 +43,22 @@ def test_check_placex_table_size_bad(temp_db_cursor, temp_db_conn, def_config):
assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL
def test_check_module_bad(temp_db_conn, def_config): def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL def_config.project_dir = tmp_path
assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
("Something wrong", chkdb.CheckState.FAIL)])
def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
check_result, state):
class _TestTokenizer:
def check_database(self):
return check_result
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
lambda *a, **k: _TestTokenizer())
assert chkdb.check_tokenizer(temp_db_conn, def_config) == state
def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config): def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):

View File

@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
database_import.setup_extensions(temp_db_conn) database_import.setup_extensions(temp_db_conn)
def test_install_module(tmp_path):
src_dir = tmp_path / 'source'
src_dir.mkdir()
(src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
project_dir = tmp_path / 'project'
project_dir.mkdir()
database_import.install_module(src_dir, project_dir, '')
outfile = project_dir / 'module' / 'nominatim.so'
assert outfile.exists()
assert outfile.read_text() == 'TEST nomiantim.so'
assert outfile.stat().st_mode == 33261
def test_install_module_custom(tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
assert not (tmp_path / 'module').exists()
def test_install_module_fail_access(temp_db_conn, tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
database_import.install_module(tmp_path, tmp_path, '',
conn=temp_db_conn)
def test_import_base_data(src_dir, temp_db, temp_db_cursor): def test_import_base_data(src_dir, temp_db, temp_db_cursor):
temp_db_cursor.execute('CREATE EXTENSION hstore') temp_db_cursor.execute('CREATE EXTENSION hstore')
temp_db_cursor.execute('CREATE EXTENSION postgis') temp_db_cursor.execute('CREATE EXTENSION postgis')
@ -171,14 +138,15 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory): def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
tables = ('word', 'placex', 'place_addressline', 'location_area', tables = ('placex', 'place_addressline', 'location_area',
'location_area_country', 'location_area_country',
'location_property_tiger', 'location_property_osmline', 'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23') 'location_postcode', 'search_name', 'location_road_23')
for table in tables: for table in tables:
table_factory(table, content=(1, 2, 3)) table_factory(table, content=((1, ), (2, ), (3, )))
assert temp_db_cursor.table_rows(table) == 3
database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23) database_import.truncate_data_tables(temp_db_conn)
for table in tables: for table in tables:
assert temp_db_cursor.table_rows(table) == 0 assert temp_db_cursor.table_rows(table) == 0
@ -196,36 +164,33 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
place_row(osm_type='W', osm_id=342, cls='place', typ='houses', place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
geom='SRID=4326;LINESTRING(0 0, 10 10)') geom='SRID=4326;LINESTRING(0 0, 10 10)')
database_import.load_data(dsn, src_dir / 'data', threads) database_import.load_data(dsn, threads)
assert temp_db_cursor.table_rows('placex') == 30 assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1 assert temp_db_cursor.table_rows('location_property_osmline') == 1
@pytest.mark.parametrize("languages", (False, True))
def test_create_country_names(temp_db_conn, temp_db_cursor, def_config, @pytest.mark.parametrize("languages", (None, ' fr,en'))
temp_db_with_extensions, monkeypatch, languages): def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
table_factory, tokenizer_mock, languages):
table_factory('country_name', 'country_code varchar(2), name hstore',
content=(('us', '"name"=>"us1","name:af"=>"us2"'),
('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
tokenizer = tokenizer_mock()
database_import.create_country_names(temp_db_conn, tokenizer, languages)
assert len(tokenizer.analyser_cache['countries']) == 2
result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
if languages: if languages:
monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en') assert result_set == {'us' : set(('us', 'us1', 'United States')),
temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT) 'fr' : set(('fr', 'Fra', 'Fren'))}
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
""")
temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
'"name"=>"us","name:af"=>"us"')""")
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
lookup_country_code varchar(2))
RETURNS INTEGER
AS $$
BEGIN
INSERT INTO word VALUES (lookup_country_code);
RETURN 5;
END;
$$
LANGUAGE plpgsql;
""")
database_import.create_country_names(temp_db_conn, def_config)
if languages:
assert temp_db_cursor.table_rows('word') == 4
else: else:
assert temp_db_cursor.table_rows('word') == 5 assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
'fr' : set(('fr', 'Fra', 'Fren'))}

View File

@ -11,41 +11,6 @@ from nominatim.tools import SpecialPhrasesImporter
TEST_BASE_DIR = Path(__file__) / '..' / '..' TEST_BASE_DIR = Path(__file__) / '..' / '..'
def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
temp_db_cursor):
"""
Check for the fetch_existing_words_phrases() method.
It should return special phrase term added to the word
table.
"""
query ="""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'class', 'type', null, 0, 'near');
"""
temp_db_cursor.execute(query)
assert not special_phrases_importer.words_phrases_to_delete
special_phrases_importer._fetch_existing_words_phrases()
contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
@pytest.mark.parametrize("house_type", ['house', 'postcode'])
def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
house_type, temp_db_cursor):
"""
Check for the fetch_existing_words_phrases() method.
It should return nothing as the terms added correspond
to a housenumber and postcode term.
"""
query ="""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'place', %s, null, 0, 'near');
"""
temp_db_cursor.execute(query, (house_type,))
special_phrases_importer._fetch_existing_words_phrases()
assert not special_phrases_importer.words_phrases_to_delete
def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor): def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
""" """
Check for the fetch_existing_place_classtype_tables() method. Check for the fetch_existing_place_classtype_tables() method.
@ -118,41 +83,11 @@ def test_convert_settings_giving_json(special_phrases_importer):
the same path is directly returned the same path is directly returned
""" """
json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve() json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
returned = special_phrases_importer._convert_php_settings_if_needed(json_file) returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
assert returned == json_file assert returned == json_file
def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
temp_db_conn, word_table):
"""
Test that _process_amenity() execute well the
getorcreate_amenityoperator() SQL function and that
the 2 differents operators are well handled.
"""
special_phrases_importer._process_amenity('', '', '', '', 'near')
special_phrases_importer._process_amenity('', '', '', '', 'in')
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
results = temp_db_cursor.fetchall()
assert len(results) == 2
def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
temp_db_conn, word_table):
"""
Test that _process_amenity() execute well the
getorcreate_amenity() SQL function.
"""
special_phrases_importer._process_amenity('', '', '', '', '')
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
result = temp_db_cursor.fetchone()
assert result
def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer): def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
""" """
Test that _create_place_classtype_indexes() create the Test that _create_place_classtype_indexes() create the
@ -215,8 +150,7 @@ def test_create_place_classtype_table_and_indexes(
assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1]) assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1]) assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table, def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
""" """
Test that _process_xml_content() process the given xml content right Test that _process_xml_content() process the given xml content right
by executing the right SQL functions for amenities and by executing the right SQL functions for amenities and
@ -228,11 +162,9 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
#Converted output set to a dict for easy assert further. #Converted output set to a dict for easy assert further.
results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en')) results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
assert check_amenities_with_op(temp_db_conn)
assert check_amenities_without_op(temp_db_conn)
assert results[class_test] and type_test in results.values() assert results[class_test] and type_test in results.values()
def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases, def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
temp_db_conn): temp_db_conn):
""" """
Check for the remove_non_existent_phrases_from_db() method. Check for the remove_non_existent_phrases_from_db() method.
@ -245,22 +177,10 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
be deleted. be deleted.
""" """
with temp_db_conn.cursor() as temp_db_cursor: with temp_db_conn.cursor() as temp_db_cursor:
to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
to_keep_phrase_tuple = (
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
)
special_phrases_importer.words_phrases_to_delete = {
to_delete_phrase_tuple,
to_keep_phrase_tuple
}
special_phrases_importer.words_phrases_still_exist = {
to_keep_phrase_tuple
}
special_phrases_importer.table_phrases_to_delete = { special_phrases_importer.table_phrases_to_delete = {
'place_classtype_testclasstypetable_to_delete' 'place_classtype_testclasstypetable_to_delete'
} }
query_words = 'SELECT word, class, type, operator FROM word;'
query_tables = """ query_tables = """
SELECT table_name SELECT table_name
FROM information_schema.tables FROM information_schema.tables
@ -268,21 +188,16 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
AND table_name like 'place_classtype_%'; AND table_name like 'place_classtype_%';
""" """
special_phrases_importer._remove_non_existent_phrases_from_db() special_phrases_importer._remove_non_existent_tables_from_db()
temp_db_cursor.execute(query_words)
words_result = temp_db_cursor.fetchall()
temp_db_cursor.execute(query_tables) temp_db_cursor.execute(query_tables)
tables_result = temp_db_cursor.fetchall() tables_result = temp_db_cursor.fetchall()
assert len(words_result) == 1 and words_result[0] == [
'normalized_word_exists', 'class_exists', 'type_exists', 'near'
]
assert (len(tables_result) == 1 and assert (len(tables_result) == 1 and
tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep' tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
) )
def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table): placex_table, tokenizer_mock):
""" """
Check that the main import_from_wiki() method is well executed. Check that the main import_from_wiki() method is well executed.
It should create the place_classtype table, the place_id and centroid indexes, It should create the place_classtype table, the place_id and centroid indexes,
@ -294,17 +209,14 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
#what is deleted and what is preserved. #what is deleted and what is preserved.
with temp_db_conn.cursor() as temp_db_cursor: with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute(""" temp_db_cursor.execute("""
INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
'amenity', 'animal_shelter', null, 0, null);
INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
'wrong_class', 'wrong_type', null, 0, 'near');
CREATE TABLE place_classtype_amenity_animal_shelter(); CREATE TABLE place_classtype_amenity_animal_shelter();
CREATE TABLE place_classtype_wrongclass_wrongtype();""") CREATE TABLE place_classtype_wrongclass_wrongtype();""")
monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content) monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
special_phrases_importer.import_from_wiki(['en']) tokenizer = tokenizer_mock()
special_phrases_importer.import_from_wiki(tokenizer, ['en'])
assert len(tokenizer.analyser_cache['special_phrases']) == 18
class_test = 'aerialway' class_test = 'aerialway'
type_test = 'zip_line' type_test = 'zip_line'
@ -312,22 +224,12 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
assert check_table_exist(temp_db_conn, class_test, type_test) assert check_table_exist(temp_db_conn, class_test, type_test)
assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test) assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test) assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
assert check_amenities_with_op(temp_db_conn)
assert check_amenities_without_op(temp_db_conn)
assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter') assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type') assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
#Format (query, should_return_something_bool) use to easily execute all asserts #Format (query, should_return_something_bool) use to easily execute all asserts
queries_tests = set() queries_tests = set()
#Used to check that the correct phrase already in the word table before is still there.
query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
queries_tests.add((query_correct_word, True))
#Used to check if wrong phrase was deleted from the word table of the database.
query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
queries_tests.add((query_wrong_word, False))
#Used to check that correct place_classtype table already in the datase before is still there. #Used to check that correct place_classtype table already in the datase before is still there.
query_existing_table = """ query_existing_table = """
SELECT table_name SELECT table_name
@ -412,24 +314,6 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
temp_db_conn.index_exists(index_prefix + 'place_id') temp_db_conn.index_exists(index_prefix + 'place_id')
) )
def check_amenities_with_op(temp_db_conn):
"""
Check that the test table for the SQL function getorcreate_amenityoperator()
contains more than one value (so that the SQL function was call more than one time).
"""
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
return len(temp_db_cursor.fetchall()) > 1
def check_amenities_without_op(temp_db_conn):
"""
Check that the test table for the SQL function getorcreate_amenity()
contains more than one value (so that the SQL function was call more than one time).
"""
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
return len(temp_db_cursor.fetchall()) > 1
@pytest.fixture @pytest.fixture
def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration): def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
""" """
@ -453,48 +337,7 @@ def temp_phplib_dir_with_migration():
yield Path(phpdir) yield Path(phpdir)
@pytest.fixture @pytest.fixture
def default_phrases(word_table, temp_db_cursor): def default_phrases(temp_db_cursor):
temp_db_cursor.execute(""" temp_db_cursor.execute("""
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
'class', 'type', null, 0, 'near');
INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
'class_exists', 'type_exists', null, 0, 'near');
CREATE TABLE place_classtype_testclasstypetable_to_delete(); CREATE TABLE place_classtype_testclasstypetable_to_delete();
CREATE TABLE place_classtype_testclasstypetable_to_keep();""") CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
@pytest.fixture
def make_strandard_name_func(temp_db_cursor):
temp_db_cursor.execute("""
CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
BEGIN
RETURN trim(name); --Basically return only the trimed name for the tests
END;
$$ LANGUAGE plpgsql IMMUTABLE;""")
@pytest.fixture
def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
temp_db_cursor.execute("""
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text)
RETURNS void as $$
BEGIN
INSERT INTO word VALUES(null, lookup_word, normalized_word,
lookup_class, lookup_type, null, 0, 'no_operator');
END;
$$ LANGUAGE plpgsql""")
@pytest.fixture
def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
temp_db_cursor.execute("""
CREATE TABLE temp_with_operator(op TEXT);
CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text, op text)
RETURNS void as $$
BEGIN
INSERT INTO word VALUES(null, lookup_word, normalized_word,
lookup_class, lookup_type, null, 0, op);
END;
$$ LANGUAGE plpgsql""")

View File

@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
import pytest import pytest
from nominatim.tools import postcodes from nominatim.tools import postcodes
import dummy_tokenizer
@pytest.fixture
def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture @pytest.fixture
def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory, def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
postcode TEXT, postcode TEXT,
geometry GEOMETRY(Geometry, 4326)""") geometry GEOMETRY(Geometry, 4326)""")
temp_db_cursor.execute('CREATE SEQUENCE seq_place') temp_db_cursor.execute('CREATE SEQUENCE seq_place')
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT) temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql; RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
""") """)
def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path): def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
postcodes.import_postcodes(dsn, tmp_path) postcodes.import_postcodes(dsn, tmp_path, tokenizer)
assert temp_db_cursor.table_exists('gb_postcode') assert temp_db_cursor.table_exists('gb_postcode')
assert temp_db_cursor.table_exists('us_postcode') assert temp_db_cursor.table_exists('us_postcode')
assert temp_db_cursor.table_rows('location_postcode') == 0 assert temp_db_cursor.table_rows('location_postcode') == 0
def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path): def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
temp_db_cursor.execute(""" temp_db_cursor.execute("""
INSERT INTO placex (place_id, country_code, address, geometry) INSERT INTO placex (place_id, country_code, address, geometry)
VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)') VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
""") """)
postcodes.import_postcodes(dsn, tmp_path) postcodes.import_postcodes(dsn, tmp_path, tokenizer)
rows = temp_db_cursor.row_set(""" SELECT postcode, country_code, rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
ST_X(geometry), ST_Y(geometry) ST_X(geometry), ST_Y(geometry)

View File

@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
return tmp_path return tmp_path
@pytest.fixture @pytest.fixture
def conn(temp_db_conn, table_factory, monkeypatch): def conn(sql_preprocessor, temp_db_conn):
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
table_factory('country_name', 'partition INT', (0, 1, 2))
return temp_db_conn return temp_db_conn

View File

@ -26,6 +26,7 @@ def test_script(envdir):
def run_website_script(envdir, config): def run_website_script(envdir, config):
config.lib_dir.php = envdir / 'php' config.lib_dir.php = envdir / 'php'
config.project_dir = envdir
refresh.setup_website(envdir, config) refresh.setup_website(envdir, config)
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq', proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',

View File

@ -1,120 +1,170 @@
SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome'); CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome'); lookup_class text, lookup_type text)
SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in'); RETURNS INTEGER
SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in'); AS $$
SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near'); DECLARE
SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near'); lookup_token TEXT;
SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome'); return_word_id INTEGER;
SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome'); BEGIN
SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in'); lookup_token := ' '||trim(lookup_word);
SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in'); SELECT min(word_id) FROM word
SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near'); WHERE word_token = lookup_token and word = normalized_word
SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near'); and class = lookup_class and type = lookup_type
SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar'); INTO return_word_id;
SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar'); IF return_word_id IS NULL THEN
SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in'); return_word_id := nextval('seq_word');
SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in'); INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near'); lookup_class, lookup_type, null, 0);
SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near'); END IF;
SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub'); RETURN return_word_id;
SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub'); END;
SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in'); $$
SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in'); LANGUAGE plpgsql;
SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant'); CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant'); normalized_word TEXT,
SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in'); lookup_class text,
SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in'); lookup_type text,
SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near'); op text)
SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near'); RETURNS INTEGER
SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar'); AS $$
SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar'); DECLARE
SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in'); lookup_token TEXT;
SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in'); return_word_id INTEGER;
SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near'); BEGIN
SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near'); lookup_token := ' '||trim(lookup_word);
SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub'); SELECT min(word_id) FROM word
SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub'); WHERE word_token = lookup_token and word = normalized_word
SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in'); and class = lookup_class and type = lookup_type and operator = op
SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in'); INTO return_word_id;
SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near'); IF return_word_id IS NULL THEN
SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near'); return_word_id := nextval('seq_word');
SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant'); INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant'); lookup_class, lookup_type, null, 0, op);
SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in'); END IF;
SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in'); RETURN return_word_id;
SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near'); END;
SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near'); $$
SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural'); LANGUAGE plpgsql;
SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture'); SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture'); SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue'); SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue'); SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park'); SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes'); SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes'); SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque'); SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue'); SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue'); SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein'); SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein'); SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein'); SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in'); SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in'); SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in'); SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near'); SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial'); SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial'); SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in'); SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in'); SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near'); SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near'); SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome'; CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid); CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id); CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
@ -175,4 +225,7 @@ CREATE TABLE place_classtype_memorial_war_memorial AS SELECT place_id AS place_i
CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid); CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid);
CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id); CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data"; GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
DROP INDEX idx_placex_classtype; DROP INDEX idx_placex_classtype;
DROP FUNCTION test_getorcreate_amenity;
DROP FUNCTION test_getorcreate_amenityoperator;