Merge pull request #2305 from lonvia/tokenizer

Factor out normalization into a separate module
2024-09-20 07:29:27 +03:00 · 2021-05-03 09:15:34 +02:00 · 2021-05-03 09:15:34 +02:00 · 8b1a509442
commit 8b1a509442
parent 5c7b9ef909 8bdb9aa607
61 changed files with 3032 additions and 2012 deletions
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@ -8,12 +8,14 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
 require_once(CONST_LibDir.'/SearchDescription.php');
 require_once(CONST_LibDir.'/SearchContext.php');
 require_once(CONST_LibDir.'/TokenList.php');
 require_once(CONST_TokenizerDir.'/tokenizer.php');
 class Geocode
 {
    protected $oDB;
    protected $oPlaceLookup;
    protected $oTokenizer;
    protected $aLangPrefOrder = array();
@ -41,23 +43,12 @@ class Geocode
    protected $sQuery = false;
    protected $aStructuredQuery = false;
    protected $oNormalizer = null;
    public function __construct(&$oDB)
    {
        $this->oDB =& $oDB;
        $this->oPlaceLookup = new PlaceLookup($this->oDB);
-        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+        $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
    }
    private function normTerm($sTerm)
    {
        if ($this->oNormalizer === null) {
            return $sTerm;
        }
        return $this->oNormalizer->transliterate($sTerm);
    }
    public function setLanguagePreference($aLangPref)
@ -510,12 +501,10 @@ class Geocode
        if ($this->aCountryCodes) {
            $oCtx->setCountryList($this->aCountryCodes);
        }
        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
        Debug::newSection('Query Preprocessing');
        $sNormQuery = $this->normTerm($this->sQuery);
        Debug::printVar('Normalized query', $sNormQuery);
        $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
            $this->oDB->getDBQuotedList($this->aLangPrefOrder)
        );
@ -569,108 +558,55 @@ class Geocode
            }
            if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
-                $sSpecialTerm = pg_escape_string($sSpecialTerm);
+                $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
                $sToken = $this->oDB->getOne(
                    'SELECT make_standard_name(:term)',
                    array(':term' => $sSpecialTerm),
                    'Cannot decode query. Wrong encoding?'
                );
                $sSQL = 'SELECT class, type FROM word ';
                $sSQL .= '   WHERE word_token in (\' '.$sToken.'\')';
                $sSQL .= '   AND class is not null AND class not in (\'place\')';
-                Debug::printSQL($sSQL);
+                if (!empty($aTokens)) {
-                $aSearchWords = $this->oDB->getAll($sSQL);
+                    $aNewSearches = array();
-                $aNewSearches = array();
+                    foreach ($aSearches as $oSearch) {
-                foreach ($aSearches as $oSearch) {
+                        foreach ($aTokens as $oToken) {
-                    foreach ($aSearchWords as $aSearchTerm) {
+                            $oNewSearch = clone $oSearch;
-                        $oNewSearch = clone $oSearch;
+                            $oNewSearch->setPoiSearch(
-                        $oNewSearch->setPoiSearch(
+                                $oToken->iOperator,
-                            Operator::TYPE,
+                                $oToken->sClass,
-                            $aSearchTerm['class'],
+                                $oToken->sType
-                            $aSearchTerm['type']
+                            );
-                        );
+                            $aNewSearches[] = $oNewSearch;
-                        $aNewSearches[] = $oNewSearch;
+                        }
                    }
                    $aSearches = $aNewSearches;
                }
                $aSearches = $aNewSearches;
            }
            // Split query into phrases
            // Commas are used to reduce the search space by indicating where phrases split
            $aPhrases = array();
            if ($this->aStructuredQuery) {
-                $aInPhrases = $this->aStructuredQuery;
+                foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
                    $aPhrases[] = new Phrase($sPhrase, $iPhrase);
                }
            } else {
-                $aInPhrases = explode(',', $sQuery);
+                foreach (explode(',', $sQuery) as $sPhrase) {
                    $aPhrases[] = new Phrase($sPhrase, '');
                }
            }
            Debug::printDebugArray('Search context', $oCtx);
            Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
            Debug::printVar('Final query phrases', $aInPhrases);
            // Convert each phrase to standard form
            // Create a list of standard words
            // Get all 'sets' of words
            // Generate a complete list of all
            Debug::newSection('Tokenization');
-            $aTokens = array();
+            $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
            $aPhrases = array();
            foreach ($aInPhrases as $iPhrase => $sPhrase) {
                $sPhrase = $this->oDB->getOne(
                    'SELECT make_standard_name(:phrase)',
                    array(':phrase' => $sPhrase),
                    'Cannot normalize query string (is it a UTF-8 string?)'
                );
                if (trim($sPhrase)) {
                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
                    $oPhrase->addTokens($aTokens);
                    $aPhrases[] = $oPhrase;
                }
            }
            Debug::printVar('Tokens', $aTokens);
            $oValidTokens = new TokenList();
            if (!empty($aTokens)) {
                $oValidTokens->addTokensFromDB(
                    $this->oDB,
                    $aTokens,
                    $this->aCountryCodes,
                    $sNormQuery,
                    $this->oNormalizer
                );
            if ($oValidTokens->count() > 0) {
                $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
-                // Try more interpretations for Tokens that could not be matched.
+                $aPhrases = array_filter($aPhrases, function ($oPhrase) {
-                foreach ($aTokens as $sToken) {
+                    return $oPhrase->getWordSets() !== null;
-                    if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                });
                        if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                            // US ZIP+4 codes - merge in the 5-digit ZIP code
                            $oValidTokens->addToken(
                                $sToken,
                                new Token\Postcode(null, $aData[1], 'us')
                            );
                        } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
                            // Unknown single word token with a number.
                            // Assume it is a house number.
                            $oValidTokens->addToken(
                                $sToken,
                                new Token\HouseNumber(null, trim($sToken))
                            );
                        }
                    }
                }
                // Any words that have failed completely?
                // TODO: suggestions
                Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
                foreach ($aPhrases as $oPhrase) {
                    $oPhrase->computeWordSets($oValidTokens);
                }
                Debug::printDebugTable('Phrases', $aPhrases);
                Debug::newSection('Search candidates');
--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@ -16,8 +16,6 @@ class Phrase
    private $sPhrase;
    // Element type for structured searches.
    private $sPhraseType;
    // Space-separated words of the phrase.
    private $aWords;
    // Possible segmentations of the phrase.
    private $aWordSets;
@ -38,7 +36,14 @@ class Phrase
    {
        $this->sPhrase = trim($sPhrase);
        $this->sPhraseType = $sPhraseType;
-        $this->aWords = explode(' ', $this->sPhrase);
+    }
    /**
     * Get the orginal phrase of the string.
     */
    public function getPhrase()
    {
        return $this->sPhrase;
    }
    /**
@ -63,30 +68,6 @@ class Phrase
        return $this->aWordSets;
    }
    /**
     * Add the tokens from this phrase to the given list of tokens.
     *
     * @param string[] $aTokens List of tokens to append.
     *
     * @return void
     */
    public function addTokens(&$aTokens)
    {
        $iNumWords = count($this->aWords);
        for ($i = 0; $i < $iNumWords; $i++) {
            $sPhrase = $this->aWords[$i];
            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
            $aTokens[$sPhrase] = $sPhrase;
            for ($j = $i + 1; $j < $iNumWords; $j++) {
                $sPhrase .= ' '.$this->aWords[$j];
                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
                $aTokens[$sPhrase] = $sPhrase;
            }
        }
    }
    /**
     * Invert the set of possible segmentations.
     *
@ -99,21 +80,27 @@ class Phrase
        }
    }
-    public function computeWordSets($oTokens)
+    public function computeWordSets($aWords, $oTokens)
    {
-        $iNumWords = count($this->aWords);
+        $iNumWords = count($aWords);
        if ($iNumWords == 0) {
            $this->aWordSets = null;
            return;
        }
        // Caches the word set for the partial phrase up to word i.
        $aSetCache = array_fill(0, $iNumWords, array());
        // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($this->aWords[0])) {
+        if ($oTokens->containsAny($aWords[0])) {
-            $aSetCache[0][] = array($this->aWords[0]);
+            $aSetCache[0][] = array($aWords[0]);
        }
        // Now do the next elements using what we already have.
        for ($i = 1; $i < $iNumWords; $i++) {
            for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
                    $aPartial = array($sPartial);
                    foreach ($aSetCache[$j - 1] as $aSet) {
@ -136,7 +123,7 @@ class Phrase
            }
            // finally the current full phrase
-            $sPartial = $this->aWords[0].' '.$sPartial;
+            $sPartial = $aWords[0].' '.$sPartial;
            if ($oTokens->containsAny($sPartial)) {
                $aSetCache[$i][] = array($sPartial);
            }
@ -153,7 +140,6 @@ class Phrase
        return array(
                'Type' => $this->sPhraseType,
                'Phrase' => $this->sPhrase,
                'Words' => $this->aWords,
                'WordSets' => $this->aWordSets
               );
    }
--- a/lib-php/Status.php
+++ b/lib-php/Status.php
@ -2,6 +2,8 @@
 namespace Nominatim;
 require_once(CONST_TokenizerDir.'/tokenizer.php');
 use Exception;
 class Status
@ -25,24 +27,8 @@ class Status
            throw new Exception('Database connection failed', 700);
        }
-        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
+        $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
-        if ($sStandardWord === false) {
+        $oTokenizer->checkStatus();
            throw new Exception('Module failed', 701);
        }
        if ($sStandardWord != 'a') {
            throw new Exception('Module call failed', 702);
        }
        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
        $sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
        $iWordID = $this->oDB->getOne($sSQL);
        if ($iWordID === false) {
            throw new Exception('Query failed', 703);
        }
        if (!$iWordID) {
            throw new Exception('No value', 704);
        }
    }
    public function dataDate()
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@ -95,88 +95,6 @@ class TokenList
        return $ids;
    }
    /**
     * Add token information from the word table in the database.
     *
     * @param object   $oDB           Nominatim::DB instance.
     * @param string[] $aTokens       List of tokens to look up in the database.
     * @param string[] $aCountryCodes List of country restrictions.
     * @param string   $sNormQuery    Normalized query string.
     * @param object   $oNormalizer   Normalizer function to use on tokens.
     *
     * @return void
     */
    public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
    {
        // Check which tokens we have, get the ID numbers
        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
        $sSQL .= ' FROM word WHERE word_token in (';
        $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
        Debug::printSQL($sSQL);
        $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
        foreach ($aDBWords as $aWord) {
            $oToken = null;
            $iId = (int) $aWord['word_id'];
            if ($aWord['class']) {
                // Special terms need to appear in their normalized form.
                if ($aWord['word']) {
                    $sNormWord = $aWord['word'];
                    if ($oNormalizer != null) {
                        $sNormWord = $oNormalizer->transliterate($aWord['word']);
                    }
                    if (strpos($sNormQuery, $sNormWord) === false) {
                        continue;
                    }
                }
                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
                    if ($aWord['word']
                        && pg_escape_string($aWord['word']) == $aWord['word']
                    ) {
                        $oToken = new Token\Postcode(
                            $iId,
                            $aWord['word'],
                            $aWord['country_code']
                        );
                    }
                } else {
                    // near and in operator the same at the moment
                    $oToken = new Token\SpecialTerm(
                        $iId,
                        $aWord['class'],
                        $aWord['type'],
                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
                    );
                }
            } elseif ($aWord['country_code']) {
                // Filter country tokens that do not match restricted countries.
                if (!$aCountryCodes
                    || in_array($aWord['country_code'], $aCountryCodes)
                ) {
                    $oToken = new Token\Country($iId, $aWord['country_code']);
                }
            } else {
                $oToken = new Token\Word(
                    $iId,
                    $aWord['word_token'][0] != ' ',
                    (int) $aWord['count'],
                    substr_count($aWord['word_token'], ' ')
                );
            }
            if ($oToken) {
                $this->addToken($aWord['word_token'], $oToken);
            }
        }
    }
    /**
     * Add a new token for the given word.
     *
--- a/lib-php/admin/query.php
+++ b/lib-php/admin/query.php
@ -2,7 +2,6 @@
@define('CONST_LibDir', dirname(dirname(__FILE__)));
 require_once(CONST_LibDir.'/init-cmd.php');
 require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/ParameterParser.php');
 ini_set('memory_limit', '800M');
@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
 require_once(CONST_LibDir.'/Geocode.php');
 $oDB = new Nominatim\DB;
 $oDB->connect();
--- a/lib-php/admin/warm.php
+++ b/lib-php/admin/warm.php
@ -3,7 +3,6 @@
 require_once(CONST_LibDir.'/init-cmd.php');
 require_once(CONST_LibDir.'/log.php');
 require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/PlaceLookup.php');
 require_once(CONST_LibDir.'/ReverseGeocode.php');
@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
 require_once(CONST_LibDir.'/Geocode.php');
 $oDB = new Nominatim\DB();
 $oDB->connect();
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@ -0,0 +1,254 @@
 <?php
 namespace Nominatim;
 class Tokenizer
 {
    private $oDB;
    private $oNormalizer = null;
    private $aCountryRestriction = null;
    public function __construct(&$oDB)
    {
        $this->oDB =& $oDB;
        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
    }
    public function checkStatus()
    {
        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
        if ($sStandardWord === false) {
            throw new Exception('Module failed', 701);
        }
        if ($sStandardWord != 'a') {
            throw new Exception('Module call failed', 702);
        }
        $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
        $iWordID = $this->oDB->getOne($sSQL);
        if ($iWordID === false) {
            throw new Exception('Query failed', 703);
        }
        if (!$iWordID) {
            throw new Exception('No value', 704);
        }
    }
    public function setCountryRestriction($aCountries)
    {
        $this->aCountryRestriction = $aCountries;
    }
    public function normalizeString($sTerm)
    {
        if ($this->oNormalizer === null) {
            return $sTerm;
        }
        return $this->oNormalizer->transliterate($sTerm);
    }
    public function tokensForSpecialTerm($sTerm)
    {
        $aResults = array();
        $sSQL = 'SELECT word_id, class, type FROM word ';
        $sSQL .= '   WHERE word_token = \' \' || make_standard_name(:term)';
        $sSQL .= '   AND class is not null AND class not in (\'place\')';
        Debug::printVar('Term', $sTerm);
        Debug::printSQL($sSQL);
        $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
        Debug::printVar('Results', $aSearchWords);
        foreach ($aSearchWords as $aSearchTerm) {
            $aResults[] = new \Nominatim\Token\SpecialTerm(
                $aSearchTerm['word_id'],
                $aSearchTerm['class'],
                $aSearchTerm['type'],
                \Nominatim\Operator::TYPE
            );
        }
        Debug::printVar('Special term tokens', $aResults);
        return $aResults;
    }
    public function extractTokensFromPhrases(&$aPhrases)
    {
        // First get the normalized version of all phrases
        $sNormQuery = '';
        $sSQL = 'SELECT ';
        $aParams = array();
        foreach ($aPhrases as $iPhrase => $oPhrase) {
            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
            $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
            $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
        }
        $sSQL = substr($sSQL, 0, -1);
        Debug::printSQL($sSQL);
        Debug::printVar('SQL parameters', $aParams);
        $aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
        Debug::printVar('SQL result', $aNormPhrases);
        // now compute all possible tokens
        $aWordLists = array();
        $aTokens = array();
        foreach ($aNormPhrases as $sTitle => $sPhrase) {
            if (strlen($sPhrase) > 0) {
                $aWords = explode(' ', $sPhrase);
                Tokenizer::addTokens($aTokens, $aWords);
                $aWordLists[] = $aWords;
            } else {
                $aWordLists[] = array();
            }
        }
        Debug::printVar('Tokens', $aTokens);
        Debug::printVar('WordLists', $aWordLists);
        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
        foreach ($aPhrases as $iPhrase => $oPhrase) {
            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
        }
        return $oValidTokens;
    }
    private function computeValidTokens($aTokens, $sNormQuery)
    {
        $oValidTokens = new TokenList();
        if (!empty($aTokens)) {
            $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
            // Try more interpretations for Tokens that could not be matched.
            foreach ($aTokens as $sToken) {
                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                        // US ZIP+4 codes - merge in the 5-digit ZIP code
                        $oValidTokens->addToken(
                            $sToken,
                            new Token\Postcode(null, $aData[1], 'us')
                        );
                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
                        // Unknown single word token with a number.
                        // Assume it is a house number.
                        $oValidTokens->addToken(
                            $sToken,
                            new Token\HouseNumber(null, trim($sToken))
                        );
                    }
                }
            }
        }
        return $oValidTokens;
    }
    private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
    {
        // Check which tokens we have, get the ID numbers
        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
        $sSQL .= ' FROM word WHERE word_token in (';
        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
        Debug::printSQL($sSQL);
        $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
        foreach ($aDBWords as $aWord) {
            $oToken = null;
            $iId = (int) $aWord['word_id'];
            if ($aWord['class']) {
                // Special terms need to appear in their normalized form.
                // (postcodes are not normalized in the word table)
                $sNormWord = $this->normalizeString($aWord['word']);
                if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
                    continue;
                }
                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
                    if ($aWord['word']
                        && pg_escape_string($aWord['word']) == $aWord['word']
                    ) {
                        $oToken = new Token\Postcode(
                            $iId,
                            $aWord['word'],
                            $aWord['country_code']
                        );
                    }
                } else {
                    // near and in operator the same at the moment
                    $oToken = new Token\SpecialTerm(
                        $iId,
                        $aWord['class'],
                        $aWord['type'],
                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
                    );
                }
            } elseif ($aWord['country_code']) {
                // Filter country tokens that do not match restricted countries.
                if (!$this->aCountryRestriction
                    || in_array($aWord['country_code'], $this->aCountryRestriction)
                ) {
                    $oToken = new Token\Country($iId, $aWord['country_code']);
                }
            } else {
                $oToken = new Token\Word(
                    $iId,
                    $aWord['word_token'][0] != ' ',
                    (int) $aWord['count'],
                    substr_count($aWord['word_token'], ' ')
                );
            }
            if ($oToken) {
                $oValidTokens->addToken($aWord['word_token'], $oToken);
            }
        }
    }
    /**
     * Add the tokens from this phrase to the given list of tokens.
     *
     * @param string[] $aTokens List of tokens to append.
     *
     * @return void
     */
    private static function addTokens(&$aTokens, $aWords)
    {
        $iNumWords = count($aWords);
        for ($i = 0; $i < $iNumWords; $i++) {
            $sPhrase = $aWords[$i];
            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
            $aTokens[$sPhrase] = $sPhrase;
            for ($j = $i + 1; $j < $iNumWords; $j++) {
                $sPhrase .= ' '.$aWords[$j];
                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
                $aTokens[$sPhrase] = $sPhrase;
            }
        }
    }
 }
--- a/lib-sql/functions.sql
+++ b/lib-sql/functions.sql
@ -1,5 +1,4 @@
 {% include('functions/utils.sql') %}
 {% include('functions/normalization.sql') %}
 {% include('functions/ranking.sql') %}
 {% include('functions/importance.sql') %}
 {% include('functions/address_lookup.sql') %}
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@ -12,39 +12,47 @@ $$
 LANGUAGE plpgsql IMMUTABLE;
 CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
 RETURNS HSTORE
  AS $$
 DECLARE
  location RECORD;
  waynodes BIGINT[];
 BEGIN
  IF akeys(in_address) != ARRAY['interpolation'] THEN
    RETURN in_address;
  END IF;
  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
  FOR location IN
    SELECT placex.address, placex.osm_id FROM placex
     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
           and placex.address is not null
           and (placex.address ? 'street' or placex.address ? 'place')
           and indexed_status < 100
  LOOP
    -- mark it as a derived address
    RETURN location.address || in_address || hstore('_inherited', '');
  END LOOP;
  RETURN in_address;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
-                                                    place TEXT, partition SMALLINT,
+                                                    partition SMALLINT,
                                                    centroid GEOMETRY, geom GEOMETRY)
  RETURNS BIGINT
  AS $$
 DECLARE
  addr_street TEXT;
  addr_place TEXT;
  parent_place_id BIGINT;
  waynodes BIGINT[];
  location RECORD;
 BEGIN
-  addr_street = street;
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);
  addr_place = place;
  IF addr_street is null and addr_place is null THEN
    select nodes from planet_osm_ways where id = wayid INTO waynodes;
    FOR location IN SELECT placex.address from placex
                    where osm_type = 'N' and osm_id = ANY(waynodes)
                          and placex.address is not null
                          and (placex.address ? 'street' or placex.address ? 'place')
                          and indexed_status < 100
                    limit 1 LOOP
      addr_street = location.address->'street';
      addr_place = location.address->'place';
    END LOOP;
  END IF;
  parent_place_id := find_parent_for_address(addr_street, addr_place,
                                             partition, centroid);
  IF parent_place_id is null THEN
    FOR location IN SELECT place_id FROM placex
@ -147,15 +155,15 @@ BEGIN
  NEW.interpolationtype = NEW.address->'interpolation';
  place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
+  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
-                                                 NEW.address->'place',
+                                                 token_addr_place_match_tokens(NEW.token_info),
                                                 NEW.partition, place_centroid, NEW.linegeo);
-  IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
+  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
-    interpol_postcode := NEW.address->'postcode';
+
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
+  NEW.token_info := token_strip_info(NEW.token_info);
-  ELSE
+  IF NEW.address ? '_inherited' THEN
-    interpol_postcode := NULL;
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
  END IF;
  -- if the line was newly inserted, split the line as necessary
@ -202,12 +210,13 @@ BEGIN
            -- determine postcode
            postcode := coalesce(interpol_postcode,
-                                 prevnode.address->'postcode',
+                                 token_normalized_postcode(prevnode.address->'postcode'),
-                                 nextnode.address->'postcode',
+                                 token_normalized_postcode(nextnode.address->'postcode'),
                                 postcode);
            IF postcode is NULL THEN
-                SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+                SELECT token_normalized_postcode(placex.postcode)
                  FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
            END IF;
            IF postcode is NULL THEN
                postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
@ -217,7 +226,7 @@ BEGIN
                NEW.startnumber := startnumber;
                NEW.endnumber := endnumber;
                NEW.linegeo := sectiongeo;
-                NEW.postcode := upper(trim(postcode));
+                NEW.postcode := postcode;
             ELSE
              insert into location_property_osmline
                     (linegeo, partition, osm_id, parent_place_id,
--- a/lib-sql/functions/normalization.sql
+++ b/lib-sql/functions/normalization.sql
@ -1,525 +0,0 @@
 -- Functions for term normalisation and access to the 'word' table.
 CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
  AS '{{ modulepath }}/nominatim.so', 'transliteration'
 LANGUAGE c IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
 LANGUAGE c IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
  AS $$
 DECLARE
  o TEXT;
 BEGIN
  o := public.gettokenstring(public.transliteration(name));
  RETURN trim(substr(o,1,length(o)));
 END;
 $$
 LANGUAGE plpgsql IMMUTABLE;
 -- returns NULL if the word is too common
 CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
  count INTEGER;
 BEGIN
  lookup_token := trim(lookup_word);
  SELECT min(word_id), max(search_name_count) FROM word
    WHERE word_token = lookup_token and class is null and type is null
    INTO return_word_id, count;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
  ELSE
    IF count > get_maxwordfreq() THEN
      return_word_id := NULL;
    END IF;
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 -- Create housenumber tokens from an OSM addr:housenumber.
 -- The housnumber is split at comma and semicolon as necessary.
 -- The function returns the normalized form of the housenumber suitable
 -- for comparison.
 CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
  RETURNS TEXT
  AS $$
 DECLARE
  normtext TEXT;
 BEGIN
  SELECT array_to_string(array_agg(trans), ';')
    INTO normtext
    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
          FROM (SELECT make_standard_name(h) as lookup_word
                FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
  return normtext;
 END;
 $$ LANGUAGE plpgsql STABLE STRICT;
 CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' ' || trim(lookup_word);
  SELECT min(word_id) FROM word
    WHERE word_token = lookup_token and class='place' and type='house'
    INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null,
                             'place', 'house', null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  lookup_word TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_word := upper(trim(postcode));
  lookup_token := ' ' || make_standard_name(lookup_word);
  SELECT min(word_id) FROM word
    WHERE word_token = lookup_token and word = lookup_word
          and class='place' and type='postcode'
    INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
                             'place', 'postcode', null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
                                               lookup_country_code varchar(2))
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' '||trim(lookup_word);
  SELECT min(word_id) FROM word
    WHERE word_token = lookup_token and country_code=lookup_country_code
    INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null,
                             null, null, lookup_country_code, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
                                               lookup_class text, lookup_type text)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' '||trim(lookup_word);
  SELECT min(word_id) FROM word
  WHERE word_token = lookup_token and word = normalized_word
        and class = lookup_class and type = lookup_type
  INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
                             lookup_class, lookup_type, null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
                                                       normalized_word TEXT,
                                                       lookup_class text,
                                                       lookup_type text,
                                                       op text)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' '||trim(lookup_word);
  SELECT min(word_id) FROM word
  WHERE word_token = lookup_token and word = normalized_word
        and class = lookup_class and type = lookup_type and operator = op
  INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
                             lookup_class, lookup_type, null, 0, op);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  nospace_lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' '||trim(lookup_word);
  SELECT min(word_id) FROM word
  WHERE word_token = lookup_token and class is null and type is null
  INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
                             null, null, null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
 BEGIN
  RETURN getorcreate_name_id(lookup_word, '');
 END;
 $$
 LANGUAGE plpgsql;
 -- Normalize a string and lookup its word ids (partial words).
 CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  words TEXT[];
  id INTEGER;
  return_word_id INTEGER[];
  word_ids INTEGER[];
  j INTEGER;
 BEGIN
  words := string_to_array(make_standard_name(lookup_word), ' ');
  IF array_upper(words, 1) IS NOT NULL THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      IF (words[j] != '') THEN
        SELECT array_agg(word_id) INTO word_ids
          FROM word
         WHERE word_token = words[j] and class is null and type is null;
        IF word_ids IS NULL THEN
          id := nextval('seq_word');
          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
          return_word_id := return_word_id || id;
        ELSE
          return_word_id := array_merge(return_word_id, word_ids);
        END IF;
      END IF;
    END LOOP;
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 -- Normalize a string and look up its name ids (full words).
 CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_ids INTEGER[];
 BEGIN
  lookup_token := ' '|| make_standard_name(lookup_word);
  SELECT array_agg(word_id) FROM word
    WHERE word_token = lookup_token and class is null and type is null
    INTO return_word_ids;
  RETURN return_word_ids;
 END;
 $$
 LANGUAGE plpgsql STABLE STRICT;
 CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
  RETURNS VOID
  AS $$
 DECLARE
  s TEXT;
  w INTEGER;
  words TEXT[];
  item RECORD;
  j INTEGER;
 BEGIN
  FOR item IN SELECT (each(src)).* LOOP
    s := make_standard_name(item.value);
    w := getorcreate_country(s, country_code);
    words := regexp_split_to_array(item.value, E'[,;()]');
    IF array_upper(words, 1) != 1 THEN
      FOR j IN 1..array_upper(words, 1) LOOP
        s := make_standard_name(words[j]);
        IF s != '' THEN
          w := getorcreate_country(s, country_code);
        END IF;
      END LOOP;
    END IF;
  END LOOP;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  result INTEGER[];
  s TEXT;
  w INTEGER;
  words TEXT[];
  item RECORD;
  j INTEGER;
 BEGIN
  result := '{}'::INTEGER[];
  FOR item IN SELECT (each(src)).* LOOP
    s := make_standard_name(item.value);
    w := getorcreate_name_id(s, item.value);
    IF not(ARRAY[w] <@ result) THEN
      result := result || w;
    END IF;
    w := getorcreate_word_id(s);
    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
      result := result || w;
    END IF;
    words := string_to_array(s, ' ');
    IF array_upper(words, 1) IS NOT NULL THEN
      FOR j IN 1..array_upper(words, 1) LOOP
        IF (words[j] != '') THEN
          w = getorcreate_word_id(words[j]);
          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
      END LOOP;
    END IF;
    words := regexp_split_to_array(item.value, E'[,;()]');
    IF array_upper(words, 1) != 1 THEN
      FOR j IN 1..array_upper(words, 1) LOOP
        s := make_standard_name(words[j]);
        IF s != '' THEN
          w := getorcreate_word_id(s);
          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
      END LOOP;
    END IF;
    s := regexp_replace(item.value, '市$', '');
    IF s != item.value THEN
      s := make_standard_name(s);
      IF s != '' THEN
        w := getorcreate_name_id(s, item.value);
        IF NOT (ARRAY[w] <@ result) THEN
          result := result || w;
        END IF;
      END IF;
    END IF;
  END LOOP;
  RETURN result;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  s TEXT;
  w INTEGER;
  words TEXT[];
  i INTEGER;
  j INTEGER;
 BEGIN
  s := make_standard_name(src);
  w := getorcreate_name_id(s, src);
  w := getorcreate_word_id(s);
  words := string_to_array(s, ' ');
  IF array_upper(words, 1) IS NOT NULL THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      IF (words[j] != '') THEN
        w := getorcreate_word_id(words[j]);
      END IF;
    END LOOP;
  END IF;
  words := regexp_split_to_array(src, E'[,;()]');
  IF array_upper(words, 1) != 1 THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      s := make_standard_name(words[j]);
      IF s != '' THEN
        w := getorcreate_word_id(s);
      END IF;
    END LOOP;
  END IF;
  s := regexp_replace(src, '市$', '');
  IF s != src THEN
    s := make_standard_name(s);
    IF s != '' THEN
      w := getorcreate_name_id(s, src);
    END IF;
  END IF;
  RETURN 1;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
                                                   in_partition SMALLINT,
                                                   parent_place_id BIGINT,
                                                   address HSTORE,
                                                   country TEXT,
                                                   housenumber TEXT,
                                                   initial_name_vector INTEGER[],
                                                   geometry GEOMETRY,
                                                   OUT name_vector INTEGER[],
                                                   OUT nameaddress_vector INTEGER[])
  AS $$
 DECLARE
  parent_name_vector INTEGER[];
  parent_address_vector INTEGER[];
  addr_place_ids INTEGER[];
  addr_item RECORD;
  parent_address_place_ids BIGINT[];
  filtered_address HSTORE;
 BEGIN
  nameaddress_vector := '{}'::INTEGER[];
  SELECT s.name_vector, s.nameaddress_vector
    INTO parent_name_vector, parent_address_vector
    FROM search_name s
    WHERE s.place_id = parent_place_id;
  -- Find all address tags that don't appear in the parent search names.
  SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
    FROM (SELECT skeys(address) as k, svals(address) as v) a
   WHERE not addr_ids_from_name(v) && parent_address_vector
         AND k not in ('country', 'street', 'place', 'postcode',
                       'housenumber', 'streetnumber', 'conscriptionnumber');
  -- Compute all search terms from the addr: tags.
  IF filtered_address IS NOT NULL THEN
    FOR addr_item IN
      SELECT * FROM
        get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
    LOOP
        IF addr_item.place_id is null THEN
            nameaddress_vector := array_merge(nameaddress_vector,
                                              addr_item.keywords);
            CONTINUE;
        END IF;
        IF parent_address_place_ids is null THEN
            SELECT array_agg(parent_place_id) INTO parent_address_place_ids
              FROM place_addressline
             WHERE place_id = parent_place_id;
        END IF;
        IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
            nameaddress_vector := array_merge(nameaddress_vector,
                                              addr_item.keywords);
            INSERT INTO place_addressline (place_id, address_place_id, fromarea,
                                           isaddress, distance, cached_rank_address)
            VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
                    true, addr_item.distance, addr_item.rank_address);
        END IF;
    END LOOP;
  END IF;
  name_vector := initial_name_vector;
  -- Check if the parent covers all address terms.
  -- If not, create a search name entry with the house number as the name.
  -- This is unusual for the search_name table but prevents that the place
  -- is returned when we only search for the street/place.
  IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
    name_vector := array_merge(name_vector,
                               ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
  END IF;
  IF not address ? 'street' and address ? 'place' THEN
    addr_place_ids := addr_ids_from_name(address->'place');
    IF not addr_place_ids <@ parent_name_vector THEN
      -- make sure addr:place terms are always searchable
      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
      -- If there is a housenumber, also add the place name as a name,
      -- so we can search it by the usual housenumber+place algorithms.
      IF housenumber is not null THEN
        name_vector := array_merge(name_vector,
                                   ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
      END IF;
    END IF;
  END IF;
  -- Cheating here by not recomputing all terms but simply using the ones
  -- from the parent object.
  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
 END;
 $$
 LANGUAGE plpgsql;
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@ -63,54 +63,36 @@ END
 $$
 LANGUAGE plpgsql STABLE;
-CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
+
-                                                    feature GEOMETRY,
+CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
-                                                    address HSTORE, country TEXT)
+                                             from_rank SMALLINT, to_rank SMALLINT,
-  RETURNS SETOF nearfeaturecentr
+                                             extent FLOAT, tokens INT[])
  RETURNS nearfeaturecentr
  AS $$
 DECLARE
  r nearfeaturecentr%rowtype;
  item RECORD;
 BEGIN
  FOR item IN
    SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
      (SELECT skeys(address) as key, svals(address) as name) x
  LOOP
   IF item.from_rank is null THEN
     CONTINUE;
   END IF;
 {% for partition in db.partitions %}
-    IF in_partition = {{ partition }} THEN
+  IF in_partition = {{ partition }} THEN
-        SELECT place_id, keywords, rank_address, rank_search,
+      SELECT place_id, keywords, rank_address, rank_search,
-               min(ST_Distance(feature, centroid)) as distance,
+             min(ST_Distance(feature, centroid)) as distance,
-               isguess, postcode, centroid INTO r
+             isguess, postcode, centroid INTO r
        FROM location_area_large_{{ partition }}
-        WHERE geometry && ST_Expand(feature, item.extent)
+        WHERE geometry && ST_Expand(feature, extent)
-          AND rank_address between item.from_rank and item.to_rank
+              AND rank_address between from_rank and to_rank
-          AND word_ids_from_name(item.name) && keywords
+              AND tokens && keywords
        GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
        ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
-      IF r.place_id is null THEN
+      RETURN r;
-        -- If we cannot find a place for the term, just return the
+  END IF;
        -- search term for the given name. That ensures that the address
        -- element can still be searched for, even though it will not be
        -- displayed.
        RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
                        null, null, null, null)::nearfeaturecentr;
      ELSE
        RETURN NEXT r;
      END IF;
      CONTINUE;
    END IF;
 {% endfor %}
-    RAISE EXCEPTION 'Unknown partition %', in_partition;
+  RAISE EXCEPTION 'Unknown partition %', in_partition;
  END LOOP;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
 DECLARE
 BEGIN
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@ -1,5 +1,84 @@
 -- Trigger functions for the placex table.
 -- Retrieve the data needed by the indexer for updating the place.
 --
 -- Return parameters:
 --  name            list of names
 --  address         list of address tags, either from the object or a surrounding
 --                  building
 --  country_feature If the place is a country feature, this contains the
 --                  country code, otherwise it is null.
 CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
                                                 OUT name HSTORE,
                                                 OUT address HSTORE,
                                                 OUT country_feature VARCHAR)
  AS $$
 BEGIN
  -- For POI nodes, check if the address should be derived from a surrounding
  -- building.
  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
    address := p.address;
  ELSE
    -- The additional && condition works around the misguided query
    -- planner of postgis 3.0.
    SELECT placex.address || hstore('_inherited', '') INTO address
      FROM placex
     WHERE ST_Covers(geometry, p.centroid)
           and geometry && p.centroid
           and placex.address is not null
           and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
           and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
     LIMIT 1;
  END IF;
  address := address - '_unlisted_place'::TEXT;
  name := p.name;
  country_feature := CASE WHEN p.admin_level = 2
                               and p.class = 'boundary' and p.type = 'administrative'
                               and p.osm_type = 'R'
                          THEN p.country_code
                          ELSE null
                     END;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
                                                  poi_osm_id BIGINT)
  RETURNS BIGINT
  AS $$
 DECLARE
  location RECORD;
  parent RECORD;
 BEGIN
  FOR location IN
    SELECT members FROM planet_osm_rels
    WHERE parts @> ARRAY[poi_osm_id]
          and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
          and tags @> ARRAY['associatedStreet']
  LOOP
    FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
      IF location.members[i+1] = 'street' THEN
        FOR parent IN
          SELECT place_id from placex
           WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
                 and name is not null
                 and rank_search between 26 and 27
        LOOP
          RETURN parent.place_id;
        END LOOP;
      END IF;
    END LOOP;
  END LOOP;
  RETURN NULL;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 -- Find the parent road of a POI.
 --
 -- \returns Place ID of parent object or NULL if none
@ -10,118 +89,89 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                               poi_osm_id BIGINT,
                                               poi_partition SMALLINT,
                                               bbox GEOMETRY,
-                                               addr_street TEXT,
+                                               addr_street INTEGER[],
-                                               addr_place TEXT,
+                                               addr_place INTEGER[],
-                                               fallback BOOL = true)
+                                               is_place_addr BOOLEAN)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent_place_id BIGINT DEFAULT NULL;
  location RECORD;
  parent RECORD;
 BEGIN
-    {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+  {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
-    -- Is this object part of an associatedStreet relation?
+  -- Is this object part of an associatedStreet relation?
-    FOR location IN
+  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
      SELECT members FROM planet_osm_rels
      WHERE parts @> ARRAY[poi_osm_id]
        and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
        and tags @> ARRAY['associatedStreet']
    LOOP
      FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
        IF location.members[i+1] = 'street' THEN
          FOR parent IN
            SELECT place_id from placex
             WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
               and name is not null
               and rank_search between 26 and 27
          LOOP
            RETURN parent.place_id;
          END LOOP;
        END IF;
      END LOOP;
    END LOOP;
  IF parent_place_id is null THEN
    parent_place_id := find_parent_for_address(addr_street, addr_place,
                                               poi_partition, bbox);
-    IF parent_place_id is not null THEN
+  END IF;
      RETURN parent_place_id;
    END IF;
-    IF poi_osm_type = 'N' THEN
+  IF parent_place_id is null and poi_osm_type = 'N' THEN
-      -- Is this node part of an interpolation?
+    -- Is this node part of an interpolation?
-      FOR parent IN
+    FOR location IN
-        SELECT q.parent_place_id
+      SELECT q.parent_place_id
-          FROM location_property_osmline q, planet_osm_ways x
+        FROM location_property_osmline q, planet_osm_ways x
-         WHERE q.linegeo && bbox and x.id = q.osm_id
+       WHERE q.linegeo && bbox and x.id = q.osm_id
-               and poi_osm_id = any(x.nodes)
+             and poi_osm_id = any(x.nodes)
-         LIMIT 1
+       LIMIT 1
-      LOOP
+    LOOP
-        {% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %}
+      {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
-        RETURN parent.parent_place_id;
+      RETURN location.parent_place_id;
-      END LOOP;
+    END LOOP;
-      -- Is this node part of any other way?
+    FOR location IN
-      FOR location IN
+      SELECT p.place_id, p.osm_id, p.rank_search, p.address,
-        SELECT p.place_id, p.osm_id, p.rank_search, p.address,
+             coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
-               coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
+        FROM placex p, planet_osm_ways w
-          FROM placex p, planet_osm_ways w
+       WHERE p.osm_type = 'W' and p.rank_search >= 26
-         WHERE p.osm_type = 'W' and p.rank_search >= 26
+             and p.geometry && bbox
-               and p.geometry && bbox
+             and w.id = p.osm_id and poi_osm_id = any(w.nodes)
-               and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+    LOOP
-      LOOP
+      {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
        {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
-        -- Way IS a road then we are on it - that must be our road
+      -- Way IS a road then we are on it - that must be our road
-        IF location.rank_search < 28 THEN
+      IF location.rank_search < 28 THEN
-          {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
+        {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
-          return location.place_id;
+        RETURN location.place_id;
        END IF;
        SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
                                   location.centroid,
                                   location.address->'street',
                                   location.address->'place',
                                   false)
          INTO parent_place_id;
        IF parent_place_id is not null THEN
          RETURN parent_place_id;
        END IF;
      END LOOP;
    END IF;
    IF fallback THEN
      IF addr_street is null and addr_place is not null THEN
        -- The address is attached to a place we don't know.
        -- Instead simply use the containing area with the largest rank.
        FOR location IN
          SELECT place_id FROM placex
            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
                  AND rank_address between 5 and 25
            ORDER BY rank_address desc
        LOOP
            RETURN location.place_id;
        END LOOP;
      ELSEIF ST_Area(bbox) < 0.005 THEN
        -- for smaller features get the nearest road
        SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
        {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
      ELSE
        -- for larger features simply find the area with the largest rank that
        -- contains the bbox, only use addressable features
        FOR location IN
          SELECT place_id FROM placex
            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
                  AND rank_address between 5 and 25
            ORDER BY rank_address desc
        LOOP
            RETURN location.place_id;
        END LOOP;
      END IF;
    END IF;
-    RETURN parent_place_id;
+      parent_place_id := find_associated_street('W', location.osm_id);
    END LOOP;
  END IF;
  IF parent_place_id is NULL THEN
    IF is_place_addr THEN
      -- The address is attached to a place we don't know.
      -- Instead simply use the containing area with the largest rank.
      FOR location IN
        SELECT place_id FROM placex
         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
               AND rank_address between 5 and 25
         ORDER BY rank_address desc
      LOOP
        RETURN location.place_id;
      END LOOP;
    ELSEIF ST_Area(bbox) < 0.005 THEN
      -- for smaller features get the nearest road
      SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
      {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
    ELSE
      -- for larger features simply find the area with the largest rank that
      -- contains the bbox, only use addressable features
      FOR location IN
        SELECT place_id FROM placex
         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
               AND rank_address between 5 and 25
        ORDER BY rank_address desc
      LOOP
        RETURN location.place_id;
      END LOOP;
    END IF;
  END IF;
  RETURN parent_place_id;
 END;
 $$
 LANGUAGE plpgsql STABLE;
@ -240,6 +290,101 @@ $$
 LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
                                                   in_partition SMALLINT,
                                                   parent_place_id BIGINT,
                                                   is_place_addr BOOLEAN,
                                                   country TEXT,
                                                   token_info JSONB,
                                                   geometry GEOMETRY,
                                                   OUT name_vector INTEGER[],
                                                   OUT nameaddress_vector INTEGER[])
  AS $$
 DECLARE
  parent_name_vector INTEGER[];
  parent_address_vector INTEGER[];
  addr_place_ids INTEGER[];
  hnr_vector INTEGER[];
  addr_item RECORD;
  addr_place RECORD;
  parent_address_place_ids BIGINT[];
 BEGIN
  nameaddress_vector := '{}'::INTEGER[];
  SELECT s.name_vector, s.nameaddress_vector
    INTO parent_name_vector, parent_address_vector
    FROM search_name s
    WHERE s.place_id = parent_place_id;
  FOR addr_item IN
    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
      FROM token_get_address_tokens(token_info)
      WHERE not search_tokens <@ parent_address_vector
  LOOP
    addr_place := get_address_place(in_partition, geometry,
                                    addr_item.from_rank, addr_item.to_rank,
                                    addr_item.extent, addr_item.match_tokens);
    IF addr_place is null THEN
      -- No place found in OSM that matches. Make it at least searchable.
      nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
    ELSE
      IF parent_address_place_ids is null THEN
        SELECT array_agg(parent_place_id) INTO parent_address_place_ids
          FROM place_addressline
          WHERE place_id = parent_place_id;
      END IF;
      -- If the parent already lists the place in place_address line, then we
      -- are done. Otherwise, add its own place_address line.
      IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
        nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
        INSERT INTO place_addressline (place_id, address_place_id, fromarea,
                                       isaddress, distance, cached_rank_address)
          VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
                    true, addr_place.distance, addr_place.rank_address);
      END IF;
    END IF;
  END LOOP;
  name_vector := token_get_name_search_tokens(token_info);
  -- Check if the parent covers all address terms.
  -- If not, create a search name entry with the house number as the name.
  -- This is unusual for the search_name table but prevents that the place
  -- is returned when we only search for the street/place.
  hnr_vector := token_get_housenumber_search_tokens(token_info);
  IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
    name_vector := array_merge(name_vector, hnr_vector);
  END IF;
  IF is_place_addr THEN
    addr_place_ids := token_addr_place_search_tokens(token_info);
    IF not addr_place_ids <@ parent_name_vector THEN
      -- make sure addr:place terms are always searchable
      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
      -- If there is a housenumber, also add the place name as a name,
      -- so we can search it by the usual housenumber+place algorithms.
      IF hnr_vector is not null THEN
        name_vector := array_merge(name_vector, addr_place_ids);
      END IF;
    END IF;
  END IF;
  -- Cheating here by not recomputing all terms but simply using the ones
  -- from the parent object.
  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
 END;
 $$
 LANGUAGE plpgsql;
 -- Insert address of a place into the place_addressline table.
 --
 -- \param obj_place_id  Place_id of the place to compute the address for.
@ -260,7 +405,7 @@ LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
                                               partition SMALLINT,
                                               maxrank SMALLINT,
-                                               address HSTORE,
+                                               token_info JSONB,
                                               geometry GEOMETRY,
                                               country TEXT,
                                               OUT parent_place_id BIGINT,
@ -275,7 +420,8 @@ DECLARE
  current_node_area GEOMETRY := NULL;
  parent_place_rank INT := 0;
-  addr_place_ids BIGINT[];
+  addr_place_ids BIGINT[] := '{}'::int[];
  new_address_vector INT[];
  location RECORD;
 BEGIN
@ -285,16 +431,21 @@ BEGIN
  address_havelevel := array_fill(false, ARRAY[maxrank]);
  FOR location IN
-    SELECT * FROM get_places_for_addr_tags(partition, geometry,
+    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                                                   address, country)
+                              extent, match_tokens)).*, search_tokens
-    ORDER BY rank_address, distance, isguess desc
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
              FROM token_get_address_tokens(token_info)) x
      ORDER BY rank_address, distance, isguess desc
  LOOP
-    {% if not db.reverse_only %}
+    IF location.place_id is null THEN
-      nameaddress_vector := array_merge(nameaddress_vector,
+      {% if not db.reverse_only %}
-                                        location.keywords::int[]);
+      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
-    {% endif %}
+      {% endif %}
    ELSE
      {% if not db.reverse_only %}
      nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
      {% endif %}
    IF location.place_id is not null THEN
      location_isaddress := not address_havelevel[location.rank_address];
      IF not address_havelevel[location.rank_address] THEN
        address_havelevel[location.rank_address] := true;
@ -309,13 +460,13 @@ BEGIN
        VALUES (obj_place_id, location.place_id, not location.isguess,
                true, location.distance, location.rank_address);
-      addr_place_ids := array_append(addr_place_ids, location.place_id);
+      addr_place_ids := addr_place_ids || location.place_id;
    END IF;
  END LOOP;
  FOR location IN
    SELECT * FROM getNearFeatures(partition, geometry, maxrank)
-    WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id]
+    WHERE not addr_place_ids @> ARRAY[place_id]
    ORDER BY rank_address, isguess asc,
             distance *
               CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
@ -397,10 +548,11 @@ BEGIN
  NEW.place_id := nextval('seq_place');
  NEW.indexed_status := 1; --STATUS_NEW
-  NEW.country_code := lower(get_country_code(NEW.geometry));
+  NEW.centroid := ST_PointOnSurface(NEW.geometry);
  NEW.country_code := lower(get_country_code(NEW.centroid));
  NEW.partition := get_partition(NEW.country_code);
-  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry);
+  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);
  IF NEW.osm_type = 'X' THEN
    -- E'X'ternal records should already be in the right format so do nothing
@ -522,8 +674,8 @@ DECLARE
  parent_address_level SMALLINT;
  place_address_level SMALLINT;
-  addr_street TEXT;
+  addr_street INTEGER[];
-  addr_place TEXT;
+  addr_place INTEGER[];
  max_rank SMALLINT;
@ -531,12 +683,11 @@ DECLARE
  nameaddress_vector INTEGER[];
  addr_nameaddress_vector INTEGER[];
  inherited_address HSTORE;
  linked_node_id BIGINT;
  linked_importance FLOAT;
  linked_wikipedia TEXT;
  is_place_address BOOLEAN;
  result BOOLEAN;
 BEGIN
  -- deferred delete
@ -566,9 +717,9 @@ BEGIN
  -- update not necessary for osmline, cause linked_place_id does not exist
  NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
  NEW.address := NEW.address - '_unlisted_place'::TEXT;
  IF NEW.linked_place_id is not null THEN
    NEW.token_info := null;
    {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
    RETURN NEW;
  END IF;
@ -579,13 +730,34 @@ BEGIN
  -- imported as place=postcode. That's why relations are allowed to pass here.
  -- This can go away in a couple of versions.
  IF NEW.class = 'place'  and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
    NEW.token_info := null;
    RETURN NEW;
  END IF;
-  -- Speed up searches - just use the centroid of the feature
+  -- Compute a preliminary centroid.
  -- cheaper but less acurate
  NEW.centroid := ST_PointOnSurface(NEW.geometry);
-  {% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
+
    -- recalculate country and partition
  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
    -- for countries, believe the mapped country code,
    -- so that we remain in the right partition if the boundaries
    -- suddenly expand.
    NEW.country_code := lower(NEW.address->'country');
    NEW.partition := get_partition(lower(NEW.country_code));
    IF NEW.partition = 0 THEN
      NEW.country_code := lower(get_country_code(NEW.centroid));
      NEW.partition := get_partition(NEW.country_code);
    END IF;
  ELSE
    IF NEW.rank_search >= 4 THEN
      NEW.country_code := lower(get_country_code(NEW.centroid));
    ELSE
      NEW.country_code := NULL;
    END IF;
    NEW.partition := get_partition(NEW.country_code);
  END IF;
  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
  -- recompute the ranks, they might change when linking changes
  SELECT * INTO NEW.rank_search, NEW.rank_address
@ -665,54 +837,12 @@ BEGIN
    parent_address_level := 3;
  END IF;
-  {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
+  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-  -- housenumber is a computed field, so start with an empty value
+  addr_street := token_addr_street_match_tokens(NEW.token_info);
-  NEW.housenumber := NULL;
+  addr_place := token_addr_place_match_tokens(NEW.token_info);
  IF NEW.address is not NULL THEN
      IF NEW.address ? 'conscriptionnumber' THEN
        IF NEW.address ? 'streetnumber' THEN
            NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
        ELSE
            NEW.housenumber := NEW.address->'conscriptionnumber';
        END IF;
      ELSEIF NEW.address ? 'streetnumber' THEN
        NEW.housenumber := NEW.address->'streetnumber';
      ELSEIF NEW.address ? 'housenumber' THEN
        NEW.housenumber := NEW.address->'housenumber';
      END IF;
      NEW.housenumber := create_housenumber_id(NEW.housenumber);
      addr_street := NEW.address->'street';
      addr_place := NEW.address->'place';
      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
        i := getorcreate_postcode_id(NEW.address->'postcode');
      END IF;
  END IF;
  NEW.postcode := null;
  -- recalculate country and partition
  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
    -- for countries, believe the mapped country code,
    -- so that we remain in the right partition if the boundaries
    -- suddenly expand.
    NEW.country_code := lower(NEW.address->'country');
    NEW.partition := get_partition(lower(NEW.country_code));
    IF NEW.partition = 0 THEN
      NEW.country_code := lower(get_country_code(NEW.centroid));
      NEW.partition := get_partition(NEW.country_code);
    END IF;
  ELSE
    IF NEW.rank_search >= 4 THEN
      NEW.country_code := lower(get_country_code(NEW.centroid));
    ELSE
      NEW.country_code := NULL;
    END IF;
    NEW.partition := get_partition(NEW.country_code);
  END IF;
  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
  -- waterway ways are linked when they are part of a relation and have the same class/type
  IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
      FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
@ -749,33 +879,14 @@ BEGIN
    {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
    NEW.parent_place_id := null;
-
+    is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);
    -- if we have a POI and there is no address information,
    -- see if we can get it from a surrounding building
    inherited_address := ''::HSTORE;
    IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
       AND NEW.housenumber IS NULL THEN
      FOR location IN
        -- The additional && condition works around the misguided query
        -- planner of postgis 3.0.
        SELECT address from placex where ST_Covers(geometry, NEW.centroid)
            and geometry && NEW.centroid
            and (address ? 'housenumber' or address ? 'street' or address ? 'place')
            and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
            limit 1
      LOOP
        NEW.housenumber := location.address->'housenumber';
        addr_street := location.address->'street';
        addr_place := location.address->'place';
        inherited_address := location.address;
      END LOOP;
    END IF;
    -- We have to find our parent road.
    NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                               NEW.partition,
                                               ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place);
+                                               addr_street, addr_place,
                                               is_place_address);
    -- If we found the road take a shortcut here.
    -- Otherwise fall back to the full address getting method below.
@ -785,12 +896,12 @@ BEGIN
      SELECT p.country_code, p.postcode, p.name FROM placex p
       WHERE p.place_id = NEW.parent_place_id INTO location;
-      IF addr_street is null and addr_place is not null THEN
+      IF is_place_address THEN
        -- Check if the addr:place tag is part of the parent name
        SELECT count(*) INTO i
-          FROM svals(location.name) AS pname WHERE pname = addr_place;
+          FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
        IF i = 0 THEN
-          NEW.address = NEW.address || hstore('_unlisted_place', addr_place);
+          NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
        END IF;
      END IF;
@ -798,39 +909,21 @@ BEGIN
      {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
      -- determine postcode
-      IF NEW.address is not null AND NEW.address ? 'postcode' THEN
+      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
-          NEW.postcode = upper(trim(NEW.address->'postcode'));
+                               location.postcode,
-      ELSE
+                               get_nearest_postcode(NEW.country_code, NEW.geometry));
         NEW.postcode := location.postcode;
      END IF;
      IF NEW.postcode is null THEN
        NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
      END IF;
      IF NEW.name is not NULL THEN
          NEW.name := add_default_place_name(NEW.country_code, NEW.name);
          name_vector := make_keywords(NEW.name);
          IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
            result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
                                   name_vector, NEW.rank_search, NEW.rank_address,
                                   upper(trim(NEW.address->'postcode')), NEW.geometry,
                                   NEW.centroid);
            {% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
          END IF;
      END IF;
      {% if not db.reverse_only %}
-      IF array_length(name_vector, 1) is not NULL
+      IF NEW.name is not NULL OR NEW.address is not NULL THEN
         OR inherited_address is not NULL OR NEW.address is not NULL
      THEN
        SELECT * INTO name_vector, nameaddress_vector
          FROM create_poi_search_terms(NEW.place_id,
                                       NEW.partition, NEW.parent_place_id,
-                                       inherited_address || NEW.address,
+                                       is_place_address, NEW.country_code,
-                                       NEW.country_code, NEW.housenumber,
+                                       NEW.token_info, NEW.centroid);
                                       name_vector, NEW.centroid);
        IF array_length(name_vector, 1) is not NULL THEN
          INSERT INTO search_name (place_id, search_rank, address_rank,
@ -844,6 +937,17 @@ BEGIN
      END IF;
      {% endif %}
      NEW.token_info := token_strip_info(NEW.token_info);
      -- If the address was inherited from a surrounding building,
      -- do not add it permanently to the table.
      IF NEW.address ? '_inherited' THEN
        IF NEW.address ? '_unlisted_place' THEN
          NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
        ELSE
          NEW.address := null;
        END IF;
      END IF;
      RETURN NEW;
    END IF;
@ -914,19 +1018,11 @@ BEGIN
    END IF;
  END IF;
  -- Initialise the name vector using our name
  NEW.name := add_default_place_name(NEW.country_code, NEW.name);
  name_vector := make_keywords(NEW.name);
  -- make sure all names are in the word table
  IF NEW.admin_level = 2
     AND NEW.class = 'boundary' AND NEW.type = 'administrative'
     AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
  THEN
-    PERFORM create_country(NEW.name, lower(NEW.country_code));
+    -- Update the list of country names. Adding an additional sanity
    {% if debug %}RAISE WARNING 'Country names updated';{% endif %}
    -- Also update the list of country names. Adding an additional sanity
    -- check here: make sure the country does overlap with the area where
    -- we expect it to be as per static country grid.
    FOR location IN
@ -959,29 +1055,28 @@ BEGIN
  ELSEIF NEW.rank_address > 25 THEN
    max_rank := 25;
  ELSE
-    max_rank = NEW.rank_address;
+    max_rank := NEW.rank_address;
  END IF;
  SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
-                                    NEW.address, geom, NEW.country_code)
+                                    NEW.token_info, geom, NEW.country_code)
    INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;
  {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
-  IF NEW.address is not null AND NEW.address ? 'postcode' 
+  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
-     AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
+                           NEW.postcode);
    NEW.postcode := upper(trim(NEW.address->'postcode'));
  END IF;
  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
  END IF;
  -- if we have a name add this to the name search table
  IF NEW.name IS NOT NULL THEN
    -- Initialise the name vector using our name
    NEW.name := add_default_place_name(NEW.country_code, NEW.name);
    name_vector := token_get_name_search_tokens(NEW.token_info);
    IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-      result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid);
+      result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
                             name_vector, NEW.rank_search, NEW.rank_address,
                             NEW.postcode, NEW.geometry, NEW.centroid);
      {% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
    END IF;
@ -990,8 +1085,11 @@ BEGIN
      {% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
    END IF;
-    result := insertSearchName(NEW.partition, NEW.place_id, name_vector,
+    IF NEW.rank_address between 16 and 27 THEN
-                               NEW.rank_search, NEW.rank_address, NEW.geometry);
+      result := insertSearchName(NEW.partition, NEW.place_id,
                                 token_get_name_match_tokens(NEW.token_info),
                                 NEW.rank_search, NEW.rank_address, NEW.geometry);
    END IF;
    {% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}
    {% if not db.reverse_only %}
@ -1002,11 +1100,15 @@ BEGIN
                       NEW.importance, NEW.country_code, name_vector,
                       nameaddress_vector, NEW.centroid);
    {% endif %}
  END IF;
  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
  END IF;
  {% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}
  NEW.token_info := token_strip_info(NEW.token_info);
  RETURN NEW;
 END;
 $$
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@ -221,37 +221,30 @@ LANGUAGE plpgsql STABLE;
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT,
+CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
                                                   partition SMALLINT,
                                                   centroid GEOMETRY)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent_place_id BIGINT;
  word_ids INTEGER[];
 BEGIN
  IF street is not null THEN
    -- Check for addr:street attributes
    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    word_ids := word_ids_from_name(street);
+    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
-    IF word_ids is not null THEN
+    IF parent_place_id is not null THEN
-      parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids);
+      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-      IF parent_place_id is not null THEN
+      RETURN parent_place_id;
        {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
        RETURN parent_place_id;
      END IF;
    END IF;
  END IF;
  -- Check for addr:place attributes.
  IF place is not null THEN
-    word_ids := word_ids_from_name(place);
+    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
-    IF word_ids is not null THEN
+    IF parent_place_id is not null THEN
-      parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids);
+      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-      IF parent_place_id is not null THEN
+      RETURN parent_place_id;
        {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
        RETURN parent_place_id;
      END IF;
    END IF;
  END IF;
--- a/lib-sql/indices.sql
+++ b/lib-sql/indices.sql
@ -1,9 +1,6 @@
 -- Indices used only during search and update.
 -- These indices are created only after the indexing process is done.
 CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
 CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
  ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
 );
 GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
 drop table IF EXISTS word;
 CREATE TABLE word (
  word_id INTEGER,
  word_token text,
  word text,
  class text,
  type text,
  country_code varchar(2),
  search_name_count INTEGER,
  operator TEXT
  ) {{db.tablespace.search_data}};
 CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
 DROP SEQUENCE IF EXISTS seq_word;
 CREATE SEQUENCE seq_word start 1;
 drop table IF EXISTS location_area CASCADE;
 CREATE TABLE location_area (
  place_id BIGINT,
@ -109,6 +93,7 @@ CREATE TABLE location_property_osmline (
    linegeo GEOMETRY,
    interpolationtype TEXT,
    address HSTORE,
    token_info JSONB, -- custom column for tokenizer use only
    postcode TEXT,
    country_code VARCHAR(2)
  ){{db.tablespace.search_data}};
@ -158,6 +143,7 @@ CREATE TABLE placex (
  indexed_status SMALLINT,
  LIKE place INCLUDING CONSTRAINTS,
  wikipedia TEXT, -- calculated wikipedia article name (language:title)
  token_info JSONB, -- custom column for tokenizer use only
  country_code varchar(2),
  housenumber TEXT,
  postcode TEXT,
@ -168,6 +154,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
 CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
 CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
 CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
 CREATE INDEX idx_placex_geometry_buildings ON placex
  USING GIST (geometry) {{db.tablespace.search_index}}
  WHERE address is not null and rank_search = 30
        and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
 CREATE INDEX idx_placex_geometry_placenode ON placex
  USING GIST (geometry) {{db.tablespace.search_index}}
  WHERE osm_type = 'N' and rank_search < 26
@ -178,7 +168,6 @@ DROP SEQUENCE IF EXISTS seq_place;
 CREATE SEQUENCE seq_place start 1;
 GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@ -0,0 +1,399 @@
 -- Get tokens used for searching the given place.
 --
 -- These are the tokens that will be saved in the search_name table.
 CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'names')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Get tokens for matching the place name against others.
 --
 -- This should usually be restricted to full name tokens.
 CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'names')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber tokens applicable for the place.
 CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'hnr_tokens')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber in the form that it can be matched during search.
 CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
  RETURNS TEXT
 AS $$
  SELECT info->>'hnr';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'street')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'place_match')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'place_search')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 DROP TYPE IF EXISTS token_addresstoken CASCADE;
 CREATE TYPE token_addresstoken AS (
  key TEXT,
  match_tokens INT[],
  search_tokens INT[]
 );
 CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
  RETURNS SETOF token_addresstoken
 AS $$
  SELECT key, (value->>1)::int[] as match_tokens,
         (value->>0)::int[] as search_tokens
  FROM jsonb_each(info->'addr');
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
  RETURNS TEXT
 AS $$
  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
  RETURNS JSONB
 AS $$
  SELECT NULL::JSONB;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 --------------- private functions ----------------------------------------------
 -- Functions for term normalisation and access to the 'word' table.
 CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
  AS '{{ modulepath }}/nominatim.so', 'transliteration'
 LANGUAGE c IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
 LANGUAGE c IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
  AS $$
 DECLARE
  o TEXT;
 BEGIN
  o := public.gettokenstring(public.transliteration(name));
  RETURN trim(substr(o,1,length(o)));
 END;
 $$
 LANGUAGE plpgsql IMMUTABLE;
 -- returns NULL if the word is too common
 CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
  count INTEGER;
 BEGIN
  lookup_token := trim(lookup_word);
  SELECT min(word_id), max(search_name_count) FROM word
    WHERE word_token = lookup_token and class is null and type is null
    INTO return_word_id, count;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
  ELSE
    IF count > {{ max_word_freq }} THEN
      return_word_id := NULL;
    END IF;
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 -- Create housenumber tokens from an OSM addr:housenumber.
 -- The housnumber is split at comma and semicolon as necessary.
 -- The function returns the normalized form of the housenumber suitable
 -- for comparison.
 CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
                                               OUT tokens TEXT,
                                               OUT normtext TEXT)
  AS $$
 BEGIN
  SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
    INTO normtext, tokens
    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
          FROM (SELECT make_standard_name(h) as lookup_word
                FROM unnest(housenumbers) h) x) y;
 END;
 $$ LANGUAGE plpgsql STABLE STRICT;
 CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' ' || trim(lookup_word);
  SELECT min(word_id) FROM word
    WHERE word_token = lookup_token and class='place' and type='house'
    INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null,
                             'place', 'house', null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
  RETURNS BOOLEAN
  AS $$
 DECLARE
  r RECORD;
  lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' ' || make_standard_name(postcode);
  FOR r IN
    SELECT word_id FROM word
    WHERE word_token = lookup_token and word = postcode
          and class='place' and type='postcode'
  LOOP
    RETURN false;
  END LOOP;
  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
                           'place', 'postcode', null, 0);
  RETURN true;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  nospace_lookup_token TEXT;
  return_word_id INTEGER;
 BEGIN
  lookup_token := ' '||trim(lookup_word);
  SELECT min(word_id) FROM word
  WHERE word_token = lookup_token and class is null and type is null
  INTO return_word_id;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
                             null, null, null, 0);
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 -- Normalize a string and lookup its word ids (partial words).
 CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  words TEXT[];
  id INTEGER;
  return_word_id INTEGER[];
  word_ids INTEGER[];
  j INTEGER;
 BEGIN
  words := string_to_array(make_standard_name(lookup_word), ' ');
  IF array_upper(words, 1) IS NOT NULL THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      IF (words[j] != '') THEN
        SELECT array_agg(word_id) INTO word_ids
          FROM word
         WHERE word_token = words[j] and class is null and type is null;
        IF word_ids IS NULL THEN
          id := nextval('seq_word');
          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
          return_word_id := return_word_id || id;
        ELSE
          return_word_id := array_merge(return_word_id, word_ids);
        END IF;
      END IF;
    END LOOP;
  END IF;
  RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 -- Normalize a string and look up its name ids (full words).
 CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_ids INTEGER[];
 BEGIN
  lookup_token := ' '|| make_standard_name(lookup_word);
  SELECT array_agg(word_id) FROM word
    WHERE word_token = lookup_token and class is null and type is null
    INTO return_word_ids;
  RETURN return_word_ids;
 END;
 $$
 LANGUAGE plpgsql STABLE STRICT;
 CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
  RETURNS INTEGER[]
  AS $$
 DECLARE
  result INTEGER[];
  s TEXT;
  w INTEGER;
  words TEXT[];
  item RECORD;
  j INTEGER;
 BEGIN
  result := '{}'::INTEGER[];
  FOR item IN SELECT (each(src)).* LOOP
    s := make_standard_name(item.value);
    w := getorcreate_name_id(s, item.value);
    IF not(ARRAY[w] <@ result) THEN
      result := result || w;
    END IF;
    w := getorcreate_word_id(s);
    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
      result := result || w;
    END IF;
    words := string_to_array(s, ' ');
    IF array_upper(words, 1) IS NOT NULL THEN
      FOR j IN 1..array_upper(words, 1) LOOP
        IF (words[j] != '') THEN
          w = getorcreate_word_id(words[j]);
          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
      END LOOP;
    END IF;
    words := regexp_split_to_array(item.value, E'[,;()]');
    IF array_upper(words, 1) != 1 THEN
      FOR j IN 1..array_upper(words, 1) LOOP
        s := make_standard_name(words[j]);
        IF s != '' THEN
          w := getorcreate_word_id(s);
          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
      END LOOP;
    END IF;
    s := regexp_replace(item.value, '市$', '');
    IF s != item.value THEN
      s := make_standard_name(s);
      IF s != '' THEN
        w := getorcreate_name_id(s, item.value);
        IF NOT (ARRAY[w] <@ result) THEN
          result := result || w;
        END IF;
      END IF;
    END IF;
  END LOOP;
  RETURN result;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  s TEXT;
  w INTEGER;
  words TEXT[];
  i INTEGER;
  j INTEGER;
 BEGIN
  s := make_standard_name(src);
  w := getorcreate_name_id(s, src);
  w := getorcreate_word_id(s);
  words := string_to_array(s, ' ');
  IF array_upper(words, 1) IS NOT NULL THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      IF (words[j] != '') THEN
        w := getorcreate_word_id(words[j]);
      END IF;
    END LOOP;
  END IF;
  words := regexp_split_to_array(src, E'[,;()]');
  IF array_upper(words, 1) != 1 THEN
    FOR j IN 1..array_upper(words, 1) LOOP
      s := make_standard_name(words[j]);
      IF s != '' THEN
        w := getorcreate_word_id(s);
      END IF;
    END LOOP;
  END IF;
  s := regexp_replace(src, '市$', '');
  IF s != src THEN
    s := make_standard_name(s);
    IF s != '' THEN
      w := getorcreate_name_id(s, src);
    END IF;
  END IF;
  RETURN 1;
 END;
 $$
 LANGUAGE plpgsql;
--- a/lib-sql/tokenizer/legacy_tokenizer_indices.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
@ -0,0 +1,2 @@
 CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
--- a/lib-sql/tokenizer/legacy_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@ -0,0 +1,19 @@
 DROP TABLE IF EXISTS word;
 CREATE TABLE word (
  word_id INTEGER,
  word_token text NOT NULL,
  word text,
  class text,
  type text,
  country_code varchar(2),
  search_name_count INTEGER,
  operator TEXT
 ) {{db.tablespace.search_data}};
 CREATE INDEX idx_word_word_token ON word
    USING BTREE (word_token) {{db.tablespace.search_index}};
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 DROP SEQUENCE IF EXISTS seq_word;
 CREATE SEQUENCE seq_word start 1;
 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@ -32,8 +32,11 @@ class UpdateIndex:
    @staticmethod
    def run(args):
        from ..indexer.indexer import Indexer
        from ..tokenizer import factory as tokenizer_factory
-        indexer = Indexer(args.config.get_libpq_dsn(),
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                          args.threads or psutil.cpu_count() or 1)
        if not args.no_boundaries:
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@ -46,6 +46,7 @@ class UpdateRefresh:
    @staticmethod
    def run(args):
        from ..tools import refresh
        from ..tokenizer import factory as tokenizer_factory
        if args.postcodes:
            LOG.warning("Update postcodes centroid")
@ -66,6 +67,8 @@ class UpdateRefresh:
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config,
                                         args.diffs, args.enable_debug_statements)
                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
                tokenizer.update_sql_functions(args.config)
        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@ -83,6 +83,7 @@ class UpdateReplication:
    def _update(args):
        from ..tools import replication
        from ..indexer.indexer import Indexer
        from ..tokenizer import factory as tokenizer_factory
        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
        params.update(base_url=args.config.REPLICATION_URL,
@ -106,6 +107,8 @@ class UpdateReplication:
                raise UsageError("Bad argument '--no-index'.")
            recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
        while True:
            with connect(args.config.get_libpq_dsn()) as conn:
                start = dt.datetime.now(dt.timezone.utc)
@ -116,7 +119,7 @@ class UpdateReplication:
            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(),
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                  args.threads or 1)
                indexer.index_boundaries(0, 30)
                indexer.index_by_rank(0, 30)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@ -56,6 +56,7 @@ class SetupAll:
        from ..tools import refresh
        from ..indexer.indexer import Indexer
        from ..tools import postcodes
        from ..tokenizer import factory as tokenizer_factory
        if args.osm_file and not Path(args.osm_file).is_file():
            LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
@ -67,12 +68,6 @@ class SetupAll:
                                                    args.no_partitions,
                                                    rouser=args.config.DATABASE_WEBUSER)
            LOG.warning('Installing database module')
            with connect(args.config.get_libpq_dsn()) as conn:
                database_import.install_module(args.module_dir, args.project_dir,
                                               args.config.DATABASE_MODULE_PATH,
                                               conn=conn)
            LOG.warning('Importing OSM data file')
            database_import.import_osm_data(Path(args.osm_file),
                                            args.osm2pgsql_options(0, 1),
@ -105,22 +100,31 @@ class SetupAll:
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)
            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
                                      args.data_dir,
                                      args.threads or psutil.cpu_count() or 1)
        LOG.warning("Setting up tokenizer")
        if args.continue_at is None or args.continue_at == 'load-data':
            # (re)initialise the tokenizer data
            tokenizer = tokenizer_factory.create_tokenizer(args.config)
        else:
            # just load the tokenizer
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
                                       tokenizer)
        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
                with connect(args.config.get_libpq_dsn()) as conn:
                    SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
            LOG.warning('Indexing places')
-            indexer = Indexer(args.config.get_libpq_dsn(),
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                              args.threads or psutil.cpu_count() or 1)
            indexer.index_full(analyse=not args.index_noanalyse)
@ -129,7 +133,9 @@ class SetupAll:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
                                                 args.config.LANGUAGES)
        tokenizer.finalize_import(args.config)
        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@ -9,6 +9,8 @@ LOG = logging.getLogger()
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
 # Using non-top-level imports to avoid eventually unused imports.
 # pylint: disable=E0012,C0415
 class ImportSpecialPhrases:
    """\
@ -22,10 +24,13 @@ class ImportSpecialPhrases:
    @staticmethod
    def run(args):
        from ..tokenizer import factory as tokenizer_factory
        if args.import_from_wiki:
            LOG.warning('Special phrases importation starting')
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            with connect(args.config.get_libpq_dsn()) as db_connection:
                SpecialPhrasesImporter(
                    args.config, args.phplib_dir, db_connection
-                ).import_from_wiki()
+                ).import_from_wiki(tokenizer)
        return 0
--- a/nominatim/db/async_connection.py
+++ b/nominatim/db/async_connection.py
@ -48,14 +48,14 @@ class DBConnection:
    """ A single non-blocking database connection.
    """
-    def __init__(self, dsn):
+    def __init__(self, dsn, cursor_factory=None):
        self.current_query = None
        self.current_params = None
        self.dsn = dsn
        self.conn = None
        self.cursor = None
-        self.connect()
+        self.connect(cursor_factory=cursor_factory)
    def close(self):
        """ Close all open connections. Does not wait for pending requests.
@ -66,7 +66,7 @@ class DBConnection:
        self.conn = None
-    def connect(self):
+    def connect(self, cursor_factory=None):
        """ (Re)connect to the database. Creates an asynchronous connection
            with JIT and parallel processing disabled. If a connection was
            already open, it is closed and a new connection established.
@ -79,7 +79,7 @@ class DBConnection:
        self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
        self.wait()
-        self.cursor = self.conn.cursor()
+        self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
        # Disable JIT and parallel workers as they are known to cause problems.
        # Update pg_settings instead of using SET because it does not yield
        # errors on older versions of Postgres where the settings are not
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@ -89,8 +89,6 @@ class SQLPreprocessor:
        self.env.globals['db'] = db_info
        self.env.globals['sql'] = _setup_postgres_sql(conn)
        self.env.globals['postgres'] = _setup_postgresql_features(conn)
        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
                                         str((config.project_dir / 'module').resolve())
    def run_sql_file(self, conn, name, **kwargs):
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database.
 """
 import logging
 import select
 import time
 import psycopg2.extras
 from nominatim.indexer.progress import ProgressLogger
 from nominatim.indexer import runners
@ -11,6 +14,73 @@ from nominatim.db.connection import connect
 LOG = logging.getLogger()
 class PlaceFetcher:
    """ Asynchronous connection that fetches place details for processing.
    """
    def __init__(self, dsn, setup_conn):
        self.wait_time = 0
        self.current_ids = None
        self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
        with setup_conn.cursor() as cur:
            # need to fetch those manually because register_hstore cannot
            # fetch them on an asynchronous connection below.
            hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
            hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
        psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
                                        array_oid=hstore_array_oid)
    def close(self):
        """ Close the underlying asynchronous connection.
        """
        if self.conn:
            self.conn.close()
            self.conn = None
    def fetch_next_batch(self, cur, runner):
        """ Send a request for the next batch of places.
            If details for the places are required, they will be fetched
            asynchronously.
            Returns true if there is still data available.
        """
        ids = cur.fetchmany(100)
        if not ids:
            self.current_ids = None
            return False
        if hasattr(runner, 'get_place_details'):
            runner.get_place_details(self.conn, ids)
            self.current_ids = []
        else:
            self.current_ids = ids
        return True
    def get_batch(self):
        """ Get the next batch of data, previously requested with
            `fetch_next_batch`.
        """
        if self.current_ids is not None and not self.current_ids:
            tstart = time.time()
            self.conn.wait()
            self.wait_time += time.time() - tstart
            self.current_ids = self.conn.cursor.fetchall()
        return self.current_ids
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.conn.wait()
        self.close()
 class WorkerPool:
    """ A pool of asynchronous database connections.
@ -21,6 +91,7 @@ class WorkerPool:
    def __init__(self, dsn, pool_size):
        self.threads = [DBConnection(dsn) for _ in range(pool_size)]
        self.free_workers = self._yield_free_worker()
        self.wait_time = 0
    def finish_all(self):
@ -64,7 +135,9 @@ class WorkerPool:
                ready = self.threads
                command_stat = 0
            else:
                tstart = time.time()
                _, ready, _ = select.select([], self.threads, [])
                self.wait_time += time.time() - tstart
    def __enter__(self):
@ -72,6 +145,7 @@ class WorkerPool:
    def __exit__(self, exc_type, exc_value, traceback):
        self.finish_all()
        self.close()
@ -79,8 +153,9 @@ class Indexer:
    """ Main indexing routine.
    """
-    def __init__(self, dsn, num_threads):
+    def __init__(self, dsn, tokenizer, num_threads):
        self.dsn = dsn
        self.tokenizer = tokenizer
        self.num_threads = num_threads
@ -123,8 +198,9 @@ class Indexer:
        LOG.warning("Starting indexing boundaries using %s threads",
                    self.num_threads)
-        for rank in range(max(minrank, 4), min(maxrank, 26)):
+        with self.tokenizer.name_analyzer() as analyzer:
-            self._index(runners.BoundaryRunner(rank))
+            for rank in range(max(minrank, 4), min(maxrank, 26)):
                self._index(runners.BoundaryRunner(rank, analyzer))
    def index_by_rank(self, minrank, maxrank):
        """ Index all entries of placex in the given rank range (inclusive)
@ -137,15 +213,16 @@ class Indexer:
        LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                    minrank, maxrank, self.num_threads)
-        for rank in range(max(1, minrank), maxrank):
+        with self.tokenizer.name_analyzer() as analyzer:
-            self._index(runners.RankRunner(rank))
+            for rank in range(max(1, minrank), maxrank):
                self._index(runners.RankRunner(rank, analyzer))
-        if maxrank == 30:
+            if maxrank == 30:
-            self._index(runners.RankRunner(0))
+                self._index(runners.RankRunner(0, analyzer))
-            self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.InterpolationRunner(analyzer), 20)
-            self._index(runners.RankRunner(30), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
-        else:
+            else:
-            self._index(runners.RankRunner(maxrank))
+                self._index(runners.RankRunner(maxrank, analyzer))
    def index_postcodes(self):
@ -173,6 +250,7 @@ class Indexer:
        LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
        with connect(self.dsn) as conn:
            psycopg2.extras.register_hstore(conn)
            with conn.cursor() as cur:
                total_tuples = cur.scalar(runner.sql_count_objects())
                LOG.debug("Total number of rows: %i", total_tuples)
@ -185,19 +263,24 @@ class Indexer:
                with conn.cursor(name='places') as cur:
                    cur.execute(runner.sql_get_objects())
-                    with WorkerPool(self.dsn, self.num_threads) as pool:
+                    with PlaceFetcher(self.dsn, conn) as fetcher:
-                        while True:
+                        with WorkerPool(self.dsn, self.num_threads) as pool:
-                            places = [p[0] for p in cur.fetchmany(batch)]
+                            has_more = fetcher.fetch_next_batch(cur, runner)
-                            if not places:
+                            while has_more:
-                                break
+                                places = fetcher.get_batch()
-                            LOG.debug("Processing places: %s", str(places))
+                                # asynchronously get the next batch
-                            worker = pool.next_free_worker()
+                                has_more = fetcher.fetch_next_batch(cur, runner)
-                            worker.perform(runner.sql_index_place(places))
+                                # And insert the curent batch
-                            progress.add(len(places))
+                                for idx in range(0, len(places), batch):
                                    part = places[idx:idx+batch]
                                    LOG.debug("Processing places: %s", str(part))
                                    runner.index_places(pool.next_free_worker(), part)
                                    progress.add(len(part))
-                        pool.finish_all()
+                            LOG.info("Wait time: fetcher: %.2fs,  pool: %.2fs",
                                     fetcher.wait_time, pool.wait_time)
                conn.commit()
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@ -2,15 +2,52 @@
 Mix-ins that provide the actual commands for the indexer for various indexing
 tasks.
 """
 import functools
 import psycopg2.extras
 # pylint: disable=C0111
-class RankRunner:
+class AbstractPlacexRunner:
    """ Returns SQL commands for indexing of the placex table.
    """
    SELECT_SQL = 'SELECT place_id FROM placex'
    def __init__(self, rank, analyzer):
        self.rank = rank
        self.analyzer = analyzer
    @staticmethod
    @functools.lru_cache(maxsize=1)
    def _index_sql(num_places):
        return """ UPDATE placex
                   SET indexed_status = 0, address = v.addr, token_info = v.ti
                   FROM (VALUES {}) as v(id, addr, ti)
                   WHERE place_id = v.id
               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
    @staticmethod
    def get_place_details(worker, ids):
        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
                          FROM placex WHERE place_id IN %s""",
                       (tuple((p[0] for p in ids)), ))
    def index_places(self, worker, places):
        values = []
        for place in places:
            values.extend((place[x] for x in ('place_id', 'address')))
            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
        worker.perform(self._index_sql(len(places)), values)
 class RankRunner(AbstractPlacexRunner):
    """ Returns SQL commands for indexing one rank within the placex table.
    """
    def __init__(self, rank):
        self.rank = rank
    def name(self):
        return "rank {}".format(self.rank)
@ -20,24 +57,16 @@ class RankRunner:
               """.format(self.rank)
    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
+        return """{} WHERE indexed_status > 0 and rank_address = {}
-                  WHERE indexed_status > 0 and rank_address = {}
+                     ORDER BY geometry_sector
-                  ORDER BY geometry_sector""".format(self.rank)
+               """.format(self.SELECT_SQL, self.rank)
    @staticmethod
    def sql_index_place(ids):
        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
               .format(','.join((str(i) for i in ids)))
-class BoundaryRunner:
+class BoundaryRunner(AbstractPlacexRunner):
    """ Returns SQL commands for indexing the administrative boundaries
        of a certain rank.
    """
    def __init__(self, rank):
        self.rank = rank
    def name(self):
        return "boundaries rank {}".format(self.rank)
@ -49,16 +78,10 @@ class BoundaryRunner:
               """.format(self.rank)
    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
+        return """{} WHERE indexed_status > 0 and rank_search = {}
-                  WHERE indexed_status > 0 and rank_search = {}
+                           and class = 'boundary' and type = 'administrative'
-                        and class = 'boundary' and type = 'administrative'
+                     ORDER BY partition, admin_level
-                  ORDER BY partition, admin_level
+               """.format(self.SELECT_SQL, self.rank)
               """.format(self.rank)
    @staticmethod
    def sql_index_place(ids):
        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
               .format(','.join((str(i) for i in ids)))
 class InterpolationRunner:
@ -66,6 +89,10 @@ class InterpolationRunner:
        location_property_osmline.
    """
    def __init__(self, analyzer):
        self.analyzer = analyzer
    @staticmethod
    def name():
        return "interpolation lines (location_property_osmline)"
@ -77,15 +104,37 @@ class InterpolationRunner:
    @staticmethod
    def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id
                  FROM location_property_osmline
                  WHERE indexed_status > 0
                  ORDER BY geometry_sector"""
    @staticmethod
-    def sql_index_place(ids):
+    def get_place_details(worker, ids):
-        return """UPDATE location_property_osmline
+        worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
-                  SET indexed_status = 0 WHERE place_id IN ({})
+                          FROM location_property_osmline WHERE place_id IN %s""",
-               """.format(','.join((str(i) for i in ids)))
+                       (tuple((p[0] for p in ids)), ))
    @staticmethod
    @functools.lru_cache(maxsize=1)
    def _index_sql(num_places):
        return """ UPDATE location_property_osmline
                   SET indexed_status = 0, address = v.addr, token_info = v.ti
                   FROM (VALUES {}) as v(id, addr, ti)
                   WHERE place_id = v.id
               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
    def index_places(self, worker, places):
        values = []
        for place in places:
            values.extend((place[x] for x in ('place_id', 'address')))
            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
        worker.perform(self._index_sql(len(places)), values)
 class PostcodeRunner:
@ -107,7 +156,7 @@ class PostcodeRunner:
                  ORDER BY country_code, postcode"""
    @staticmethod
-    def sql_index_place(ids):
+    def index_places(worker, ids):
-        return """UPDATE location_postcode SET indexed_status = 0
+        worker.perform(""" UPDATE location_postcode SET indexed_status = 0
-                  WHERE place_id IN ({})
+                           WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+                       """.format(','.join((str(i[0]) for i in ids))))
--- a/nominatim/tokenizer/init.py
+++ b/nominatim/tokenizer/init.py
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@ -0,0 +1,88 @@
 """
 Functions for creating a tokenizer or initialising the right one for an
 existing database.
 A tokenizer is something that is bound to the lifetime of a database. It
 can be choosen and configured before the intial import but then needs to
 be used consistently when querying and updating the database.
 This module provides the functions to create and configure a new tokenizer
 as well as instanciating the appropriate tokenizer for updating an existing
 database.
 A tokenizer usually also includes PHP code for querying. The appropriate PHP
 normalizer module is installed, when the tokenizer is created.
 """
 import logging
 import importlib
 from ..errors import UsageError
 from ..db import properties
 from ..db.connection import connect
 LOG = logging.getLogger()
 def _import_tokenizer(name):
    """ Load the tokenizer.py module from project directory.
    """
    try:
        return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
    except ModuleNotFoundError as exp:
        LOG.fatal("No tokenizer named '%s' available. "
                  "Check the setting of NOMINATIM_TOKENIZER.", name)
        raise UsageError('Tokenizer not found') from exp
 def create_tokenizer(config, init_db=True, module_name=None):
    """ Create a new tokenizer as defined by the given configuration.
        The tokenizer data and code is copied into the 'tokenizer' directory
        of the project directory and the tokenizer loaded from its new location.
    """
    if module_name is None:
        module_name = config.TOKENIZER
    # Create the directory for the tokenizer data
    basedir = config.project_dir / 'tokenizer'
    if not basedir.exists():
        basedir.mkdir()
    elif not basedir.is_dir():
        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
        raise UsageError("Tokenizer setup failed.")
    # Import and initialize the tokenizer.
    tokenizer_module = _import_tokenizer(module_name)
    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
    tokenizer.init_new_db(config, init_db=init_db)
    with connect(config.get_libpq_dsn()) as conn:
        properties.set_property(conn, 'tokenizer', module_name)
    return tokenizer
 def get_tokenizer_for_db(config):
    """ Instantiate a tokenizer for an existing database.
        The function looks up the appropriate tokenizer in the database
        and initialises it.
    """
    basedir = config.project_dir / 'tokenizer'
    if not basedir.is_dir():
        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
        raise UsageError('Cannot initialize tokenizer.')
    with connect(config.get_libpq_dsn()) as conn:
        name = properties.get_property(conn, 'tokenizer')
    if name is None:
        LOG.fatal("Tokenizer was not set up properly. Database property missing.")
        raise UsageError('Cannot initialize tokenizer.')
    tokenizer_module = _import_tokenizer(name)
    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
    tokenizer.init_from_project()
    return tokenizer
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -0,0 +1,540 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4.
 """
 from collections import OrderedDict
 import logging
 import re
 import shutil
 from textwrap import dedent
 from icu import Transliterator
 import psycopg2
 import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db import properties
 from nominatim.db import utils as db_utils
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
 DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 LOG = logging.getLogger()
 def create(dsn, data_dir):
    """ Create a new instance of the tokenizer provided by this module.
    """
    return LegacyTokenizer(dsn, data_dir)
 def _install_module(config_module_path, src_dir, module_dir):
    """ Copies the PostgreSQL normalisation module into the project
        directory if necessary. For historical reasons the module is
        saved in the '/module' subdirectory and not with the other tokenizer
        data.
        The function detects when the installation is run from the
        build directory. It doesn't touch the module in that case.
    """
    # Custom module locations are simply used as is.
    if config_module_path:
        LOG.info("Using custom path for database module at '%s'", config_module_path)
        return config_module_path
    # Compatibility mode for builddir installations.
    if module_dir.exists() and src_dir.samefile(module_dir):
        LOG.info('Running from build directory. Leaving database module as is.')
        return module_dir
    # In any other case install the module in the project directory.
    if not module_dir.exists():
        module_dir.mkdir()
    destfile = module_dir / 'nominatim.so'
    shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
    destfile.chmod(0o755)
    LOG.info('Database module installed at %s', str(destfile))
    return module_dir
 def _check_module(module_dir, conn):
    """ Try to use the PostgreSQL module to confirm that it is correctly
        installed and accessible from PostgreSQL.
    """
    with conn.cursor() as cur:
        try:
            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
                           RETURNS text AS '{}/nominatim.so', 'transliteration'
                           LANGUAGE c IMMUTABLE STRICT;
                           DROP FUNCTION nominatim_test_import_func(text)
                        """.format(module_dir))
        except psycopg2.DatabaseError as err:
            LOG.fatal("Error accessing database module: %s", err)
            raise UsageError("Database module cannot be accessed.") from err
 class LegacyTokenizer:
    """ The legacy tokenizer uses a special PostgreSQL module to normalize
        names and queries. The tokenizer thus implements normalization through
        calls to the database.
    """
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
        self.normalization = None
    def init_new_db(self, config, init_db=True):
        """ Set up a new tokenizer for the database.
            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
        module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                     config.lib_dir.module,
                                     config.project_dir / 'module')
        self.normalization = config.TERM_NORMALIZATION
        self._install_php(config)
        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
            self._save_config(conn, config)
            conn.commit()
        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)
    def init_from_project(self):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
    def update_sql_functions(self, config):
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
            modulepath = config.DATABASE_MODULE_PATH or \
                         str((config.project_dir / 'module').resolve())
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
                              max_word_freq=max_word_freq,
                              modulepath=modulepath)
    def check_database(self):
        """ Check that the tokenizer is set up correctly.
        """
        hint = """\
             The Postgresql extension nominatim.so was not correctly loaded.
             Error: {error}
             Hints:
             * Check the output of the CMmake/make installation step
             * Does nominatim.so exist?
             * Does nominatim.so exist on the database server?
             * Can nominatim.so be accessed by the database user?
             """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                try:
                    out = cur.scalar("SELECT make_standard_name('a')")
                except psycopg2.Error as err:
                    return hint.format(error=str(err))
        if out != 'a':
            return hint.format(error='Unexpected result for make_standard_name()')
        return None
    def migrate_database(self, config):
        """ Initialise the project directory of an existing database for
            use with this tokenizer.
            This is a special migration function for updating existing databases
            to new software versions.
        """
        self.normalization = config.TERM_NORMALIZATION
        module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                     config.lib_dir.module,
                                     config.project_dir / 'module')
        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
            self._save_config(conn, config)
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:
            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```
            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        normalizer = Transliterator.createFromRules("phrase normalizer",
                                                    self.normalization)
        return LegacyNameAnalyzer(self.dsn, normalizer)
    def _install_php(self, config):
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(dedent("""\
            <?php
            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
            """.format(config)))
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()
        LOG.warning("Precomputing word tokens")
        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
    def _save_config(self, conn, config):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 class LegacyNameAnalyzer:
    """ The legacy analyzer uses the special Postgresql module for
        splitting names.
        Each instance opens a connection to the database to request the
        normalization.
    """
    def __init__(self, dsn, normalizer):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        self.normalizer = normalizer
        psycopg2.extras.register_hstore(self.conn)
        self._cache = _TokenCache(self.conn)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """
        if self.conn:
            self.conn.close()
            self.conn = None
    def normalize(self, phrase):
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.
        """
        return self.normalizer.transliterate(phrase)
    def add_postcodes_from_db(self):
        """ Add postcodes from the location_postcode table to the word table.
        """
        with self.conn.cursor() as cur:
            cur.execute("""SELECT count(create_postcode_id(pc))
                           FROM (SELECT distinct(postcode) as pc
                                 FROM location_postcode) x""")
    def update_special_phrases(self, phrases):
        """ Replace the search index for special phrases with the new phrases.
        """
        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
                            for p in phrases))
        with self.conn.cursor() as cur:
            # Get the old phrases.
            existing_phrases = set()
            cur.execute("""SELECT word, class, type, operator FROM word
                           WHERE class != 'place'
                                 OR (type != 'house' AND type != 'postcode')""")
            for label, cls, typ, oper in cur:
                existing_phrases.add((label, cls, typ, oper or '-'))
            to_add = norm_phrases - existing_phrases
            to_delete = existing_phrases - norm_phrases
            if to_add:
                psycopg2.extras.execute_values(
                    cur,
                    """ INSERT INTO word (word_id, word_token, word, class, type,
                                          search_name_count, operator)
                        (SELECT nextval('seq_word'), make_standard_name(name), name,
                                class, type, 0,
                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
                           FROM (VALUES %s) as v(name, class, type, op))""",
                    to_add)
            if to_delete:
                psycopg2.extras.execute_values(
                    cur,
                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                        WHERE word = name and class = in_class and type = in_type
                              and ((op = '-' and operator is null) or op = operator)""",
                    to_delete)
        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                 len(norm_phrases), len(to_add), len(to_delete))
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        with self.conn.cursor() as cur:
            cur.execute(
                """INSERT INTO word (word_id, word_token, country_code)
                   (SELECT nextval('seq_word'), lookup_token, %s
                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
                            FROM unnest(%s)n) y
                      WHERE NOT EXISTS(SELECT * FROM word
                                       WHERE word_token = lookup_token and country_code = %s))
                """, (country_code, names, country_code))
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)
        names = place.get('name')
        if names:
            token_info.add_names(self.conn, names)
            country_feature = place.get('country_feature')
            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                self.add_country_names(country_feature.lower(), list(names.values()))
        address = place.get('address')
        if address:
            hnrs = []
            addr_terms = []
            for key, value in address.items():
                if key == 'postcode':
                    self._add_postcode(value)
                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                    hnrs.append(value)
                elif key == 'street':
                    token_info.add_street(self.conn, value)
                elif key == 'place':
                    token_info.add_place(self.conn, value)
                elif not key.startswith('_') and \
                     key not in ('country', 'full'):
                    addr_terms.append((key, value))
            if hnrs:
                token_info.add_housenumbers(self.conn, hnrs)
            if addr_terms:
                token_info.add_address_terms(self.conn, addr_terms)
        return token_info.data
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
        def _create_postcode_from_db(pcode):
            with self.conn.cursor() as cur:
                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
        if re.search(r'[:,;]', postcode) is None:
            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
 class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
    def __init__(self, cache):
        self.cache = cache
        self.data = {}
    def add_names(self, conn, names):
        """ Add token information for the names of the place.
        """
        with conn.cursor() as cur:
            # Create the token IDs for all names.
            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
                                            (names, ))
    def add_housenumbers(self, conn, hnrs):
        """ Extract housenumber information from the address.
        """
        if len(hnrs) == 1:
            token = self.cache.get_housenumber(hnrs[0])
            if token is not None:
                self.data['hnr_tokens'] = token
                self.data['hnr'] = hnrs[0]
                return
        # split numbers if necessary
        simple_list = []
        for hnr in hnrs:
            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
        if len(simple_list) > 1:
            simple_list = list(set(simple_list))
        with conn.cursor() as cur:
            cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
    def add_street(self, conn, street):
        """ Add addr:street match terms.
        """
        def _get_street(name):
            with conn.cursor() as cur:
                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
        self.data['street'] = self.cache.streets.get(street, _get_street)
    def add_place(self, conn, place):
        """ Add addr:place search and match terms.
        """
        def _get_place(name):
            with conn.cursor() as cur:
                cur.execute("""SELECT (addr_ids_from_name(%s)
                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
                                      word_ids_from_name(%s)::text""",
                            (name, name, name))
                return cur.fetchone()
        self.data['place_search'], self.data['place_match'] = \
            self.cache.places.get(place, _get_place)
    def add_address_terms(self, conn, terms):
        """ Add additional address terms.
        """
        def _get_address_term(name):
            with conn.cursor() as cur:
                cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                      word_ids_from_name(%s)::text""",
                            (name, name))
                return cur.fetchone()
        tokens = {}
        for key, value in terms:
            tokens[key] = self.cache.address_terms.get(value, _get_address_term)
        self.data['addr'] = tokens
 class _LRU:
    """ Least recently used cache that accepts a generator function to
        produce the item when there is a cache miss.
    """
    def __init__(self, maxsize=128, init_data=None):
        self.data = init_data or OrderedDict()
        self.maxsize = maxsize
        if init_data is not None and len(init_data) > maxsize:
            self.maxsize = len(init_data)
    def get(self, key, generator):
        """ Get the item with the given key from the cache. If nothing
            is found in the cache, generate the value through the
            generator function and store it in the cache.
        """
        value = self.data.get(key)
        if value is not None:
            self.data.move_to_end(key)
        else:
            value = generator(key)
            if len(self.data) >= self.maxsize:
                self.data.popitem(last=False)
            self.data[key] = value
        return value
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.
        This cache is not thread-safe and needs to be instantiated per
        analyzer.
    """
    def __init__(self, conn):
        # various LRU caches
        self.streets = _LRU(maxsize=256)
        self.places = _LRU(maxsize=128)
        self.address_terms = _LRU(maxsize=1024)
        # Lookup houseunumbers up to 100 and cache them
        with conn.cursor() as cur:
            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
                           FROM generate_series(1, 100) as i""")
            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
        # Get postcodes that are already saved
        postcodes = OrderedDict()
        with conn.cursor() as cur:
            cur.execute("""SELECT word FROM word
                           WHERE class ='place' and type = 'postcode'""")
            for row in cur:
                postcodes[row[0]] = None
        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
    def get_housenumber(self, number):
        """ Get a housenumber token from the cache.
        """
        return self._cached_housenumbers.get(number)
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@ -4,10 +4,9 @@ Collection of functions that check if the database is complete and functional.
 from enum import Enum
 from textwrap import dedent
 import psycopg2
 from nominatim.db.connection import connect
 from nominatim.errors import UsageError
 from nominatim.tokenizer import factory as tokenizer_factory
 CHECKLIST = []
@ -78,8 +77,7 @@ def check_database(config):
 def _get_indexes(conn):
-    indexes = ['idx_word_word_id',
+    indexes = ['idx_place_addressline_address_place_id',
               'idx_place_addressline_address_place_id',
               'idx_placex_rank_search',
               'idx_placex_rank_address',
               'idx_placex_parent_place_id',
@ -149,7 +147,7 @@ def check_placex_table(conn, config):
@_check(hint="""placex table has no data. Did the import finish sucessfully?""")
-def check_placex_size(conn, config): # pylint: disable=W0613
+def check_placex_size(conn, _):
    """ Checking for placex content
    """
    with conn.cursor() as cur:
@ -158,38 +156,30 @@ def check_placex_size(conn, config): # pylint: disable=W0613
    return CheckState.OK if cnt > 0 else CheckState.FATAL
-@_check(hint="""\
+@_check(hint="""{msg}""")
-             The Postgresql extension nominatim.so was not correctly loaded.
+def check_tokenizer(_, config):
-
+    """ Checking that tokenizer works
             Error: {error}
             Hints:
             * Check the output of the CMmake/make installation step
             * Does nominatim.so exist?
             * Does nominatim.so exist on the database server?
             * Can nominatim.so be accessed by the database user?
             """)
 def check_module(conn, config): # pylint: disable=W0613
    """ Checking that nominatim.so module is installed
    """
-    with conn.cursor() as cur:
+    try:
-        try:
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
-            out = cur.scalar("SELECT make_standard_name('a')")
+    except UsageError:
-        except psycopg2.ProgrammingError as err:
+        return CheckState.FAIL, dict(msg="""\
-            return CheckState.FAIL, dict(error=str(err))
+            Cannot load tokenizer. Did the import finish sucessfully?""")
-        if out != 'a':
+    result = tokenizer.check_database()
            return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
    if result is None:
        return CheckState.OK
    return CheckState.FAIL, dict(msg=result)
@_check(hint="""\
             The indexing didn't finish. {count} entries are not yet indexed.
             To index the remaining entries, run:   {index_cmd}
             """)
-def check_indexing(conn, config): # pylint: disable=W0613
+def check_indexing(conn, _):
    """ Checking indexing status
    """
    with conn.cursor() as cur:
@ -198,7 +188,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
    if cnt == 0:
        return CheckState.OK
-    if conn.index_exists('idx_word_word_id'):
+    if conn.index_exists('idx_placex_rank_search'):
        # Likely just an interrupted update.
        index_cmd = 'nominatim index'
    else:
@ -214,7 +204,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
             Rerun the index creation with:   nominatim import --continue db-postprocess
             """)
-def check_database_indexes(conn, config): # pylint: disable=W0613
+def check_database_indexes(conn, _):
    """ Checking that database indexes are complete
    """
    missing = []
@ -236,7 +226,7 @@ def check_database_indexes(conn, config): # pylint: disable=W0613
             Invalid indexes:
               {indexes}
             """)
-def check_database_index_valid(conn, config): # pylint: disable=W0613
+def check_database_index_valid(conn, _):
    """ Checking that all database indexes are valid
    """
    with conn.cursor() as cur:
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@ -5,11 +5,10 @@ import logging
 import os
 import selectors
 import subprocess
 import shutil
 from pathlib import Path
 import psutil
-import psycopg2
+import psycopg2.extras
 from nominatim.db.connection import connect, get_pg_env
 from nominatim.db import utils as db_utils
@ -89,49 +88,6 @@ def setup_extensions(conn):
        raise UsageError('PostGIS version is too old.')
 def install_module(src_dir, project_dir, module_dir, conn=None):
    """ Copy the normalization module from src_dir into the project
        directory under the '/module' directory. If 'module_dir' is set, then
        use the module from there instead and check that it is accessible
        for Postgresql.
        The function detects when the installation is run from the
        build directory. It doesn't touch the module in that case.
        If 'conn' is given, then the function also tests if the module
        can be access via the given database.
    """
    if not module_dir:
        module_dir = project_dir / 'module'
        if not module_dir.exists() or not src_dir.samefile(module_dir):
            if not module_dir.exists():
                module_dir.mkdir()
            destfile = module_dir / 'nominatim.so'
            shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
            destfile.chmod(0o755)
            LOG.info('Database module installed at %s', str(destfile))
        else:
            LOG.info('Running from build directory. Leaving database module as is.')
    else:
        LOG.info("Using custom path for database module at '%s'", module_dir)
    if conn is not None:
        with conn.cursor() as cur:
            try:
                cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
                               RETURNS text AS '{}/nominatim.so', 'transliteration'
                               LANGUAGE c IMMUTABLE STRICT;
                               DROP FUNCTION nominatim_test_import_func(text)
                            """.format(module_dir))
            except psycopg2.DatabaseError as err:
                LOG.fatal("Error accessing database module: %s", err)
                raise UsageError("Database module cannot be accessed.") from err
 def import_base_data(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
@ -205,11 +161,10 @@ def create_partition_tables(conn, config):
    sql.run_sql_file(conn, 'partition-tables.src.sql')
-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
    """ Truncate all data tables to prepare for a fresh load.
    """
    with conn.cursor() as cur:
        cur.execute('TRUNCATE word')
        cur.execute('TRUNCATE placex')
        cur.execute('TRUNCATE place_addressline')
        cur.execute('TRUNCATE location_area')
@ -228,23 +183,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
        for table in [r[0] for r in list(cur)]:
            cur.execute('TRUNCATE ' + table)
-        if max_word_frequency is not None:
+    conn.commit()
            # Used by getorcreate_word_id to ignore frequent partial words.
            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
                           RETURNS integer AS $$
                             SELECT {} as maxwordfreq;
                           $$ LANGUAGE SQL IMMUTABLE
                        """.format(max_word_frequency))
        conn.commit()
 _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
    """ Copy data into the word and placex table.
    """
    # Pre-calculate the most important terms in the word list.
    db_utils.execute_file(dsn, data_dir / 'words.sql')
    sel = selectors.DefaultSelector()
    # Then copy data from place to placex in <threads - 1> chunks.
    place_threads = max(1, threads - 1)
@ -306,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
    sql.run_sql_file(conn, 'indices.sql', drop=drop)
-def create_country_names(conn, config):
+def create_country_names(conn, tokenizer, languages=None):
-    """ Create search index for default country names.
+    """ Add default country names to search index. `languages` is a comma-
        separated list of language codes as used in OSM. If `languages` is not
        empty then only name translations for the given languages are added
        to the index.
    """
    if languages:
        languages = languages.split(',')
    def _include_key(key):
        return key == 'name' or \
               (key.startswith('name:') \
                and (not languages or key[5:] in languages))
    with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
+        psycopg2.extras.register_hstore(cur)
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
+        cur.execute("""SELECT country_code, name FROM country_name
-        cur.execute("""SELECT COUNT(*) FROM
+                       WHERE country_code is not null""")
                       (SELECT getorcreate_country(make_standard_name(country_code),
                       country_code) FROM country_name WHERE country_code is not null) AS x""")
        cur.execute("""SELECT COUNT(*) FROM
                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
                       FROM country_name WHERE name ? 'name') AS x""")
        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
                           country_code) FROM (SELECT country_code, skeys(name)
                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
-        languages = config.LANGUAGES
+        with tokenizer.name_analyzer() as analyzer:
            for code, name in cur:
                names = [code]
                if code == 'gb':
                    names.append('UK')
                if code == 'us':
                    names.append('United States')
                # country names (only in languages as provided)
                if name:
                    names.extend((v for k, v in name.items() if _include_key(k)))
                analyzer.add_country_names(code, names)
        if languages:
            sql_statement = "{} IN (".format(sql_statement)
            delim = ''
            for language in languages.split(','):
                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
                delim = ', '
            sql_statement = '{})'.format(sql_statement)
        else:
            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
        sql_statement = "{}) v".format(sql_statement)
        cur.execute(sql_statement)
    conn.commit()
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@ -6,7 +6,8 @@ import logging
 from nominatim.db import properties
 from nominatim.db.connection import connect
 from nominatim.version import NOMINATIM_VERSION
-from nominatim.tools import refresh, database_import
+from nominatim.tools import refresh
 from nominatim.tokenizer import factory as tokenizer_factory
 from nominatim.errors import UsageError
 LOG = logging.getLogger()
@ -43,11 +44,14 @@ def migrate(config, paths):
                            '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
                kwargs = dict(conn=conn, config=config, paths=paths)
                func(**kwargs)
                conn.commit()
                has_run_migration = True
        if has_run_migration:
            LOG.warning('Updating SQL functions.')
            refresh.create_functions(conn, config)
            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
            tokenizer.update_sql_functions(config)
        properties.set_property(conn, 'database_version',
                                '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
@ -108,17 +112,6 @@ def import_status_timestamp_change(conn, **_):
                       TYPE timestamp with time zone;""")
@_migration(3, 5, 0, 99)
 def install_database_module_in_project_directory(conn, config, paths, **_):
    """ Install database module in project directory.
        The database module needs to be present in the project directory
        since those were introduced.
    """
    database_import.install_module(paths.module_dir, paths.project_dir,
                                   config.DATABASE_MODULE_PATH, conn=conn)
@_migration(3, 5, 0, 99)
 def add_nominatim_property_table(conn, config, **_):
    """ Add nominatim_property table.
@ -137,6 +130,9 @@ def change_housenumber_transliteration(conn, **_):
        The database schema switched from saving raw housenumbers in
        placex.housenumber to saving transliterated ones.
        Note: the function create_housenumber_id() has been dropped in later
              versions.
    """
    with conn.cursor() as cur:
        cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
@ -173,3 +169,25 @@ def switch_placenode_geometry_index(conn, **_):
                              and class = 'place' and type != 'postcode'
                              and linked_place_id is null""")
        cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
@_migration(3, 7, 0, 1)
 def install_legacy_tokenizer(conn, config, **_):
    """ Setup legacy tokenizer.
        If no other tokenizer has been configured yet, then create the
        configuration for the backwards-compatible legacy tokenizer
    """
    if properties.get_property(conn, 'tokenizer') is None:
        with conn.cursor() as cur:
            for table in ('placex', 'location_property_osmline'):
                has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
                                           WHERE table_name = %s
                                           and column_name = 'token_info'""",
                                        (table, ))
            if has_column == 0:
                cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
        tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
                                                       module_name='legacy')
        tokenizer.migrate_database(config)
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@ -6,7 +6,7 @@ of artificial postcode centroids.
 from nominatim.db.utils import execute_file
 from nominatim.db.connection import connect
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
    """ Set up the initial list of postcodes.
    """
@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
                       and token_normalized_postcode(address->'postcode') is not null
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
                       token_normalized_postcode(postcode),
                       ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
+                SELECT nextval('seq_place'), 1, 'gb',
-                  FROM gb_postcode WHERE postcode NOT IN
+                       token_normalized_postcode(postcode), geometry
                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)
            cur.execute("""
                SELECT count(getorcreate_postcode_id(v)) FROM
                (SELECT distinct(postcode) as v FROM location_postcode) p
            """)
        conn.commit()
        with tokenizer.name_analyzer() as analyzer:
            analyzer.add_postcodes_from_db()
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
    ('Default_Language', 'DEFAULT_LANGUAGE', str),
    ('Log_DB', 'LOG_DB', bool),
    ('Log_File', 'LOG_FILE', str),
    ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
    ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
    ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
    ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
    ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
    ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
    ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
    ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
    ('MapIcon_URL', 'MAPICON_URL', str),
 )
@ -175,9 +173,11 @@ def setup_website(basedir, config):
                      @define('CONST_Debug', $_GET['debug'] ?? false);
                      @define('CONST_LibDir', '{0}');
                      @define('CONST_TokenizerDir', '{2}');
                      @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
-                      """.format(config.lib_dir.php, NOMINATIM_VERSION))
+                      """.format(config.lib_dir.php, NOMINATIM_VERSION,
                                 config.project_dir / 'tokenizer'))
    for php_name, conf_name, var_type in PHP_CONST_DEFS:
        if var_type == bool:
--- a/nominatim/tools/special_phrases/importer_statistics.py
+++ b/nominatim/tools/special_phrases/importer_statistics.py
@ -24,9 +24,6 @@ class SpecialPhrasesImporterStatistics():
        self.tables_deleted = 0
        self.tables_ignored = 0
        self.global_phrases_invalid = 0
        self.global_phrases_added = 0
        self.global_phrases_ignored = 0
        self.global_phrases_deleted = 0
    def _set_lang_values_to_0(self):
        """
@ -34,8 +31,6 @@ class SpecialPhrasesImporterStatistics():
            lang to 0.
        """
        self.lang_phrases_invalid = 0
        self.lang_phrases_added = 0
        self.lang_phrases_ignored = 0
    def notify_one_phrase_invalid(self):
        """
@ -45,29 +40,6 @@ class SpecialPhrasesImporterStatistics():
        self.lang_phrases_invalid += 1
        self.global_phrases_invalid += 1
    def notify_one_phrase_added(self):
        """
            Add +1 to the count of entries
            added to the db.
        """
        self.lang_phrases_added += 1
        self.global_phrases_added += 1
    def notify_one_phrase_ignored(self):
        """
            Add +1 to the count of ignored
            entries as it was already in the db.
        """
        self.lang_phrases_ignored += 1
        self.global_phrases_ignored += 1
    def notify_one_phrase_deleted(self):
        """
            Add +1 to the count of phrases deleted
            from the database.
        """
        self.global_phrases_deleted += 1
    def notify_one_table_created(self):
        """
            Add +1 to the count of created tables.
@ -97,12 +69,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
        if self.global_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
        LOG.info('- %s phrases were ignored as they are already in the database',
                 self.global_phrases_ignored)
        LOG.info('- %s phrases were added to the database', self.global_phrases_added)
        LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
        if self.global_phrases_deleted > 0:
            LOG.info('  They were deleted as they are not valid anymore.')
        LOG.info('- %s tables were ignored as they already exist on the database',
                 self.tables_ignored)
        LOG.info('- %s tables were created', self.tables_created)
@ -126,9 +92,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
        if self.lang_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
        LOG.info('- %s phrases were ignored as they are already in the database',
                 self.lang_phrases_ignored)
        LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
        LOG.info('====================================================================')
        if self.lang_phrases_invalid > 0:
--- a/nominatim/tools/special_phrases/special_phrases_importer.py
+++ b/nominatim/tools/special_phrases/special_phrases_importer.py
@ -9,7 +9,6 @@ import re
 import subprocess
 import json
 from icu import Transliterator
 from psycopg2.sql import Identifier, Literal, SQL
 from nominatim.tools.exec_utils import get_url
@ -33,21 +32,14 @@ class SpecialPhrasesImporter():
            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
        )
        self.sanity_check_pattern = re.compile(r'^\w+$')
-        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
+        # This set will contain all existing phrases to be added.
-                                                             self.config.TERM_NORMALIZATION)
+        # It contains tuples with the following format: (lable, class, type, operator)
-        #This set will contain all existing phrases from the word table which
+        self.word_phrases = set()
        #no longer exist on the wiki.
        #It contain tuples with the following format: (normalized_word, class, type, operator)
        self.words_phrases_to_delete = set()
        #This set will contain the phrases which still exist from the wiki.
        #It is used to prevent duplicates on the wiki by removing them from
        #the word_phrases_to_delete only at the end.
        self.words_phrases_still_exist = set()
        #This set will contain all existing place_classtype tables which doesn't match any
        #special phrases class/type on the wiki.
        self.table_phrases_to_delete = set()
-    def import_from_wiki(self, languages=None):
+    def import_from_wiki(self, tokenizer, languages=None):
        """
            Iterate through all specified languages and
            extract corresponding special phrases from the wiki.
@ -55,7 +47,6 @@ class SpecialPhrasesImporter():
        if languages is not None and not isinstance(languages, list):
            raise TypeError('The \'languages\' argument should be of type list.')
        self._fetch_existing_words_phrases()
        self._fetch_existing_place_classtype_tables()
        #Get all languages to process.
@ -71,30 +62,15 @@ class SpecialPhrasesImporter():
            self.statistics_handler.notify_current_lang_done(lang)
        self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_phrases_from_db()
+        self._remove_non_existent_tables_from_db()
        self.db_connection.commit()
        with tokenizer.name_analyzer() as analyzer:
            analyzer.update_special_phrases(self.word_phrases)
        LOG.warning('Import done.')
        self.statistics_handler.notify_import_done()
    def _fetch_existing_words_phrases(self):
        """
            Fetch existing special phrases from the word table.
            Fill the word_phrases_to_delete set of the class.
        """
        #Only extract special phrases terms:
        #If class=place and type=house then it is a housenumber term.
        #If class=place and type=postcode then it is a postcode term.
        word_query = """
            SELECT word, class, type, operator FROM word
            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
        """
        with self.db_connection.cursor() as db_cursor:
            db_cursor.execute(SQL(word_query))
            for row in db_cursor:
                row[3] = '-' if row[3] is None else row[3]
                self.words_phrases_to_delete.add(
                    (row[0], row[1], row[2], row[3])
                )
    def _fetch_existing_place_classtype_tables(self):
        """
@ -176,7 +152,6 @@ class SpecialPhrasesImporter():
        for match in matches:
            phrase_label = match[0].strip()
            normalized_label = self.transliterator.transliterate(phrase_label)
            phrase_class = match[1].strip()
            phrase_type = match[2].strip()
            phrase_operator = match[3].strip()
@ -198,20 +173,6 @@ class SpecialPhrasesImporter():
            ):
                continue
            #Check if the phrase already exists in the database.
            if (
                    (normalized_label, phrase_class, phrase_type, phrase_operator)
                    in self.words_phrases_to_delete
            ):
                #Remove this phrase from the ones to delete as it still exist on the wiki.
                self.words_phrases_still_exist.add(
                    (normalized_label, phrase_class, phrase_type, phrase_operator)
                )
                class_type_pairs.add((phrase_class, phrase_type))
                self.statistics_handler.notify_one_phrase_ignored()
                #Dont need to add this phrase as it already exists in the word table.
                continue
            #sanity check, in case somebody added garbage in the wiki
            if not self._check_sanity(lang, phrase_class, phrase_type):
                self.statistics_handler.notify_one_phrase_invalid()
@ -219,35 +180,11 @@ class SpecialPhrasesImporter():
            class_type_pairs.add((phrase_class, phrase_type))
-            self._process_amenity(
+            self.word_phrases.add((phrase_label, phrase_class,
-                phrase_label, normalized_label, phrase_class,
+                                   phrase_type, phrase_operator))
                phrase_type, phrase_operator
            )
            self.statistics_handler.notify_one_phrase_added()
        return class_type_pairs
    def _process_amenity(self, phrase_label, normalized_label,
                         phrase_class, phrase_type, phrase_operator):
        # pylint: disable-msg=too-many-arguments
        """
            Add phrase lookup and corresponding class and
            type to the word table based on the operator.
        """
        with self.db_connection.cursor() as db_cursor:
            if phrase_operator == 'near':
                db_cursor.execute("""SELECT getorcreate_amenityoperator(
                                  make_standard_name(%s), %s, %s, %s, 'near')""",
                                  (phrase_label, normalized_label, phrase_class, phrase_type))
            elif phrase_operator == 'in':
                db_cursor.execute("""SELECT getorcreate_amenityoperator(
                                  make_standard_name(%s), %s, %s, %s, 'in')""",
                                  (phrase_label, normalized_label, phrase_class, phrase_type))
            else:
                db_cursor.execute("""SELECT getorcreate_amenity(
                                  make_standard_name(%s), %s, %s, %s)""",
                                  (phrase_label, normalized_label, phrase_class, phrase_type))
    def _create_place_classtype_table_and_indexes(self, class_type_pairs):
        """
@ -339,33 +276,15 @@ class SpecialPhrasesImporter():
                              .format(Identifier(table_name),
                                      Identifier(self.config.DATABASE_WEBUSER)))
-    def _remove_non_existent_phrases_from_db(self):
+    def _remove_non_existent_tables_from_db(self):
        """
            Remove special phrases which doesn't exist on the wiki anymore.
-            Delete from the word table and delete the place_classtype tables.
+            Delete the place_classtype tables.
        """
        LOG.warning('Cleaning database...')
        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
        #Array containing all queries to execute. Contain tuples of format (query, parameters)
        queries_parameters = []
        #Delete phrases from the word table which are not on the wiki anymore.
        for phrase_to_delete in self.words_phrases_to_delete:
            self.statistics_handler.notify_one_phrase_deleted()
            if phrase_to_delete[3] == '-':
                query = """
                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
                """
                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
                queries_parameters.append((query, parameters))
            else:
                query = """
                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
                """
                parameters = (phrase_to_delete[0], phrase_to_delete[1],
                              phrase_to_delete[2], phrase_to_delete[3], )
                queries_parameters.append((query, parameters))
        #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
        for table in self.table_phrases_to_delete:
            self.statistics_handler.notify_one_table_deleted()
--- a/nominatim/version.py
+++ b/nominatim/version.py
@ -10,7 +10,7 @@ Version information for Nominatim.
 # and must always be increased when there is a change to the database or code
 # that requires a migration.
 # Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 7, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 2)
 POSTGRESQL_REQUIRED_VERSION = (9, 3)
 POSTGIS_REQUIRED_VERSION = (2, 2)
--- a/settings/env.defaults
+++ b/settings/env.defaults
@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data"
 # Changing this value requires to run 'nominatim refresh --functions'.
 NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
 # Currently available tokenizers: legacy
 NOMINATIM_TOKENIZER="legacy"
 # Number of occurances of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
 # or handled differently during search.
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@ -24,7 +24,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | nameaddress_vector |
-         | N1     | Rose, Street, Walltown |
+         | N1     | #Rose Street, Walltown |
        When searching for "23 Rose Street, Walltown"
        Then results contain
         | osm_type | osm_id | name |
@ -248,7 +248,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Rose, Street, Walltown |
+         | N1     | #Green Moss | #Rose Street, Walltown |
        When searching for "Green Moss, Rose Street, Walltown"
        Then results contain
         | osm_type | osm_id | name |
@ -299,7 +299,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
-         | N1     | foo         | the road |
+         | N1     | foo         | #the road |
    Scenario: Some addr: tags are added to address
        Given the scene roads-with-pois
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
 from nominatim import cli
 from nominatim.config import Configuration
 from nominatim.tools import refresh
 from nominatim.tokenizer import factory as tokenizer_factory
 from steps.utils import run_script
 class NominatimEnvironment:
@ -106,9 +107,19 @@ class NominatimEnvironment:
            self.website_dir.cleanup()
        self.website_dir = tempfile.TemporaryDirectory()
-        cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
+        refresh.setup_website(Path(self.website_dir.name) / 'website',
-        cfg.lib_dir.php = self.src_dir / 'lib-php'
+                              self.get_test_config())
-        refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
+
    def get_test_config(self):
        cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
                            environ=self.test_env)
        cfg.set_libdirs(module=self.build_dir / 'module',
                        osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
                        php=self.src_dir / 'lib-php',
                        sql=self.src_dir / 'lib-sql',
                        data=self.src_dir / 'data')
        return cfg
    def get_libpq_dsn(self):
        dsn = self.test_env['NOMINATIM_DATABASE_DSN']
@ -169,33 +180,41 @@ class NominatimEnvironment:
        """
        self.write_nominatim_config(self.api_test_db)
-        if self.api_db_done:
+        if not self.api_db_done:
-            return
+            self.api_db_done = True
-        self.api_db_done = True
+            if not self._reuse_or_drop_db(self.api_test_db):
                testdata = Path('__file__') / '..' / '..' / 'testdb'
                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
-        if self._reuse_or_drop_db(self.api_test_db):
+                try:
-            return
+                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')
-        testdata = Path('__file__') / '..' / '..' / 'testdb'
+                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-        self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                except:
                    self.db_drop_database(self.api_test_db)
                    raise
-        try:
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
            self.run_nominatim('import', '--osm-file', str(self.api_test_file))
            self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
            self.run_nominatim('freeze')
            phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
            run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
        except:
            self.db_drop_database(self.api_test_db)
            raise
    def setup_unknown_db(self):
        """ Setup a test against a non-existing database.
        """
-        self.write_nominatim_config('UNKNOWN_DATABASE_NAME')
+        # The tokenizer needs an existing database to function.
        # So start with the usual database
        class _Context:
            db = None
        context = _Context()
        self.setup_db(context)
        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
        # Then drop the DB again
        self.teardown_db(context, force_drop=True)
    def setup_db(self, context):
        """ Setup a test against a fresh, empty test database.
@ -212,13 +231,13 @@ class NominatimEnvironment:
        context.db.autocommit = True
        psycopg2.extras.register_hstore(context.db, globally=False)
-    def teardown_db(self, context):
+    def teardown_db(self, context, force_drop=False):
        """ Remove the test database, if it exists.
        """
-        if 'db' in context:
+        if hasattr(context, 'db'):
            context.db.close()
-        if not self.keep_scenario_db:
+        if force_drop or not self.keep_scenario_db:
            self.db_drop_database(self.test_db)
    def _reuse_or_drop_db(self, name):
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@ -7,6 +7,7 @@ from place_inserter import PlaceColumn
 from table_compare import NominatimID, DBRow
 from nominatim.indexer import indexer
 from nominatim.tokenizer import factory as tokenizer_factory
 def check_database_integrity(context):
    """ Check some generic constraints on the tables.
@ -86,6 +87,9 @@ def add_data_to_planet_ways(context):
 def import_and_index_data_from_place_table(context):
    """ Import data previously set up in the place table.
    """
    nctx = context.nominatim
    tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
    context.nominatim.copy_from_place(context.db)
    # XXX use tool function as soon as it is ported
@ -105,7 +109,7 @@ def import_and_index_data_from_place_table(context):
    # Call directly as the refresh function does not include postcodes.
    indexer.LOG.setLevel(logging.ERROR)
-    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
    check_database_integrity(context)
@ -230,7 +234,7 @@ def check_search_name_contents(context, exclude):
                                if exclude:
                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
                                else:
-                                    assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
+                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
                    elif name != 'object':
                        assert db_row.contains(name, value), db_row.assert_msg(name, value)
--- a/test/php/Nominatim/PhraseTest.php
+++ b/test/php/Nominatim/PhraseTest.php
@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testEmptyPhrase()
    {
        $oPhrase = new Phrase('', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array(), new TokensFullSet());
-        $this->assertEquals(
+        $this->assertNull($oPhrase->getWordSets());
            array(array('')),
            $oPhrase->getWordSets()
        );
    }
    public function testSingleWordPhrase()
    {
        $oPhrase = new Phrase('a', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a'), new TokensFullSet());
        $this->assertEquals(
            '(a)',
@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testMultiWordPhrase()
    {
        $oPhrase = new Phrase('a b', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
        $this->assertEquals(
            '(a b),(a|b)',
            $this->serializeSets($oPhrase->getWordSets())
        );
        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
        $this->assertEquals(
            '(a b c),(a|b c),(a b|c),(a|b|c)',
            $this->serializeSets($oPhrase->getWordSets())
        );
        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
        $this->assertEquals(
            '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
            $this->serializeSets($oPhrase->getWordSets())
@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testInverseWordSets()
    {
        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
        $oPhrase->invertWordSets();
        $this->assertEquals(
@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testMaxWordSets()
    {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
+        $aWords = array_fill(0, 4, 'a');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase = new Phrase(join(' ', $aWords), '');
        $oPhrase->computeWordSets($aWords, new TokensFullSet());
        $this->assertEquals(8, count($oPhrase->getWordSets()));
        $oPhrase->invertWordSets();
        $this->assertEquals(8, count($oPhrase->getWordSets()));
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
+        $aWords = array_fill(0, 18, 'a');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase = new Phrase(join(' ', $aWords), '');
        $oPhrase->computeWordSets($aWords, new TokensFullSet());
        $this->assertEquals(100, count($oPhrase->getWordSets()));
        $oPhrase->invertWordSets();
        $this->assertEquals(100, count($oPhrase->getWordSets()));
@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testPartialTokensShortTerm()
    {
        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
        $this->assertEquals(
            '(a|b c d),(a|b c|d)',
            $this->serializeSets($oPhrase->getWordSets())
@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testPartialTokensLongTerm()
    {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
+        $aWords = array_fill(0, 18, 'a');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+        $oPhrase = new Phrase(join(' ', $aWords), '');
        $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
        $this->assertEquals(80, count($oPhrase->getWordSets()));
    }
 }
--- a/test/php/Nominatim/StatusTest.php
+++ b/test/php/Nominatim/StatusTest.php
@ -2,6 +2,8 @@
 namespace Nominatim;
@define('CONST_TokenizerDir', dirname(__FILE__));
 require_once(CONST_LibDir.'/DB.php');
 require_once(CONST_LibDir.'/Status.php');
@ -40,45 +42,6 @@ class StatusTest extends \PHPUnit\Framework\TestCase
        $this->assertEquals('No database', $oStatus->status());
    }
    public function testModuleFail()
    {
        $this->expectException(\Exception::class);
        $this->expectExceptionMessage('Module call failed');
        $this->expectExceptionCode(702);
        // stub has getOne method but doesn't return anything
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                        ->setMethods(array('connect', 'getOne'))
                        ->getMock();
        $oStatus = new Status($oDbStub);
        $this->assertNull($oStatus->status());
    }
    public function testWordIdQueryFail()
    {
        $this->expectException(\Exception::class);
        $this->expectExceptionMessage('No value');
        $this->expectExceptionCode(704);
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                        ->setMethods(array('connect', 'getOne'))
                        ->getMock();
        // return no word_id
        $oDbStub->method('getOne')
                ->will($this->returnCallback(function ($sql) {
                    if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
                    if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
                }));
        $oStatus = new Status($oDbStub);
        $this->assertNull($oStatus->status());
    }
    public function testOK()
    {
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
@ -100,7 +63,7 @@ class StatusTest extends \PHPUnit\Framework\TestCase
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                        ->setMethods(array('getOne'))
                        ->getMock();
-     
+
        $oDbStub->method('getOne')
                ->willReturn(1519430221);
--- a/test/php/Nominatim/TokenListTest.php
+++ b/test/php/Nominatim/TokenListTest.php
@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
        $this->assertFalse($TL->contains('unknownword'));
        $this->assertEquals(array(), $TL->get('unknownword'));
    }
    public function testAddress()
    {
        $this->expectOutputRegex('/<p><tt>/');
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                        ->setMethods(array('getAll', 'getDBQuotedList'))
                        ->getMock();
        $oDbStub->method('getDBQuotedList')
                ->will($this->returnCallback(function ($aVals) {
                    return array_map(function ($sVal) {
                        return "'".$sVal."'";
                    }, $aVals);
                }));
        $oDbStub->method('getAll')
                ->will($this->returnCallback(function ($sql) {
                    $aResults = array();
                    if (preg_match('/1051/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => '1051',
                                                         'class' => 'place',
                                                         'type' => 'house'
                                                        ));
                    }
                    if (preg_match('/hauptstr/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => 'hauptstr',
                                                         'class' => 'place',
                                                         'type' => 'street',
                                                         'operator' => true
                                                        ));
                    }
                    if (preg_match('/64286/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => '64286',
                                                         'word' => '64286',
                                                         'class' => 'place',
                                                         'type' => 'postcode'
                                                        ));
                    }
                    if (preg_match('/darmstadt/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => 'darmstadt',
                                                         'count' => 533
                                                        ));
                    }
                    if (preg_match('/alemagne/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => 'alemagne',
                                                         'country_code' => 'de',
                                                        ));
                    }
                    if (preg_match('/mexico/', $sql)) {
                        $aResults[] = $this->wordResult(array(
                                                         'word_id' => 999,
                                                         'word_token' => 'mexico',
                                                         'country_code' => 'mx',
                                                        ));
                    }
                    return $aResults;
                }));
        $aCountryCodes = array('de', 'fr');
        $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
        $aTokens = explode(' ', $sNormQuery);
        $TL = new TokenList;
        $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
        $this->assertEquals(5, $TL->count());
        $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
        $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
        $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
        $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
        $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
    }
 }
--- a/test/php/Nominatim/tokenizer.php
+++ b/test/php/Nominatim/tokenizer.php
@ -0,0 +1,17 @@
 <?php
 namespace Nominatim;
 class Tokenizer
 {
    private $oDB;
    public function __construct(&$oDB)
    {
        $this->oDB =& $oDB;
    }
    public function checkStatus()
    {
    }
 }
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@ -1,3 +1,4 @@
 import importlib
 import itertools
 import sys
 from pathlib import Path
@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
 from nominatim.config import Configuration
 from nominatim.db import connection
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.db import properties
 import dummy_tokenizer
 class _TestingCursor(psycopg2.extras.DictCursor):
    """ Extension to the DictCursor class that provides execution
@ -117,9 +121,8 @@ def table_factory(temp_db_cursor):
    def mk_table(name, definition='id INT', content=None):
        temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
        if content is not None:
-            if not isinstance(content, str):
+            psycopg2.extras.execute_values(
-                content = '),('.join([str(x) for x in content])
+                temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
            temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
    return mk_table
@ -144,6 +147,11 @@ def tmp_phplib_dir():
        yield Path(phpdir)
@pytest.fixture
 def property_table(table_factory):
    table_factory('nominatim_properties', 'property TEXT, value TEXT')
@pytest.fixture
 def status_table(temp_db_conn):
    """ Create an empty version of the status table and
@ -281,10 +289,29 @@ def osm2pgsql_options(temp_db):
@pytest.fixture
 def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
+    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
    table_factory('country_name', 'partition INT', (0, 1, 2))
    cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
    cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
                    sql=tmp_path, data=SRC_DIR / 'data')
    return SQLPreprocessor(temp_db_conn, cfg)
@pytest.fixture
 def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
    """ Sets up the configuration so that the test dummy tokenizer will be
        loaded when the tokenizer factory is used. Also returns a factory
        with which a new dummy tokenizer may be created.
    """
    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
    def _import_dummy(module, *args, **kwargs):
        return dummy_tokenizer
    monkeypatch.setattr(importlib, "import_module", _import_dummy)
    properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
    def _create_tokenizer():
        return dummy_tokenizer.DummyTokenizer(None, None)
    return _create_tokenizer
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@ -0,0 +1,64 @@
 """
 Tokenizer for testing.
 """
 def create(dsn, data_dir):
    """ Create a new instance of the tokenizer provided by this module.
    """
    return DummyTokenizer(dsn, data_dir)
 class DummyTokenizer:
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
        self.init_state = None
        self.analyser_cache = {}
    def init_new_db(self, *args, **kwargs):
        assert self.init_state == None
        self.init_state = "new"
    def init_from_project(self):
        assert self.init_state == None
        self.init_state = "loaded"
    def finalize_import(self, _):
        pass
    def name_analyzer(self):
        return DummyNameAnalyzer(self.analyser_cache)
 class DummyNameAnalyzer:
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def __init__(self, cache):
        self.analyser_cache = cache
        cache['countries'] = []
    def close(self):
        pass
    def add_postcodes_from_db(self):
        pass
    def update_special_phrases(self, phrases):
        self.analyser_cache['special_phrases'] = phrases
    def add_country_names(self, code, names):
        self.analyser_cache['countries'].append((code, names))
    def process_place(self, place):
        return {}
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@ -22,6 +22,7 @@ import nominatim.tools.database_import
 import nominatim.tools.freeze
 import nominatim.tools.refresh
 import nominatim.tools.postcodes
 import nominatim.tokenizer.factory
 from mocks import MockParamCapture
@ -56,6 +57,28 @@ def mock_func_factory(monkeypatch):
    return get_mock
@pytest.fixture
 def tokenizer_mock(monkeypatch):
    class DummyTokenizer:
        def __init__(self, *args, **kwargs):
            self.update_sql_functions_called = False
            self.finalize_import_called = False
        def update_sql_functions(self, *args):
            self.update_sql_functions_called = True
        def finalize_import(self, *args):
            self.finalize_import_called = True
    tok = DummyTokenizer()
    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
                        lambda *args: tok)
    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
                        lambda *args: tok)
    return tok
 def test_cli_help(capsys):
    """ Running nominatim tool without arguments prints help.
    """
@ -84,10 +107,9 @@ def test_import_bad_file(temp_db):
    assert 1 == call_nominatim('import', '--osm-file', '.')
-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
        mock_func_factory(nominatim.tools.database_import, 'install_module'),
        mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
        mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
        mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
@ -107,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
    cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
    assert 0 == call_nominatim('import', '--osm-file', __file__)
    assert tokenizer_mock.finalize_import_called
    assert cf_mock.called > 1
@ -114,7 +137,7 @@ def test_import_full(temp_db, mock_func_factory):
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
        mock_func_factory(nominatim.tools.database_import, 'load_data'),
@ -127,12 +150,14 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
    ]
    assert 0 == call_nominatim('import', '--continue', 'load-data')
    assert tokenizer_mock.finalize_import_called
    for mock in mocks:
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
                                  temp_db_conn, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -153,7 +178,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
    assert temp_db_conn.index_exists('idx_placex_pendingsector')
-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -163,6 +188,8 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
    assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
    assert tokenizer_mock.finalize_import_called
    for mock in mocks:
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
@ -217,7 +244,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
                          (['--boundaries-only'], 1, 0),
                          (['--no-boundaries'], 0, 1),
                          (['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
                       params, do_bnds, do_ranks):
    temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
    bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
    rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@ -227,7 +255,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
    assert bnd_mock.called == do_bnds
    assert rank_mock.called == do_ranks
-def test_special_phrases_command(temp_db, mock_func_factory):
+def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
    func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
    call_nominatim('special-phrases', '--import-from-wiki')
@ -238,7 +266,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
                         ('postcodes', 'update_postcodes'),
                         ('word-counts', 'recompute_word_counts'),
                         ('address-levels', 'load_address_levels_from_file'),
                         ('functions', 'create_functions'),
                         ('wiki-data', 'import_wikipedia_articles'),
                         ('importance', 'recompute_importance'),
                         ('website', 'setup_website'),
@ -250,6 +277,14 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
    assert func_mock.called == 1
 def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
    func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
    assert 0 == call_nominatim('refresh', '--functions')
    assert func_mock.called == 1
    assert tokenizer_mock.update_sql_functions_called
 def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
    calls = []
    monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
--- a/test/python/test_cli_replication.py
+++ b/test/python/test_cli_replication.py
@ -27,7 +27,29 @@ def call_nominatim(*args):
                                   cli_args=['replication'] + list(args))
@pytest.fixture
-def index_mock(monkeypatch):
+def tokenizer_mock(monkeypatch):
    class DummyTokenizer:
        def __init__(self, *args, **kwargs):
            self.update_sql_functions_called = False
            self.finalize_import_called = False
        def update_sql_functions(self, *args):
            self.update_sql_functions_called = True
        def finalize_import(self, *args):
            self.finalize_import_called = True
    tok = DummyTokenizer()
    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
                        lambda *args: tok)
    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
                        lambda *args: tok)
    return tok
@pytest.fixture
 def index_mock(monkeypatch, tokenizer_mock):
    mock = MockParamCapture()
    monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
    monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@ -52,7 +74,7 @@ def init_status(temp_db_conn, status_table):
@pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
    return mock_func_factory(nominatim.tools.replication, 'update')
@pytest.mark.parametrize("params,func", [
--- a/test/python/test_db_sql_preprocessor.py
+++ b/test/python/test_db_sql_preprocessor.py
@ -24,7 +24,6 @@ def sql_factory(tmp_path):
    ("'{{db.partitions|join}}'", '012'),
    ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
    ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
    ("'{{config.DATABASE_MODULE_PATH}}'", '.')
    ])
 def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
    sqlfile = sql_factory("RETURN {};".format(expr))
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@ -6,6 +6,7 @@ import psycopg2
 import pytest
 from nominatim.indexer import indexer
 from nominatim.tokenizer import factory
 class IndexerTestDB:
@ -17,6 +18,7 @@ class IndexerTestDB:
        self.conn = conn
        self.conn.set_isolation_level(0)
        with self.conn.cursor() as cur:
            cur.execute('CREATE EXTENSION hstore')
            cur.execute("""CREATE TABLE placex (place_id BIGINT,
                                                class TEXT,
                                                type TEXT,
@ -26,9 +28,14 @@ class IndexerTestDB:
                                                indexed_date TIMESTAMP,
                                                partition SMALLINT,
                                                admin_level SMALLINT,
                                                address HSTORE,
                                                token_info JSONB,
                                                geometry_sector INTEGER)""")
            cur.execute("""CREATE TABLE location_property_osmline (
                               place_id BIGINT,
                               osm_id BIGINT,
                               address HSTORE,
                               token_info JSONB,
                               indexed_status SMALLINT,
                               indexed_date TIMESTAMP,
                               geometry_sector INTEGER)""")
@ -46,6 +53,25 @@ class IndexerTestDB:
                             END IF;
                             RETURN NEW;
                           END; $$ LANGUAGE plpgsql;""")
            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
                                                      OUT name HSTORE,
                                                      OUT address HSTORE,
                                                      OUT country_feature VARCHAR)
                           AS $$
                           BEGIN
                            address := p.address;
                            name := p.address;
                           END;
                           $$ LANGUAGE plpgsql STABLE;
                        """)
            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
                           RETURNS HSTORE AS $$
                           BEGIN
                             RETURN in_address;
                           END;
                           $$ LANGUAGE plpgsql STABLE;
                        """)
            for table in ('placex', 'location_property_osmline', 'location_postcode'):
                cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                               FOR EACH ROW EXECUTE PROCEDURE date_update()
@ -76,9 +102,9 @@ class IndexerTestDB:
        next_id = next(self.osmline_id)
        with self.conn.cursor() as cur:
            cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
+                              (place_id, osm_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
+                              VALUES (%s, %s, 1, %s)""",
-                        (next_id, sector))
+                        (next_id, next_id, sector))
        return next_id
    def add_postcode(self, country, postcode):
@ -102,8 +128,14 @@ def test_db(temp_db_conn):
    yield IndexerTestDB(temp_db_conn)
@pytest.fixture
 def test_tokenizer(tokenizer_mock, def_config, tmp_path):
    def_config.project_dir = tmp_path
    return factory.create_tokenizer(def_config)
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -111,7 +143,7 @@ def test_index_all_by_rank(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(0, 30)
    assert 0 == test_db.placex_unindexed()
@ -142,7 +174,7 @@ def test_index_all_by_rank(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -150,7 +182,8 @@ def test_index_partial_without_30(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
                          test_tokenizer, threads)
    idx.index_by_rank(4, 15)
    assert 19 == test_db.placex_unindexed()
@ -162,7 +195,7 @@ def test_index_partial_without_30(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -170,7 +203,7 @@ def test_index_partial_with_30(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(28, 30)
    assert 27 == test_db.placex_unindexed()
@ -181,7 +214,7 @@ def test_index_partial_with_30(test_db, threads):
                      WHERE indexed_status = 0 AND rank_address between 1 and 27""")
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
    for rank in range(4, 10):
        test_db.add_admin(rank_address=rank, rank_search=rank)
    for rank in range(31):
@ -191,7 +224,7 @@ def test_index_boundaries(test_db, threads):
    assert 37 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_boundaries(0, 30)
    assert 31 == test_db.placex_unindexed()
@ -203,13 +236,13 @@ def test_index_boundaries(test_db, threads):
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
    for postcode in range(1000):
        test_db.add_postcode('de', postcode)
    for postcode in range(32000, 33000):
        test_db.add_postcode('us', postcode)
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_postcodes()
    assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@ -217,7 +250,7 @@ def test_index_postcodes(test_db, threads):
@pytest.mark.parametrize("analyse", [True, False])
-def test_index_full(test_db, analyse):
+def test_index_full(test_db, analyse, test_tokenizer):
    for rank in range(4, 10):
        test_db.add_admin(rank_address=rank, rank_search=rank)
    for rank in range(31):
@ -226,7 +259,7 @@ def test_index_full(test_db, analyse):
    for postcode in range(1000):
        test_db.add_postcode('de', postcode)
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
    idx.index_full(analyse=analyse)
    assert 0 == test_db.placex_unindexed()
@ -236,13 +269,13 @@ def test_index_full(test_db, analyse):
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_reopen_connection(test_db, threads, monkeypatch):
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
    monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
    for _ in range(1000):
        test_db.add_place(rank_address=30, rank_search=30)
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(28, 30)
    assert 0 == test_db.placex_unindexed()
--- a/test/python/test_tokenizer_factory.py
+++ b/test/python/test_tokenizer_factory.py
@ -0,0 +1,77 @@
 """
 Tests for creating new tokenizers.
 """
 import importlib
 import pytest
 from nominatim.db import properties
 from nominatim.tokenizer import factory
 from nominatim.errors import UsageError
 from dummy_tokenizer import DummyTokenizer
@pytest.fixture
 def test_config(def_config, tmp_path):
    def_config.project_dir = tmp_path
    return def_config
 def test_setup_dummy_tokenizer(temp_db_conn, test_config,
                               tokenizer_mock, property_table):
    tokenizer = factory.create_tokenizer(test_config)
    assert isinstance(tokenizer, DummyTokenizer)
    assert tokenizer.init_state == "new"
    assert (test_config.project_dir / 'tokenizer').is_dir()
    assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
 def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
    (test_config.project_dir / 'tokenizer').mkdir()
    tokenizer = factory.create_tokenizer(test_config)
    assert isinstance(tokenizer, DummyTokenizer)
    assert tokenizer.init_state == "new"
 def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
    (test_config.project_dir / 'tokenizer').write_text("foo")
    with pytest.raises(UsageError):
        factory.create_tokenizer(test_config)
 def test_setup_bad_tokenizer_name(test_config, monkeypatch):
    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
    with pytest.raises(UsageError):
        factory.create_tokenizer(test_config)
 def test_load_tokenizer(temp_db_conn, test_config,
                        tokenizer_mock, property_table):
    factory.create_tokenizer(test_config)
    tokenizer = factory.get_tokenizer_for_db(test_config)
    assert isinstance(tokenizer, DummyTokenizer)
    assert tokenizer.init_state == "loaded"
 def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
    factory.create_tokenizer(test_config)
    test_config.project_dir = test_config.project_dir / 'foo'
    with pytest.raises(UsageError):
        factory.get_tokenizer_for_db(test_config)
 def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
    factory.create_tokenizer(test_config)
    temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
    with pytest.raises(UsageError):
        factory.get_tokenizer_for_db(test_config)
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@ -0,0 +1,298 @@
 """
 Test for legacy tokenizer.
 """
 import shutil
 import pytest
 from nominatim.tokenizer import legacy_tokenizer
 from nominatim.db import properties
 from nominatim.errors import UsageError
@pytest.fixture
 def test_config(def_config, tmp_path):
    def_config.project_dir = tmp_path / 'project'
    def_config.project_dir.mkdir()
    module_dir = tmp_path / 'module_src'
    module_dir.mkdir()
    (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
    def_config.lib_dir.module = module_dir
    sqldir = tmp_path / 'sql'
    sqldir.mkdir()
    (sqldir / 'tokenizer').mkdir()
    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
    (sqldir / 'words.sql').write_text("SELECT 'a'")
    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
    def_config.lib_dir.sql = sqldir
    def_config.lib_dir.data = sqldir
    return def_config
@pytest.fixture
 def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
    (tmp_path / 'tokenizer').mkdir()
    def _maker():
        return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
    return _maker
@pytest.fixture
 def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
@pytest.fixture
 def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
             word_table, temp_db_with_extensions, tmp_path):
    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
    sql.write_text("""
        CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
          RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
        """)
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    monkeypatch.undo()
    with tok.name_analyzer() as analyzer:
        yield analyzer
@pytest.fixture
 def make_standard_name(temp_db_cursor):
    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
                              RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
@pytest.fixture
 def create_postcode_id(table_factory, temp_db_cursor):
    table_factory('out_postcode_table', 'postcode TEXT')
    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
                              RETURNS BOOLEAN AS $$
                              INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
                              $$ LANGUAGE SQL""")
@pytest.fixture
 def create_housenumbers(temp_db_cursor):
    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
                                  housenumbers TEXT[],
                                  OUT tokens TEXT, OUT normtext TEXT)
                              AS $$
                              SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
                              $$ LANGUAGE SQL""")
@pytest.fixture
 def make_keywords(temp_db_cursor, temp_db_with_extensions):
    temp_db_cursor.execute(
        """CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
           RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
 def test_init_new(tokenizer_factory, test_config, monkeypatch,
                  temp_db_conn, sql_preprocessor):
    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
    outfile = test_config.project_dir / 'module' / 'nominatim.so'
    assert outfile.exists()
    assert outfile.read_text() == 'TEST nomiantim.so'
    assert outfile.stat().st_mode == 33261
 def test_init_module_load_failed(tokenizer_factory, test_config,
                                 monkeypatch, temp_db_conn):
    tok = tokenizer_factory()
    with pytest.raises(UsageError):
        tok.init_new_db(test_config)
 def test_init_module_custom(tokenizer_factory, test_config,
                            monkeypatch, tmp_path, sql_preprocessor):
    module_dir = (tmp_path / 'custom').resolve()
    module_dir.mkdir()
    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    assert not (test_config.project_dir / 'module').exists()
 def test_init_from_project(tokenizer_setup, tokenizer_factory):
    tok = tokenizer_factory()
    tok.init_from_project()
    assert tok.normalization is not None
 def test_update_sql_functions(sql_preprocessor, temp_db_conn,
                              tokenizer_factory, test_config, table_factory,
                              monkeypatch, temp_db_cursor):
    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    monkeypatch.undo()
    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
    table_factory('test', 'txt TEXT')
    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
                                                   ('{{modulepath}}')""")
    tok.update_sql_functions(test_config)
    test_content = temp_db_cursor.row_set('SELECT * FROM test')
    assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
 def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.migrate_database(test_config)
    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
    outfile = test_config.project_dir / 'module' / 'nominatim.so'
    assert outfile.exists()
    assert outfile.read_text() == 'TEST nomiantim.so'
    assert outfile.stat().st_mode == 33261
 def test_normalize(analyzer):
    assert analyzer.normalize('TEsT') == 'test'
 def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
                               create_postcode_id):
    table_factory('location_postcode', 'postcode TEXT',
                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
    analyzer.add_postcodes_from_db()
    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
               == set((('1234', ), ('12 34', ), ('AB23',)))
 def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
                                           make_standard_name):
    analyzer.update_special_phrases([
        ("König bei", "amenity", "royal", "near"),
        ("Könige", "amenity", "royal", "-"),
        ("strasse", "highway", "primary", "in")
    ])
    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                     FROM word WHERE class != 'place'""") \
               == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
                       (' könige', 'könige', 'amenity', 'royal', None),
                       (' strasse', 'strasse', 'highway', 'primary', 'in')))
 def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
                                          make_standard_name):
    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
                                     (' bar', 'bar', 'highway', 'road', null)""")
    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
    analyzer.update_special_phrases([])
    assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
                                      make_standard_name):
    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
                                     (' bar', 'bar', 'highway', 'road', null)""")
    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
    analyzer.update_special_phrases([
      ('prison', 'amenity', 'prison', 'in'),
      ('bar', 'highway', 'road', '-'),
      ('garden', 'leisure', 'garden', 'near')
    ])
    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                     FROM word WHERE class != 'place'""") \
               == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
                       (' bar', 'bar', 'highway', 'road', None),
                       (' garden', 'garden', 'leisure', 'garden', 'near')))
 def test_process_place_names(analyzer, make_keywords):
    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
    assert info['names'] == '{1,2,3}'
@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
 def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
    info = analyzer.process_place({'address': {'postcode' : pc}})
    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
               == set(((pc, ),))
@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
 def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
                                    pc):
    info = analyzer.process_place({'address': {'postcode' : pc}})
    assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
 def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
    info = analyzer.process_place({'address': {'housenumber' : hnr}})
    assert info['hnr'] == hnr
    assert info['hnr_tokens'].startswith("{")
 def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
    info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
    assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
 def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
    info = analyzer.process_place({'address': {'housenumber' : '134',
                                               'conscriptionnumber' : '134',
                                               'streetnumber' : '99a'}})
    assert set(info['hnr'].split(';')) == set(('134', '99a'))
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@ -43,8 +43,22 @@ def test_check_placex_table_size_bad(temp_db_cursor, temp_db_conn, def_config):
    assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL
-def test_check_module_bad(temp_db_conn, def_config):
+def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
-    assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+    def_config.project_dir = tmp_path
    assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
                                                ("Something wrong", chkdb.CheckState.FAIL)])
 def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
                         check_result, state):
    class _TestTokenizer:
        def check_database(self):
            return check_result
    monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
                         lambda *a, **k: _TestTokenizer())
    assert chkdb.check_tokenizer(temp_db_conn, def_config) == state
 def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
        database_import.setup_extensions(temp_db_conn)
 def test_install_module(tmp_path):
    src_dir = tmp_path / 'source'
    src_dir.mkdir()
    (src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
    project_dir = tmp_path / 'project'
    project_dir.mkdir()
    database_import.install_module(src_dir, project_dir, '')
    outfile = project_dir / 'module' / 'nominatim.so'
    assert outfile.exists()
    assert outfile.read_text() == 'TEST nomiantim.so'
    assert outfile.stat().st_mode == 33261
 def test_install_module_custom(tmp_path):
    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
    database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
    assert not (tmp_path / 'module').exists()
 def test_install_module_fail_access(temp_db_conn, tmp_path):
    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
    with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
        database_import.install_module(tmp_path, tmp_path, '',
                                       conn=temp_db_conn)
 def test_import_base_data(src_dir, temp_db, temp_db_cursor):
    temp_db_cursor.execute('CREATE EXTENSION hstore')
    temp_db_cursor.execute('CREATE EXTENSION postgis')
@ -171,14 +138,15 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
 def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
-    tables = ('word', 'placex', 'place_addressline', 'location_area',
+    tables = ('placex', 'place_addressline', 'location_area',
              'location_area_country',
              'location_property_tiger', 'location_property_osmline',
              'location_postcode', 'search_name', 'location_road_23')
    for table in tables:
-        table_factory(table, content=(1, 2, 3))
+        table_factory(table, content=((1, ), (2, ), (3, )))
        assert temp_db_cursor.table_rows(table) == 3
-    database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+    database_import.truncate_data_tables(temp_db_conn)
    for table in tables:
        assert temp_db_cursor.table_rows(table) == 0
@ -196,36 +164,33 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
    place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
              geom='SRID=4326;LINESTRING(0 0, 10 10)')
-    database_import.load_data(dsn, src_dir / 'data', threads)
+    database_import.load_data(dsn, threads)
    assert temp_db_cursor.table_rows('placex') == 30
    assert temp_db_cursor.table_rows('location_property_osmline') == 1
-@pytest.mark.parametrize("languages", (False, True))
+
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
-                              temp_db_with_extensions, monkeypatch, languages):
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
                              table_factory, tokenizer_mock, languages):
    table_factory('country_name', 'country_code varchar(2), name hstore',
                  content=(('us', '"name"=>"us1","name:af"=>"us2"'),
                           ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
    assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
    tokenizer = tokenizer_mock()
    database_import.create_country_names(temp_db_conn, tokenizer, languages)
    assert len(tokenizer.analyser_cache['countries']) == 2
    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
    if languages:
-        monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
+        assert result_set == {'us' : set(('us', 'us1', 'United States')),
-    temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
                               """)
    temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
    temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
    temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
                              '"name"=>"us","name:af"=>"us"')""")
    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
                            lookup_country_code varchar(2))
                            RETURNS INTEGER
                            AS $$
                            BEGIN
                                INSERT INTO word VALUES (lookup_country_code);
                                RETURN 5;
                            END;
                            $$
                            LANGUAGE plpgsql;
                               """)
    database_import.create_country_names(temp_db_conn, def_config)
    if languages:
        assert temp_db_cursor.table_rows('word') == 4
    else:
-        assert temp_db_cursor.table_rows('word') == 5
+        assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
                              'fr' : set(('fr', 'Fra', 'Fren'))}
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@ -11,41 +11,6 @@ from nominatim.tools import SpecialPhrasesImporter
 TEST_BASE_DIR = Path(__file__) / '..' / '..'
 def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
                                            temp_db_cursor):
    """
        Check for the fetch_existing_words_phrases() method.
        It should return special phrase term added to the word
        table.
    """
    query ="""
        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
        'class', 'type', null, 0, 'near');
    """
    temp_db_cursor.execute(query)
    assert not special_phrases_importer.words_phrases_to_delete
    special_phrases_importer._fetch_existing_words_phrases()
    contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
    assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
@pytest.mark.parametrize("house_type", ['house', 'postcode'])
 def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
                                                    house_type, temp_db_cursor):
    """
        Check for the fetch_existing_words_phrases() method.
        It should return nothing as the terms added correspond
        to a housenumber and postcode term.
    """
    query ="""
        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
        'place', %s, null, 0, 'near');
    """
    temp_db_cursor.execute(query, (house_type,))
    special_phrases_importer._fetch_existing_words_phrases()
    assert not special_phrases_importer.words_phrases_to_delete
 def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
    """
        Check for the fetch_existing_place_classtype_tables() method.
@ -118,41 +83,11 @@ def test_convert_settings_giving_json(special_phrases_importer):
        the same path is directly returned
    """
    json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
-    
+
    returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
    assert returned == json_file
 def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
                                       temp_db_conn, word_table):
    """
        Test that _process_amenity() execute well the 
        getorcreate_amenityoperator() SQL function and that
        the 2 differents operators are well handled.
    """
    special_phrases_importer._process_amenity('', '', '', '', 'near')
    special_phrases_importer._process_amenity('', '', '', '', 'in')
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
        results = temp_db_cursor.fetchall()
    assert len(results) == 2
 def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
                                          temp_db_conn, word_table):
    """
        Test that _process_amenity() execute well the
        getorcreate_amenity() SQL function.
    """
    special_phrases_importer._process_amenity('', '', '', '', '')
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
        result = temp_db_cursor.fetchone()
    assert result
 def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
    """
        Test that _create_place_classtype_indexes() create the
@ -215,8 +150,7 @@ def test_create_place_classtype_table_and_indexes(
        assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
        assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
-def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table,
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
                             getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
    """
        Test that _process_xml_content() process the given xml content right
        by executing the right SQL functions for amenities and 
@ -228,11 +162,9 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
    #Converted output set to a dict for easy assert further.
    results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
    assert check_amenities_with_op(temp_db_conn)
    assert check_amenities_without_op(temp_db_conn)
    assert results[class_test] and type_test in results.values()
-def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
                                             temp_db_conn):
    """
        Check for the remove_non_existent_phrases_from_db() method.
@ -245,22 +177,10 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
        be deleted.
    """
    with temp_db_conn.cursor() as temp_db_cursor:
        to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
        to_keep_phrase_tuple = (
            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
        )
        special_phrases_importer.words_phrases_to_delete = {
            to_delete_phrase_tuple,
            to_keep_phrase_tuple
        }
        special_phrases_importer.words_phrases_still_exist = {
            to_keep_phrase_tuple
        }
        special_phrases_importer.table_phrases_to_delete = {
            'place_classtype_testclasstypetable_to_delete'
        }
        query_words = 'SELECT word, class, type, operator FROM word;'
        query_tables = """
            SELECT table_name
            FROM information_schema.tables
@ -268,21 +188,16 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
            AND table_name like 'place_classtype_%';
        """
-        special_phrases_importer._remove_non_existent_phrases_from_db()
+        special_phrases_importer._remove_non_existent_tables_from_db()
        temp_db_cursor.execute(query_words)
        words_result = temp_db_cursor.fetchall()
        temp_db_cursor.execute(query_tables)
        tables_result = temp_db_cursor.fetchall()
        assert len(words_result) == 1 and words_result[0] == [
            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
        ]
        assert (len(tables_result) == 1 and
            tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
        )
-def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
-                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
+                          placex_table, tokenizer_mock):
    """
        Check that the main import_from_wiki() method is well executed.
        It should create the place_classtype table, the place_id and centroid indexes,
@ -294,17 +209,14 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
    #what is deleted and what is preserved.
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("""
            INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
            'amenity', 'animal_shelter', null, 0, null);
            INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
            'wrong_class', 'wrong_type', null, 0, 'near');
            CREATE TABLE place_classtype_amenity_animal_shelter();
            CREATE TABLE place_classtype_wrongclass_wrongtype();""")
    monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
-    special_phrases_importer.import_from_wiki(['en'])
+    tokenizer = tokenizer_mock()
    special_phrases_importer.import_from_wiki(tokenizer, ['en'])
    assert len(tokenizer.analyser_cache['special_phrases']) == 18
    class_test = 'aerialway'
    type_test = 'zip_line'
@ -312,22 +224,12 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
    assert check_table_exist(temp_db_conn, class_test, type_test)
    assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
    assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
    assert check_amenities_with_op(temp_db_conn)
    assert check_amenities_without_op(temp_db_conn)
    assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
    assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
    #Format (query, should_return_something_bool) use to easily execute all asserts
    queries_tests = set()
    #Used to check that the correct phrase already in the word table before is still there.
    query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
    queries_tests.add((query_correct_word, True))
    #Used to check if wrong phrase was deleted from the word table of the database.
    query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
    queries_tests.add((query_wrong_word, False))
    #Used to check that correct place_classtype table already in the datase before is still there.
    query_existing_table = """
        SELECT table_name
@ -412,24 +314,6 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
        temp_db_conn.index_exists(index_prefix + 'place_id')
    )
 def check_amenities_with_op(temp_db_conn):
    """
        Check that the test table for the SQL function getorcreate_amenityoperator()
        contains more than one value (so that the SQL function was call more than one time).
    """
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
        return len(temp_db_cursor.fetchall()) > 1
 def check_amenities_without_op(temp_db_conn):
    """
        Check that the test table for the SQL function getorcreate_amenity()
        contains more than one value (so that the SQL function was call more than one time).
    """
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
        return len(temp_db_cursor.fetchall()) > 1
@pytest.fixture
 def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
    """
@ -453,48 +337,7 @@ def temp_phplib_dir_with_migration():
        yield Path(phpdir)
@pytest.fixture
-def default_phrases(word_table, temp_db_cursor):
+def default_phrases(temp_db_cursor):
    temp_db_cursor.execute("""
        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
        'class', 'type', null, 0, 'near');
        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
        'class_exists', 'type_exists', null, 0, 'near');
        CREATE TABLE place_classtype_testclasstypetable_to_delete();
        CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
@pytest.fixture
 def make_strandard_name_func(temp_db_cursor):
    temp_db_cursor.execute("""
        CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
        BEGIN
        RETURN trim(name); --Basically return only the trimed name for the tests
        END;
        $$ LANGUAGE plpgsql IMMUTABLE;""")
@pytest.fixture
 def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
    temp_db_cursor.execute("""
        CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
                                                    lookup_class text, lookup_type text)
        RETURNS void as $$
        BEGIN
            INSERT INTO word VALUES(null, lookup_word, normalized_word,
            lookup_class, lookup_type, null, 0, 'no_operator');
        END;
        $$ LANGUAGE plpgsql""")
@pytest.fixture
 def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
    temp_db_cursor.execute("""
        CREATE TABLE temp_with_operator(op TEXT);
        CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
                                                    lookup_class text, lookup_type text, op text)
        RETURNS void as $$
        BEGIN 
            INSERT INTO word VALUES(null, lookup_word, normalized_word,
            lookup_class, lookup_type, null, 0, op);
        END;
        $$ LANGUAGE plpgsql""")
--- a/test/python/test_tools_postcodes.py
+++ b/test/python/test_tools_postcodes.py
@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
 import pytest
 from nominatim.tools import postcodes
 import dummy_tokenizer
@pytest.fixture
 def tokenizer():
    return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
 def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                      postcode TEXT,
                      geometry GEOMETRY(Geometry, 4326)""")
    temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                           """)
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
    assert temp_db_cursor.table_exists('gb_postcode')
    assert temp_db_cursor.table_exists('us_postcode')
    assert temp_db_cursor.table_rows('location_postcode') == 0
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
    temp_db_cursor.execute("""
        INSERT INTO placex (place_id, country_code, address, geometry)
          VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
    """)
-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
    rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                      ST_X(geometry), ST_Y(geometry)
--- a/test/python/test_tools_refresh_create_functions.py
+++ b/test/python/test_tools_refresh_create_functions.py
@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
    return tmp_path
@pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
+def conn(sql_preprocessor, temp_db_conn):
    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
    table_factory('country_name', 'partition INT', (0, 1, 2))
    return temp_db_conn
--- a/test/python/test_tools_refresh_setup_website.py
+++ b/test/python/test_tools_refresh_setup_website.py
@ -26,6 +26,7 @@ def test_script(envdir):
 def run_website_script(envdir, config):
    config.lib_dir.php = envdir / 'php'
    config.project_dir = envdir
    refresh.setup_website(envdir, config)
    proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
--- a/test/testdb/specialphrases_testdb.sql
+++ b/test/testdb/specialphrases_testdb.sql
@ -1,120 +1,170 @@
-SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
+                                               lookup_class text, lookup_type text)
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
+  RETURNS INTEGER
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
+  AS $$
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
+DECLARE
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
+  lookup_token TEXT;
-SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
+  return_word_id INTEGER;
-SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
+BEGIN
-SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
+  lookup_token := ' '||trim(lookup_word);
-SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
+  SELECT min(word_id) FROM word
-SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
+  WHERE word_token = lookup_token and word = normalized_word
-SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
+        and class = lookup_class and type = lookup_type
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
+  INTO return_word_id;
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
+  IF return_word_id IS NULL THEN
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
+    return_word_id := nextval('seq_word');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
+                             lookup_class, lookup_type, null, 0);
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
+  END IF;
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
+  RETURN return_word_id;
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
+END;
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
+$$
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
+LANGUAGE plpgsql;
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
+
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
+
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+                                                       normalized_word TEXT,
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+                                                       lookup_class text,
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+                                                       lookup_type text,
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+                                                       op text)
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+  RETURNS INTEGER
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
+  AS $$
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
+DECLARE
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
+  lookup_token TEXT;
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
+  return_word_id INTEGER;
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
+BEGIN
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
+  lookup_token := ' '||trim(lookup_word);
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
+  SELECT min(word_id) FROM word
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
+  WHERE word_token = lookup_token and word = normalized_word
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
+        and class = lookup_class and type = lookup_type and operator = op
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
+  INTO return_word_id;
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
+  IF return_word_id IS NULL THEN
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
+    return_word_id := nextval('seq_word');
-SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
+                             lookup_class, lookup_type, null, 0, op);
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
+  END IF;
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
+  RETURN return_word_id;
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
+END;
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
+$$
-SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
+LANGUAGE plpgsql;
-SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
+
-SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
 SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
 SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
 SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
 SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
 SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
 SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
 SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
 SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
 SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
 SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
 SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
 SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
 SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
 SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
 SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
 CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
 CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
 CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
@ -175,4 +225,7 @@ CREATE TABLE place_classtype_memorial_war_memorial AS SELECT place_id AS place_i
 CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid);
 CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
 GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
-DROP INDEX idx_placex_classtype;
+DROP INDEX idx_placex_classtype;
 DROP FUNCTION test_getorcreate_amenity;
 DROP FUNCTION test_getorcreate_amenityoperator;
		`@ -0,0 +1,2 @@`
							`CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id`
							`ON word USING BTREE (word_id) {{db.tablespace.search_index}};`