Merge pull request #2305 from lonvia/tokenizer

Factor out normalization into a separate module
2024-11-10 11:42:13 +03:00 · 2021-05-03 09:15:34 +02:00 · 2021-05-03 09:15:34 +02:00 · 8b1a509442
commit 8b1a509442
parent 5c7b9ef909 8bdb9aa607
61 changed files with 3032 additions and 2012 deletions
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@ -8,12 +8,14 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
 require_once(CONST_LibDir.'/SearchDescription.php');
 require_once(CONST_LibDir.'/SearchContext.php');
 require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');

 class Geocode
 {
    protected $oDB;

    protected $oPlaceLookup;
+    protected $oTokenizer;

    protected $aLangPrefOrder = array();

@ -41,23 +43,12 @@ class Geocode
    protected $sQuery = false;
    protected $aStructuredQuery = false;

-    protected $oNormalizer = null;
-

    public function __construct(&$oDB)
    {
        $this->oDB =& $oDB;
        $this->oPlaceLookup = new PlaceLookup($this->oDB);
-        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
-    }
-
-    private function normTerm($sTerm)
-    {
-        if ($this->oNormalizer === null) {
-            return $sTerm;
-        }
-
-        return $this->oNormalizer->transliterate($sTerm);
+        $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
    }

    public function setLanguagePreference($aLangPref)
@ -510,12 +501,10 @@ class Geocode
        if ($this->aCountryCodes) {
            $oCtx->setCountryList($this->aCountryCodes);
        }
+        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);

        Debug::newSection('Query Preprocessing');

-        $sNormQuery = $this->normTerm($this->sQuery);
-        Debug::printVar('Normalized query', $sNormQuery);
-
        $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
            $this->oDB->getDBQuotedList($this->aLangPrefOrder)
        );
@ -569,108 +558,55 @@ class Geocode
            }

            if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
-                $sSpecialTerm = pg_escape_string($sSpecialTerm);
-                $sToken = $this->oDB->getOne(
-                    'SELECT make_standard_name(:term)',
-                    array(':term' => $sSpecialTerm),
-                    'Cannot decode query. Wrong encoding?'
-                );
-                $sSQL = 'SELECT class, type FROM word ';
-                $sSQL .= '   WHERE word_token in (\' '.$sToken.'\')';
-                $sSQL .= '   AND class is not null AND class not in (\'place\')';
+                $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);

-                Debug::printSQL($sSQL);
-                $aSearchWords = $this->oDB->getAll($sSQL);
+                if (!empty($aTokens)) {
                    $aNewSearches = array();
                    foreach ($aSearches as $oSearch) {
-                    foreach ($aSearchWords as $aSearchTerm) {
+                        foreach ($aTokens as $oToken) {
                            $oNewSearch = clone $oSearch;
                            $oNewSearch->setPoiSearch(
-                            Operator::TYPE,
-                            $aSearchTerm['class'],
-                            $aSearchTerm['type']
+                                $oToken->iOperator,
+                                $oToken->sClass,
+                                $oToken->sType
                            );
                            $aNewSearches[] = $oNewSearch;
                        }
                    }
                    $aSearches = $aNewSearches;
                }
+            }

            // Split query into phrases
            // Commas are used to reduce the search space by indicating where phrases split
+            $aPhrases = array();
            if ($this->aStructuredQuery) {
-                $aInPhrases = $this->aStructuredQuery;
+                foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, $iPhrase);
+                }
            } else {
-                $aInPhrases = explode(',', $sQuery);
+                foreach (explode(',', $sQuery) as $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, '');
+                }
            }

            Debug::printDebugArray('Search context', $oCtx);
            Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
-            Debug::printVar('Final query phrases', $aInPhrases);

-            // Convert each phrase to standard form
-            // Create a list of standard words
-            // Get all 'sets' of words
-            // Generate a complete list of all
            Debug::newSection('Tokenization');
-            $aTokens = array();
-            $aPhrases = array();
-            foreach ($aInPhrases as $iPhrase => $sPhrase) {
-                $sPhrase = $this->oDB->getOne(
-                    'SELECT make_standard_name(:phrase)',
-                    array(':phrase' => $sPhrase),
-                    'Cannot normalize query string (is it a UTF-8 string?)'
-                );
-                if (trim($sPhrase)) {
-                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
-                    $oPhrase->addTokens($aTokens);
-                    $aPhrases[] = $oPhrase;
-                }
-            }
-
-            Debug::printVar('Tokens', $aTokens);
-
-            $oValidTokens = new TokenList();
-
-            if (!empty($aTokens)) {
-                $oValidTokens->addTokensFromDB(
-                    $this->oDB,
-                    $aTokens,
-                    $this->aCountryCodes,
-                    $sNormQuery,
-                    $this->oNormalizer
-                );
+            $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);

+            if ($oValidTokens->count() > 0) {
                $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());

-                // Try more interpretations for Tokens that could not be matched.
-                foreach ($aTokens as $sToken) {
-                    if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                        if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
-                            // US ZIP+4 codes - merge in the 5-digit ZIP code
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\Postcode(null, $aData[1], 'us')
-                            );
-                        } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
-                            // Unknown single word token with a number.
-                            // Assume it is a house number.
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\HouseNumber(null, trim($sToken))
-                            );
-                        }
-                    }
-                }
+                $aPhrases = array_filter($aPhrases, function ($oPhrase) {
+                    return $oPhrase->getWordSets() !== null;
+                });

                // Any words that have failed completely?
                // TODO: suggestions

                Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
-
-                foreach ($aPhrases as $oPhrase) {
-                    $oPhrase->computeWordSets($oValidTokens);
-                }
                Debug::printDebugTable('Phrases', $aPhrases);

                Debug::newSection('Search candidates');
--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@ -16,8 +16,6 @@ class Phrase
    private $sPhrase;
    // Element type for structured searches.
    private $sPhraseType;
-    // Space-separated words of the phrase.
-    private $aWords;
    // Possible segmentations of the phrase.
    private $aWordSets;

@ -38,7 +36,14 @@ class Phrase
    {
        $this->sPhrase = trim($sPhrase);
        $this->sPhraseType = $sPhraseType;
-        $this->aWords = explode(' ', $this->sPhrase);
+    }
+
+    /**
+     * Get the orginal phrase of the string.
+     */
+    public function getPhrase()
+    {
+        return $this->sPhrase;
    }

    /**
@ -63,30 +68,6 @@ class Phrase
        return $this->aWordSets;
    }

-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    public function addTokens(&$aTokens)
-    {
-        $iNumWords = count($this->aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $this->aWords[$i];
-            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$this->aWords[$j];
-                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
-
    /**
     * Invert the set of possible segmentations.
     *
@ -99,21 +80,27 @@ class Phrase
        }
    }

-    public function computeWordSets($oTokens)
+    public function computeWordSets($aWords, $oTokens)
    {
-        $iNumWords = count($this->aWords);
+        $iNumWords = count($aWords);
+
+        if ($iNumWords == 0) {
+            $this->aWordSets = null;
+            return;
+        }
+
        // Caches the word set for the partial phrase up to word i.
        $aSetCache = array_fill(0, $iNumWords, array());

        // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($this->aWords[0])) {
-            $aSetCache[0][] = array($this->aWords[0]);
+        if ($oTokens->containsAny($aWords[0])) {
+            $aSetCache[0][] = array($aWords[0]);
        }

        // Now do the next elements using what we already have.
        for ($i = 1; $i < $iNumWords; $i++) {
            for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
                    $aPartial = array($sPartial);
                    foreach ($aSetCache[$j - 1] as $aSet) {
@ -136,7 +123,7 @@ class Phrase
            }

            // finally the current full phrase
-            $sPartial = $this->aWords[0].' '.$sPartial;
+            $sPartial = $aWords[0].' '.$sPartial;
            if ($oTokens->containsAny($sPartial)) {
                $aSetCache[$i][] = array($sPartial);
            }
@ -153,7 +140,6 @@ class Phrase
        return array(
                'Type' => $this->sPhraseType,
                'Phrase' => $this->sPhrase,
-                'Words' => $this->aWords,
                'WordSets' => $this->aWordSets
               );
    }
--- a/lib-php/Status.php
+++ b/lib-php/Status.php
@ -2,6 +2,8 @@

 namespace Nominatim;

+require_once(CONST_TokenizerDir.'/tokenizer.php');
+
 use Exception;

 class Status
@ -25,24 +27,8 @@ class Status
            throw new Exception('Database connection failed', 700);
        }

-        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
-        if ($sStandardWord === false) {
-            throw new Exception('Module failed', 701);
-        }
-
-        if ($sStandardWord != 'a') {
-            throw new Exception('Module call failed', 702);
-        }
-
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
-        $sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
-        $iWordID = $this->oDB->getOne($sSQL);
-        if ($iWordID === false) {
-            throw new Exception('Query failed', 703);
-        }
-        if (!$iWordID) {
-            throw new Exception('No value', 704);
-        }
+        $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
+        $oTokenizer->checkStatus();
    }

    public function dataDate()
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@ -95,88 +95,6 @@ class TokenList
        return $ids;
    }

-    /**
-     * Add token information from the word table in the database.
-     *
-     * @param object   $oDB           Nominatim::DB instance.
-     * @param string[] $aTokens       List of tokens to look up in the database.
-     * @param string[] $aCountryCodes List of country restrictions.
-     * @param string   $sNormQuery    Normalized query string.
-     * @param object   $oNormalizer   Normalizer function to use on tokens.
-     *
-     * @return void
-     */
-    public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
-    {
-        // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
-        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
-        $sSQL .= ' FROM word WHERE word_token in (';
-        $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
-
-        Debug::printSQL($sSQL);
-
-        $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
-
-        foreach ($aDBWords as $aWord) {
-            $oToken = null;
-            $iId = (int) $aWord['word_id'];
-
-            if ($aWord['class']) {
-                // Special terms need to appear in their normalized form.
-                if ($aWord['word']) {
-                    $sNormWord = $aWord['word'];
-                    if ($oNormalizer != null) {
-                        $sNormWord = $oNormalizer->transliterate($aWord['word']);
-                    }
-                    if (strpos($sNormQuery, $sNormWord) === false) {
-                        continue;
-                    }
-                }
-
-                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
-                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
-                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
-                    if ($aWord['word']
-                        && pg_escape_string($aWord['word']) == $aWord['word']
-                    ) {
-                        $oToken = new Token\Postcode(
-                            $iId,
-                            $aWord['word'],
-                            $aWord['country_code']
-                        );
-                    }
-                } else {
-                    // near and in operator the same at the moment
-                    $oToken = new Token\SpecialTerm(
-                        $iId,
-                        $aWord['class'],
-                        $aWord['type'],
-                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
-                    );
-                }
-            } elseif ($aWord['country_code']) {
-                // Filter country tokens that do not match restricted countries.
-                if (!$aCountryCodes
-                    || in_array($aWord['country_code'], $aCountryCodes)
-                ) {
-                    $oToken = new Token\Country($iId, $aWord['country_code']);
-                }
-            } else {
-                $oToken = new Token\Word(
-                    $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
-                );
-            }
-
-            if ($oToken) {
-                $this->addToken($aWord['word_token'], $oToken);
-            }
-        }
-    }
-
    /**
     * Add a new token for the given word.
     *
--- a/lib-php/admin/query.php
+++ b/lib-php/admin/query.php
@ -2,7 +2,6 @@
@define('CONST_LibDir', dirname(dirname(__FILE__)));

 require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/ParameterParser.php');
 ini_set('memory_limit', '800M');

@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');

+require_once(CONST_LibDir.'/Geocode.php');

 $oDB = new Nominatim\DB;
 $oDB->connect();
--- a/lib-php/admin/warm.php
+++ b/lib-php/admin/warm.php
@ -3,7 +3,6 @@

 require_once(CONST_LibDir.'/init-cmd.php');
 require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/PlaceLookup.php');
 require_once(CONST_LibDir.'/ReverseGeocode.php');

@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');

+require_once(CONST_LibDir.'/Geocode.php');

 $oDB = new Nominatim\DB();
 $oDB->connect();
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@ -0,0 +1,254 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    private $oNormalizer = null;
+    private $aCountryRestriction = null;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+    }
+
+    public function checkStatus()
+    {
+        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
+        if ($sStandardWord === false) {
+            throw new Exception('Module failed', 701);
+        }
+
+        if ($sStandardWord != 'a') {
+            throw new Exception('Module call failed', 702);
+        }
+
+        $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
+        $iWordID = $this->oDB->getOne($sSQL);
+        if ($iWordID === false) {
+            throw new Exception('Query failed', 703);
+        }
+        if (!$iWordID) {
+            throw new Exception('No value', 704);
+        }
+    }
+
+
+    public function setCountryRestriction($aCountries)
+    {
+        $this->aCountryRestriction = $aCountries;
+    }
+
+
+    public function normalizeString($sTerm)
+    {
+        if ($this->oNormalizer === null) {
+            return $sTerm;
+        }
+
+        return $this->oNormalizer->transliterate($sTerm);
+    }
+
+
+    public function tokensForSpecialTerm($sTerm)
+    {
+        $aResults = array();
+
+        $sSQL = 'SELECT word_id, class, type FROM word ';
+        $sSQL .= '   WHERE word_token = \' \' || make_standard_name(:term)';
+        $sSQL .= '   AND class is not null AND class not in (\'place\')';
+
+        Debug::printVar('Term', $sTerm);
+        Debug::printSQL($sSQL);
+        $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
+
+        Debug::printVar('Results', $aSearchWords);
+
+        foreach ($aSearchWords as $aSearchTerm) {
+            $aResults[] = new \Nominatim\Token\SpecialTerm(
+                $aSearchTerm['word_id'],
+                $aSearchTerm['class'],
+                $aSearchTerm['type'],
+                \Nominatim\Operator::TYPE
+            );
+        }
+
+        Debug::printVar('Special term tokens', $aResults);
+
+        return $aResults;
+    }
+
+
+    public function extractTokensFromPhrases(&$aPhrases)
+    {
+        // First get the normalized version of all phrases
+        $sNormQuery = '';
+        $sSQL = 'SELECT ';
+        $aParams = array();
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
+            $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
+            $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+        }
+        $sSQL = substr($sSQL, 0, -1);
+
+        Debug::printSQL($sSQL);
+        Debug::printVar('SQL parameters', $aParams);
+
+        $aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
+
+        Debug::printVar('SQL result', $aNormPhrases);
+
+        // now compute all possible tokens
+        $aWordLists = array();
+        $aTokens = array();
+        foreach ($aNormPhrases as $sTitle => $sPhrase) {
+            if (strlen($sPhrase) > 0) {
+                $aWords = explode(' ', $sPhrase);
+                Tokenizer::addTokens($aTokens, $aWords);
+                $aWordLists[] = $aWords;
+            } else {
+                $aWordLists[] = array();
+            }
+        }
+
+        Debug::printVar('Tokens', $aTokens);
+        Debug::printVar('WordLists', $aWordLists);
+
+        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
+
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function computeValidTokens($aTokens, $sNormQuery)
+    {
+        $oValidTokens = new TokenList();
+
+        if (!empty($aTokens)) {
+            $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
+
+            // Try more interpretations for Tokens that could not be matched.
+            foreach ($aTokens as $sToken) {
+                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                        // US ZIP+4 codes - merge in the 5-digit ZIP code
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\Postcode(null, $aData[1], 'us')
+                        );
+                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                        // Unknown single word token with a number.
+                        // Assume it is a house number.
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\HouseNumber(null, trim($sToken))
+                        );
+                    }
+                }
+            }
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
+    {
+        // Check which tokens we have, get the ID numbers
+        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
+        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
+        $sSQL .= ' FROM word WHERE word_token in (';
+        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
+
+        Debug::printSQL($sSQL);
+
+        $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
+
+        foreach ($aDBWords as $aWord) {
+            $oToken = null;
+            $iId = (int) $aWord['word_id'];
+
+            if ($aWord['class']) {
+                // Special terms need to appear in their normalized form.
+                // (postcodes are not normalized in the word table)
+                $sNormWord = $this->normalizeString($aWord['word']);
+                if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
+                    continue;
+                }
+
+                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
+                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
+                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
+                    if ($aWord['word']
+                        && pg_escape_string($aWord['word']) == $aWord['word']
+                    ) {
+                        $oToken = new Token\Postcode(
+                            $iId,
+                            $aWord['word'],
+                            $aWord['country_code']
+                        );
+                    }
+                } else {
+                    // near and in operator the same at the moment
+                    $oToken = new Token\SpecialTerm(
+                        $iId,
+                        $aWord['class'],
+                        $aWord['type'],
+                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
+                    );
+                }
+            } elseif ($aWord['country_code']) {
+                // Filter country tokens that do not match restricted countries.
+                if (!$this->aCountryRestriction
+                    || in_array($aWord['country_code'], $this->aCountryRestriction)
+                ) {
+                    $oToken = new Token\Country($iId, $aWord['country_code']);
+                }
+            } else {
+                $oToken = new Token\Word(
+                    $iId,
+                    $aWord['word_token'][0] != ' ',
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
+                );
+            }
+
+            if ($oToken) {
+                $oValidTokens->addToken($aWord['word_token'], $oToken);
+            }
+        }
+    }
+
+
+    /**
+     * Add the tokens from this phrase to the given list of tokens.
+     *
+     * @param string[] $aTokens List of tokens to append.
+     *
+     * @return void
+     */
+    private static function addTokens(&$aTokens, $aWords)
+    {
+        $iNumWords = count($aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $aWords[$i];
+            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$aWords[$j];
+                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+    }
+}
--- a/lib-sql/functions.sql
+++ b/lib-sql/functions.sql
@ -1,5 +1,4 @@
 {% include('functions/utils.sql') %}
-{% include('functions/normalization.sql') %}
 {% include('functions/ranking.sql') %}
 {% include('functions/importance.sql') %}
 {% include('functions/address_lookup.sql') %}
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@ -12,39 +12,47 @@ $$
 LANGUAGE plpgsql IMMUTABLE;


+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+  AS $$
+DECLARE
+  location RECORD;
+  waynodes BIGINT[];
+BEGIN
+  IF akeys(in_address) != ARRAY['interpolation'] THEN
+    RETURN in_address;
+  END IF;
+
+  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+  FOR location IN
+    SELECT placex.address, placex.osm_id FROM placex
+     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+           and placex.address is not null
+           and (placex.address ? 'street' or placex.address ? 'place')
+           and indexed_status < 100
+  LOOP
+    -- mark it as a derived address
+    RETURN location.address || in_address || hstore('_inherited', '');
+  END LOOP;
+
+  RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
-                                                    place TEXT, partition SMALLINT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+                                                    partition SMALLINT,
                                                    centroid GEOMETRY, geom GEOMETRY)
  RETURNS BIGINT
  AS $$
 DECLARE
-  addr_street TEXT;
-  addr_place TEXT;
  parent_place_id BIGINT;
-
-  waynodes BIGINT[];
-
  location RECORD;
 BEGIN
-  addr_street = street;
-  addr_place = place;
-
-  IF addr_street is null and addr_place is null THEN
-    select nodes from planet_osm_ways where id = wayid INTO waynodes;
-    FOR location IN SELECT placex.address from placex
-                    where osm_type = 'N' and osm_id = ANY(waynodes)
-                          and placex.address is not null
-                          and (placex.address ? 'street' or placex.address ? 'place')
-                          and indexed_status < 100
-                    limit 1 LOOP
-      addr_street = location.address->'street';
-      addr_place = location.address->'place';
-    END LOOP;
-  END IF;
-
-  parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                             partition, centroid);
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);

  IF parent_place_id is null THEN
    FOR location IN SELECT place_id FROM placex
@ -147,15 +155,15 @@ BEGIN
  NEW.interpolationtype = NEW.address->'interpolation';

  place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
-                                                 NEW.address->'place',
+  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
+                                                 token_addr_place_match_tokens(NEW.token_info),
                                                 NEW.partition, place_centroid, NEW.linegeo);

-  IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    interpol_postcode := NEW.address->'postcode';
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
-  ELSE
-    interpol_postcode := NULL;
+  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
+
+  NEW.token_info := token_strip_info(NEW.token_info);
+  IF NEW.address ? '_inherited' THEN
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
  END IF;

  -- if the line was newly inserted, split the line as necessary
@ -202,12 +210,13 @@ BEGIN

            -- determine postcode
            postcode := coalesce(interpol_postcode,
-                                 prevnode.address->'postcode',
-                                 nextnode.address->'postcode',
+                                 token_normalized_postcode(prevnode.address->'postcode'),
+                                 token_normalized_postcode(nextnode.address->'postcode'),
                                 postcode);

            IF postcode is NULL THEN
-                SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+                SELECT token_normalized_postcode(placex.postcode)
+                  FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
            END IF;
            IF postcode is NULL THEN
                postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
@ -217,7 +226,7 @@ BEGIN
                NEW.startnumber := startnumber;
                NEW.endnumber := endnumber;
                NEW.linegeo := sectiongeo;
-                NEW.postcode := upper(trim(postcode));
+                NEW.postcode := postcode;
             ELSE
              insert into location_property_osmline
                     (linegeo, partition, osm_id, parent_place_id,
--- a/lib-sql/functions/normalization.sql
+++ b/lib-sql/functions/normalization.sql
@ -1,525 +0,0 @@
-- Functions for term normalisation and access to the 'word' table.
-
-CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'transliteration'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
-  AS $$
-DECLARE
-  o TEXT;
-BEGIN
-  o := public.gettokenstring(public.transliteration(name));
-  RETURN trim(substr(o,1,length(o)));
-END;
-$$
-LANGUAGE plpgsql IMMUTABLE;
-
-- returns NULL if the word is too common
-CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-  count INTEGER;
-BEGIN
-  lookup_token := trim(lookup_word);
-  SELECT min(word_id), max(search_name_count) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_id, count;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
-  ELSE
-    IF count > get_maxwordfreq() THEN
-      return_word_id := NULL;
-    END IF;
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-- Create housenumber tokens from an OSM addr:housenumber.
-- The housnumber is split at comma and semicolon as necessary.
-- The function returns the normalized form of the housenumber suitable
-- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
-  RETURNS TEXT
-  AS $$
-DECLARE
-  normtext TEXT;
-BEGIN
-  SELECT array_to_string(array_agg(trans), ';')
-    INTO normtext
-    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
-          FROM (SELECT make_standard_name(h) as lookup_word
-                FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
-
-  return normtext;
-END;
-$$ LANGUAGE plpgsql STABLE STRICT;
-
-CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' ' || trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and class='place' and type='house'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             'place', 'house', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  lookup_word TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_word := upper(trim(postcode));
-  lookup_token := ' ' || make_standard_name(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and word = lookup_word
-          and class='place' and type='postcode'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
-                             'place', 'postcode', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                                               lookup_country_code varchar(2))
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and country_code=lookup_country_code
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             null, null, lookup_country_code, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                               lookup_class text, lookup_type text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
-                                                       normalized_word TEXT,
-                                                       lookup_class text,
-                                                       lookup_type text,
-                                                       op text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type and operator = op
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0, op);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  nospace_lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and class is null and type is null
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
-                             null, null, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-BEGIN
-  RETURN getorcreate_name_id(lookup_word, '');
-END;
-$$
-LANGUAGE plpgsql;
-
-- Normalize a string and lookup its word ids (partial words).
-CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  words TEXT[];
-  id INTEGER;
-  return_word_id INTEGER[];
-  word_ids INTEGER[];
-  j INTEGER;
-BEGIN
-  words := string_to_array(make_standard_name(lookup_word), ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        SELECT array_agg(word_id) INTO word_ids
-          FROM word
-         WHERE word_token = words[j] and class is null and type is null;
-
-        IF word_ids IS NULL THEN
-          id := nextval('seq_word');
-          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
-          return_word_id := return_word_id || id;
-        ELSE
-          return_word_id := array_merge(return_word_id, word_ids);
-        END IF;
-      END IF;
-    END LOOP;
-  END IF;
-
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-- Normalize a string and look up its name ids (full words).
-CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_ids INTEGER[];
-BEGIN
-  lookup_token := ' '|| make_standard_name(lookup_word);
-  SELECT array_agg(word_id) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_ids;
-  RETURN return_word_ids;
-END;
-$$
-LANGUAGE plpgsql STABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
-  RETURNS VOID
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_country(s, country_code);
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_country(s, country_code);
-        END IF;
-      END LOOP;
-    END IF;
-  END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  result INTEGER[];
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  result := '{}'::INTEGER[];
-
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_name_id(s, item.value);
-
-    IF not(ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    w := getorcreate_word_id(s);
-
-    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    words := string_to_array(s, ' ');
-    IF array_upper(words, 1) IS NOT NULL THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        IF (words[j] != '') THEN
-          w = getorcreate_word_id(words[j]);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_word_id(s);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    s := regexp_replace(item.value, '市$', '');
-    IF s != item.value THEN
-      s := make_standard_name(s);
-      IF s != '' THEN
-        w := getorcreate_name_id(s, item.value);
-        IF NOT (ARRAY[w] <@ result) THEN
-          result := result || w;
-        END IF;
-      END IF;
-    END IF;
-
-  END LOOP;
-
-  RETURN result;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  i INTEGER;
-  j INTEGER;
-BEGIN
-  s := make_standard_name(src);
-  w := getorcreate_name_id(s, src);
-
-  w := getorcreate_word_id(s);
-
-  words := string_to_array(s, ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        w := getorcreate_word_id(words[j]);
-      END IF;
-    END LOOP;
-  END IF;
-
-  words := regexp_split_to_array(src, E'[,;()]');
-  IF array_upper(words, 1) != 1 THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      s := make_standard_name(words[j]);
-      IF s != '' THEN
-        w := getorcreate_word_id(s);
-      END IF;
-    END LOOP;
-  END IF;
-
-  s := regexp_replace(src, '市$', '');
-  IF s != src THEN
-    s := make_standard_name(s);
-    IF s != '' THEN
-      w := getorcreate_name_id(s, src);
-    END IF;
-  END IF;
-
-  RETURN 1;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
-                                                   in_partition SMALLINT,
-                                                   parent_place_id BIGINT,
-                                                   address HSTORE,
-                                                   country TEXT,
-                                                   housenumber TEXT,
-                                                   initial_name_vector INTEGER[],
-                                                   geometry GEOMETRY,
-                                                   OUT name_vector INTEGER[],
-                                                   OUT nameaddress_vector INTEGER[])
-  AS $$
-DECLARE
-  parent_name_vector INTEGER[];
-  parent_address_vector INTEGER[];
-  addr_place_ids INTEGER[];
-
-  addr_item RECORD;
-  parent_address_place_ids BIGINT[];
-  filtered_address HSTORE;
-BEGIN
-  nameaddress_vector := '{}'::INTEGER[];
-
-  SELECT s.name_vector, s.nameaddress_vector
-    INTO parent_name_vector, parent_address_vector
-    FROM search_name s
-    WHERE s.place_id = parent_place_id;
-
-  -- Find all address tags that don't appear in the parent search names.
-  SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
-    FROM (SELECT skeys(address) as k, svals(address) as v) a
-   WHERE not addr_ids_from_name(v) && parent_address_vector
-         AND k not in ('country', 'street', 'place', 'postcode',
-                       'housenumber', 'streetnumber', 'conscriptionnumber');
-
-  -- Compute all search terms from the addr: tags.
-  IF filtered_address IS NOT NULL THEN
-    FOR addr_item IN
-      SELECT * FROM
-        get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
-    LOOP
-        IF addr_item.place_id is null THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-            CONTINUE;
-        END IF;
-
-        IF parent_address_place_ids is null THEN
-            SELECT array_agg(parent_place_id) INTO parent_address_place_ids
-              FROM place_addressline
-             WHERE place_id = parent_place_id;
-        END IF;
-
-        IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-
-            INSERT INTO place_addressline (place_id, address_place_id, fromarea,
-                                           isaddress, distance, cached_rank_address)
-            VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
-                    true, addr_item.distance, addr_item.rank_address);
-        END IF;
-    END LOOP;
-  END IF;
-
-  name_vector := initial_name_vector;
-
-  -- Check if the parent covers all address terms.
-  -- If not, create a search name entry with the house number as the name.
-  -- This is unusual for the search_name table but prevents that the place
-  -- is returned when we only search for the street/place.
-
-  IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
-    name_vector := array_merge(name_vector,
-                               ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
-  END IF;
-
-  IF not address ? 'street' and address ? 'place' THEN
-    addr_place_ids := addr_ids_from_name(address->'place');
-    IF not addr_place_ids <@ parent_name_vector THEN
-      -- make sure addr:place terms are always searchable
-      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-      -- If there is a housenumber, also add the place name as a name,
-      -- so we can search it by the usual housenumber+place algorithms.
-      IF housenumber is not null THEN
-        name_vector := array_merge(name_vector,
-                                   ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
-      END IF;
-    END IF;
-  END IF;
-
-  -- Cheating here by not recomputing all terms but simply using the ones
-  -- from the parent object.
-  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
-  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
-
-END;
-$$
-LANGUAGE plpgsql;
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@ -63,54 +63,36 @@ END
 $$
 LANGUAGE plpgsql STABLE;

-CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
-                                                    feature GEOMETRY,
-                                                    address HSTORE, country TEXT)
-  RETURNS SETOF nearfeaturecentr
+
+CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
+                                             from_rank SMALLINT, to_rank SMALLINT,
+                                             extent FLOAT, tokens INT[])
+  RETURNS nearfeaturecentr
  AS $$
 DECLARE
  r nearfeaturecentr%rowtype;
-  item RECORD;
 BEGIN
-  FOR item IN
-    SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
-      (SELECT skeys(address) as key, svals(address) as name) x
-  LOOP
-   IF item.from_rank is null THEN
-     CONTINUE;
-   END IF;
-
 {% for partition in db.partitions %}
  IF in_partition = {{ partition }} THEN
      SELECT place_id, keywords, rank_address, rank_search,
             min(ST_Distance(feature, centroid)) as distance,
             isguess, postcode, centroid INTO r
        FROM location_area_large_{{ partition }}
-        WHERE geometry && ST_Expand(feature, item.extent)
-          AND rank_address between item.from_rank and item.to_rank
-          AND word_ids_from_name(item.name) && keywords
+        WHERE geometry && ST_Expand(feature, extent)
+              AND rank_address between from_rank and to_rank
+              AND tokens && keywords
        GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
        ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
-      IF r.place_id is null THEN
-        -- If we cannot find a place for the term, just return the
-        -- search term for the given name. That ensures that the address
-        -- element can still be searched for, even though it will not be
-        -- displayed.
-        RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
-                        null, null, null, null)::nearfeaturecentr;
-      ELSE
-        RETURN NEXT r;
-      END IF;
-      CONTINUE;
+      RETURN r;
  END IF;
 {% endfor %}

  RAISE EXCEPTION 'Unknown partition %', in_partition;
-  END LOOP;
 END;
 $$
 LANGUAGE plpgsql STABLE;

+
 create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
 DECLARE
 BEGIN
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@ -1,28 +1,58 @@
 -- Trigger functions for the placex table.

-- Find the parent road of a POI.
+-- Retrieve the data needed by the indexer for updating the place.
 --
-- \returns Place ID of parent object or NULL if none
--
-- Copy data from linked items (POIs on ways, addr:street links, relations).
--
-CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
-                                               poi_osm_id BIGINT,
-                                               poi_partition SMALLINT,
-                                               bbox GEOMETRY,
-                                               addr_street TEXT,
-                                               addr_place TEXT,
-                                               fallback BOOL = true)
+-- Return parameters:
+--  name            list of names
+--  address         list of address tags, either from the object or a surrounding
+--                  building
+--  country_feature If the place is a country feature, this contains the
+--                  country code, otherwise it is null.
+CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                 OUT name HSTORE,
+                                                 OUT address HSTORE,
+                                                 OUT country_feature VARCHAR)
+  AS $$
+BEGIN
+  -- For POI nodes, check if the address should be derived from a surrounding
+  -- building.
+  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
+    address := p.address;
+  ELSE
+    -- The additional && condition works around the misguided query
+    -- planner of postgis 3.0.
+    SELECT placex.address || hstore('_inherited', '') INTO address
+      FROM placex
+     WHERE ST_Covers(geometry, p.centroid)
+           and geometry && p.centroid
+           and placex.address is not null
+           and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
+           and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
+     LIMIT 1;
+  END IF;
+
+  address := address - '_unlisted_place'::TEXT;
+  name := p.name;
+
+  country_feature := CASE WHEN p.admin_level = 2
+                               and p.class = 'boundary' and p.type = 'administrative'
+                               and p.osm_type = 'R'
+                          THEN p.country_code
+                          ELSE null
+                     END;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
+                                                  poi_osm_id BIGINT)
  RETURNS BIGINT
  AS $$
 DECLARE
-  parent_place_id BIGINT DEFAULT NULL;
  location RECORD;
  parent RECORD;
 BEGIN
-    {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
-
-    -- Is this object part of an associatedStreet relation?
  FOR location IN
    SELECT members FROM planet_osm_rels
    WHERE parts @> ARRAY[poi_osm_id]
@ -43,26 +73,54 @@ BEGIN
    END LOOP;
  END LOOP;

+  RETURN NULL;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+-- Find the parent road of a POI.
+--
+-- \returns Place ID of parent object or NULL if none
+--
+-- Copy data from linked items (POIs on ways, addr:street links, relations).
+--
+CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
+                                               poi_osm_id BIGINT,
+                                               poi_partition SMALLINT,
+                                               bbox GEOMETRY,
+                                               addr_street INTEGER[],
+                                               addr_place INTEGER[],
+                                               is_place_addr BOOLEAN)
+  RETURNS BIGINT
+  AS $$
+DECLARE
+  parent_place_id BIGINT DEFAULT NULL;
+  location RECORD;
+BEGIN
+  {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+
+  -- Is this object part of an associatedStreet relation?
+  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
+
+  IF parent_place_id is null THEN
    parent_place_id := find_parent_for_address(addr_street, addr_place,
                                               poi_partition, bbox);
-    IF parent_place_id is not null THEN
-      RETURN parent_place_id;
  END IF;

-    IF poi_osm_type = 'N' THEN
+  IF parent_place_id is null and poi_osm_type = 'N' THEN
    -- Is this node part of an interpolation?
-      FOR parent IN
+    FOR location IN
      SELECT q.parent_place_id
        FROM location_property_osmline q, planet_osm_ways x
       WHERE q.linegeo && bbox and x.id = q.osm_id
             and poi_osm_id = any(x.nodes)
       LIMIT 1
    LOOP
-        {% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %}
-        RETURN parent.parent_place_id;
+      {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
+      RETURN location.parent_place_id;
    END LOOP;

-      -- Is this node part of any other way?
    FOR location IN
      SELECT p.place_id, p.osm_id, p.rank_search, p.address,
             coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
@ -76,23 +134,15 @@ BEGIN
      -- Way IS a road then we are on it - that must be our road
      IF location.rank_search < 28 THEN
        {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
-          return location.place_id;
+        RETURN location.place_id;
      END IF;

-        SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
-                                   location.centroid,
-                                   location.address->'street',
-                                   location.address->'place',
-                                   false)
-          INTO parent_place_id;
-        IF parent_place_id is not null THEN
-          RETURN parent_place_id;
-        END IF;
+      parent_place_id := find_associated_street('W', location.osm_id);
    END LOOP;
  END IF;

-    IF fallback THEN
-      IF addr_street is null and addr_place is not null THEN
+  IF parent_place_id is NULL THEN
+    IF is_place_addr THEN
      -- The address is attached to a place we don't know.
      -- Instead simply use the containing area with the largest rank.
      FOR location IN
@ -240,6 +290,101 @@ $$
 LANGUAGE plpgsql STABLE;


+CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
+                                                   in_partition SMALLINT,
+                                                   parent_place_id BIGINT,
+                                                   is_place_addr BOOLEAN,
+                                                   country TEXT,
+                                                   token_info JSONB,
+                                                   geometry GEOMETRY,
+                                                   OUT name_vector INTEGER[],
+                                                   OUT nameaddress_vector INTEGER[])
+  AS $$
+DECLARE
+  parent_name_vector INTEGER[];
+  parent_address_vector INTEGER[];
+  addr_place_ids INTEGER[];
+  hnr_vector INTEGER[];
+
+  addr_item RECORD;
+  addr_place RECORD;
+  parent_address_place_ids BIGINT[];
+BEGIN
+  nameaddress_vector := '{}'::INTEGER[];
+
+  SELECT s.name_vector, s.nameaddress_vector
+    INTO parent_name_vector, parent_address_vector
+    FROM search_name s
+    WHERE s.place_id = parent_place_id;
+
+  FOR addr_item IN
+    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+      FROM token_get_address_tokens(token_info)
+      WHERE not search_tokens <@ parent_address_vector
+  LOOP
+    addr_place := get_address_place(in_partition, geometry,
+                                    addr_item.from_rank, addr_item.to_rank,
+                                    addr_item.extent, addr_item.match_tokens);
+
+    IF addr_place is null THEN
+      -- No place found in OSM that matches. Make it at least searchable.
+      nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
+    ELSE
+      IF parent_address_place_ids is null THEN
+        SELECT array_agg(parent_place_id) INTO parent_address_place_ids
+          FROM place_addressline
+          WHERE place_id = parent_place_id;
+      END IF;
+
+      -- If the parent already lists the place in place_address line, then we
+      -- are done. Otherwise, add its own place_address line.
+      IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
+        nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
+
+        INSERT INTO place_addressline (place_id, address_place_id, fromarea,
+                                       isaddress, distance, cached_rank_address)
+          VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
+                    true, addr_place.distance, addr_place.rank_address);
+      END IF;
+    END IF;
+  END LOOP;
+
+  name_vector := token_get_name_search_tokens(token_info);
+
+  -- Check if the parent covers all address terms.
+  -- If not, create a search name entry with the house number as the name.
+  -- This is unusual for the search_name table but prevents that the place
+  -- is returned when we only search for the street/place.
+
+  hnr_vector := token_get_housenumber_search_tokens(token_info);
+
+  IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
+    name_vector := array_merge(name_vector, hnr_vector);
+  END IF;
+
+  IF is_place_addr THEN
+    addr_place_ids := token_addr_place_search_tokens(token_info);
+    IF not addr_place_ids <@ parent_name_vector THEN
+      -- make sure addr:place terms are always searchable
+      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
+      -- If there is a housenumber, also add the place name as a name,
+      -- so we can search it by the usual housenumber+place algorithms.
+      IF hnr_vector is not null THEN
+        name_vector := array_merge(name_vector, addr_place_ids);
+      END IF;
+    END IF;
+  END IF;
+
+  -- Cheating here by not recomputing all terms but simply using the ones
+  -- from the parent object.
+  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
+  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
+
+END;
+$$
+LANGUAGE plpgsql;
+
+
 -- Insert address of a place into the place_addressline table.
 --
 -- \param obj_place_id  Place_id of the place to compute the address for.
@ -260,7 +405,7 @@ LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
                                               partition SMALLINT,
                                               maxrank SMALLINT,
-                                               address HSTORE,
+                                               token_info JSONB,
                                               geometry GEOMETRY,
                                               country TEXT,
                                               OUT parent_place_id BIGINT,
@ -275,7 +420,8 @@ DECLARE
  current_node_area GEOMETRY := NULL;

  parent_place_rank INT := 0;
-  addr_place_ids BIGINT[];
+  addr_place_ids BIGINT[] := '{}'::int[];
+  new_address_vector INT[];

  location RECORD;
 BEGIN
@ -285,16 +431,21 @@ BEGIN
  address_havelevel := array_fill(false, ARRAY[maxrank]);

  FOR location IN
-    SELECT * FROM get_places_for_addr_tags(partition, geometry,
-                                                   address, country)
+    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
+                              extent, match_tokens)).*, search_tokens
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+              FROM token_get_address_tokens(token_info)) x
      ORDER BY rank_address, distance, isguess desc
  LOOP
+    IF location.place_id is null THEN
      {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector,
-                                        location.keywords::int[]);
+      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      {% endif %}
+    ELSE
+      {% if not db.reverse_only %}
+      nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
      {% endif %}

-    IF location.place_id is not null THEN
      location_isaddress := not address_havelevel[location.rank_address];
      IF not address_havelevel[location.rank_address] THEN
        address_havelevel[location.rank_address] := true;
@ -309,13 +460,13 @@ BEGIN
        VALUES (obj_place_id, location.place_id, not location.isguess,
                true, location.distance, location.rank_address);

-      addr_place_ids := array_append(addr_place_ids, location.place_id);
+      addr_place_ids := addr_place_ids || location.place_id;
    END IF;
  END LOOP;

  FOR location IN
    SELECT * FROM getNearFeatures(partition, geometry, maxrank)
-    WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id]
+    WHERE not addr_place_ids @> ARRAY[place_id]
    ORDER BY rank_address, isguess asc,
             distance *
               CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
@ -397,10 +548,11 @@ BEGIN
  NEW.place_id := nextval('seq_place');
  NEW.indexed_status := 1; --STATUS_NEW

-  NEW.country_code := lower(get_country_code(NEW.geometry));
+  NEW.centroid := ST_PointOnSurface(NEW.geometry);
+  NEW.country_code := lower(get_country_code(NEW.centroid));

  NEW.partition := get_partition(NEW.country_code);
-  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry);
+  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);

  IF NEW.osm_type = 'X' THEN
    -- E'X'ternal records should already be in the right format so do nothing
@ -522,8 +674,8 @@ DECLARE
  parent_address_level SMALLINT;
  place_address_level SMALLINT;

-  addr_street TEXT;
-  addr_place TEXT;
+  addr_street INTEGER[];
+  addr_place INTEGER[];

  max_rank SMALLINT;

@ -531,12 +683,11 @@ DECLARE
  nameaddress_vector INTEGER[];
  addr_nameaddress_vector INTEGER[];

-  inherited_address HSTORE;
-
  linked_node_id BIGINT;
  linked_importance FLOAT;
  linked_wikipedia TEXT;

+  is_place_address BOOLEAN;
  result BOOLEAN;
 BEGIN
  -- deferred delete
@ -566,9 +717,9 @@ BEGIN
  -- update not necessary for osmline, cause linked_place_id does not exist

  NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
-  NEW.address := NEW.address - '_unlisted_place'::TEXT;

  IF NEW.linked_place_id is not null THEN
+    NEW.token_info := null;
    {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
    RETURN NEW;
  END IF;
@ -579,13 +730,34 @@ BEGIN
  -- imported as place=postcode. That's why relations are allowed to pass here.
  -- This can go away in a couple of versions.
  IF NEW.class = 'place'  and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
+    NEW.token_info := null;
    RETURN NEW;
  END IF;

-  -- Speed up searches - just use the centroid of the feature
-  -- cheaper but less acurate
+  -- Compute a preliminary centroid.
  NEW.centroid := ST_PointOnSurface(NEW.geometry);
-  {% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
+
+    -- recalculate country and partition
+  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
+    -- for countries, believe the mapped country code,
+    -- so that we remain in the right partition if the boundaries
+    -- suddenly expand.
+    NEW.country_code := lower(NEW.address->'country');
+    NEW.partition := get_partition(lower(NEW.country_code));
+    IF NEW.partition = 0 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+      NEW.partition := get_partition(NEW.country_code);
+    END IF;
+  ELSE
+    IF NEW.rank_search >= 4 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+    ELSE
+      NEW.country_code := NULL;
+    END IF;
+    NEW.partition := get_partition(NEW.country_code);
+  END IF;
+  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
+

  -- recompute the ranks, they might change when linking changes
  SELECT * INTO NEW.rank_search, NEW.rank_address
@ -665,54 +837,12 @@ BEGIN
    parent_address_level := 3;
  END IF;

-  {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
-  -- housenumber is a computed field, so start with an empty value
-  NEW.housenumber := NULL;
-  IF NEW.address is not NULL THEN
-      IF NEW.address ? 'conscriptionnumber' THEN
-        IF NEW.address ? 'streetnumber' THEN
-            NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
-        ELSE
-            NEW.housenumber := NEW.address->'conscriptionnumber';
-        END IF;
-      ELSEIF NEW.address ? 'streetnumber' THEN
-        NEW.housenumber := NEW.address->'streetnumber';
-      ELSEIF NEW.address ? 'housenumber' THEN
-        NEW.housenumber := NEW.address->'housenumber';
-      END IF;
-      NEW.housenumber := create_housenumber_id(NEW.housenumber);
-
-      addr_street := NEW.address->'street';
-      addr_place := NEW.address->'place';
-
-      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
-        i := getorcreate_postcode_id(NEW.address->'postcode');
-      END IF;
-  END IF;
+  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
+  addr_street := token_addr_street_match_tokens(NEW.token_info);
+  addr_place := token_addr_place_match_tokens(NEW.token_info);

  NEW.postcode := null;

-  -- recalculate country and partition
-  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
-    -- for countries, believe the mapped country code,
-    -- so that we remain in the right partition if the boundaries
-    -- suddenly expand.
-    NEW.country_code := lower(NEW.address->'country');
-    NEW.partition := get_partition(lower(NEW.country_code));
-    IF NEW.partition = 0 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-      NEW.partition := get_partition(NEW.country_code);
-    END IF;
-  ELSE
-    IF NEW.rank_search >= 4 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-    ELSE
-      NEW.country_code := NULL;
-    END IF;
-    NEW.partition := get_partition(NEW.country_code);
-  END IF;
-  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-
  -- waterway ways are linked when they are part of a relation and have the same class/type
  IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
      FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
@ -749,33 +879,14 @@ BEGIN

    {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
    NEW.parent_place_id := null;
-
-    -- if we have a POI and there is no address information,
-    -- see if we can get it from a surrounding building
-    inherited_address := ''::HSTORE;
-    IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
-       AND NEW.housenumber IS NULL THEN
-      FOR location IN
-        -- The additional && condition works around the misguided query
-        -- planner of postgis 3.0.
-        SELECT address from placex where ST_Covers(geometry, NEW.centroid)
-            and geometry && NEW.centroid
-            and (address ? 'housenumber' or address ? 'street' or address ? 'place')
-            and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
-            limit 1
-      LOOP
-        NEW.housenumber := location.address->'housenumber';
-        addr_street := location.address->'street';
-        addr_place := location.address->'place';
-        inherited_address := location.address;
-      END LOOP;
-    END IF;
+    is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);

    -- We have to find our parent road.
    NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                               NEW.partition,
                                               ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place);
+                                               addr_street, addr_place,
+                                               is_place_address);

    -- If we found the road take a shortcut here.
    -- Otherwise fall back to the full address getting method below.
@ -785,12 +896,12 @@ BEGIN
      SELECT p.country_code, p.postcode, p.name FROM placex p
       WHERE p.place_id = NEW.parent_place_id INTO location;

-      IF addr_street is null and addr_place is not null THEN
+      IF is_place_address THEN
        -- Check if the addr:place tag is part of the parent name
        SELECT count(*) INTO i
-          FROM svals(location.name) AS pname WHERE pname = addr_place;
+          FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
        IF i = 0 THEN
-          NEW.address = NEW.address || hstore('_unlisted_place', addr_place);
+          NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
        END IF;
      END IF;

@ -798,39 +909,21 @@ BEGIN
      {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}

      -- determine postcode
-      IF NEW.address is not null AND NEW.address ? 'postcode' THEN
-          NEW.postcode = upper(trim(NEW.address->'postcode'));
-      ELSE
-         NEW.postcode := location.postcode;
-      END IF;
-      IF NEW.postcode is null THEN
-        NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-      END IF;
+      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                               location.postcode,
+                               get_nearest_postcode(NEW.country_code, NEW.geometry));

      IF NEW.name is not NULL THEN
          NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-          name_vector := make_keywords(NEW.name);
-
-          IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-            result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
-                                   name_vector, NEW.rank_search, NEW.rank_address,
-                                   upper(trim(NEW.address->'postcode')), NEW.geometry,
-                                   NEW.centroid);
-            {% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
-          END IF;
-
      END IF;

      {% if not db.reverse_only %}
-      IF array_length(name_vector, 1) is not NULL
-         OR inherited_address is not NULL OR NEW.address is not NULL
-      THEN
+      IF NEW.name is not NULL OR NEW.address is not NULL THEN
        SELECT * INTO name_vector, nameaddress_vector
          FROM create_poi_search_terms(NEW.place_id,
                                       NEW.partition, NEW.parent_place_id,
-                                       inherited_address || NEW.address,
-                                       NEW.country_code, NEW.housenumber,
-                                       name_vector, NEW.centroid);
+                                       is_place_address, NEW.country_code,
+                                       NEW.token_info, NEW.centroid);

        IF array_length(name_vector, 1) is not NULL THEN
          INSERT INTO search_name (place_id, search_rank, address_rank,
@ -844,6 +937,17 @@ BEGIN
      END IF;
      {% endif %}

+      NEW.token_info := token_strip_info(NEW.token_info);
+      -- If the address was inherited from a surrounding building,
+      -- do not add it permanently to the table.
+      IF NEW.address ? '_inherited' THEN
+        IF NEW.address ? '_unlisted_place' THEN
+          NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
+        ELSE
+          NEW.address := null;
+        END IF;
+      END IF;
+
      RETURN NEW;
    END IF;

@ -914,19 +1018,11 @@ BEGIN
    END IF;
  END IF;

-  -- Initialise the name vector using our name
-  NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-  name_vector := make_keywords(NEW.name);
-
-  -- make sure all names are in the word table
  IF NEW.admin_level = 2
     AND NEW.class = 'boundary' AND NEW.type = 'administrative'
     AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
  THEN
-    PERFORM create_country(NEW.name, lower(NEW.country_code));
-    {% if debug %}RAISE WARNING 'Country names updated';{% endif %}
-
-    -- Also update the list of country names. Adding an additional sanity
+    -- Update the list of country names. Adding an additional sanity
    -- check here: make sure the country does overlap with the area where
    -- we expect it to be as per static country grid.
    FOR location IN
@ -959,29 +1055,28 @@ BEGIN
  ELSEIF NEW.rank_address > 25 THEN
    max_rank := 25;
  ELSE
-    max_rank = NEW.rank_address;
+    max_rank := NEW.rank_address;
  END IF;

  SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
-                                    NEW.address, geom, NEW.country_code)
+                                    NEW.token_info, geom, NEW.country_code)
    INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;

  {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}

-  IF NEW.address is not null AND NEW.address ? 'postcode' 
-     AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    NEW.postcode := upper(trim(NEW.address->'postcode'));
-  END IF;
-
-  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
-    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-  END IF;
+  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                           NEW.postcode);

  -- if we have a name add this to the name search table
  IF NEW.name IS NOT NULL THEN
+    -- Initialise the name vector using our name
+    NEW.name := add_default_place_name(NEW.country_code, NEW.name);
+    name_vector := token_get_name_search_tokens(NEW.token_info);

    IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-      result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid);
+      result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
+                             name_vector, NEW.rank_search, NEW.rank_address,
+                             NEW.postcode, NEW.geometry, NEW.centroid);
      {% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
    END IF;

@ -990,8 +1085,11 @@ BEGIN
      {% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
    END IF;

-    result := insertSearchName(NEW.partition, NEW.place_id, name_vector,
+    IF NEW.rank_address between 16 and 27 THEN
+      result := insertSearchName(NEW.partition, NEW.place_id,
+                                 token_get_name_match_tokens(NEW.token_info),
                                 NEW.rank_search, NEW.rank_address, NEW.geometry);
+    END IF;
    {% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}

    {% if not db.reverse_only %}
@ -1002,11 +1100,15 @@ BEGIN
                       NEW.importance, NEW.country_code, name_vector,
                       nameaddress_vector, NEW.centroid);
    {% endif %}
+  END IF;

+  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
+    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
  END IF;

  {% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}

+  NEW.token_info := token_strip_info(NEW.token_info);
  RETURN NEW;
 END;
 $$
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@ -221,39 +221,32 @@ LANGUAGE plpgsql STABLE;
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT,
+CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
                                                   partition SMALLINT,
                                                   centroid GEOMETRY)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent_place_id BIGINT;
-  word_ids INTEGER[];
 BEGIN
  IF street is not null THEN
    -- Check for addr:street attributes
    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    word_ids := word_ids_from_name(street);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids);
+    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
    IF parent_place_id is not null THEN
      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
      RETURN parent_place_id;
    END IF;
  END IF;
-  END IF;

  -- Check for addr:place attributes.
  IF place is not null THEN
-    word_ids := word_ids_from_name(place);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids);
+    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
    IF parent_place_id is not null THEN
      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
      RETURN parent_place_id;
    END IF;
  END IF;
-  END IF;

  RETURN NULL;
 END;
--- a/lib-sql/indices.sql
+++ b/lib-sql/indices.sql
@ -1,9 +1,6 @@
 -- Indices used only during search and update.
 -- These indices are created only after the indexing process is done.

-CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
-  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
-
 CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
  ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};

--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
 );
 GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";

-drop table IF EXISTS word;
-CREATE TABLE word (
-  word_id INTEGER,
-  word_token text,
-  word text,
-  class text,
-  type text,
-  country_code varchar(2),
-  search_name_count INTEGER,
-  operator TEXT
-  ) {{db.tablespace.search_data}};
-CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-
 drop table IF EXISTS location_area CASCADE;
 CREATE TABLE location_area (
  place_id BIGINT,
@ -109,6 +93,7 @@ CREATE TABLE location_property_osmline (
    linegeo GEOMETRY,
    interpolationtype TEXT,
    address HSTORE,
+    token_info JSONB, -- custom column for tokenizer use only
    postcode TEXT,
    country_code VARCHAR(2)
  ){{db.tablespace.search_data}};
@ -158,6 +143,7 @@ CREATE TABLE placex (
  indexed_status SMALLINT,
  LIKE place INCLUDING CONSTRAINTS,
  wikipedia TEXT, -- calculated wikipedia article name (language:title)
+  token_info JSONB, -- custom column for tokenizer use only
  country_code varchar(2),
  housenumber TEXT,
  postcode TEXT,
@ -168,6 +154,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
 CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
 CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
 CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
+CREATE INDEX idx_placex_geometry_buildings ON placex
+  USING GIST (geometry) {{db.tablespace.search_index}}
+  WHERE address is not null and rank_search = 30
+        and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
 CREATE INDEX idx_placex_geometry_placenode ON placex
  USING GIST (geometry) {{db.tablespace.search_index}}
  WHERE osm_type = 'N' and rank_search < 26
@ -178,7 +168,6 @@ DROP SEQUENCE IF EXISTS seq_place;
 CREATE SEQUENCE seq_place start 1;
 GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@ -0,0 +1,399 @@
+-- Get tokens used for searching the given place.
+--
+-- These are the tokens that will be saved in the search_name table.
+CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Get tokens for matching the place name against others.
+--
+-- This should usually be restricted to full name tokens.
+CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+  RETURNS TEXT
+AS $$
+  SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'street')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_match')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_search')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE TYPE token_addresstoken AS (
+  key TEXT,
+  match_tokens INT[],
+  search_tokens INT[]
+);
+
+CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
+  RETURNS SETOF token_addresstoken
+AS $$
+  SELECT key, (value->>1)::int[] as match_tokens,
+         (value->>0)::int[] as search_tokens
+  FROM jsonb_each(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return token info that should be saved permanently in the database.
+CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
+  RETURNS JSONB
+AS $$
+  SELECT NULL::JSONB;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+--------------- private functions ----------------------------------------------
+
+-- Functions for term normalisation and access to the 'word' table.
+
+CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'transliteration'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
+  AS $$
+DECLARE
+  o TEXT;
+BEGIN
+  o := public.gettokenstring(public.transliteration(name));
+  RETURN trim(substr(o,1,length(o)));
+END;
+$$
+LANGUAGE plpgsql IMMUTABLE;
+
+-- returns NULL if the word is too common
+CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+  count INTEGER;
+BEGIN
+  lookup_token := trim(lookup_word);
+  SELECT min(word_id), max(search_name_count) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_id, count;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
+  ELSE
+    IF count > {{ max_word_freq }} THEN
+      return_word_id := NULL;
+    END IF;
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Create housenumber tokens from an OSM addr:housenumber.
+-- The housnumber is split at comma and semicolon as necessary.
+-- The function returns the normalized form of the housenumber suitable
+-- for comparison.
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
+                                               OUT tokens TEXT,
+                                               OUT normtext TEXT)
+  AS $$
+BEGIN
+  SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
+    INTO normtext, tokens
+    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
+          FROM (SELECT make_standard_name(h) as lookup_word
+                FROM unnest(housenumbers) h) x) y;
+END;
+$$ LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || trim(lookup_word);
+  SELECT min(word_id) FROM word
+    WHERE word_token = lookup_token and class='place' and type='house'
+    INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null,
+                             'place', 'house', null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  r RECORD;
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || make_standard_name(postcode);
+  FOR r IN
+    SELECT word_id FROM word
+    WHERE word_token = lookup_token and word = postcode
+          and class='place' and type='postcode'
+  LOOP
+    RETURN false;
+  END LOOP;
+
+  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+                           'place', 'postcode', null, 0);
+  RETURN true;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  nospace_lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and class is null and type is null
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
+                             null, null, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and lookup its word ids (partial words).
+CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  words TEXT[];
+  id INTEGER;
+  return_word_id INTEGER[];
+  word_ids INTEGER[];
+  j INTEGER;
+BEGIN
+  words := string_to_array(make_standard_name(lookup_word), ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        SELECT array_agg(word_id) INTO word_ids
+          FROM word
+         WHERE word_token = words[j] and class is null and type is null;
+
+        IF word_ids IS NULL THEN
+          id := nextval('seq_word');
+          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
+          return_word_id := return_word_id || id;
+        ELSE
+          return_word_id := array_merge(return_word_id, word_ids);
+        END IF;
+      END IF;
+    END LOOP;
+  END IF;
+
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and look up its name ids (full words).
+CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_ids INTEGER[];
+BEGIN
+  lookup_token := ' '|| make_standard_name(lookup_word);
+  SELECT array_agg(word_id) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_ids;
+  RETURN return_word_ids;
+END;
+$$
+LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  result INTEGER[];
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  item RECORD;
+  j INTEGER;
+BEGIN
+  result := '{}'::INTEGER[];
+
+  FOR item IN SELECT (each(src)).* LOOP
+
+    s := make_standard_name(item.value);
+    w := getorcreate_name_id(s, item.value);
+
+    IF not(ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    w := getorcreate_word_id(s);
+
+    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    words := string_to_array(s, ' ');
+    IF array_upper(words, 1) IS NOT NULL THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        IF (words[j] != '') THEN
+          w = getorcreate_word_id(words[j]);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    words := regexp_split_to_array(item.value, E'[,;()]');
+    IF array_upper(words, 1) != 1 THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        s := make_standard_name(words[j]);
+        IF s != '' THEN
+          w := getorcreate_word_id(s);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    s := regexp_replace(item.value, '市$', '');
+    IF s != item.value THEN
+      s := make_standard_name(s);
+      IF s != '' THEN
+        w := getorcreate_name_id(s, item.value);
+        IF NOT (ARRAY[w] <@ result) THEN
+          result := result || w;
+        END IF;
+      END IF;
+    END IF;
+
+  END LOOP;
+
+  RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  i INTEGER;
+  j INTEGER;
+BEGIN
+  s := make_standard_name(src);
+  w := getorcreate_name_id(s, src);
+
+  w := getorcreate_word_id(s);
+
+  words := string_to_array(s, ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        w := getorcreate_word_id(words[j]);
+      END IF;
+    END LOOP;
+  END IF;
+
+  words := regexp_split_to_array(src, E'[,;()]');
+  IF array_upper(words, 1) != 1 THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      s := make_standard_name(words[j]);
+      IF s != '' THEN
+        w := getorcreate_word_id(s);
+      END IF;
+    END LOOP;
+  END IF;
+
+  s := regexp_replace(src, '市$', '');
+  IF s != src THEN
+    s := make_standard_name(s);
+    IF s != '' THEN
+      w := getorcreate_name_id(s, src);
+    END IF;
+  END IF;
+
+  RETURN 1;
+END;
+$$
+LANGUAGE plpgsql;
--- a/lib-sql/tokenizer/legacy_tokenizer_indices.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
@ -0,0 +1,2 @@
+CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
+  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
--- a/lib-sql/tokenizer/legacy_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@ -0,0 +1,19 @@
+DROP TABLE IF EXISTS word;
+CREATE TABLE word (
+  word_id INTEGER,
+  word_token text NOT NULL,
+  word text,
+  class text,
+  type text,
+  country_code varchar(2),
+  search_name_count INTEGER,
+  operator TEXT
+) {{db.tablespace.search_data}};
+
+CREATE INDEX idx_word_word_token ON word
+    USING BTREE (word_token) {{db.tablespace.search_index}};
+GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+DROP SEQUENCE IF EXISTS seq_word;
+CREATE SEQUENCE seq_word start 1;
+GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@ -32,8 +32,11 @@ class UpdateIndex:
    @staticmethod
    def run(args):
        from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory

-        indexer = Indexer(args.config.get_libpq_dsn(),
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                          args.threads or psutil.cpu_count() or 1)

        if not args.no_boundaries:
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@ -46,6 +46,7 @@ class UpdateRefresh:
    @staticmethod
    def run(args):
        from ..tools import refresh
+        from ..tokenizer import factory as tokenizer_factory

        if args.postcodes:
            LOG.warning("Update postcodes centroid")
@ -66,6 +67,8 @@ class UpdateRefresh:
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config,
                                         args.diffs, args.enable_debug_statements)
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                tokenizer.update_sql_functions(args.config)

        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@ -83,6 +83,7 @@ class UpdateReplication:
    def _update(args):
        from ..tools import replication
        from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory

        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
        params.update(base_url=args.config.REPLICATION_URL,
@ -106,6 +107,8 @@ class UpdateReplication:
                raise UsageError("Bad argument '--no-index'.")
            recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')

+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
        while True:
            with connect(args.config.get_libpq_dsn()) as conn:
                start = dt.datetime.now(dt.timezone.utc)
@ -116,7 +119,7 @@ class UpdateReplication:

            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(),
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                  args.threads or 1)
                indexer.index_boundaries(0, 30)
                indexer.index_by_rank(0, 30)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@ -56,6 +56,7 @@ class SetupAll:
        from ..tools import refresh
        from ..indexer.indexer import Indexer
        from ..tools import postcodes
+        from ..tokenizer import factory as tokenizer_factory

        if args.osm_file and not Path(args.osm_file).is_file():
            LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
@ -67,12 +68,6 @@ class SetupAll:
                                                    args.no_partitions,
                                                    rouser=args.config.DATABASE_WEBUSER)

-            LOG.warning('Installing database module')
-            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.install_module(args.module_dir, args.project_dir,
-                                               args.config.DATABASE_MODULE_PATH,
-                                               conn=conn)
-
            LOG.warning('Importing OSM data file')
            database_import.import_osm_data(Path(args.osm_file),
                                            args.osm2pgsql_options(0, 1),
@ -105,22 +100,31 @@ class SetupAll:
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                      args.threads or psutil.cpu_count() or 1)

+        LOG.warning("Setting up tokenizer")
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)

        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
                with connect(args.config.get_libpq_dsn()) as conn:
                    SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
            LOG.warning('Indexing places')
-            indexer = Indexer(args.config.get_libpq_dsn(),
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                              args.threads or psutil.cpu_count() or 1)
            indexer.index_full(analyse=not args.index_noanalyse)

@ -129,7 +133,9 @@ class SetupAll:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
+                                                 args.config.LANGUAGES)
+        tokenizer.finalize_import(args.config)

        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@ -9,6 +9,8 @@ LOG = logging.getLogger()

 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415

 class ImportSpecialPhrases:
    """\
@ -22,10 +24,13 @@ class ImportSpecialPhrases:

    @staticmethod
    def run(args):
+        from ..tokenizer import factory as tokenizer_factory
+
        if args.import_from_wiki:
            LOG.warning('Special phrases importation starting')
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            with connect(args.config.get_libpq_dsn()) as db_connection:
                SpecialPhrasesImporter(
                    args.config, args.phplib_dir, db_connection
-                ).import_from_wiki()
+                ).import_from_wiki(tokenizer)
        return 0
--- a/nominatim/db/async_connection.py
+++ b/nominatim/db/async_connection.py
@ -48,14 +48,14 @@ class DBConnection:
    """ A single non-blocking database connection.
    """

-    def __init__(self, dsn):
+    def __init__(self, dsn, cursor_factory=None):
        self.current_query = None
        self.current_params = None
        self.dsn = dsn

        self.conn = None
        self.cursor = None
-        self.connect()
+        self.connect(cursor_factory=cursor_factory)

    def close(self):
        """ Close all open connections. Does not wait for pending requests.
@ -66,7 +66,7 @@ class DBConnection:

        self.conn = None

-    def connect(self):
+    def connect(self, cursor_factory=None):
        """ (Re)connect to the database. Creates an asynchronous connection
            with JIT and parallel processing disabled. If a connection was
            already open, it is closed and a new connection established.
@ -79,7 +79,7 @@ class DBConnection:
        self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
        self.wait()

-        self.cursor = self.conn.cursor()
+        self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
        # Disable JIT and parallel workers as they are known to cause problems.
        # Update pg_settings instead of using SET because it does not yield
        # errors on older versions of Postgres where the settings are not
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@ -89,8 +89,6 @@ class SQLPreprocessor:
        self.env.globals['db'] = db_info
        self.env.globals['sql'] = _setup_postgres_sql(conn)
        self.env.globals['postgres'] = _setup_postgresql_features(conn)
-        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
-                                         str((config.project_dir / 'module').resolve())


    def run_sql_file(self, conn, name, **kwargs):
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database.
 """
 import logging
 import select
+import time
+
+import psycopg2.extras

 from nominatim.indexer.progress import ProgressLogger
 from nominatim.indexer import runners
@ -11,6 +14,73 @@ from nominatim.db.connection import connect

 LOG = logging.getLogger()

+
+class PlaceFetcher:
+    """ Asynchronous connection that fetches place details for processing.
+    """
+    def __init__(self, dsn, setup_conn):
+        self.wait_time = 0
+        self.current_ids = None
+        self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
+
+        with setup_conn.cursor() as cur:
+            # need to fetch those manually because register_hstore cannot
+            # fetch them on an asynchronous connection below.
+            hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
+            hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
+
+        psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
+                                        array_oid=hstore_array_oid)
+
+    def close(self):
+        """ Close the underlying asynchronous connection.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def fetch_next_batch(self, cur, runner):
+        """ Send a request for the next batch of places.
+            If details for the places are required, they will be fetched
+            asynchronously.
+
+            Returns true if there is still data available.
+        """
+        ids = cur.fetchmany(100)
+
+        if not ids:
+            self.current_ids = None
+            return False
+
+        if hasattr(runner, 'get_place_details'):
+            runner.get_place_details(self.conn, ids)
+            self.current_ids = []
+        else:
+            self.current_ids = ids
+
+        return True
+
+    def get_batch(self):
+        """ Get the next batch of data, previously requested with
+            `fetch_next_batch`.
+        """
+        if self.current_ids is not None and not self.current_ids:
+            tstart = time.time()
+            self.conn.wait()
+            self.wait_time += time.time() - tstart
+            self.current_ids = self.conn.cursor.fetchall()
+
+        return self.current_ids
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.conn.wait()
+        self.close()
+
 class WorkerPool:
    """ A pool of asynchronous database connections.

@ -21,6 +91,7 @@ class WorkerPool:
    def __init__(self, dsn, pool_size):
        self.threads = [DBConnection(dsn) for _ in range(pool_size)]
        self.free_workers = self._yield_free_worker()
+        self.wait_time = 0


    def finish_all(self):
@ -64,7 +135,9 @@ class WorkerPool:
                ready = self.threads
                command_stat = 0
            else:
+                tstart = time.time()
                _, ready, _ = select.select([], self.threads, [])
+                self.wait_time += time.time() - tstart


    def __enter__(self):
@ -72,6 +145,7 @@ class WorkerPool:


    def __exit__(self, exc_type, exc_value, traceback):
+        self.finish_all()
        self.close()


@ -79,8 +153,9 @@ class Indexer:
    """ Main indexing routine.
    """

-    def __init__(self, dsn, num_threads):
+    def __init__(self, dsn, tokenizer, num_threads):
        self.dsn = dsn
+        self.tokenizer = tokenizer
        self.num_threads = num_threads


@ -123,8 +198,9 @@ class Indexer:
        LOG.warning("Starting indexing boundaries using %s threads",
                    self.num_threads)

+        with self.tokenizer.name_analyzer() as analyzer:
            for rank in range(max(minrank, 4), min(maxrank, 26)):
-            self._index(runners.BoundaryRunner(rank))
+                self._index(runners.BoundaryRunner(rank, analyzer))

    def index_by_rank(self, minrank, maxrank):
        """ Index all entries of placex in the given rank range (inclusive)
@ -137,15 +213,16 @@ class Indexer:
        LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                    minrank, maxrank, self.num_threads)

+        with self.tokenizer.name_analyzer() as analyzer:
            for rank in range(max(1, minrank), maxrank):
-            self._index(runners.RankRunner(rank))
+                self._index(runners.RankRunner(rank, analyzer))

            if maxrank == 30:
-            self._index(runners.RankRunner(0))
-            self._index(runners.InterpolationRunner(), 20)
-            self._index(runners.RankRunner(30), 20)
+                self._index(runners.RankRunner(0, analyzer))
+                self._index(runners.InterpolationRunner(analyzer), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
            else:
-            self._index(runners.RankRunner(maxrank))
+                self._index(runners.RankRunner(maxrank, analyzer))


    def index_postcodes(self):
@ -173,6 +250,7 @@ class Indexer:
        LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)

        with connect(self.dsn) as conn:
+            psycopg2.extras.register_hstore(conn)
            with conn.cursor() as cur:
                total_tuples = cur.scalar(runner.sql_count_objects())
                LOG.debug("Total number of rows: %i", total_tuples)
@ -185,19 +263,24 @@ class Indexer:
                with conn.cursor(name='places') as cur:
                    cur.execute(runner.sql_get_objects())

+                    with PlaceFetcher(self.dsn, conn) as fetcher:
                        with WorkerPool(self.dsn, self.num_threads) as pool:
-                        while True:
-                            places = [p[0] for p in cur.fetchmany(batch)]
-                            if not places:
-                                break
+                            has_more = fetcher.fetch_next_batch(cur, runner)
+                            while has_more:
+                                places = fetcher.get_batch()

-                            LOG.debug("Processing places: %s", str(places))
-                            worker = pool.next_free_worker()
+                                # asynchronously get the next batch
+                                has_more = fetcher.fetch_next_batch(cur, runner)

-                            worker.perform(runner.sql_index_place(places))
-                            progress.add(len(places))
+                                # And insert the curent batch
+                                for idx in range(0, len(places), batch):
+                                    part = places[idx:idx+batch]
+                                    LOG.debug("Processing places: %s", str(part))
+                                    runner.index_places(pool.next_free_worker(), part)
+                                    progress.add(len(part))

-                        pool.finish_all()
+                            LOG.info("Wait time: fetcher: %.2fs,  pool: %.2fs",
+                                     fetcher.wait_time, pool.wait_time)

                conn.commit()

--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@ -2,15 +2,52 @@
 Mix-ins that provide the actual commands for the indexer for various indexing
 tasks.
 """
+import functools
+
+import psycopg2.extras
+
 # pylint: disable=C0111

-class RankRunner:
+class AbstractPlacexRunner:
+    """ Returns SQL commands for indexing of the placex table.
+    """
+    SELECT_SQL = 'SELECT place_id FROM placex'
+
+    def __init__(self, rank, analyzer):
+        self.rank = rank
+        self.analyzer = analyzer
+
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE placex
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    @staticmethod
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+                          FROM placex WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+class RankRunner(AbstractPlacexRunner):
    """ Returns SQL commands for indexing one rank within the placex table.
    """

-    def __init__(self, rank):
-        self.rank = rank
-
    def name(self):
        return "rank {}".format(self.rank)

@ -20,24 +57,16 @@ class RankRunner:
               """.format(self.rank)

    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_address = {}
-                  ORDER BY geometry_sector""".format(self.rank)
-
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
+        return """{} WHERE indexed_status > 0 and rank_address = {}
+                     ORDER BY geometry_sector
+               """.format(self.SELECT_SQL, self.rank)


-class BoundaryRunner:
+class BoundaryRunner(AbstractPlacexRunner):
    """ Returns SQL commands for indexing the administrative boundaries
        of a certain rank.
    """

-    def __init__(self, rank):
-        self.rank = rank
-
    def name(self):
        return "boundaries rank {}".format(self.rank)

@ -49,16 +78,10 @@ class BoundaryRunner:
               """.format(self.rank)

    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_search = {}
+        return """{} WHERE indexed_status > 0 and rank_search = {}
                           and class = 'boundary' and type = 'administrative'
                     ORDER BY partition, admin_level
-               """.format(self.rank)
-
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
+               """.format(self.SELECT_SQL, self.rank)


 class InterpolationRunner:
@ -66,6 +89,10 @@ class InterpolationRunner:
        location_property_osmline.
    """

+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
    @staticmethod
    def name():
        return "interpolation lines (location_property_osmline)"
@ -77,15 +104,37 @@ class InterpolationRunner:

    @staticmethod
    def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id
+                  FROM location_property_osmline
                  WHERE indexed_status > 0
                  ORDER BY geometry_sector"""

+
    @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_property_osmline
-                  SET indexed_status = 0 WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
+                          FROM location_property_osmline WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+


 class PostcodeRunner:
@ -107,7 +156,7 @@ class PostcodeRunner:
                  ORDER BY country_code, postcode"""

    @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_postcode SET indexed_status = 0
+    def index_places(worker, ids):
+        worker.perform(""" UPDATE location_postcode SET indexed_status = 0
                           WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+                       """.format(','.join((str(i[0]) for i in ids))))
--- a/nominatim/tokenizer/init.py
+++ b/nominatim/tokenizer/init.py
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@ -0,0 +1,88 @@
+"""
+Functions for creating a tokenizer or initialising the right one for an
+existing database.
+
+A tokenizer is something that is bound to the lifetime of a database. It
+can be choosen and configured before the intial import but then needs to
+be used consistently when querying and updating the database.
+
+This module provides the functions to create and configure a new tokenizer
+as well as instanciating the appropriate tokenizer for updating an existing
+database.
+
+A tokenizer usually also includes PHP code for querying. The appropriate PHP
+normalizer module is installed, when the tokenizer is created.
+"""
+import logging
+import importlib
+
+from ..errors import UsageError
+from ..db import properties
+from ..db.connection import connect
+
+LOG = logging.getLogger()
+
+def _import_tokenizer(name):
+    """ Load the tokenizer.py module from project directory.
+    """
+    try:
+        return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
+    except ModuleNotFoundError as exp:
+        LOG.fatal("No tokenizer named '%s' available. "
+                  "Check the setting of NOMINATIM_TOKENIZER.", name)
+        raise UsageError('Tokenizer not found') from exp
+
+
+def create_tokenizer(config, init_db=True, module_name=None):
+    """ Create a new tokenizer as defined by the given configuration.
+
+        The tokenizer data and code is copied into the 'tokenizer' directory
+        of the project directory and the tokenizer loaded from its new location.
+    """
+    if module_name is None:
+        module_name = config.TOKENIZER
+
+    # Create the directory for the tokenizer data
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.exists():
+        basedir.mkdir()
+    elif not basedir.is_dir():
+        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
+        raise UsageError("Tokenizer setup failed.")
+
+    # Import and initialize the tokenizer.
+    tokenizer_module = _import_tokenizer(module_name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_new_db(config, init_db=init_db)
+
+    with connect(config.get_libpq_dsn()) as conn:
+        properties.set_property(conn, 'tokenizer', module_name)
+
+    return tokenizer
+
+
+def get_tokenizer_for_db(config):
+    """ Instantiate a tokenizer for an existing database.
+
+        The function looks up the appropriate tokenizer in the database
+        and initialises it.
+    """
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.is_dir():
+        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
+        raise UsageError('Cannot initialize tokenizer.')
+
+    with connect(config.get_libpq_dsn()) as conn:
+        name = properties.get_property(conn, 'tokenizer')
+
+    if name is None:
+        LOG.fatal("Tokenizer was not set up properly. Database property missing.")
+        raise UsageError('Cannot initialize tokenizer.')
+
+    tokenizer_module = _import_tokenizer(name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_from_project()
+
+    return tokenizer
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -0,0 +1,540 @@
+"""
+Tokenizer implementing normalisation as used before Nominatim 4.
+"""
+from collections import OrderedDict
+import logging
+import re
+import shutil
+from textwrap import dedent
+
+from icu import Transliterator
+import psycopg2
+import psycopg2.extras
+
+from nominatim.db.connection import connect
+from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.errors import UsageError
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+
+LOG = logging.getLogger()
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return LegacyTokenizer(dsn, data_dir)
+
+
+def _install_module(config_module_path, src_dir, module_dir):
+    """ Copies the PostgreSQL normalisation module into the project
+        directory if necessary. For historical reasons the module is
+        saved in the '/module' subdirectory and not with the other tokenizer
+        data.
+
+        The function detects when the installation is run from the
+        build directory. It doesn't touch the module in that case.
+    """
+    # Custom module locations are simply used as is.
+    if config_module_path:
+        LOG.info("Using custom path for database module at '%s'", config_module_path)
+        return config_module_path
+
+    # Compatibility mode for builddir installations.
+    if module_dir.exists() and src_dir.samefile(module_dir):
+        LOG.info('Running from build directory. Leaving database module as is.')
+        return module_dir
+
+    # In any other case install the module in the project directory.
+    if not module_dir.exists():
+        module_dir.mkdir()
+
+    destfile = module_dir / 'nominatim.so'
+    shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
+    destfile.chmod(0o755)
+
+    LOG.info('Database module installed at %s', str(destfile))
+
+    return module_dir
+
+
+def _check_module(module_dir, conn):
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
+    with conn.cursor() as cur:
+        try:
+            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
+                           RETURNS text AS '{}/nominatim.so', 'transliteration'
+                           LANGUAGE c IMMUTABLE STRICT;
+                           DROP FUNCTION nominatim_test_import_func(text)
+                        """.format(module_dir))
+        except psycopg2.DatabaseError as err:
+            LOG.fatal("Error accessing database module: %s", err)
+            raise UsageError("Database module cannot be accessed.") from err
+
+
+class LegacyTokenizer:
+    """ The legacy tokenizer uses a special PostgreSQL module to normalize
+        names and queries. The tokenizer thus implements normalization through
+        calls to the database.
+    """
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.normalization = None
+
+
+    def init_new_db(self, config, init_db=True):
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        self.normalization = config.TERM_NORMALIZATION
+
+        self._install_php(config)
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
+
+
+    def init_from_project(self):
+        """ Initialise the tokenizer from the project directory.
+        """
+        with connect(self.dsn) as conn:
+            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+
+
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
+    def migrate_database(self, config):
+        """ Initialise the project directory of an existing database for
+            use with this tokenizer.
+
+            This is a special migration function for updating existing databases
+            to new software versions.
+        """
+        self.normalization = config.TERM_NORMALIZATION
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+
+
+    def name_analyzer(self):
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn, config):
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+class LegacyNameAnalyzer:
+    """ The legacy analyzer uses the special Postgresql module for
+        splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn, normalizer):
+        self.conn = connect(dsn).connection
+        self.conn.autocommit = True
+        self.normalizer = normalizer
+        psycopg2.extras.register_hstore(self.conn)
+
+        self._cache = _TokenCache(self.conn)
+
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
+
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        token_info = _TokenInfo(self._cache)
+
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))
+
+        address = place.get('address')
+
+        if address:
+            hnrs = []
+            addr_terms = []
+            for key, value in address.items():
+                if key == 'postcode':
+                    self._add_postcode(value)
+                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                    hnrs.append(value)
+                elif key == 'street':
+                    token_info.add_street(self.conn, value)
+                elif key == 'place':
+                    token_info.add_place(self.conn, value)
+                elif not key.startswith('_') and \
+                     key not in ('country', 'full'):
+                    addr_terms.append((key, value))
+
+            if hnrs:
+                token_info.add_housenumbers(self.conn, hnrs)
+
+            if addr_terms:
+                token_info.add_address_terms(self.conn, addr_terms)
+
+        return token_info.data
+
+
+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        if re.search(r'[:,;]', postcode) is None:
+            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        self.data = {}
+
+
+    def add_names(self, conn, names):
+        """ Add token information for the names of the place.
+        """
+        with conn.cursor() as cur:
+            # Create the token IDs for all names.
+            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+                                            (names, ))
+
+
+    def add_housenumbers(self, conn, hnrs):
+        """ Extract housenumber information from the address.
+        """
+        if len(hnrs) == 1:
+            token = self.cache.get_housenumber(hnrs[0])
+            if token is not None:
+                self.data['hnr_tokens'] = token
+                self.data['hnr'] = hnrs[0]
+                return
+
+        # split numbers if necessary
+        simple_list = []
+        for hnr in hnrs:
+            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+        if len(simple_list) > 1:
+            simple_list = list(set(simple_list))
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+    def add_street(self, conn, street):
+        """ Add addr:street match terms.
+        """
+        def _get_street(name):
+            with conn.cursor() as cur:
+                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+        self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+    def add_place(self, conn, place):
+        """ Add addr:place search and match terms.
+        """
+        def _get_place(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name, name))
+                return cur.fetchone()
+
+        self.data['place_search'], self.data['place_match'] = \
+            self.cache.places.get(place, _get_place)
+
+
+    def add_address_terms(self, conn, terms):
+        """ Add additional address terms.
+        """
+        def _get_address_term(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT addr_ids_from_name(%s)::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name))
+                return cur.fetchone()
+
+        tokens = {}
+        for key, value in terms:
+            tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+        self.data['addr'] = tokens
+
+
+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128, init_data=None):
+        self.data = init_data or OrderedDict()
+        self.maxsize = maxsize
+        if init_data is not None and len(init_data) > maxsize:
+            self.maxsize = len(init_data)
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self, conn):
+        # various LRU caches
+        self.streets = _LRU(maxsize=256)
+        self.places = _LRU(maxsize=128)
+        self.address_terms = _LRU(maxsize=1024)
+
+        # Lookup houseunumbers up to 100 and cache them
+        with conn.cursor() as cur:
+            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+                           FROM generate_series(1, 100) as i""")
+            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+        # Get postcodes that are already saved
+        postcodes = OrderedDict()
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word FROM word
+                           WHERE class ='place' and type = 'postcode'""")
+            for row in cur:
+                postcodes[row[0]] = None
+        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+
+    def get_housenumber(self, number):
+        """ Get a housenumber token from the cache.
+        """
+        return self._cached_housenumbers.get(number)
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@ -4,10 +4,9 @@ Collection of functions that check if the database is complete and functional.
 from enum import Enum
 from textwrap import dedent

-import psycopg2
-
 from nominatim.db.connection import connect
 from nominatim.errors import UsageError
+from nominatim.tokenizer import factory as tokenizer_factory

 CHECKLIST = []

@ -78,8 +77,7 @@ def check_database(config):


 def _get_indexes(conn):
-    indexes = ['idx_word_word_id',
-               'idx_place_addressline_address_place_id',
+    indexes = ['idx_place_addressline_address_place_id',
               'idx_placex_rank_search',
               'idx_placex_rank_address',
               'idx_placex_parent_place_id',
@ -149,7 +147,7 @@ def check_placex_table(conn, config):


@_check(hint="""placex table has no data. Did the import finish sucessfully?""")
-def check_placex_size(conn, config): # pylint: disable=W0613
+def check_placex_size(conn, _):
    """ Checking for placex content
    """
    with conn.cursor() as cur:
@ -158,38 +156,30 @@ def check_placex_size(conn, config): # pylint: disable=W0613
    return CheckState.OK if cnt > 0 else CheckState.FATAL


-@_check(hint="""\
-             The Postgresql extension nominatim.so was not correctly loaded.
-
-             Error: {error}
-
-             Hints:
-             * Check the output of the CMmake/make installation step
-             * Does nominatim.so exist?
-             * Does nominatim.so exist on the database server?
-             * Can nominatim.so be accessed by the database user?
-             """)
-def check_module(conn, config): # pylint: disable=W0613
-    """ Checking that nominatim.so module is installed
+@_check(hint="""{msg}""")
+def check_tokenizer(_, config):
+    """ Checking that tokenizer works
    """
-    with conn.cursor() as cur:
    try:
-            out = cur.scalar("SELECT make_standard_name('a')")
-        except psycopg2.ProgrammingError as err:
-            return CheckState.FAIL, dict(error=str(err))
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    except UsageError:
+        return CheckState.FAIL, dict(msg="""\
+            Cannot load tokenizer. Did the import finish sucessfully?""")

-        if out != 'a':
-            return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
+    result = tokenizer.check_database()

+    if result is None:
        return CheckState.OK

+    return CheckState.FAIL, dict(msg=result)
+

@_check(hint="""\
             The indexing didn't finish. {count} entries are not yet indexed.

             To index the remaining entries, run:   {index_cmd}
             """)
-def check_indexing(conn, config): # pylint: disable=W0613
+def check_indexing(conn, _):
    """ Checking indexing status
    """
    with conn.cursor() as cur:
@ -198,7 +188,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
    if cnt == 0:
        return CheckState.OK

-    if conn.index_exists('idx_word_word_id'):
+    if conn.index_exists('idx_placex_rank_search'):
        # Likely just an interrupted update.
        index_cmd = 'nominatim index'
    else:
@ -214,7 +204,7 @@ def check_indexing(conn, config): # pylint: disable=W0613

             Rerun the index creation with:   nominatim import --continue db-postprocess
             """)
-def check_database_indexes(conn, config): # pylint: disable=W0613
+def check_database_indexes(conn, _):
    """ Checking that database indexes are complete
    """
    missing = []
@ -236,7 +226,7 @@ def check_database_indexes(conn, config): # pylint: disable=W0613
             Invalid indexes:
               {indexes}
             """)
-def check_database_index_valid(conn, config): # pylint: disable=W0613
+def check_database_index_valid(conn, _):
    """ Checking that all database indexes are valid
    """
    with conn.cursor() as cur:
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@ -5,11 +5,10 @@ import logging
 import os
 import selectors
 import subprocess
-import shutil
 from pathlib import Path

 import psutil
-import psycopg2
+import psycopg2.extras

 from nominatim.db.connection import connect, get_pg_env
 from nominatim.db import utils as db_utils
@ -89,49 +88,6 @@ def setup_extensions(conn):
        raise UsageError('PostGIS version is too old.')


-def install_module(src_dir, project_dir, module_dir, conn=None):
-    """ Copy the normalization module from src_dir into the project
-        directory under the '/module' directory. If 'module_dir' is set, then
-        use the module from there instead and check that it is accessible
-        for Postgresql.
-
-        The function detects when the installation is run from the
-        build directory. It doesn't touch the module in that case.
-
-        If 'conn' is given, then the function also tests if the module
-        can be access via the given database.
-    """
-    if not module_dir:
-        module_dir = project_dir / 'module'
-
-        if not module_dir.exists() or not src_dir.samefile(module_dir):
-
-            if not module_dir.exists():
-                module_dir.mkdir()
-
-            destfile = module_dir / 'nominatim.so'
-            shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
-            destfile.chmod(0o755)
-
-            LOG.info('Database module installed at %s', str(destfile))
-        else:
-            LOG.info('Running from build directory. Leaving database module as is.')
-    else:
-        LOG.info("Using custom path for database module at '%s'", module_dir)
-
-    if conn is not None:
-        with conn.cursor() as cur:
-            try:
-                cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                               RETURNS text AS '{}/nominatim.so', 'transliteration'
-                               LANGUAGE c IMMUTABLE STRICT;
-                               DROP FUNCTION nominatim_test_import_func(text)
-                            """.format(module_dir))
-            except psycopg2.DatabaseError as err:
-                LOG.fatal("Error accessing database module: %s", err)
-                raise UsageError("Database module cannot be accessed.") from err
-
-
 def import_base_data(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
@ -205,11 +161,10 @@ def create_partition_tables(conn, config):
    sql.run_sql_file(conn, 'partition-tables.src.sql')


-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
    """ Truncate all data tables to prepare for a fresh load.
    """
    with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
        cur.execute('TRUNCATE placex')
        cur.execute('TRUNCATE place_addressline')
        cur.execute('TRUNCATE location_area')
@ -228,23 +183,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
        for table in [r[0] for r in list(cur)]:
            cur.execute('TRUNCATE ' + table)

-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
    conn.commit()

 _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'

-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
    """ Copy data into the word and placex table.
    """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
    sel = selectors.DefaultSelector()
    # Then copy data from place to placex in <threads - 1> chunks.
    place_threads = max(1, threads - 1)
@ -306,34 +251,37 @@ def create_search_indices(conn, config, drop=False):

    sql.run_sql_file(conn, 'indices.sql', drop=drop)

-def create_country_names(conn, config):
-    """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
    """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') \
+                and (not languages or key[5:] in languages))

    with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(country_code),
-                       country_code) FROM country_name WHERE country_code is not null) AS x""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
-                       FROM country_name WHERE name ? 'name') AS x""")
-        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
-                           country_code) FROM (SELECT country_code, skeys(name)
-                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")

-        languages = config.LANGUAGES
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = [code]
+                if code == 'gb':
+                    names.append('UK')
+                if code == 'us':
+                    names.append('United States')
+
+                # country names (only in languages as provided)
+                if name:
+                    names.extend((v for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)

-        if languages:
-            sql_statement = "{} IN (".format(sql_statement)
-            delim = ''
-            for language in languages.split(','):
-                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
-                delim = ', '
-            sql_statement = '{})'.format(sql_statement)
-        else:
-            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
-        sql_statement = "{}) v".format(sql_statement)
-        cur.execute(sql_statement)
    conn.commit()
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@ -6,7 +6,8 @@ import logging
 from nominatim.db import properties
 from nominatim.db.connection import connect
 from nominatim.version import NOMINATIM_VERSION
-from nominatim.tools import refresh, database_import
+from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
 from nominatim.errors import UsageError

 LOG = logging.getLogger()
@ -43,11 +44,14 @@ def migrate(config, paths):
                            '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
                kwargs = dict(conn=conn, config=config, paths=paths)
                func(**kwargs)
+                conn.commit()
                has_run_migration = True

        if has_run_migration:
            LOG.warning('Updating SQL functions.')
            refresh.create_functions(conn, config)
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+            tokenizer.update_sql_functions(config)

        properties.set_property(conn, 'database_version',
                                '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
@ -108,17 +112,6 @@ def import_status_timestamp_change(conn, **_):
                       TYPE timestamp with time zone;""")


-@_migration(3, 5, 0, 99)
-def install_database_module_in_project_directory(conn, config, paths, **_):
-    """ Install database module in project directory.
-
-        The database module needs to be present in the project directory
-        since those were introduced.
-    """
-    database_import.install_module(paths.module_dir, paths.project_dir,
-                                   config.DATABASE_MODULE_PATH, conn=conn)
-
-
@_migration(3, 5, 0, 99)
 def add_nominatim_property_table(conn, config, **_):
    """ Add nominatim_property table.
@ -137,6 +130,9 @@ def change_housenumber_transliteration(conn, **_):

        The database schema switched from saving raw housenumbers in
        placex.housenumber to saving transliterated ones.
+
+        Note: the function create_housenumber_id() has been dropped in later
+              versions.
    """
    with conn.cursor() as cur:
        cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
@ -173,3 +169,25 @@ def switch_placenode_geometry_index(conn, **_):
                              and class = 'place' and type != 'postcode'
                              and linked_place_id is null""")
        cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
+
+
+@_migration(3, 7, 0, 1)
+def install_legacy_tokenizer(conn, config, **_):
+    """ Setup legacy tokenizer.
+
+        If no other tokenizer has been configured yet, then create the
+        configuration for the backwards-compatible legacy tokenizer
+    """
+    if properties.get_property(conn, 'tokenizer') is None:
+        with conn.cursor() as cur:
+            for table in ('placex', 'location_property_osmline'):
+                has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
+                                           WHERE table_name = %s
+                                           and column_name = 'token_info'""",
+                                        (table, ))
+            if has_column == 0:
+                cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
+        tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
+                                                       module_name='legacy')
+
+        tokenizer.migrate_database(config)
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@ -6,7 +6,7 @@ of artificial postcode centroids.
 from nominatim.db.utils import execute_file
 from nominatim.db.connection import connect

-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
    """ Set up the initial list of postcodes.
    """

@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                       ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
        conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
    ('Default_Language', 'DEFAULT_LANGUAGE', str),
    ('Log_DB', 'LOG_DB', bool),
    ('Log_File', 'LOG_FILE', str),
-    ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
    ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
    ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
    ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
    ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
    ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
-    ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
    ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
    ('MapIcon_URL', 'MAPICON_URL', str),
 )
@ -175,9 +173,11 @@ def setup_website(basedir, config):

                      @define('CONST_Debug', $_GET['debug'] ?? false);
                      @define('CONST_LibDir', '{0}');
+                      @define('CONST_TokenizerDir', '{2}');
                      @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');

-                      """.format(config.lib_dir.php, NOMINATIM_VERSION))
+                      """.format(config.lib_dir.php, NOMINATIM_VERSION,
+                                 config.project_dir / 'tokenizer'))

    for php_name, conf_name, var_type in PHP_CONST_DEFS:
        if var_type == bool:
--- a/nominatim/tools/special_phrases/importer_statistics.py
+++ b/nominatim/tools/special_phrases/importer_statistics.py
@ -24,9 +24,6 @@ class SpecialPhrasesImporterStatistics():
        self.tables_deleted = 0
        self.tables_ignored = 0
        self.global_phrases_invalid = 0
-        self.global_phrases_added = 0
-        self.global_phrases_ignored = 0
-        self.global_phrases_deleted = 0

    def _set_lang_values_to_0(self):
        """
@ -34,8 +31,6 @@ class SpecialPhrasesImporterStatistics():
            lang to 0.
        """
        self.lang_phrases_invalid = 0
-        self.lang_phrases_added = 0
-        self.lang_phrases_ignored = 0

    def notify_one_phrase_invalid(self):
        """
@ -45,29 +40,6 @@ class SpecialPhrasesImporterStatistics():
        self.lang_phrases_invalid += 1
        self.global_phrases_invalid += 1

-    def notify_one_phrase_added(self):
-        """
-            Add +1 to the count of entries
-            added to the db.
-        """
-        self.lang_phrases_added += 1
-        self.global_phrases_added += 1
-
-    def notify_one_phrase_ignored(self):
-        """
-            Add +1 to the count of ignored
-            entries as it was already in the db.
-        """
-        self.lang_phrases_ignored += 1
-        self.global_phrases_ignored += 1
-
-    def notify_one_phrase_deleted(self):
-        """
-            Add +1 to the count of phrases deleted
-            from the database.
-        """
-        self.global_phrases_deleted += 1
-
    def notify_one_table_created(self):
        """
            Add +1 to the count of created tables.
@ -97,12 +69,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
        if self.global_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.global_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.global_phrases_added)
-        LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
-        if self.global_phrases_deleted > 0:
-            LOG.info('  They were deleted as they are not valid anymore.')
        LOG.info('- %s tables were ignored as they already exist on the database',
                 self.tables_ignored)
        LOG.info('- %s tables were created', self.tables_created)
@ -126,9 +92,6 @@ class SpecialPhrasesImporterStatistics():
        LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
        if self.lang_phrases_invalid > 0:
            LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.lang_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
        LOG.info('====================================================================')

        if self.lang_phrases_invalid > 0:
--- a/nominatim/tools/special_phrases/special_phrases_importer.py
+++ b/nominatim/tools/special_phrases/special_phrases_importer.py
@ -9,7 +9,6 @@ import re
 import subprocess
 import json

-from icu import Transliterator
 from psycopg2.sql import Identifier, Literal, SQL

 from nominatim.tools.exec_utils import get_url
@ -33,21 +32,14 @@ class SpecialPhrasesImporter():
            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
        )
        self.sanity_check_pattern = re.compile(r'^\w+$')
-        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
-                                                             self.config.TERM_NORMALIZATION)
-        #This set will contain all existing phrases from the word table which
-        #no longer exist on the wiki.
-        #It contain tuples with the following format: (normalized_word, class, type, operator)
-        self.words_phrases_to_delete = set()
-        #This set will contain the phrases which still exist from the wiki.
-        #It is used to prevent duplicates on the wiki by removing them from
-        #the word_phrases_to_delete only at the end.
-        self.words_phrases_still_exist = set()
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (lable, class, type, operator)
+        self.word_phrases = set()
        #This set will contain all existing place_classtype tables which doesn't match any
        #special phrases class/type on the wiki.
        self.table_phrases_to_delete = set()

-    def import_from_wiki(self, languages=None):
+    def import_from_wiki(self, tokenizer, languages=None):
        """
            Iterate through all specified languages and
            extract corresponding special phrases from the wiki.
@ -55,7 +47,6 @@ class SpecialPhrasesImporter():
        if languages is not None and not isinstance(languages, list):
            raise TypeError('The \'languages\' argument should be of type list.')

-        self._fetch_existing_words_phrases()
        self._fetch_existing_place_classtype_tables()

        #Get all languages to process.
@ -71,30 +62,15 @@ class SpecialPhrasesImporter():
            self.statistics_handler.notify_current_lang_done(lang)

        self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_phrases_from_db()
+        self._remove_non_existent_tables_from_db()
        self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases)
+
        LOG.warning('Import done.')
        self.statistics_handler.notify_import_done()

-    def _fetch_existing_words_phrases(self):
-        """
-            Fetch existing special phrases from the word table.
-            Fill the word_phrases_to_delete set of the class.
-        """
-        #Only extract special phrases terms:
-        #If class=place and type=house then it is a housenumber term.
-        #If class=place and type=postcode then it is a postcode term.
-        word_query = """
-            SELECT word, class, type, operator FROM word
-            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
-        """
-        with self.db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(word_query))
-            for row in db_cursor:
-                row[3] = '-' if row[3] is None else row[3]
-                self.words_phrases_to_delete.add(
-                    (row[0], row[1], row[2], row[3])
-                )

    def _fetch_existing_place_classtype_tables(self):
        """
@ -176,7 +152,6 @@ class SpecialPhrasesImporter():

        for match in matches:
            phrase_label = match[0].strip()
-            normalized_label = self.transliterator.transliterate(phrase_label)
            phrase_class = match[1].strip()
            phrase_type = match[2].strip()
            phrase_operator = match[3].strip()
@ -198,20 +173,6 @@ class SpecialPhrasesImporter():
            ):
                continue

-            #Check if the phrase already exists in the database.
-            if (
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                    in self.words_phrases_to_delete
-            ):
-                #Remove this phrase from the ones to delete as it still exist on the wiki.
-                self.words_phrases_still_exist.add(
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                )
-                class_type_pairs.add((phrase_class, phrase_type))
-                self.statistics_handler.notify_one_phrase_ignored()
-                #Dont need to add this phrase as it already exists in the word table.
-                continue
-
            #sanity check, in case somebody added garbage in the wiki
            if not self._check_sanity(lang, phrase_class, phrase_type):
                self.statistics_handler.notify_one_phrase_invalid()
@ -219,35 +180,11 @@ class SpecialPhrasesImporter():

            class_type_pairs.add((phrase_class, phrase_type))

-            self._process_amenity(
-                phrase_label, normalized_label, phrase_class,
-                phrase_type, phrase_operator
-            )
-            self.statistics_handler.notify_one_phrase_added()
+            self.word_phrases.add((phrase_label, phrase_class,
+                                   phrase_type, phrase_operator))

        return class_type_pairs

-    def _process_amenity(self, phrase_label, normalized_label,
-                         phrase_class, phrase_type, phrase_operator):
-        # pylint: disable-msg=too-many-arguments
-        """
-            Add phrase lookup and corresponding class and
-            type to the word table based on the operator.
-        """
-        with self.db_connection.cursor() as db_cursor:
-            if phrase_operator == 'near':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'near')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            elif phrase_operator == 'in':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'in')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            else:
-                db_cursor.execute("""SELECT getorcreate_amenity(
-                                  make_standard_name(%s), %s, %s, %s)""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-

    def _create_place_classtype_table_and_indexes(self, class_type_pairs):
        """
@ -339,33 +276,15 @@ class SpecialPhrasesImporter():
                              .format(Identifier(table_name),
                                      Identifier(self.config.DATABASE_WEBUSER)))

-    def _remove_non_existent_phrases_from_db(self):
+    def _remove_non_existent_tables_from_db(self):
        """
            Remove special phrases which doesn't exist on the wiki anymore.
-            Delete from the word table and delete the place_classtype tables.
+            Delete the place_classtype tables.
        """
        LOG.warning('Cleaning database...')
-        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
        #Array containing all queries to execute. Contain tuples of format (query, parameters)
        queries_parameters = []

-        #Delete phrases from the word table which are not on the wiki anymore.
-        for phrase_to_delete in self.words_phrases_to_delete:
-            self.statistics_handler.notify_one_phrase_deleted()
-            if phrase_to_delete[3] == '-':
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
-                queries_parameters.append((query, parameters))
-            else:
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1],
-                              phrase_to_delete[2], phrase_to_delete[3], )
-                queries_parameters.append((query, parameters))
-
        #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
        for table in self.table_phrases_to_delete:
            self.statistics_handler.notify_one_table_deleted()
--- a/nominatim/version.py
+++ b/nominatim/version.py
@ -10,7 +10,7 @@ Version information for Nominatim.
 # and must always be increased when there is a change to the database or code
 # that requires a migration.
 # Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 7, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 2)

 POSTGRESQL_REQUIRED_VERSION = (9, 3)
 POSTGIS_REQUIRED_VERSION = (2, 2)
--- a/settings/env.defaults
+++ b/settings/env.defaults
@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data"
 # Changing this value requires to run 'nominatim refresh --functions'.
 NOMINATIM_DATABASE_MODULE_PATH=

+# Tokenizer used for normalizing and parsing queries and names.
+# The tokenizer is set up during import and cannot be changed afterwards
+# without a reimport.
+# Currently available tokenizers: legacy
+NOMINATIM_TOKENIZER="legacy"
+
 # Number of occurances of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
 # or handled differently during search.
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@ -24,7 +24,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | nameaddress_vector |
-         | N1     | Rose, Street, Walltown |
+         | N1     | #Rose Street, Walltown |
        When searching for "23 Rose Street, Walltown"
        Then results contain
         | osm_type | osm_id | name |
@ -248,7 +248,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Rose, Street, Walltown |
+         | N1     | #Green Moss | #Rose Street, Walltown |
        When searching for "Green Moss, Rose Street, Walltown"
        Then results contain
         | osm_type | osm_id | name |
@ -299,7 +299,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
-         | N1     | foo         | the road |
+         | N1     | foo         | #the road |

    Scenario: Some addr: tags are added to address
        Given the scene roads-with-pois
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
 from nominatim import cli
 from nominatim.config import Configuration
 from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
 from steps.utils import run_script

 class NominatimEnvironment:
@ -106,9 +107,19 @@ class NominatimEnvironment:
            self.website_dir.cleanup()

        self.website_dir = tempfile.TemporaryDirectory()
-        cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
-        cfg.lib_dir.php = self.src_dir / 'lib-php'
-        refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
+        refresh.setup_website(Path(self.website_dir.name) / 'website',
+                              self.get_test_config())
+
+
+    def get_test_config(self):
+        cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
+                            environ=self.test_env)
+        cfg.set_libdirs(module=self.build_dir / 'module',
+                        osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
+                        php=self.src_dir / 'lib-php',
+                        sql=self.src_dir / 'lib-sql',
+                        data=self.src_dir / 'data')
+        return cfg

    def get_libpq_dsn(self):
        dsn = self.test_env['NOMINATIM_DATABASE_DSN']
@ -169,14 +180,10 @@ class NominatimEnvironment:
        """
        self.write_nominatim_config(self.api_test_db)

-        if self.api_db_done:
-            return
-
+        if not self.api_db_done:
            self.api_db_done = True

-        if self._reuse_or_drop_db(self.api_test_db):
-            return
-
+            if not self._reuse_or_drop_db(self.api_test_db):
                testdata = Path('__file__') / '..' / '..' / 'testdb'
                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())

@ -191,11 +198,23 @@ class NominatimEnvironment:
                    self.db_drop_database(self.api_test_db)
                    raise

+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+

    def setup_unknown_db(self):
        """ Setup a test against a non-existing database.
        """
-        self.write_nominatim_config('UNKNOWN_DATABASE_NAME')
+        # The tokenizer needs an existing database to function.
+        # So start with the usual database
+        class _Context:
+            db = None
+
+        context = _Context()
+        self.setup_db(context)
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+
+        # Then drop the DB again
+        self.teardown_db(context, force_drop=True)

    def setup_db(self, context):
        """ Setup a test against a fresh, empty test database.
@ -212,13 +231,13 @@ class NominatimEnvironment:
        context.db.autocommit = True
        psycopg2.extras.register_hstore(context.db, globally=False)

-    def teardown_db(self, context):
+    def teardown_db(self, context, force_drop=False):
        """ Remove the test database, if it exists.
        """
-        if 'db' in context:
+        if hasattr(context, 'db'):
            context.db.close()

-        if not self.keep_scenario_db:
+        if force_drop or not self.keep_scenario_db:
            self.db_drop_database(self.test_db)

    def _reuse_or_drop_db(self, name):
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@ -7,6 +7,7 @@ from place_inserter import PlaceColumn
 from table_compare import NominatimID, DBRow

 from nominatim.indexer import indexer
+from nominatim.tokenizer import factory as tokenizer_factory

 def check_database_integrity(context):
    """ Check some generic constraints on the tables.
@ -86,6 +87,9 @@ def add_data_to_planet_ways(context):
 def import_and_index_data_from_place_table(context):
    """ Import data previously set up in the place table.
    """
+    nctx = context.nominatim
+
+    tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
    context.nominatim.copy_from_place(context.db)

    # XXX use tool function as soon as it is ported
@ -105,7 +109,7 @@ def import_and_index_data_from_place_table(context):

    # Call directly as the refresh function does not include postcodes.
    indexer.LOG.setLevel(logging.ERROR)
-    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)

    check_database_integrity(context)

@ -230,7 +234,7 @@ def check_search_name_contents(context, exclude):
                                if exclude:
                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
                                else:
-                                    assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
+                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
                    elif name != 'object':
                        assert db_row.contains(name, value), db_row.assert_msg(name, value)

--- a/test/php/Nominatim/PhraseTest.php
+++ b/test/php/Nominatim/PhraseTest.php
@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testEmptyPhrase()
    {
        $oPhrase = new Phrase('', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array(), new TokensFullSet());

-        $this->assertEquals(
-            array(array('')),
-            $oPhrase->getWordSets()
-        );
+        $this->assertNull($oPhrase->getWordSets());
    }


    public function testSingleWordPhrase()
    {
        $oPhrase = new Phrase('a', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a'), new TokensFullSet());

        $this->assertEquals(
            '(a)',
@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testMultiWordPhrase()
    {
        $oPhrase = new Phrase('a b', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
        $this->assertEquals(
            '(a b),(a|b)',
            $this->serializeSets($oPhrase->getWordSets())
        );

        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
        $this->assertEquals(
            '(a b c),(a|b c),(a b|c),(a|b|c)',
            $this->serializeSets($oPhrase->getWordSets())
        );

        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
        $this->assertEquals(
            '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
            $this->serializeSets($oPhrase->getWordSets())
@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testInverseWordSets()
    {
        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
        $oPhrase->invertWordSets();

        $this->assertEquals(
@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase

    public function testMaxWordSets()
    {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 4, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
        $this->assertEquals(8, count($oPhrase->getWordSets()));
        $oPhrase->invertWordSets();
        $this->assertEquals(8, count($oPhrase->getWordSets()));

-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
        $this->assertEquals(100, count($oPhrase->getWordSets()));
        $oPhrase->invertWordSets();
        $this->assertEquals(100, count($oPhrase->getWordSets()));
@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
    public function testPartialTokensShortTerm()
    {
        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
        $this->assertEquals(
            '(a|b c d),(a|b c|d)',
            $this->serializeSets($oPhrase->getWordSets())
@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase

    public function testPartialTokensLongTerm()
    {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
        $this->assertEquals(80, count($oPhrase->getWordSets()));
    }
 }
--- a/test/php/Nominatim/StatusTest.php
+++ b/test/php/Nominatim/StatusTest.php
@ -2,6 +2,8 @@

 namespace Nominatim;

+@define('CONST_TokenizerDir', dirname(__FILE__));
+
 require_once(CONST_LibDir.'/DB.php');
 require_once(CONST_LibDir.'/Status.php');

@ -40,45 +42,6 @@ class StatusTest extends \PHPUnit\Framework\TestCase
        $this->assertEquals('No database', $oStatus->status());
    }

-
-    public function testModuleFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('Module call failed');
-        $this->expectExceptionCode(702);
-
-        // stub has getOne method but doesn't return anything
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
-    public function testWordIdQueryFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('No value');
-        $this->expectExceptionCode(704);
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        // return no word_id
-        $oDbStub->method('getOne')
-                ->will($this->returnCallback(function ($sql) {
-                    if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
-                    if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
-                }));
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
    public function testOK()
    {
        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
--- a/test/php/Nominatim/TokenListTest.php
+++ b/test/php/Nominatim/TokenListTest.php
@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
        $this->assertFalse($TL->contains('unknownword'));
        $this->assertEquals(array(), $TL->get('unknownword'));
    }
-
-    public function testAddress()
-    {
-        $this->expectOutputRegex('/<p><tt>/');
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('getAll', 'getDBQuotedList'))
-                        ->getMock();
-
-        $oDbStub->method('getDBQuotedList')
-                ->will($this->returnCallback(function ($aVals) {
-                    return array_map(function ($sVal) {
-                        return "'".$sVal."'";
-                    }, $aVals);
-                }));
-
-
-        $oDbStub->method('getAll')
-                ->will($this->returnCallback(function ($sql) {
-                    $aResults = array();
-                    if (preg_match('/1051/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '1051',
-                                                         'class' => 'place',
-                                                         'type' => 'house'
-                                                        ));
-                    }
-                    if (preg_match('/hauptstr/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'hauptstr',
-                                                         'class' => 'place',
-                                                         'type' => 'street',
-                                                         'operator' => true
-                                                        ));
-                    }
-                    if (preg_match('/64286/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '64286',
-                                                         'word' => '64286',
-                                                         'class' => 'place',
-                                                         'type' => 'postcode'
-                                                        ));
-                    }
-                    if (preg_match('/darmstadt/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'darmstadt',
-                                                         'count' => 533
-                                                        ));
-                    }
-                    if (preg_match('/alemagne/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'alemagne',
-                                                         'country_code' => 'de',
-                                                        ));
-                    }
-                    if (preg_match('/mexico/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'mexico',
-                                                         'country_code' => 'mx',
-                                                        ));
-                    }
-                    return $aResults;
-                }));
-
-        $aCountryCodes = array('de', 'fr');
-        $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
-        $aTokens = explode(' ', $sNormQuery);
-
-        $TL = new TokenList;
-        $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
-        $this->assertEquals(5, $TL->count());
-
-        $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
-        $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
-        $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
-        $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
-        $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
-    }
 }
--- a/test/php/Nominatim/tokenizer.php
+++ b/test/php/Nominatim/tokenizer.php
@ -0,0 +1,17 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+    }
+
+    public function checkStatus()
+    {
+    }
+}
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@ -1,3 +1,4 @@
+import importlib
 import itertools
 import sys
 from pathlib import Path
@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
 from nominatim.config import Configuration
 from nominatim.db import connection
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.db import properties
+
+import dummy_tokenizer

 class _TestingCursor(psycopg2.extras.DictCursor):
    """ Extension to the DictCursor class that provides execution
@ -117,9 +121,8 @@ def table_factory(temp_db_cursor):
    def mk_table(name, definition='id INT', content=None):
        temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
        if content is not None:
-            if not isinstance(content, str):
-                content = '),('.join([str(x) for x in content])
-            temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+            psycopg2.extras.execute_values(
+                temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)

    return mk_table

@ -144,6 +147,11 @@ def tmp_phplib_dir():

        yield Path(phpdir)

+
+@pytest.fixture
+def property_table(table_factory):
+    table_factory('nominatim_properties', 'property TEXT, value TEXT')
+
@pytest.fixture
 def status_table(temp_db_conn):
    """ Create an empty version of the status table and
@ -281,10 +289,29 @@ def osm2pgsql_options(temp_db):

@pytest.fixture
 def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
    cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
    cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
                    sql=tmp_path, data=SRC_DIR / 'data')

    return SQLPreprocessor(temp_db_conn, cfg)
+
+
+@pytest.fixture
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
+    """ Sets up the configuration so that the test dummy tokenizer will be
+        loaded when the tokenizer factory is used. Also returns a factory
+        with which a new dummy tokenizer may be created.
+    """
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    def _import_dummy(module, *args, **kwargs):
+        return dummy_tokenizer
+
+    monkeypatch.setattr(importlib, "import_module", _import_dummy)
+    properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+    def _create_tokenizer():
+        return dummy_tokenizer.DummyTokenizer(None, None)
+
+    return _create_tokenizer
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@ -0,0 +1,64 @@
+"""
+Tokenizer for testing.
+"""
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return DummyTokenizer(dsn, data_dir)
+
+class DummyTokenizer:
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.init_state = None
+        self.analyser_cache = {}
+
+
+    def init_new_db(self, *args, **kwargs):
+        assert self.init_state == None
+        self.init_state = "new"
+
+
+    def init_from_project(self):
+        assert self.init_state == None
+        self.init_state = "loaded"
+
+
+    def finalize_import(self, _):
+        pass
+
+
+    def name_analyzer(self):
+        return DummyNameAnalyzer(self.analyser_cache)
+
+
+class DummyNameAnalyzer:
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def __init__(self, cache):
+        self.analyser_cache = cache
+        cache['countries'] = []
+
+
+    def close(self):
+        pass
+
+    def add_postcodes_from_db(self):
+        pass
+
+    def update_special_phrases(self, phrases):
+        self.analyser_cache['special_phrases'] = phrases
+
+    def add_country_names(self, code, names):
+        self.analyser_cache['countries'].append((code, names))
+
+    def process_place(self, place):
+        return {}
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@ -22,6 +22,7 @@ import nominatim.tools.database_import
 import nominatim.tools.freeze
 import nominatim.tools.refresh
 import nominatim.tools.postcodes
+import nominatim.tokenizer.factory

 from mocks import MockParamCapture

@ -56,6 +57,28 @@ def mock_func_factory(monkeypatch):
    return get_mock


+@pytest.fixture
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
 def test_cli_help(capsys):
    """ Running nominatim tool without arguments prints help.
    """
@ -84,10 +107,9 @@ def test_import_bad_file(temp_db):
    assert 1 == call_nominatim('import', '--osm-file', '.')


-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
-        mock_func_factory(nominatim.tools.database_import, 'install_module'),
        mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
        mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
        mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
@ -107,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
    cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')

    assert 0 == call_nominatim('import', '--osm-file', __file__)
+    assert tokenizer_mock.finalize_import_called

    assert cf_mock.called > 1

@ -114,7 +137,7 @@ def test_import_full(temp_db, mock_func_factory):
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)


-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
        mock_func_factory(nominatim.tools.database_import, 'load_data'),
@ -127,12 +150,14 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
    ]

    assert 0 == call_nominatim('import', '--continue', 'load-data')
+    assert tokenizer_mock.finalize_import_called

    for mock in mocks:
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)


-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
+                                  temp_db_conn, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -153,7 +178,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
    assert temp_db_conn.index_exists('idx_placex_pendingsector')


-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
    mocks = [
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@ -163,6 +188,8 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):

    assert 0 == call_nominatim('import', '--continue', 'db-postprocess')

+    assert tokenizer_mock.finalize_import_called
+
    for mock in mocks:
        assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)

@ -217,7 +244,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
                          (['--boundaries-only'], 1, 0),
                          (['--no-boundaries'], 0, 1),
                          (['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
+                       params, do_bnds, do_ranks):
    temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
    bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
    rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@ -227,7 +255,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
    assert bnd_mock.called == do_bnds
    assert rank_mock.called == do_ranks

-def test_special_phrases_command(temp_db, mock_func_factory):
+def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
    func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')

    call_nominatim('special-phrases', '--import-from-wiki')
@ -238,7 +266,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
                         ('postcodes', 'update_postcodes'),
                         ('word-counts', 'recompute_word_counts'),
                         ('address-levels', 'load_address_levels_from_file'),
-                         ('functions', 'create_functions'),
                         ('wiki-data', 'import_wikipedia_articles'),
                         ('importance', 'recompute_importance'),
                         ('website', 'setup_website'),
@ -250,6 +277,14 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
    assert func_mock.called == 1


+def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
+    func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
+
+    assert 0 == call_nominatim('refresh', '--functions')
+    assert func_mock.called == 1
+    assert tokenizer_mock.update_sql_functions_called
+
+
 def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
    calls = []
    monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
--- a/test/python/test_cli_replication.py
+++ b/test/python/test_cli_replication.py
@ -27,7 +27,29 @@ def call_nominatim(*args):
                                   cli_args=['replication'] + list(args))

@pytest.fixture
-def index_mock(monkeypatch):
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
+@pytest.fixture
+def index_mock(monkeypatch, tokenizer_mock):
    mock = MockParamCapture()
    monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
    monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@ -52,7 +74,7 @@ def init_status(temp_db_conn, status_table):


@pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
    return mock_func_factory(nominatim.tools.replication, 'update')

@pytest.mark.parametrize("params,func", [
--- a/test/python/test_db_sql_preprocessor.py
+++ b/test/python/test_db_sql_preprocessor.py
@ -24,7 +24,6 @@ def sql_factory(tmp_path):
    ("'{{db.partitions|join}}'", '012'),
    ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
    ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
-    ("'{{config.DATABASE_MODULE_PATH}}'", '.')
    ])
 def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
    sqlfile = sql_factory("RETURN {};".format(expr))
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@ -6,6 +6,7 @@ import psycopg2
 import pytest

 from nominatim.indexer import indexer
+from nominatim.tokenizer import factory

 class IndexerTestDB:

@ -17,6 +18,7 @@ class IndexerTestDB:
        self.conn = conn
        self.conn.set_isolation_level(0)
        with self.conn.cursor() as cur:
+            cur.execute('CREATE EXTENSION hstore')
            cur.execute("""CREATE TABLE placex (place_id BIGINT,
                                                class TEXT,
                                                type TEXT,
@ -26,9 +28,14 @@ class IndexerTestDB:
                                                indexed_date TIMESTAMP,
                                                partition SMALLINT,
                                                admin_level SMALLINT,
+                                                address HSTORE,
+                                                token_info JSONB,
                                                geometry_sector INTEGER)""")
            cur.execute("""CREATE TABLE location_property_osmline (
                               place_id BIGINT,
+                               osm_id BIGINT,
+                               address HSTORE,
+                               token_info JSONB,
                               indexed_status SMALLINT,
                               indexed_date TIMESTAMP,
                               geometry_sector INTEGER)""")
@ -46,6 +53,25 @@ class IndexerTestDB:
                             END IF;
                             RETURN NEW;
                           END; $$ LANGUAGE plpgsql;""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                      OUT name HSTORE,
+                                                      OUT address HSTORE,
+                                                      OUT country_feature VARCHAR)
+                           AS $$
+                           BEGIN
+                            address := p.address;
+                            name := p.address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+                           RETURNS HSTORE AS $$
+                           BEGIN
+                             RETURN in_address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+
            for table in ('placex', 'location_property_osmline', 'location_postcode'):
                cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                               FOR EACH ROW EXECUTE PROCEDURE date_update()
@ -76,9 +102,9 @@ class IndexerTestDB:
        next_id = next(self.osmline_id)
        with self.conn.cursor() as cur:
            cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
-                        (next_id, sector))
+                              (place_id, osm_id, indexed_status, geometry_sector)
+                              VALUES (%s, %s, 1, %s)""",
+                        (next_id, next_id, sector))
        return next_id

    def add_postcode(self, country, postcode):
@ -102,8 +128,14 @@ def test_db(temp_db_conn):
    yield IndexerTestDB(temp_db_conn)


+@pytest.fixture
+def test_tokenizer(tokenizer_mock, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return factory.create_tokenizer(def_config)
+
+
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -111,7 +143,7 @@ def test_index_all_by_rank(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(0, 30)

    assert 0 == test_db.placex_unindexed()
@ -142,7 +174,7 @@ def test_index_all_by_rank(test_db, threads):


@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -150,7 +182,8 @@ def test_index_partial_without_30(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
+                          test_tokenizer, threads)
    idx.index_by_rank(4, 15)

    assert 19 == test_db.placex_unindexed()
@ -162,7 +195,7 @@ def test_index_partial_without_30(test_db, threads):


@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
    for rank in range(31):
        test_db.add_place(rank_address=rank, rank_search=rank)
    test_db.add_osmline()
@ -170,7 +203,7 @@ def test_index_partial_with_30(test_db, threads):
    assert 31 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(28, 30)

    assert 27 == test_db.placex_unindexed()
@ -181,7 +214,7 @@ def test_index_partial_with_30(test_db, threads):
                      WHERE indexed_status = 0 AND rank_address between 1 and 27""")

@pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
    for rank in range(4, 10):
        test_db.add_admin(rank_address=rank, rank_search=rank)
    for rank in range(31):
@ -191,7 +224,7 @@ def test_index_boundaries(test_db, threads):
    assert 37 == test_db.placex_unindexed()
    assert 1 == test_db.osmline_unindexed()

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_boundaries(0, 30)

    assert 31 == test_db.placex_unindexed()
@ -203,13 +236,13 @@ def test_index_boundaries(test_db, threads):


@pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
    for postcode in range(1000):
        test_db.add_postcode('de', postcode)
    for postcode in range(32000, 33000):
        test_db.add_postcode('us', postcode)

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_postcodes()

    assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@ -217,7 +250,7 @@ def test_index_postcodes(test_db, threads):


@pytest.mark.parametrize("analyse", [True, False])
-def test_index_full(test_db, analyse):
+def test_index_full(test_db, analyse, test_tokenizer):
    for rank in range(4, 10):
        test_db.add_admin(rank_address=rank, rank_search=rank)
    for rank in range(31):
@ -226,7 +259,7 @@ def test_index_full(test_db, analyse):
    for postcode in range(1000):
        test_db.add_postcode('de', postcode)

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
    idx.index_full(analyse=analyse)

    assert 0 == test_db.placex_unindexed()
@ -236,13 +269,13 @@ def test_index_full(test_db, analyse):


@pytest.mark.parametrize("threads", [1, 15])
-def test_index_reopen_connection(test_db, threads, monkeypatch):
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
    monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)

    for _ in range(1000):
        test_db.add_place(rank_address=30, rank_search=30)

-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
    idx.index_by_rank(28, 30)

    assert 0 == test_db.placex_unindexed()
--- a/test/python/test_tokenizer_factory.py
+++ b/test/python/test_tokenizer_factory.py
@ -0,0 +1,77 @@
+"""
+Tests for creating new tokenizers.
+"""
+import importlib
+import pytest
+
+from nominatim.db import properties
+from nominatim.tokenizer import factory
+from nominatim.errors import UsageError
+from dummy_tokenizer import DummyTokenizer
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return def_config
+
+
+def test_setup_dummy_tokenizer(temp_db_conn, test_config,
+                               tokenizer_mock, property_table):
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+    assert (test_config.project_dir / 'tokenizer').is_dir()
+
+    assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
+
+
+def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').mkdir()
+
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+
+
+def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').write_text("foo")
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+
+def test_setup_bad_tokenizer_name(test_config, monkeypatch):
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+def test_load_tokenizer(temp_db_conn, test_config,
+                        tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    tokenizer = factory.get_tokenizer_for_db(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "loaded"
+
+
+def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    test_config.project_dir = test_config.project_dir / 'foo'
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
+
+def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@ -0,0 +1,298 @@
+"""
+Test for legacy tokenizer.
+"""
+import shutil
+
+import pytest
+
+from nominatim.tokenizer import legacy_tokenizer
+from nominatim.db import properties
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path / 'project'
+    def_config.project_dir.mkdir()
+
+    module_dir = tmp_path / 'module_src'
+    module_dir.mkdir()
+    (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
+
+    def_config.lib_dir.module = module_dir
+
+    sqldir = tmp_path / 'sql'
+    sqldir.mkdir()
+    (sqldir / 'tokenizer').mkdir()
+    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'words.sql').write_text("SELECT 'a'")
+    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+    def_config.lib_dir.sql = sqldir
+    def_config.lib_dir.data = sqldir
+
+    return def_config
+
+
+@pytest.fixture
+def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
+    (tmp_path / 'tokenizer').mkdir()
+
+    def _maker():
+        return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
+
+    return _maker
+
+@pytest.fixture
+def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+
+@pytest.fixture
+def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
+             word_table, temp_db_with_extensions, tmp_path):
+    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
+    sql.write_text("""
+        CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+          RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
+        """)
+
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    with tok.name_analyzer() as analyzer:
+        yield analyzer
+
+
+@pytest.fixture
+def make_standard_name(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
+                              RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_postcode_id(table_factory, temp_db_cursor):
+    table_factory('out_postcode_table', 'postcode TEXT')
+
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+                              RETURNS BOOLEAN AS $$
+                              INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_housenumbers(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
+                                  housenumbers TEXT[],
+                                  OUT tokens TEXT, OUT normtext TEXT)
+                              AS $$
+                              SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def make_keywords(temp_db_cursor, temp_db_with_extensions):
+    temp_db_cursor.execute(
+        """CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
+           RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
+
+def test_init_new(tokenizer_factory, test_config, monkeypatch,
+                  temp_db_conn, sql_preprocessor):
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_init_module_load_failed(tokenizer_factory, test_config,
+                                 monkeypatch, temp_db_conn):
+    tok = tokenizer_factory()
+
+    with pytest.raises(UsageError):
+        tok.init_new_db(test_config)
+
+
+def test_init_module_custom(tokenizer_factory, test_config,
+                            monkeypatch, tmp_path, sql_preprocessor):
+    module_dir = (tmp_path / 'custom').resolve()
+    module_dir.mkdir()
+    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
+
+    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert not (test_config.project_dir / 'module').exists()
+
+
+def test_init_from_project(tokenizer_setup, tokenizer_factory):
+    tok = tokenizer_factory()
+
+    tok.init_from_project()
+
+    assert tok.normalization is not None
+
+
+def test_update_sql_functions(sql_preprocessor, temp_db_conn,
+                              tokenizer_factory, test_config, table_factory,
+                              monkeypatch, temp_db_cursor):
+    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+
+    table_factory('test', 'txt TEXT')
+
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
+    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
+                                                   ('{{modulepath}}')""")
+
+    tok.update_sql_functions(test_config)
+
+    test_content = temp_db_cursor.row_set('SELECT * FROM test')
+    assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
+
+
+def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.migrate_database(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_normalize(analyzer):
+    assert analyzer.normalize('TEsT') == 'test'
+
+
+def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
+                               create_postcode_id):
+    table_factory('location_postcode', 'postcode TEXT',
+                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+
+    analyzer.add_postcodes_from_db()
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set((('1234', ), ('12 34', ), ('AB23',)))
+
+
+def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
+                                           make_standard_name):
+    analyzer.update_special_phrases([
+        ("König bei", "amenity", "royal", "near"),
+        ("Könige", "amenity", "royal", "-"),
+        ("strasse", "highway", "primary", "in")
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
+                       (' könige', 'könige', 'amenity', 'royal', None),
+                       (' strasse', 'strasse', 'highway', 'primary', 'in')))
+
+
+def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
+                                          make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([])
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
+def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
+                                      make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([
+      ('prison', 'amenity', 'prison', 'in'),
+      ('bar', 'highway', 'road', '-'),
+      ('garden', 'leisure', 'garden', 'near')
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
+                       (' bar', 'bar', 'highway', 'road', None),
+                       (' garden', 'garden', 'leisure', 'garden', 'near')))
+
+
+def test_process_place_names(analyzer, make_keywords):
+
+    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+
+    assert info['names'] == '{1,2,3}'
+
+
+@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
+def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set(((pc, ),))
+
+
+@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
+def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
+                                    pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
+
+
+@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
+def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
+    info = analyzer.process_place({'address': {'housenumber' : hnr}})
+
+    assert info['hnr'] == hnr
+    assert info['hnr_tokens'].startswith("{")
+
+
+def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+
+    assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
+
+
+def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'housenumber' : '134',
+                                               'conscriptionnumber' : '134',
+                                               'streetnumber' : '99a'}})
+
+    assert set(info['hnr'].split(';')) == set(('134', '99a'))
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@ -43,8 +43,22 @@ def test_check_placex_table_size_bad(temp_db_cursor, temp_db_conn, def_config):
    assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL


-def test_check_module_bad(temp_db_conn, def_config):
-    assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+
+
+@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
+                                                ("Something wrong", chkdb.CheckState.FAIL)])
+def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
+                         check_result, state):
+    class _TestTokenizer:
+        def check_database(self):
+            return check_result
+
+    monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
+                         lambda *a, **k: _TestTokenizer())
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == state


 def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
        database_import.setup_extensions(temp_db_conn)


-def test_install_module(tmp_path):
-    src_dir = tmp_path / 'source'
-    src_dir.mkdir()
-    (src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    project_dir = tmp_path / 'project'
-    project_dir.mkdir()
-
-    database_import.install_module(src_dir, project_dir, '')
-
-    outfile = project_dir / 'module' / 'nominatim.so'
-
-    assert outfile.exists()
-    assert outfile.read_text() == 'TEST nomiantim.so'
-    assert outfile.stat().st_mode == 33261
-
-
-def test_install_module_custom(tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
-
-    assert not (tmp_path / 'module').exists()
-
-
-def test_install_module_fail_access(temp_db_conn, tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
-        database_import.install_module(tmp_path, tmp_path, '',
-                                       conn=temp_db_conn)
-
-
 def test_import_base_data(src_dir, temp_db, temp_db_cursor):
    temp_db_cursor.execute('CREATE EXTENSION hstore')
    temp_db_cursor.execute('CREATE EXTENSION postgis')
@ -171,14 +138,15 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):


 def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
-    tables = ('word', 'placex', 'place_addressline', 'location_area',
+    tables = ('placex', 'place_addressline', 'location_area',
              'location_area_country',
              'location_property_tiger', 'location_property_osmline',
              'location_postcode', 'search_name', 'location_road_23')
    for table in tables:
-        table_factory(table, content=(1, 2, 3))
+        table_factory(table, content=((1, ), (2, ), (3, )))
+        assert temp_db_cursor.table_rows(table) == 3

-    database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+    database_import.truncate_data_tables(temp_db_conn)

    for table in tables:
        assert temp_db_cursor.table_rows(table) == 0
@ -196,36 +164,33 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
    place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
              geom='SRID=4326;LINESTRING(0 0, 10 10)')

-    database_import.load_data(dsn, src_dir / 'data', threads)
+    database_import.load_data(dsn, threads)

    assert temp_db_cursor.table_rows('placex') == 30
    assert temp_db_cursor.table_rows('location_property_osmline') == 1

-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
-                              temp_db_with_extensions, monkeypatch, languages):
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+                              table_factory, tokenizer_mock, languages):
+
+    table_factory('country_name', 'country_code varchar(2), name hstore',
+                  content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+                           ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+    assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+    tokenizer = tokenizer_mock()
+
+    database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+    assert len(tokenizer.analyser_cache['countries']) == 2
+
+    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
    if languages:
-        monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
-    temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
-                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
-                               """)
-    temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
-    temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
-    temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
-                              '"name"=>"us","name:af"=>"us"')""")
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                            lookup_country_code varchar(2))
-                            RETURNS INTEGER
-                            AS $$
-                            BEGIN
-                                INSERT INTO word VALUES (lookup_country_code);
-                                RETURN 5;
-                            END;
-                            $$
-                            LANGUAGE plpgsql;
-                               """)
-    database_import.create_country_names(temp_db_conn, def_config)
-    if languages:
-        assert temp_db_cursor.table_rows('word') == 4
+        assert result_set == {'us' : set(('us', 'us1', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
    else:
-        assert temp_db_cursor.table_rows('word') == 5
+        assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@ -11,41 +11,6 @@ from nominatim.tools import SpecialPhrasesImporter

 TEST_BASE_DIR = Path(__file__) / '..' / '..'

-def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
-                                            temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return special phrase term added to the word
-        table.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-    """
-    temp_db_cursor.execute(query)
-
-    assert not special_phrases_importer.words_phrases_to_delete
-    special_phrases_importer._fetch_existing_words_phrases()
-    contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
-    assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
-
-@pytest.mark.parametrize("house_type", ['house', 'postcode'])
-def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
-                                                    house_type, temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return nothing as the terms added correspond
-        to a housenumber and postcode term.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'place', %s, null, 0, 'near');
-    """
-    temp_db_cursor.execute(query, (house_type,))
-
-    special_phrases_importer._fetch_existing_words_phrases()
-    assert not special_phrases_importer.words_phrases_to_delete
-
 def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
    """
        Check for the fetch_existing_place_classtype_tables() method.
@ -123,36 +88,6 @@ def test_convert_settings_giving_json(special_phrases_importer):

    assert returned == json_file

-def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
-                                       temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the 
-        getorcreate_amenityoperator() SQL function and that
-        the 2 differents operators are well handled.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', 'near')
-    special_phrases_importer._process_amenity('', '', '', '', 'in')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
-        results = temp_db_cursor.fetchall()
-
-    assert len(results) == 2
-
-def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
-                                          temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the
-        getorcreate_amenity() SQL function.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', '')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
-        result = temp_db_cursor.fetchone()
-
-    assert result
-
 def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
    """
        Test that _create_place_classtype_indexes() create the
@ -215,8 +150,7 @@ def test_create_place_classtype_table_and_indexes(
        assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
        assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])

-def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table,
-                             getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
    """
        Test that _process_xml_content() process the given xml content right
        by executing the right SQL functions for amenities and 
@ -228,11 +162,9 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
    #Converted output set to a dict for easy assert further.
    results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))

-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
    assert results[class_test] and type_test in results.values()

-def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
                                             temp_db_conn):
    """
        Check for the remove_non_existent_phrases_from_db() method.
@ -245,22 +177,10 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
        be deleted.
    """
    with temp_db_conn.cursor() as temp_db_cursor:
-        to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
-        to_keep_phrase_tuple = (
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        )
-        special_phrases_importer.words_phrases_to_delete = {
-            to_delete_phrase_tuple,
-            to_keep_phrase_tuple
-        }
-        special_phrases_importer.words_phrases_still_exist = {
-            to_keep_phrase_tuple
-        }
        special_phrases_importer.table_phrases_to_delete = {
            'place_classtype_testclasstypetable_to_delete'
        }

-        query_words = 'SELECT word, class, type, operator FROM word;'
        query_tables = """
            SELECT table_name
            FROM information_schema.tables
@ -268,21 +188,16 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
            AND table_name like 'place_classtype_%';
        """

-        special_phrases_importer._remove_non_existent_phrases_from_db()
+        special_phrases_importer._remove_non_existent_tables_from_db()

-        temp_db_cursor.execute(query_words)
-        words_result = temp_db_cursor.fetchall()
        temp_db_cursor.execute(query_tables)
        tables_result = temp_db_cursor.fetchall()
-        assert len(words_result) == 1 and words_result[0] == [
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        ]
        assert (len(tables_result) == 1 and
            tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
        )

-def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
-                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
+                          placex_table, tokenizer_mock):
    """
        Check that the main import_from_wiki() method is well executed.
        It should create the place_classtype table, the place_id and centroid indexes,
@ -294,17 +209,14 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
    #what is deleted and what is preserved.
    with temp_db_conn.cursor() as temp_db_cursor:
        temp_db_cursor.execute("""
-            INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
-            'amenity', 'animal_shelter', null, 0, null);
-
-            INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
-            'wrong_class', 'wrong_type', null, 0, 'near');
-
            CREATE TABLE place_classtype_amenity_animal_shelter();
            CREATE TABLE place_classtype_wrongclass_wrongtype();""")

    monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
-    special_phrases_importer.import_from_wiki(['en'])
+    tokenizer = tokenizer_mock()
+    special_phrases_importer.import_from_wiki(tokenizer, ['en'])
+
+    assert len(tokenizer.analyser_cache['special_phrases']) == 18

    class_test = 'aerialway'
    type_test = 'zip_line'
@ -312,22 +224,12 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
    assert check_table_exist(temp_db_conn, class_test, type_test)
    assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
    assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
    assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
    assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')

    #Format (query, should_return_something_bool) use to easily execute all asserts
    queries_tests = set()

-    #Used to check that the correct phrase already in the word table before is still there.
-    query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
-    queries_tests.add((query_correct_word, True))
-
-    #Used to check if wrong phrase was deleted from the word table of the database.
-    query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
-    queries_tests.add((query_wrong_word, False))
-
    #Used to check that correct place_classtype table already in the datase before is still there.
    query_existing_table = """
        SELECT table_name
@ -412,24 +314,6 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
        temp_db_conn.index_exists(index_prefix + 'place_id')
    )

-def check_amenities_with_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenityoperator()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
-def check_amenities_without_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenity()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
@pytest.fixture
 def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
    """
@ -453,48 +337,7 @@ def temp_phplib_dir_with_migration():
        yield Path(phpdir)

@pytest.fixture
-def default_phrases(word_table, temp_db_cursor):
+def default_phrases(temp_db_cursor):
    temp_db_cursor.execute("""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
-        'class_exists', 'type_exists', null, 0, 'near');
-
        CREATE TABLE place_classtype_testclasstypetable_to_delete();
        CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
-
-@pytest.fixture
-def make_strandard_name_func(temp_db_cursor):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
-        BEGIN
-        RETURN trim(name); --Basically return only the trimed name for the tests
-        END;
-        $$ LANGUAGE plpgsql IMMUTABLE;""")
-        
-@pytest.fixture
-def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text)
-        RETURNS void as $$
-        BEGIN
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, 'no_operator');
-        END;
-        $$ LANGUAGE plpgsql""")
-
-@pytest.fixture
-def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE TABLE temp_with_operator(op TEXT);
-
-        CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text, op text)
-        RETURNS void as $$
-        BEGIN 
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, op);
-        END;
-        $$ LANGUAGE plpgsql""")
--- a/test/python/test_tools_postcodes.py
+++ b/test/python/test_tools_postcodes.py
@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
 import pytest

 from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+    return dummy_tokenizer.DummyTokenizer(None, None)

@pytest.fixture
 def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                      postcode TEXT,
                      geometry GEOMETRY(Geometry, 4326)""")
    temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                           """)


-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
-    postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)

    assert temp_db_cursor.table_exists('gb_postcode')
    assert temp_db_cursor.table_exists('us_postcode')
    assert temp_db_cursor.table_rows('location_postcode') == 0


-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
    temp_db_cursor.execute("""
        INSERT INTO placex (place_id, country_code, address, geometry)
          VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
    """)

-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)

    rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                      ST_X(geometry), ST_Y(geometry)
--- a/test/python/test_tools_refresh_create_functions.py
+++ b/test/python/test_tools_refresh_create_functions.py
@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
    return tmp_path

@pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
    return temp_db_conn


--- a/test/python/test_tools_refresh_setup_website.py
+++ b/test/python/test_tools_refresh_setup_website.py
@ -26,6 +26,7 @@ def test_script(envdir):

 def run_website_script(envdir, config):
    config.lib_dir.php = envdir / 'php'
+    config.project_dir = envdir
    refresh.setup_website(envdir, config)

    proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
--- a/test/testdb/specialphrases_testdb.sql
+++ b/test/testdb/specialphrases_testdb.sql
@ -1,120 +1,170 @@
-SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
-SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
-SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
-SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
-SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
-SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
-SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
-SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
+                                               lookup_class text, lookup_type text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
+                                                       normalized_word TEXT,
+                                                       lookup_class text,
+                                                       lookup_type text,
+                                                       op text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type and operator = op
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0, op);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
+SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
+SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
 CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
 CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
 CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
@ -176,3 +226,6 @@ CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classty
 CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
 GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
 DROP INDEX idx_placex_classtype;
+
+DROP FUNCTION test_getorcreate_amenity;
+DROP FUNCTION test_getorcreate_amenityoperator;