Nominatim/lib/TokenList.php

<?php

namespace Nominatim;

require_once(CONST_BasePath.'/lib/TokenCountry.php');
require_once(CONST_BasePath.'/lib/TokenHousenumber.php');
require_once(CONST_BasePath.'/lib/TokenPostcode.php');
require_once(CONST_BasePath.'/lib/TokenSpecialTerm.php');
require_once(CONST_BasePath.'/lib/TokenWord.php');
require_once(CONST_BasePath.'/lib/SpecialSearchOperator.php');

/**
 * Saves information about the tokens that appear in a search query.
 *
 * Tokens are sorted by their normalized form, the token word. There are different
 * kinds of tokens, represented by different Token* classes. Note that
 * tokens do not have a common base class. All tokens need to have a field
 * with the word id that points to an entry in the `word` database table
 * but otherwise the information saved about a token can be very different.
 *
 * There are two different kinds of token words: full words and partial terms.
 *
 * Full words start with a space. They represent a complete name of a place.
 * All special tokens are normally full words.
 *
 * Partial terms have no space at the beginning. They may represent a part of
 * a name of a place (e.g. in the name 'World Trade Center' a partial term
 * would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
 */
class TokenList
{
    // List of list of tokens indexed by their word_token.
    private $aTokens = array();


    /**
     * Return total number of tokens.
     *
     * @return Integer
     */
    public function count()
    {
        return count($this->aTokens);
    }

    /**
     * Check if there are tokens for the given token word.
     *
     * @param string $sWord Token word to look for.
     *
     * @return bool True if there is one or more token for the token word.
     */
    public function contains($sWord)
    {
        return isset($this->aTokens[$sWord]);
    }

    /**
     * Check if there are partial or full tokens for the given word.
     *
     * @param string $sWord Token word to look for.
     *
     * @return bool True if there is one or more token for the token word.
     */
    public function containsAny($sWord)
    {
        return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
    }

    /**
     * Get the list of tokens for the given token word.
     *
     * @param string $sWord Token word to look for.
     *
     * @return object[] Array of tokens for the given token word or an
     *                  empty array if no tokens could be found.
     */
    public function get($sWord)
    {
        return isset($this->aTokens[$sWord]) ? $this->aTokens[$sWord] : array();
    }

    /**
     * Add token information from the word table in the database.
     *
     * @param object   $oDB           Nominatim::DB instance.
     * @param string[] $aTokens       List of tokens to look up in the database.
     * @param string[] $aCountryCodes List of country restrictions.
     * @param string   $sNormQuery    Normalized query string.
     * @param object   $oNormalizer   Normalizer function to use on tokens.
     *
     * @return void
     */
    public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
    {
        // Check which tokens we have, get the ID numbers
        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
        $sSQL .= ' FROM word WHERE word_token in (';
        $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';

        Debug::printSQL($sSQL);

        $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');

        foreach ($aDBWords as $aWord) {
            $oToken = null;
            $iId = (int) $aWord['word_id'];

            if ($aWord['class']) {
                // Special terms need to appear in their normalized form.
                if ($aWord['word']) {
                    $sNormWord = $aWord['word'];
                    if ($oNormalizer != null) {
                        $sNormWord = $oNormalizer->transliterate($aWord['word']);
                    }
                    if (strpos($sNormQuery, $sNormWord) === false) {
                        continue;
                    }
                }

                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
                    if ($aWord['word']
                        && pg_escape_string($aWord['word']) == $aWord['word']
                    ) {
                        $oToken = new Token\Postcode(
                            $iId,
                            $aWord['word'],
                            $aWord['country_code']
                        );
                    }
                } else {
                    // near and in operator the same at the moment
                    $oToken = new Token\SpecialTerm(
                        $iId,
                        $aWord['class'],
                        $aWord['type'],
                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
                    );
                }
            } elseif ($aWord['country_code']) {
                // Filter country tokens that do not match restricted countries.
                if (!$aCountryCodes
                    || in_array($aWord['country_code'], $aCountryCodes)
                ) {
                    $oToken = new Token\Country($iId, $aWord['country_code']);
                }
            } else {
                $oToken = new Token\Word(
                    $iId,
                    $aWord['word_token'][0] != ' ',
                    (int) $aWord['count']
                );
            }

            if ($oToken) {
                $this->addToken($aWord['word_token'], $oToken);
            }
        }
    }

    /**
     * Add a new token for the given word.
     *
     * @param string $sWord  Word the token describes.
     * @param object $oToken Token object to add.
     *
     * @return void
     */
    public function addToken($sWord, $oToken)
    {
        if (isset($this->aTokens[$sWord])) {
            $this->aTokens[$sWord][] = $oToken;
        } else {
            $this->aTokens[$sWord] = array($oToken);
        }
    }

    public function debugTokenByWordIdList()
    {
        $aWordsIDs = array();
        foreach ($this->aTokens as $sToken => $aWords) {
            foreach ($aWords as $aToken) {
                if ($aToken->iId !== null) {
                    $aWordsIDs[$aToken->iId] =
                        '#'.$sToken.'('.$aToken->iId.')#';
                }
            }
        }

        return $aWordsIDs;
    }

    public function debugInfo()
    {
        return $this->aTokens;
    }
}
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`<?php`

			`namespace Nominatim;`

			`require_once(CONST_BasePath.'/lib/TokenCountry.php');`
			`require_once(CONST_BasePath.'/lib/TokenHousenumber.php');`
			`require_once(CONST_BasePath.'/lib/TokenPostcode.php');`
			`require_once(CONST_BasePath.'/lib/TokenSpecialTerm.php');`
			`require_once(CONST_BasePath.'/lib/TokenWord.php');`
			`require_once(CONST_BasePath.'/lib/SpecialSearchOperator.php');`

			`/**`
			`* Saves information about the tokens that appear in a search query.`
			`*`
add documentation for TokenList 2018-05-15 00:17:54 +03:00			`* Tokens are sorted by their normalized form, the token word. There are different`
			`* kinds of tokens, represented by different Token* classes. Note that`
			`* tokens do not have a common base class. All tokens need to have a field`
			* with the word id that points to an entry in the `word` database table
			`* but otherwise the information saved about a token can be very different.`
			`*`
			`* There are two different kinds of token words: full words and partial terms.`
			`*`
			`* Full words start with a space. They represent a complete name of a place.`
			`* All special tokens are normally full words.`
			`*`
			`* Partial terms have no space at the beginning. They may represent a part of`
			`* a name of a place (e.g. in the name 'World Trade Center' a partial term`
			`* would be 'Trade' or 'Trade Center'). They are only used in TokenWord.`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`*/`
			`class TokenList`
			`{`
			`// List of list of tokens indexed by their word_token.`
			`private $aTokens = array();`

tests for Nominatim::TokenList 2018-07-19 03:31:54 +03:00
			`/**`
			`* Return total number of tokens.`
			`*`
			`* @return Integer`
			`*/`
			`public function count()`
			`{`
			`return count($this->aTokens);`
			`}`

add documentation for TokenList 2018-05-15 00:17:54 +03:00			`/**`
			`* Check if there are tokens for the given token word.`
			`*`
			`* @param string $sWord Token word to look for.`
			`*`
			`* @return bool True if there is one or more token for the token word.`
			`*/`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`public function contains($sWord)`
			`{`
			`return isset($this->aTokens[$sWord]);`
			`}`

Rework word set computation Switch from an recursive algorithm for computing the word sets to an iterative one that benefits from caching intermediate results. This considerably reduces the amount of memory needed, so that the depth restriction can be dropped. To ensure that the number of word sets remains manageable, only sets up to a certain length are accepted and only a certain number of total word sets. If word sets need to be dropped, we drop the ones with more words per word set first. To further reduce the number of potential word sets, the valid tokens are looked up first and then only word sets containing valid tokens are computed. Fixes #1403, #1404 and #654. 2019-06-29 19:22:31 +03:00			`/**`
			`* Check if there are partial or full tokens for the given word.`
			`*`
			`* @param string $sWord Token word to look for.`
			`*`
			`* @return bool True if there is one or more token for the token word.`
			`*/`
			`public function containsAny($sWord)`
			`{`
			`return isset($this->aTokens[$sWord]) \|\| isset($this->aTokens[' '.$sWord]);`
			`}`

add documentation for TokenList 2018-05-15 00:17:54 +03:00			`/**`
			`* Get the list of tokens for the given token word.`
			`*`
			`* @param string $sWord Token word to look for.`
			`*`
			`* @return object[] Array of tokens for the given token word or an`
			`* empty array if no tokens could be found.`
			`*/`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`public function get($sWord)`
			`{`
			`return isset($this->aTokens[$sWord]) ? $this->aTokens[$sWord] : array();`
			`}`

			`/**`
			`* Add token information from the word table in the database.`
			`*`
Nominatim::DB support input variables, custom error messages 2019-03-10 17:42:58 +03:00			`* @param object $oDB Nominatim::DB instance.`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`* @param string[] $aTokens List of tokens to look up in the database.`
			`* @param string[] $aCountryCodes List of country restrictions.`
			`* @param string $sNormQuery Normalized query string.`
			`* @param object $oNormalizer Normalizer function to use on tokens.`
			`*`
			`* @return void`
			`*/`
			`public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)`
			`{`
			`// Check which tokens we have, get the ID numbers`
			`$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';`
			`$sSQL .= ' operator, coalesce(search_name_count, 0) as count';`
			`$sSQL .= ' FROM word WHERE word_token in (';`
replace database abstraction DB with PDO 2019-02-24 18:14:36 +03:00			`$sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00
			`Debug::printSQL($sSQL);`

Nominatim::DB support input variables, custom error messages 2019-03-10 17:42:58 +03:00			`$aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00
			`foreach ($aDBWords as $aWord) {`
			`$oToken = null;`
			`$iId = (int) $aWord['word_id'];`

			`if ($aWord['class']) {`
			`// Special terms need to appear in their normalized form.`
			`if ($aWord['word']) {`
			`$sNormWord = $aWord['word'];`
			`if ($oNormalizer != null) {`
			`$sNormWord = $oNormalizer->transliterate($aWord['word']);`
			`}`
			`if (strpos($sNormQuery, $sNormWord) === false) {`
			`continue;`
			`}`
			`}`

			`if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {`
			`$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));`
			`} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {`
			`if ($aWord['word']`
			`&& pg_escape_string($aWord['word']) == $aWord['word']`
			`) {`
			`$oToken = new Token\Postcode(`
			`$iId,`
			`$aWord['word'],`
			`$aWord['country_code']`
			`);`
			`}`
			`} else {`
			`// near and in operator the same at the moment`
			`$oToken = new Token\SpecialTerm(`
			`$iId,`
			`$aWord['class'],`
			`$aWord['type'],`
fix operator type assignment Fixes #1084. 2018-07-20 23:27:27 +03:00			`$aWord['operator'] ? Operator::NEAR : Operator::NONE`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`);`
			`}`
			`} elseif ($aWord['country_code']) {`
			`// Filter country tokens that do not match restricted countries.`
			`if (!$aCountryCodes`
			`\|\| in_array($aWord['country_code'], $aCountryCodes)`
			`) {`
			`$oToken = new Token\Country($iId, $aWord['country_code']);`
			`}`
			`} else {`
			`$oToken = new Token\Word(`
			`$iId,`
Fix partial word computation Partial word tokens have a space at the beginning of the token not the word. 2018-08-13 22:22:45 +03:00			`$aWord['word_token'][0] != ' ',`
introduce classes for token list and token types 2018-05-15 00:04:15 +03:00			`(int) $aWord['count']`
			`);`
			`}`

			`if ($oToken) {`
			`$this->addToken($aWord['word_token'], $oToken);`
			`}`
			`}`
			`}`

			`/**`
			`* Add a new token for the given word.`
			`*`
			`* @param string $sWord Word the token describes.`
			`* @param object $oToken Token object to add.`
			`*`
			`* @return void`
			`*/`
			`public function addToken($sWord, $oToken)`
			`{`
			`if (isset($this->aTokens[$sWord])) {`
			`$this->aTokens[$sWord][] = $oToken;`
			`} else {`
			`$this->aTokens[$sWord] = array($oToken);`
			`}`
			`}`

			`public function debugTokenByWordIdList()`
			`{`
			`$aWordsIDs = array();`
			`foreach ($this->aTokens as $sToken => $aWords) {`
			`foreach ($aWords as $aToken) {`
			`if ($aToken->iId !== null) {`
			`$aWordsIDs[$aToken->iId] =`
			`'#'.$sToken.'('.$aToken->iId.')#';`
			`}`
			`}`
			`}`

			`return $aWordsIDs;`
			`}`

			`public function debugInfo()`
			`{`
			`return $this->aTokens;`
			`}`
			`}`