From 044bb6afa53f2c799490d8a95ca050c8b755ca4c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 28 Apr 2021 14:08:24 +0200 Subject: [PATCH] move tokenization in query into tokenizer --- lib-php/Geocode.php | 123 ++++--------- lib-php/Phrase.php | 54 +++--- lib-php/TokenList.php | 82 --------- lib-php/tokenizer/legacy_tokenizer.php | 232 +++++++++++++++++++++++++ test/php/Nominatim/PhraseTest.php | 34 ++-- test/php/Nominatim/TokenListTest.php | 84 --------- test/php/Nominatim/tokenizer.php | 17 ++ 7 files changed, 315 insertions(+), 311 deletions(-) create mode 100644 test/php/Nominatim/tokenizer.php diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index d9c1b3c0..53ee49c0 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -15,6 +15,7 @@ class Geocode protected $oDB; protected $oPlaceLookup; + protected $oTokenizer; protected $aLangPrefOrder = array(); @@ -42,23 +43,12 @@ class Geocode protected $sQuery = false; protected $aStructuredQuery = false; - protected $oNormalizer = null; - public function __construct(&$oDB) { $this->oDB =& $oDB; $this->oPlaceLookup = new PlaceLookup($this->oDB); - $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); - } - - private function normTerm($sTerm) - { - if ($this->oNormalizer === null) { - return $sTerm; - } - - return $this->oNormalizer->transliterate($sTerm); + $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB); } public function setLanguagePreference($aLangPref) @@ -511,12 +501,10 @@ class Geocode if ($this->aCountryCodes) { $oCtx->setCountryList($this->aCountryCodes); } + $this->oTokenizer->setCountryRestriction($this->aCountryCodes); Debug::newSection('Query Preprocessing'); - $sNormQuery = $this->normTerm($this->sQuery); - Debug::printVar('Normalized query', $sNormQuery); - $sLanguagePrefArraySQL = $this->oDB->getArraySQL( $this->oDB->getDBQuotedList($this->aLangPrefOrder) ); @@ -570,108 +558,55 @@ class Geocode } if ($sSpecialTerm && !$aSearches[0]->hasOperator()) { - $sSpecialTerm = pg_escape_string($sSpecialTerm); - $sToken = $this->oDB->getOne( - 'SELECT make_standard_name(:term)', - array(':term' => $sSpecialTerm), - 'Cannot decode query. Wrong encoding?' - ); - $sSQL = 'SELECT class, type FROM word '; - $sSQL .= ' WHERE word_token in (\' '.$sToken.'\')'; - $sSQL .= ' AND class is not null AND class not in (\'place\')'; + $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm); - Debug::printSQL($sSQL); - $aSearchWords = $this->oDB->getAll($sSQL); - $aNewSearches = array(); - foreach ($aSearches as $oSearch) { - foreach ($aSearchWords as $aSearchTerm) { - $oNewSearch = clone $oSearch; - $oNewSearch->setPoiSearch( - Operator::TYPE, - $aSearchTerm['class'], - $aSearchTerm['type'] - ); - $aNewSearches[] = $oNewSearch; + if (!empty($aTokens)) { + $aNewSearches = array(); + foreach ($aSearches as $oSearch) { + foreach ($aTokens as $oToken) { + $oNewSearch = clone $oSearch; + $oNewSearch->setPoiSearch( + $oToken->iOperator, + $oToken->sClass, + $oToken->sType + ); + $aNewSearches[] = $oNewSearch; + } } + $aSearches = $aNewSearches; } - $aSearches = $aNewSearches; } // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split + $aPhrases = array(); if ($this->aStructuredQuery) { - $aInPhrases = $this->aStructuredQuery; + foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, $iPhrase); + } } else { - $aInPhrases = explode(',', $sQuery); + foreach (explode(',', $sQuery) as $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, ''); + } } Debug::printDebugArray('Search context', $oCtx); Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]); - Debug::printVar('Final query phrases', $aInPhrases); - // Convert each phrase to standard form - // Create a list of standard words - // Get all 'sets' of words - // Generate a complete list of all Debug::newSection('Tokenization'); - $aTokens = array(); - $aPhrases = array(); - foreach ($aInPhrases as $iPhrase => $sPhrase) { - $sPhrase = $this->oDB->getOne( - 'SELECT make_standard_name(:phrase)', - array(':phrase' => $sPhrase), - 'Cannot normalize query string (is it a UTF-8 string?)' - ); - if (trim($sPhrase)) { - $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); - $oPhrase->addTokens($aTokens); - $aPhrases[] = $oPhrase; - } - } - - Debug::printVar('Tokens', $aTokens); - - $oValidTokens = new TokenList(); - - if (!empty($aTokens)) { - $oValidTokens->addTokensFromDB( - $this->oDB, - $aTokens, - $this->aCountryCodes, - $sNormQuery, - $this->oNormalizer - ); + $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases); + if ($oValidTokens->count() > 0) { $oCtx->setFullNameWords($oValidTokens->getFullWordIDs()); - // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken) { - if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { - if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { - // US ZIP+4 codes - merge in the 5-digit ZIP code - $oValidTokens->addToken( - $sToken, - new Token\Postcode(null, $aData[1], 'us') - ); - } elseif (preg_match('/^ [0-9]+$/', $sToken)) { - // Unknown single word token with a number. - // Assume it is a house number. - $oValidTokens->addToken( - $sToken, - new Token\HouseNumber(null, trim($sToken)) - ); - } - } - } + $aPhrases = array_filter($aPhrases, function ($oPhrase) { + return $oPhrase->getWordSets() !== null; + }); // Any words that have failed completely? // TODO: suggestions Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); - - foreach ($aPhrases as $oPhrase) { - $oPhrase->computeWordSets($oValidTokens); - } Debug::printDebugTable('Phrases', $aPhrases); Debug::newSection('Search candidates'); diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php index e2643e87..d14c842d 100644 --- a/lib-php/Phrase.php +++ b/lib-php/Phrase.php @@ -16,8 +16,6 @@ class Phrase private $sPhrase; // Element type for structured searches. private $sPhraseType; - // Space-separated words of the phrase. - private $aWords; // Possible segmentations of the phrase. private $aWordSets; @@ -38,7 +36,14 @@ class Phrase { $this->sPhrase = trim($sPhrase); $this->sPhraseType = $sPhraseType; - $this->aWords = explode(' ', $this->sPhrase); + } + + /** + * Get the orginal phrase of the string. + */ + public function getPhrase() + { + return $this->sPhrase; } /** @@ -63,30 +68,6 @@ class Phrase return $this->aWordSets; } - /** - * Add the tokens from this phrase to the given list of tokens. - * - * @param string[] $aTokens List of tokens to append. - * - * @return void - */ - public function addTokens(&$aTokens) - { - $iNumWords = count($this->aWords); - - for ($i = 0; $i < $iNumWords; $i++) { - $sPhrase = $this->aWords[$i]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; - $aTokens[$sPhrase] = $sPhrase; - - for ($j = $i + 1; $j < $iNumWords; $j++) { - $sPhrase .= ' '.$this->aWords[$j]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; - $aTokens[$sPhrase] = $sPhrase; - } - } - } - /** * Invert the set of possible segmentations. * @@ -99,21 +80,27 @@ class Phrase } } - public function computeWordSets($oTokens) + public function computeWordSets($aWords, $oTokens) { - $iNumWords = count($this->aWords); + $iNumWords = count($aWords); + + if ($iNumWords == 0) { + $this->aWordSets = null; + return; + } + // Caches the word set for the partial phrase up to word i. $aSetCache = array_fill(0, $iNumWords, array()); // Initialise first element of cache. There can only be the word. - if ($oTokens->containsAny($this->aWords[0])) { - $aSetCache[0][] = array($this->aWords[0]); + if ($oTokens->containsAny($aWords[0])) { + $aSetCache[0][] = array($aWords[0]); } // Now do the next elements using what we already have. for ($i = 1; $i < $iNumWords; $i++) { for ($j = $i; $j > 0; $j--) { - $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial; + $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial; if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { $aPartial = array($sPartial); foreach ($aSetCache[$j - 1] as $aSet) { @@ -136,7 +123,7 @@ class Phrase } // finally the current full phrase - $sPartial = $this->aWords[0].' '.$sPartial; + $sPartial = $aWords[0].' '.$sPartial; if ($oTokens->containsAny($sPartial)) { $aSetCache[$i][] = array($sPartial); } @@ -153,7 +140,6 @@ class Phrase return array( 'Type' => $this->sPhraseType, 'Phrase' => $this->sPhrase, - 'Words' => $this->aWords, 'WordSets' => $this->aWordSets ); } diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php index a419da6a..2df9fe05 100644 --- a/lib-php/TokenList.php +++ b/lib-php/TokenList.php @@ -95,88 +95,6 @@ class TokenList return $ids; } - /** - * Add token information from the word table in the database. - * - * @param object $oDB Nominatim::DB instance. - * @param string[] $aTokens List of tokens to look up in the database. - * @param string[] $aCountryCodes List of country restrictions. - * @param string $sNormQuery Normalized query string. - * @param object $oNormalizer Normalizer function to use on tokens. - * - * @return void - */ - public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer) - { - // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,'; - $sSQL .= ' operator, coalesce(search_name_count, 0) as count'; - $sSQL .= ' FROM word WHERE word_token in ('; - $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')'; - - Debug::printSQL($sSQL); - - $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.'); - - foreach ($aDBWords as $aWord) { - $oToken = null; - $iId = (int) $aWord['word_id']; - - if ($aWord['class']) { - // Special terms need to appear in their normalized form. - if ($aWord['word']) { - $sNormWord = $aWord['word']; - if ($oNormalizer != null) { - $sNormWord = $oNormalizer->transliterate($aWord['word']); - } - if (strpos($sNormQuery, $sNormWord) === false) { - continue; - } - } - - if ($aWord['class'] == 'place' && $aWord['type'] == 'house') { - $oToken = new Token\HouseNumber($iId, trim($aWord['word_token'])); - } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') { - if ($aWord['word'] - && pg_escape_string($aWord['word']) == $aWord['word'] - ) { - $oToken = new Token\Postcode( - $iId, - $aWord['word'], - $aWord['country_code'] - ); - } - } else { - // near and in operator the same at the moment - $oToken = new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['type'], - $aWord['operator'] ? Operator::NEAR : Operator::NONE - ); - } - } elseif ($aWord['country_code']) { - // Filter country tokens that do not match restricted countries. - if (!$aCountryCodes - || in_array($aWord['country_code'], $aCountryCodes) - ) { - $oToken = new Token\Country($iId, $aWord['country_code']); - } - } else { - $oToken = new Token\Word( - $iId, - $aWord['word_token'][0] != ' ', - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') - ); - } - - if ($oToken) { - $this->addToken($aWord['word_token'], $oToken); - } - } - } - /** * Add a new token for the given word. * diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index b3d9bbc7..804f69e2 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -1 +1,233 @@ oDB =& $oDB; + $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); + } + + + public function setCountryRestriction($aCountries) + { + $this->aCountryRestriction = $aCountries; + } + + + public function normalizeString($sTerm) + { + if ($this->oNormalizer === null) { + return $sTerm; + } + + return $this->oNormalizer->transliterate($sTerm); + } + + + public function tokensForSpecialTerm($sTerm) + { + $aResults = array(); + + $sSQL = 'SELECT word_id, class, type FROM word '; + $sSQL .= ' WHERE word_token = \' \' || make_standard_name(:term)'; + $sSQL .= ' AND class is not null AND class not in (\'place\')'; + + Debug::printVar('Term', $sTerm); + Debug::printSQL($sSQL); + $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm)); + + Debug::printVar('Results', $aSearchWords); + + foreach ($aSearchWords as $aSearchTerm) { + $aResults[] = new \Nominatim\Token\SpecialTerm( + $aSearchTerm['word_id'], + $aSearchTerm['class'], + $aSearchTerm['type'], + \Nominatim\Operator::TYPE + ); + } + + Debug::printVar('Special term tokens', $aResults); + + return $aResults; + } + + + public function extractTokensFromPhrases(&$aPhrases) + { + // First get the normalized version of all phrases + $sNormQuery = ''; + $sSQL = 'SELECT '; + $aParams = array(); + foreach ($aPhrases as $iPhrase => $oPhrase) { + $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); + $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.','; + $aParams[':'.$iPhrase] = $oPhrase->getPhrase(); + } + $sSQL = substr($sSQL, 0, -1); + + Debug::printSQL($sSQL); + Debug::printVar('SQL parameters', $aParams); + + $aNormPhrases = $this->oDB->getRow($sSQL, $aParams); + + Debug::printVar('SQL result', $aNormPhrases); + + // now compute all possible tokens + $aWordLists = array(); + $aTokens = array(); + foreach ($aNormPhrases as $sTitle => $sPhrase) { + if (strlen($sPhrase) > 0) { + $aWords = explode(' ', $sPhrase); + Tokenizer::addTokens($aTokens, $aWords); + $aWordLists[] = $aWords; + } else { + $aWordLists[] = array(); + } + } + + Debug::printVar('Tokens', $aTokens); + Debug::printVar('WordLists', $aWordLists); + + $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); + + foreach ($aPhrases as $iPhrase => $oPhrase) { + $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens); + } + + return $oValidTokens; + } + + + private function computeValidTokens($aTokens, $sNormQuery) + { + $oValidTokens = new TokenList(); + + if (!empty($aTokens)) { + $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery); + + // Try more interpretations for Tokens that could not be matched. + foreach ($aTokens as $sToken) { + if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { + if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { + // US ZIP+4 codes - merge in the 5-digit ZIP code + $oValidTokens->addToken( + $sToken, + new Token\Postcode(null, $aData[1], 'us') + ); + } elseif (preg_match('/^ [0-9]+$/', $sToken)) { + // Unknown single word token with a number. + // Assume it is a house number. + $oValidTokens->addToken( + $sToken, + new Token\HouseNumber(null, trim($sToken)) + ); + } + } + } + } + + return $oValidTokens; + } + + + private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) + { + // Check which tokens we have, get the ID numbers + $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,'; + $sSQL .= ' operator, coalesce(search_name_count, 0) as count'; + $sSQL .= ' FROM word WHERE word_token in ('; + $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; + + Debug::printSQL($sSQL); + + $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); + + foreach ($aDBWords as $aWord) { + $oToken = null; + $iId = (int) $aWord['word_id']; + + if ($aWord['class']) { + // Special terms need to appear in their normalized form. + // (postcodes are not normalized in the word table) + $sNormWord = $this->normalizeString($aWord['word']); + if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) { + continue; + } + + if ($aWord['class'] == 'place' && $aWord['type'] == 'house') { + $oToken = new Token\HouseNumber($iId, trim($aWord['word_token'])); + } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') { + if ($aWord['word'] + && pg_escape_string($aWord['word']) == $aWord['word'] + ) { + $oToken = new Token\Postcode( + $iId, + $aWord['word'], + $aWord['country_code'] + ); + } + } else { + // near and in operator the same at the moment + $oToken = new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['type'], + $aWord['operator'] ? Operator::NEAR : Operator::NONE + ); + } + } elseif ($aWord['country_code']) { + // Filter country tokens that do not match restricted countries. + if (!$this->aCountryRestriction + || in_array($aWord['country_code'], $this->aCountryRestriction) + ) { + $oToken = new Token\Country($iId, $aWord['country_code']); + } + } else { + $oToken = new Token\Word( + $iId, + $aWord['word_token'][0] != ' ', + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + ); + } + + if ($oToken) { + $oValidTokens->addToken($aWord['word_token'], $oToken); + } + } + } + + + /** + * Add the tokens from this phrase to the given list of tokens. + * + * @param string[] $aTokens List of tokens to append. + * + * @return void + */ + private static function addTokens(&$aTokens, $aWords) + { + $iNumWords = count($aWords); + + for ($i = 0; $i < $iNumWords; $i++) { + $sPhrase = $aWords[$i]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; + + for ($j = $i + 1; $j < $iNumWords; $j++) { + $sPhrase .= ' '.$aWords[$j]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; + } + } + } +} diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php index 42166e34..e4c2bbd1 100644 --- a/test/php/Nominatim/PhraseTest.php +++ b/test/php/Nominatim/PhraseTest.php @@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testEmptyPhrase() { $oPhrase = new Phrase('', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array(), new TokensFullSet()); - $this->assertEquals( - array(array('')), - $oPhrase->getWordSets() - ); + $this->assertNull($oPhrase->getWordSets()); } public function testSingleWordPhrase() { $oPhrase = new Phrase('a', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array('a'), new TokensFullSet()); $this->assertEquals( '(a)', @@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testMultiWordPhrase() { $oPhrase = new Phrase('a b', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet()); $this->assertEquals( '(a b),(a|b)', $this->serializeSets($oPhrase->getWordSets()) ); $oPhrase = new Phrase('a b c', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet()); $this->assertEquals( '(a b c),(a|b c),(a b|c),(a|b|c)', $this->serializeSets($oPhrase->getWordSets()) ); $oPhrase = new Phrase('a b c d', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet()); $this->assertEquals( '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)', $this->serializeSets($oPhrase->getWordSets()) @@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testInverseWordSets() { $oPhrase = new Phrase('a b c', ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet()); $oPhrase->invertWordSets(); $this->assertEquals( @@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testMaxWordSets() { - $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $aWords = array_fill(0, 4, 'a'); + $oPhrase = new Phrase(join(' ', $aWords), ''); + $oPhrase->computeWordSets($aWords, new TokensFullSet()); $this->assertEquals(8, count($oPhrase->getWordSets())); $oPhrase->invertWordSets(); $this->assertEquals(8, count($oPhrase->getWordSets())); - $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); - $oPhrase->computeWordSets(new TokensFullSet()); + $aWords = array_fill(0, 18, 'a'); + $oPhrase = new Phrase(join(' ', $aWords), ''); + $oPhrase->computeWordSets($aWords, new TokensFullSet()); $this->assertEquals(100, count($oPhrase->getWordSets())); $oPhrase->invertWordSets(); $this->assertEquals(100, count($oPhrase->getWordSets())); @@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testPartialTokensShortTerm() { $oPhrase = new Phrase('a b c d', ''); - $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))); + $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))); $this->assertEquals( '(a|b c d),(a|b c|d)', $this->serializeSets($oPhrase->getWordSets()) @@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testPartialTokensLongTerm() { - $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); - $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a'))); + $aWords = array_fill(0, 18, 'a'); + $oPhrase = new Phrase(join(' ', $aWords), ''); + $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a'))); $this->assertEquals(80, count($oPhrase->getWordSets())); } } diff --git a/test/php/Nominatim/TokenListTest.php b/test/php/Nominatim/TokenListTest.php index 14a595ea..f0139d76 100644 --- a/test/php/Nominatim/TokenListTest.php +++ b/test/php/Nominatim/TokenListTest.php @@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase $this->assertFalse($TL->contains('unknownword')); $this->assertEquals(array(), $TL->get('unknownword')); } - - public function testAddress() - { - $this->expectOutputRegex('/

/'); - - $oDbStub = $this->getMockBuilder(Nominatim\DB::class) - ->setMethods(array('getAll', 'getDBQuotedList')) - ->getMock(); - - $oDbStub->method('getDBQuotedList') - ->will($this->returnCallback(function ($aVals) { - return array_map(function ($sVal) { - return "'".$sVal."'"; - }, $aVals); - })); - - - $oDbStub->method('getAll') - ->will($this->returnCallback(function ($sql) { - $aResults = array(); - if (preg_match('/1051/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => '1051', - 'class' => 'place', - 'type' => 'house' - )); - } - if (preg_match('/hauptstr/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => 'hauptstr', - 'class' => 'place', - 'type' => 'street', - 'operator' => true - )); - } - if (preg_match('/64286/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => '64286', - 'word' => '64286', - 'class' => 'place', - 'type' => 'postcode' - )); - } - if (preg_match('/darmstadt/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => 'darmstadt', - 'count' => 533 - )); - } - if (preg_match('/alemagne/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => 'alemagne', - 'country_code' => 'de', - )); - } - if (preg_match('/mexico/', $sql)) { - $aResults[] = $this->wordResult(array( - 'word_id' => 999, - 'word_token' => 'mexico', - 'country_code' => 'mx', - )); - } - return $aResults; - })); - - $aCountryCodes = array('de', 'fr'); - $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico'; - $aTokens = explode(' ', $sNormQuery); - - $TL = new TokenList; - $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer); - $this->assertEquals(5, $TL->count()); - - $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051')); - $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne')); - $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286')); - $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt')); - $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr')); - } } diff --git a/test/php/Nominatim/tokenizer.php b/test/php/Nominatim/tokenizer.php new file mode 100644 index 00000000..0735e661 --- /dev/null +++ b/test/php/Nominatim/tokenizer.php @@ -0,0 +1,17 @@ +oDB =& $oDB; + } + + public function checkStatus() + { + } +}