mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-23 05:35:13 +03:00
move tokenization in query into tokenizer
This commit is contained in:
parent
3eb4d88057
commit
044bb6afa5
@ -15,6 +15,7 @@ class Geocode
|
||||
protected $oDB;
|
||||
|
||||
protected $oPlaceLookup;
|
||||
protected $oTokenizer;
|
||||
|
||||
protected $aLangPrefOrder = array();
|
||||
|
||||
@ -42,23 +43,12 @@ class Geocode
|
||||
protected $sQuery = false;
|
||||
protected $aStructuredQuery = false;
|
||||
|
||||
protected $oNormalizer = null;
|
||||
|
||||
|
||||
public function __construct(&$oDB)
|
||||
{
|
||||
$this->oDB =& $oDB;
|
||||
$this->oPlaceLookup = new PlaceLookup($this->oDB);
|
||||
$this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
|
||||
}
|
||||
|
||||
private function normTerm($sTerm)
|
||||
{
|
||||
if ($this->oNormalizer === null) {
|
||||
return $sTerm;
|
||||
}
|
||||
|
||||
return $this->oNormalizer->transliterate($sTerm);
|
||||
$this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
|
||||
}
|
||||
|
||||
public function setLanguagePreference($aLangPref)
|
||||
@ -511,12 +501,10 @@ class Geocode
|
||||
if ($this->aCountryCodes) {
|
||||
$oCtx->setCountryList($this->aCountryCodes);
|
||||
}
|
||||
$this->oTokenizer->setCountryRestriction($this->aCountryCodes);
|
||||
|
||||
Debug::newSection('Query Preprocessing');
|
||||
|
||||
$sNormQuery = $this->normTerm($this->sQuery);
|
||||
Debug::printVar('Normalized query', $sNormQuery);
|
||||
|
||||
$sLanguagePrefArraySQL = $this->oDB->getArraySQL(
|
||||
$this->oDB->getDBQuotedList($this->aLangPrefOrder)
|
||||
);
|
||||
@ -570,108 +558,55 @@ class Geocode
|
||||
}
|
||||
|
||||
if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
|
||||
$sSpecialTerm = pg_escape_string($sSpecialTerm);
|
||||
$sToken = $this->oDB->getOne(
|
||||
'SELECT make_standard_name(:term)',
|
||||
array(':term' => $sSpecialTerm),
|
||||
'Cannot decode query. Wrong encoding?'
|
||||
);
|
||||
$sSQL = 'SELECT class, type FROM word ';
|
||||
$sSQL .= ' WHERE word_token in (\' '.$sToken.'\')';
|
||||
$sSQL .= ' AND class is not null AND class not in (\'place\')';
|
||||
$aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
|
||||
|
||||
Debug::printSQL($sSQL);
|
||||
$aSearchWords = $this->oDB->getAll($sSQL);
|
||||
$aNewSearches = array();
|
||||
foreach ($aSearches as $oSearch) {
|
||||
foreach ($aSearchWords as $aSearchTerm) {
|
||||
$oNewSearch = clone $oSearch;
|
||||
$oNewSearch->setPoiSearch(
|
||||
Operator::TYPE,
|
||||
$aSearchTerm['class'],
|
||||
$aSearchTerm['type']
|
||||
);
|
||||
$aNewSearches[] = $oNewSearch;
|
||||
if (!empty($aTokens)) {
|
||||
$aNewSearches = array();
|
||||
foreach ($aSearches as $oSearch) {
|
||||
foreach ($aTokens as $oToken) {
|
||||
$oNewSearch = clone $oSearch;
|
||||
$oNewSearch->setPoiSearch(
|
||||
$oToken->iOperator,
|
||||
$oToken->sClass,
|
||||
$oToken->sType
|
||||
);
|
||||
$aNewSearches[] = $oNewSearch;
|
||||
}
|
||||
}
|
||||
$aSearches = $aNewSearches;
|
||||
}
|
||||
$aSearches = $aNewSearches;
|
||||
}
|
||||
|
||||
// Split query into phrases
|
||||
// Commas are used to reduce the search space by indicating where phrases split
|
||||
$aPhrases = array();
|
||||
if ($this->aStructuredQuery) {
|
||||
$aInPhrases = $this->aStructuredQuery;
|
||||
foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
|
||||
$aPhrases[] = new Phrase($sPhrase, $iPhrase);
|
||||
}
|
||||
} else {
|
||||
$aInPhrases = explode(',', $sQuery);
|
||||
foreach (explode(',', $sQuery) as $sPhrase) {
|
||||
$aPhrases[] = new Phrase($sPhrase, '');
|
||||
}
|
||||
}
|
||||
|
||||
Debug::printDebugArray('Search context', $oCtx);
|
||||
Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
|
||||
Debug::printVar('Final query phrases', $aInPhrases);
|
||||
|
||||
// Convert each phrase to standard form
|
||||
// Create a list of standard words
|
||||
// Get all 'sets' of words
|
||||
// Generate a complete list of all
|
||||
Debug::newSection('Tokenization');
|
||||
$aTokens = array();
|
||||
$aPhrases = array();
|
||||
foreach ($aInPhrases as $iPhrase => $sPhrase) {
|
||||
$sPhrase = $this->oDB->getOne(
|
||||
'SELECT make_standard_name(:phrase)',
|
||||
array(':phrase' => $sPhrase),
|
||||
'Cannot normalize query string (is it a UTF-8 string?)'
|
||||
);
|
||||
if (trim($sPhrase)) {
|
||||
$oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
|
||||
$oPhrase->addTokens($aTokens);
|
||||
$aPhrases[] = $oPhrase;
|
||||
}
|
||||
}
|
||||
|
||||
Debug::printVar('Tokens', $aTokens);
|
||||
|
||||
$oValidTokens = new TokenList();
|
||||
|
||||
if (!empty($aTokens)) {
|
||||
$oValidTokens->addTokensFromDB(
|
||||
$this->oDB,
|
||||
$aTokens,
|
||||
$this->aCountryCodes,
|
||||
$sNormQuery,
|
||||
$this->oNormalizer
|
||||
);
|
||||
$oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
|
||||
|
||||
if ($oValidTokens->count() > 0) {
|
||||
$oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
|
||||
|
||||
// Try more interpretations for Tokens that could not be matched.
|
||||
foreach ($aTokens as $sToken) {
|
||||
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
|
||||
if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
|
||||
// US ZIP+4 codes - merge in the 5-digit ZIP code
|
||||
$oValidTokens->addToken(
|
||||
$sToken,
|
||||
new Token\Postcode(null, $aData[1], 'us')
|
||||
);
|
||||
} elseif (preg_match('/^ [0-9]+$/', $sToken)) {
|
||||
// Unknown single word token with a number.
|
||||
// Assume it is a house number.
|
||||
$oValidTokens->addToken(
|
||||
$sToken,
|
||||
new Token\HouseNumber(null, trim($sToken))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
$aPhrases = array_filter($aPhrases, function ($oPhrase) {
|
||||
return $oPhrase->getWordSets() !== null;
|
||||
});
|
||||
|
||||
// Any words that have failed completely?
|
||||
// TODO: suggestions
|
||||
|
||||
Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
|
||||
|
||||
foreach ($aPhrases as $oPhrase) {
|
||||
$oPhrase->computeWordSets($oValidTokens);
|
||||
}
|
||||
Debug::printDebugTable('Phrases', $aPhrases);
|
||||
|
||||
Debug::newSection('Search candidates');
|
||||
|
@ -16,8 +16,6 @@ class Phrase
|
||||
private $sPhrase;
|
||||
// Element type for structured searches.
|
||||
private $sPhraseType;
|
||||
// Space-separated words of the phrase.
|
||||
private $aWords;
|
||||
// Possible segmentations of the phrase.
|
||||
private $aWordSets;
|
||||
|
||||
@ -38,7 +36,14 @@ class Phrase
|
||||
{
|
||||
$this->sPhrase = trim($sPhrase);
|
||||
$this->sPhraseType = $sPhraseType;
|
||||
$this->aWords = explode(' ', $this->sPhrase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the orginal phrase of the string.
|
||||
*/
|
||||
public function getPhrase()
|
||||
{
|
||||
return $this->sPhrase;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -63,30 +68,6 @@ class Phrase
|
||||
return $this->aWordSets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the tokens from this phrase to the given list of tokens.
|
||||
*
|
||||
* @param string[] $aTokens List of tokens to append.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function addTokens(&$aTokens)
|
||||
{
|
||||
$iNumWords = count($this->aWords);
|
||||
|
||||
for ($i = 0; $i < $iNumWords; $i++) {
|
||||
$sPhrase = $this->aWords[$i];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
|
||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||
$sPhrase .= ' '.$this->aWords[$j];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invert the set of possible segmentations.
|
||||
*
|
||||
@ -99,21 +80,27 @@ class Phrase
|
||||
}
|
||||
}
|
||||
|
||||
public function computeWordSets($oTokens)
|
||||
public function computeWordSets($aWords, $oTokens)
|
||||
{
|
||||
$iNumWords = count($this->aWords);
|
||||
$iNumWords = count($aWords);
|
||||
|
||||
if ($iNumWords == 0) {
|
||||
$this->aWordSets = null;
|
||||
return;
|
||||
}
|
||||
|
||||
// Caches the word set for the partial phrase up to word i.
|
||||
$aSetCache = array_fill(0, $iNumWords, array());
|
||||
|
||||
// Initialise first element of cache. There can only be the word.
|
||||
if ($oTokens->containsAny($this->aWords[0])) {
|
||||
$aSetCache[0][] = array($this->aWords[0]);
|
||||
if ($oTokens->containsAny($aWords[0])) {
|
||||
$aSetCache[0][] = array($aWords[0]);
|
||||
}
|
||||
|
||||
// Now do the next elements using what we already have.
|
||||
for ($i = 1; $i < $iNumWords; $i++) {
|
||||
for ($j = $i; $j > 0; $j--) {
|
||||
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
|
||||
$sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
|
||||
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
||||
$aPartial = array($sPartial);
|
||||
foreach ($aSetCache[$j - 1] as $aSet) {
|
||||
@ -136,7 +123,7 @@ class Phrase
|
||||
}
|
||||
|
||||
// finally the current full phrase
|
||||
$sPartial = $this->aWords[0].' '.$sPartial;
|
||||
$sPartial = $aWords[0].' '.$sPartial;
|
||||
if ($oTokens->containsAny($sPartial)) {
|
||||
$aSetCache[$i][] = array($sPartial);
|
||||
}
|
||||
@ -153,7 +140,6 @@ class Phrase
|
||||
return array(
|
||||
'Type' => $this->sPhraseType,
|
||||
'Phrase' => $this->sPhrase,
|
||||
'Words' => $this->aWords,
|
||||
'WordSets' => $this->aWordSets
|
||||
);
|
||||
}
|
||||
|
@ -95,88 +95,6 @@ class TokenList
|
||||
return $ids;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add token information from the word table in the database.
|
||||
*
|
||||
* @param object $oDB Nominatim::DB instance.
|
||||
* @param string[] $aTokens List of tokens to look up in the database.
|
||||
* @param string[] $aCountryCodes List of country restrictions.
|
||||
* @param string $sNormQuery Normalized query string.
|
||||
* @param object $oNormalizer Normalizer function to use on tokens.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
|
||||
{
|
||||
// Check which tokens we have, get the ID numbers
|
||||
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
|
||||
$sSQL .= ' operator, coalesce(search_name_count, 0) as count';
|
||||
$sSQL .= ' FROM word WHERE word_token in (';
|
||||
$sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
|
||||
|
||||
Debug::printSQL($sSQL);
|
||||
|
||||
$aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
|
||||
|
||||
foreach ($aDBWords as $aWord) {
|
||||
$oToken = null;
|
||||
$iId = (int) $aWord['word_id'];
|
||||
|
||||
if ($aWord['class']) {
|
||||
// Special terms need to appear in their normalized form.
|
||||
if ($aWord['word']) {
|
||||
$sNormWord = $aWord['word'];
|
||||
if ($oNormalizer != null) {
|
||||
$sNormWord = $oNormalizer->transliterate($aWord['word']);
|
||||
}
|
||||
if (strpos($sNormQuery, $sNormWord) === false) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
|
||||
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
|
||||
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
|
||||
if ($aWord['word']
|
||||
&& pg_escape_string($aWord['word']) == $aWord['word']
|
||||
) {
|
||||
$oToken = new Token\Postcode(
|
||||
$iId,
|
||||
$aWord['word'],
|
||||
$aWord['country_code']
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// near and in operator the same at the moment
|
||||
$oToken = new Token\SpecialTerm(
|
||||
$iId,
|
||||
$aWord['class'],
|
||||
$aWord['type'],
|
||||
$aWord['operator'] ? Operator::NEAR : Operator::NONE
|
||||
);
|
||||
}
|
||||
} elseif ($aWord['country_code']) {
|
||||
// Filter country tokens that do not match restricted countries.
|
||||
if (!$aCountryCodes
|
||||
|| in_array($aWord['country_code'], $aCountryCodes)
|
||||
) {
|
||||
$oToken = new Token\Country($iId, $aWord['country_code']);
|
||||
}
|
||||
} else {
|
||||
$oToken = new Token\Word(
|
||||
$iId,
|
||||
$aWord['word_token'][0] != ' ',
|
||||
(int) $aWord['count'],
|
||||
substr_count($aWord['word_token'], ' ')
|
||||
);
|
||||
}
|
||||
|
||||
if ($oToken) {
|
||||
$this->addToken($aWord['word_token'], $oToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a new token for the given word.
|
||||
*
|
||||
|
@ -1 +1,233 @@
|
||||
<?php
|
||||
|
||||
namespace Nominatim;
|
||||
|
||||
class Tokenizer
|
||||
{
|
||||
private $oDB;
|
||||
|
||||
private $oNormalizer = null;
|
||||
private $aCountryRestriction = null;
|
||||
|
||||
public function __construct(&$oDB)
|
||||
{
|
||||
$this->oDB =& $oDB;
|
||||
$this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
|
||||
}
|
||||
|
||||
|
||||
public function setCountryRestriction($aCountries)
|
||||
{
|
||||
$this->aCountryRestriction = $aCountries;
|
||||
}
|
||||
|
||||
|
||||
public function normalizeString($sTerm)
|
||||
{
|
||||
if ($this->oNormalizer === null) {
|
||||
return $sTerm;
|
||||
}
|
||||
|
||||
return $this->oNormalizer->transliterate($sTerm);
|
||||
}
|
||||
|
||||
|
||||
public function tokensForSpecialTerm($sTerm)
|
||||
{
|
||||
$aResults = array();
|
||||
|
||||
$sSQL = 'SELECT word_id, class, type FROM word ';
|
||||
$sSQL .= ' WHERE word_token = \' \' || make_standard_name(:term)';
|
||||
$sSQL .= ' AND class is not null AND class not in (\'place\')';
|
||||
|
||||
Debug::printVar('Term', $sTerm);
|
||||
Debug::printSQL($sSQL);
|
||||
$aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
|
||||
|
||||
Debug::printVar('Results', $aSearchWords);
|
||||
|
||||
foreach ($aSearchWords as $aSearchTerm) {
|
||||
$aResults[] = new \Nominatim\Token\SpecialTerm(
|
||||
$aSearchTerm['word_id'],
|
||||
$aSearchTerm['class'],
|
||||
$aSearchTerm['type'],
|
||||
\Nominatim\Operator::TYPE
|
||||
);
|
||||
}
|
||||
|
||||
Debug::printVar('Special term tokens', $aResults);
|
||||
|
||||
return $aResults;
|
||||
}
|
||||
|
||||
|
||||
public function extractTokensFromPhrases(&$aPhrases)
|
||||
{
|
||||
// First get the normalized version of all phrases
|
||||
$sNormQuery = '';
|
||||
$sSQL = 'SELECT ';
|
||||
$aParams = array();
|
||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
||||
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
|
||||
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
||||
}
|
||||
$sSQL = substr($sSQL, 0, -1);
|
||||
|
||||
Debug::printSQL($sSQL);
|
||||
Debug::printVar('SQL parameters', $aParams);
|
||||
|
||||
$aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
|
||||
|
||||
Debug::printVar('SQL result', $aNormPhrases);
|
||||
|
||||
// now compute all possible tokens
|
||||
$aWordLists = array();
|
||||
$aTokens = array();
|
||||
foreach ($aNormPhrases as $sTitle => $sPhrase) {
|
||||
if (strlen($sPhrase) > 0) {
|
||||
$aWords = explode(' ', $sPhrase);
|
||||
Tokenizer::addTokens($aTokens, $aWords);
|
||||
$aWordLists[] = $aWords;
|
||||
} else {
|
||||
$aWordLists[] = array();
|
||||
}
|
||||
}
|
||||
|
||||
Debug::printVar('Tokens', $aTokens);
|
||||
Debug::printVar('WordLists', $aWordLists);
|
||||
|
||||
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
||||
|
||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
|
||||
}
|
||||
|
||||
return $oValidTokens;
|
||||
}
|
||||
|
||||
|
||||
private function computeValidTokens($aTokens, $sNormQuery)
|
||||
{
|
||||
$oValidTokens = new TokenList();
|
||||
|
||||
if (!empty($aTokens)) {
|
||||
$this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
|
||||
|
||||
// Try more interpretations for Tokens that could not be matched.
|
||||
foreach ($aTokens as $sToken) {
|
||||
if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
|
||||
if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
|
||||
// US ZIP+4 codes - merge in the 5-digit ZIP code
|
||||
$oValidTokens->addToken(
|
||||
$sToken,
|
||||
new Token\Postcode(null, $aData[1], 'us')
|
||||
);
|
||||
} elseif (preg_match('/^ [0-9]+$/', $sToken)) {
|
||||
// Unknown single word token with a number.
|
||||
// Assume it is a house number.
|
||||
$oValidTokens->addToken(
|
||||
$sToken,
|
||||
new Token\HouseNumber(null, trim($sToken))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $oValidTokens;
|
||||
}
|
||||
|
||||
|
||||
private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
|
||||
{
|
||||
// Check which tokens we have, get the ID numbers
|
||||
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
|
||||
$sSQL .= ' operator, coalesce(search_name_count, 0) as count';
|
||||
$sSQL .= ' FROM word WHERE word_token in (';
|
||||
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
|
||||
|
||||
Debug::printSQL($sSQL);
|
||||
|
||||
$aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
|
||||
|
||||
foreach ($aDBWords as $aWord) {
|
||||
$oToken = null;
|
||||
$iId = (int) $aWord['word_id'];
|
||||
|
||||
if ($aWord['class']) {
|
||||
// Special terms need to appear in their normalized form.
|
||||
// (postcodes are not normalized in the word table)
|
||||
$sNormWord = $this->normalizeString($aWord['word']);
|
||||
if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
|
||||
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
|
||||
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
|
||||
if ($aWord['word']
|
||||
&& pg_escape_string($aWord['word']) == $aWord['word']
|
||||
) {
|
||||
$oToken = new Token\Postcode(
|
||||
$iId,
|
||||
$aWord['word'],
|
||||
$aWord['country_code']
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// near and in operator the same at the moment
|
||||
$oToken = new Token\SpecialTerm(
|
||||
$iId,
|
||||
$aWord['class'],
|
||||
$aWord['type'],
|
||||
$aWord['operator'] ? Operator::NEAR : Operator::NONE
|
||||
);
|
||||
}
|
||||
} elseif ($aWord['country_code']) {
|
||||
// Filter country tokens that do not match restricted countries.
|
||||
if (!$this->aCountryRestriction
|
||||
|| in_array($aWord['country_code'], $this->aCountryRestriction)
|
||||
) {
|
||||
$oToken = new Token\Country($iId, $aWord['country_code']);
|
||||
}
|
||||
} else {
|
||||
$oToken = new Token\Word(
|
||||
$iId,
|
||||
$aWord['word_token'][0] != ' ',
|
||||
(int) $aWord['count'],
|
||||
substr_count($aWord['word_token'], ' ')
|
||||
);
|
||||
}
|
||||
|
||||
if ($oToken) {
|
||||
$oValidTokens->addToken($aWord['word_token'], $oToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add the tokens from this phrase to the given list of tokens.
|
||||
*
|
||||
* @param string[] $aTokens List of tokens to append.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private static function addTokens(&$aTokens, $aWords)
|
||||
{
|
||||
$iNumWords = count($aWords);
|
||||
|
||||
for ($i = 0; $i < $iNumWords; $i++) {
|
||||
$sPhrase = $aWords[$i];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
|
||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||
$sPhrase .= ' '.$aWords[$j];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testEmptyPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array(), new TokensFullSet());
|
||||
|
||||
$this->assertEquals(
|
||||
array(array('')),
|
||||
$oPhrase->getWordSets()
|
||||
);
|
||||
$this->assertNull($oPhrase->getWordSets());
|
||||
}
|
||||
|
||||
|
||||
public function testSingleWordPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('a', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array('a'), new TokensFullSet());
|
||||
|
||||
$this->assertEquals(
|
||||
'(a)',
|
||||
@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testMultiWordPhrase()
|
||||
{
|
||||
$oPhrase = new Phrase('a b', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b),(a|b)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
|
||||
$oPhrase = new Phrase('a b c', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b c),(a|b c),(a b|c),(a|b|c)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
);
|
||||
|
||||
$oPhrase = new Phrase('a b c d', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
|
||||
$this->assertEquals(
|
||||
'(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testInverseWordSets()
|
||||
{
|
||||
$oPhrase = new Phrase('a b c', '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
|
||||
$oPhrase->invertWordSets();
|
||||
|
||||
$this->assertEquals(
|
||||
@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
|
||||
public function testMaxWordSets()
|
||||
{
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$aWords = array_fill(0, 4, 'a');
|
||||
$oPhrase = new Phrase(join(' ', $aWords), '');
|
||||
$oPhrase->computeWordSets($aWords, new TokensFullSet());
|
||||
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||
$oPhrase->invertWordSets();
|
||||
$this->assertEquals(8, count($oPhrase->getWordSets()));
|
||||
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
|
||||
$oPhrase->computeWordSets(new TokensFullSet());
|
||||
$aWords = array_fill(0, 18, 'a');
|
||||
$oPhrase = new Phrase(join(' ', $aWords), '');
|
||||
$oPhrase->computeWordSets($aWords, new TokensFullSet());
|
||||
$this->assertEquals(100, count($oPhrase->getWordSets()));
|
||||
$oPhrase->invertWordSets();
|
||||
$this->assertEquals(100, count($oPhrase->getWordSets()));
|
||||
@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
public function testPartialTokensShortTerm()
|
||||
{
|
||||
$oPhrase = new Phrase('a b c d', '');
|
||||
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
|
||||
$oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
|
||||
$this->assertEquals(
|
||||
'(a|b c d),(a|b c|d)',
|
||||
$this->serializeSets($oPhrase->getWordSets())
|
||||
@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
|
||||
|
||||
public function testPartialTokensLongTerm()
|
||||
{
|
||||
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
|
||||
$oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
|
||||
$aWords = array_fill(0, 18, 'a');
|
||||
$oPhrase = new Phrase(join(' ', $aWords), '');
|
||||
$oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
|
||||
$this->assertEquals(80, count($oPhrase->getWordSets()));
|
||||
}
|
||||
}
|
||||
|
@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
|
||||
$this->assertFalse($TL->contains('unknownword'));
|
||||
$this->assertEquals(array(), $TL->get('unknownword'));
|
||||
}
|
||||
|
||||
public function testAddress()
|
||||
{
|
||||
$this->expectOutputRegex('/<p><tt>/');
|
||||
|
||||
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
|
||||
->setMethods(array('getAll', 'getDBQuotedList'))
|
||||
->getMock();
|
||||
|
||||
$oDbStub->method('getDBQuotedList')
|
||||
->will($this->returnCallback(function ($aVals) {
|
||||
return array_map(function ($sVal) {
|
||||
return "'".$sVal."'";
|
||||
}, $aVals);
|
||||
}));
|
||||
|
||||
|
||||
$oDbStub->method('getAll')
|
||||
->will($this->returnCallback(function ($sql) {
|
||||
$aResults = array();
|
||||
if (preg_match('/1051/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => '1051',
|
||||
'class' => 'place',
|
||||
'type' => 'house'
|
||||
));
|
||||
}
|
||||
if (preg_match('/hauptstr/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => 'hauptstr',
|
||||
'class' => 'place',
|
||||
'type' => 'street',
|
||||
'operator' => true
|
||||
));
|
||||
}
|
||||
if (preg_match('/64286/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => '64286',
|
||||
'word' => '64286',
|
||||
'class' => 'place',
|
||||
'type' => 'postcode'
|
||||
));
|
||||
}
|
||||
if (preg_match('/darmstadt/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => 'darmstadt',
|
||||
'count' => 533
|
||||
));
|
||||
}
|
||||
if (preg_match('/alemagne/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => 'alemagne',
|
||||
'country_code' => 'de',
|
||||
));
|
||||
}
|
||||
if (preg_match('/mexico/', $sql)) {
|
||||
$aResults[] = $this->wordResult(array(
|
||||
'word_id' => 999,
|
||||
'word_token' => 'mexico',
|
||||
'country_code' => 'mx',
|
||||
));
|
||||
}
|
||||
return $aResults;
|
||||
}));
|
||||
|
||||
$aCountryCodes = array('de', 'fr');
|
||||
$sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
|
||||
$aTokens = explode(' ', $sNormQuery);
|
||||
|
||||
$TL = new TokenList;
|
||||
$TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
|
||||
$this->assertEquals(5, $TL->count());
|
||||
|
||||
$this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
|
||||
$this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
|
||||
$this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
|
||||
$this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
|
||||
$this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
|
||||
}
|
||||
}
|
||||
|
17
test/php/Nominatim/tokenizer.php
Normal file
17
test/php/Nominatim/tokenizer.php
Normal file
@ -0,0 +1,17 @@
|
||||
<?php
|
||||
|
||||
namespace Nominatim;
|
||||
|
||||
class Tokenizer
|
||||
{
|
||||
private $oDB;
|
||||
|
||||
public function __construct(&$oDB)
|
||||
{
|
||||
$this->oDB =& $oDB;
|
||||
}
|
||||
|
||||
public function checkStatus()
|
||||
{
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user