2017-10-12 23:37:44 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace Nominatim;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Segment of a query string.
|
|
|
|
*
|
|
|
|
* The parts of a query strings are usually separated by commas.
|
|
|
|
*/
|
|
|
|
class Phrase
|
|
|
|
{
|
2019-07-03 00:24:49 +03:00
|
|
|
const MAX_WORDSET_LEN = 20;
|
|
|
|
const MAX_WORDSETS = 100;
|
2017-10-12 23:37:44 +03:00
|
|
|
|
|
|
|
// Complete phrase as a string.
|
|
|
|
private $sPhrase;
|
|
|
|
// Element type for structured searches.
|
|
|
|
private $sPhraseType;
|
|
|
|
// Space-separated words of the phrase.
|
|
|
|
private $aWords;
|
|
|
|
// Possible segmentations of the phrase.
|
|
|
|
private $aWordSets;
|
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
public static function cmpByArraylen($aA, $aB)
|
|
|
|
{
|
|
|
|
$iALen = count($aA);
|
|
|
|
$iBLen = count($aB);
|
|
|
|
|
|
|
|
if ($iALen == $iBLen) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ($iALen < $iBLen) ? -1 : 1;
|
|
|
|
}
|
|
|
|
|
2017-10-12 23:37:44 +03:00
|
|
|
|
|
|
|
public function __construct($sPhrase, $sPhraseType)
|
|
|
|
{
|
|
|
|
$this->sPhrase = trim($sPhrase);
|
|
|
|
$this->sPhraseType = $sPhraseType;
|
|
|
|
$this->aWords = explode(' ', $this->sPhrase);
|
|
|
|
}
|
|
|
|
|
2017-10-13 22:23:45 +03:00
|
|
|
/**
|
|
|
|
* Return the element type of the phrase.
|
|
|
|
*
|
|
|
|
* @return string Pharse type if the phrase comes from a structured query
|
|
|
|
* or empty string otherwise.
|
|
|
|
*/
|
2017-10-12 23:37:44 +03:00
|
|
|
public function getPhraseType()
|
|
|
|
{
|
|
|
|
return $this->sPhraseType;
|
|
|
|
}
|
|
|
|
|
2017-10-13 22:23:45 +03:00
|
|
|
/**
|
|
|
|
* Return the array of possible segmentations of the phrase.
|
|
|
|
*
|
|
|
|
* @return string[][] Array of segmentations, each consisting of an
|
|
|
|
* array of terms.
|
|
|
|
*/
|
2017-10-12 23:37:44 +03:00
|
|
|
public function getWordSets()
|
|
|
|
{
|
|
|
|
return $this->aWordSets;
|
|
|
|
}
|
|
|
|
|
2017-10-13 22:23:45 +03:00
|
|
|
/**
|
|
|
|
* Add the tokens from this phrase to the given list of tokens.
|
|
|
|
*
|
|
|
|
* @param string[] $aTokens List of tokens to append.
|
|
|
|
*
|
|
|
|
* @return void
|
|
|
|
*/
|
2017-10-12 23:37:44 +03:00
|
|
|
public function addTokens(&$aTokens)
|
|
|
|
{
|
2019-06-29 19:22:31 +03:00
|
|
|
$iNumWords = count($this->aWords);
|
|
|
|
|
|
|
|
for ($i = 0; $i < $iNumWords; $i++) {
|
|
|
|
$sPhrase = $this->aWords[$i];
|
|
|
|
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
|
|
|
$aTokens[$sPhrase] = $sPhrase;
|
|
|
|
|
|
|
|
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
|
|
|
$sPhrase .= ' '.$this->aWords[$j];
|
|
|
|
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
|
|
|
$aTokens[$sPhrase] = $sPhrase;
|
2017-10-12 23:37:44 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-13 22:23:45 +03:00
|
|
|
/**
|
|
|
|
* Invert the set of possible segmentations.
|
|
|
|
*
|
|
|
|
* @return void
|
|
|
|
*/
|
2017-10-12 23:37:44 +03:00
|
|
|
public function invertWordSets()
|
|
|
|
{
|
2019-06-29 19:22:31 +03:00
|
|
|
foreach ($this->aWordSets as $i => $aSet) {
|
|
|
|
$this->aWordSets[$i] = array_reverse($aSet);
|
|
|
|
}
|
2017-10-12 23:37:44 +03:00
|
|
|
}
|
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
public function computeWordSets($oTokens)
|
2017-10-12 23:37:44 +03:00
|
|
|
{
|
2019-06-29 19:22:31 +03:00
|
|
|
$iNumWords = count($this->aWords);
|
|
|
|
// Caches the word set for the partial phrase up to word i.
|
|
|
|
$aSetCache = array_fill(0, $iNumWords, array());
|
2017-10-12 23:37:44 +03:00
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
// Initialise first element of cache. There can only be the word.
|
|
|
|
if ($oTokens->containsAny($this->aWords[0])) {
|
|
|
|
$aSetCache[0][] = array($this->aWords[0]);
|
|
|
|
}
|
2017-10-12 23:37:44 +03:00
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
// Now do the next elements using what we already have.
|
|
|
|
for ($i = 1; $i < $iNumWords; $i++) {
|
|
|
|
for ($j = $i; $j > 0; $j--) {
|
|
|
|
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
|
|
|
|
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
|
|
|
$aPartial = array($sPartial);
|
|
|
|
foreach ($aSetCache[$j - 1] as $aSet) {
|
|
|
|
if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
|
|
|
|
$aSetCache[$i][] = array_merge($aSet, $aPartial);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
|
|
|
|
usort(
|
|
|
|
$aSetCache[$i],
|
|
|
|
array('\Nominatim\Phrase', 'cmpByArraylen')
|
|
|
|
);
|
|
|
|
$aSetCache[$i] = array_slice(
|
|
|
|
$aSetCache[$i],
|
|
|
|
0,
|
|
|
|
Phrase::MAX_WORDSETS
|
|
|
|
);
|
|
|
|
}
|
2017-10-12 23:37:44 +03:00
|
|
|
}
|
|
|
|
}
|
2019-06-29 19:22:31 +03:00
|
|
|
|
|
|
|
// finally the current full phrase
|
|
|
|
$sPartial = $this->aWords[0].' '.$sPartial;
|
|
|
|
if ($oTokens->containsAny($sPartial)) {
|
|
|
|
$aSetCache[$i][] = array($sPartial);
|
|
|
|
}
|
2017-10-12 23:37:44 +03:00
|
|
|
}
|
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
$this->aWordSets = $aSetCache[$iNumWords - 1];
|
|
|
|
usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
|
|
|
|
$this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
|
2017-10-12 23:37:44 +03:00
|
|
|
}
|
2018-03-24 19:44:13 +03:00
|
|
|
|
2019-06-29 19:22:31 +03:00
|
|
|
|
2018-03-24 19:44:13 +03:00
|
|
|
public function debugInfo()
|
|
|
|
{
|
|
|
|
return array(
|
|
|
|
'Type' => $this->sPhraseType,
|
|
|
|
'Phrase' => $this->sPhrase,
|
|
|
|
'Words' => $this->aWords,
|
|
|
|
'WordSets' => $this->aWordSets
|
|
|
|
);
|
|
|
|
}
|
2017-10-14 00:11:09 +03:00
|
|
|
}
|