move special hack for US states to legacy tokenizer

The hack for IL, AL and LA is only needed because these abbreviations
are removed by the legacy tokenizer as a stop word. There is no need
to keep the hack for future tokenizers. Move it therefore to the
token extraction function.
This commit is contained in:
Sarah Hoffmann 2021-08-17 14:28:55 +02:00
parent 5f2b9e317a
commit f00b8dd1c3
3 changed files with 19 additions and 8 deletions

View File

@ -506,13 +506,6 @@ class Geocode
userError('Query string is not UTF-8 encoded.');
}
// Conflicts between US state abreviations and various words for 'the' in different languages
if (isset($this->aLangPrefOrder['name:en'])) {
$sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery);
$sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery);
$sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery);
}
// Do we have anything that looks like a lat/lon pair?
$sQuery = $oCtx->setNearPointFromQuery($sQuery);

View File

@ -9,7 +9,8 @@ namespace Nominatim;
*/
class Phrase
{
// Complete phrase as a string.
// Complete phrase as a string (guaranteed to have no leading or trailing
// spaces).
private $sPhrase;
// Element type for structured searches.
private $sPhraseType;

View File

@ -87,6 +87,23 @@ class Tokenizer
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
// Conflicts between US state abbreviations and various words
// for 'the' in different languages
switch (strtolower($oPhrase->getPhrase())) {
case 'il':
$aParams[':'.$iPhrase] = 'illinois';
break;
case 'al':
$aParams[':'.$iPhrase] = 'alabama';
break;
case 'la':
$aParams[':'.$iPhrase] = 'louisiana';
break;
default:
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
break;
}
}
$sSQL = substr($sSQL, 0, -1);