avoid dropping tokens completely just because they are expensive. Use ' ' token in preference to just dropping

This commit is contained in:
Brian Quinion 2013-01-31 14:17:41 +00:00
parent 4bc40d2c0b
commit 3852096c80
2 changed files with 43 additions and 3 deletions

View File

@ -646,7 +646,7 @@
}
}
echo "<table border=\"1\">";
echo "<tr><th>rank</th><th>Name Tokens</th><th>Address Tokens</th><th>country</th><th>operator</th><th>class</th><th>type</th><th>house#</th><th>Lat</th><th>Lon</th><th>Radius</th></tr>";
echo "<tr><th>rank</th><th>Name Tokens</th><th>Name Not</th><th>Address Tokens</th><th>Address Not</th><th>country</th><th>operator</th><th>class</th><th>type</th><th>house#</th><th>Lat</th><th>Lon</th><th>Radius</th></tr>";
foreach($aData as $iRank => $aRankedSet)
{
foreach($aRankedSet as $aRow)
@ -663,6 +663,15 @@
}
echo "</td>";
echo "<td>";
$sSep = '';
foreach($aRow['aNameNonSearch'] as $iWordID)
{
echo $sSep.'#'.$aWordsIDs[$iWordID].'#';
$sSep = ', ';
}
echo "</td>";
echo "<td>";
$sSep = '';
foreach($aRow['aAddress'] as $iWordID)
@ -672,6 +681,15 @@
}
echo "</td>";
echo "<td>";
$sSep = '';
foreach($aRow['aAddressNonSearch'] as $iWordID)
{
echo $sSep.'#'.$aWordsIDs[$iWordID].'#';
$sSep = ', ';
}
echo "</td>";
echo "<td>".$aRow['sCountryCode']."</td>";
echo "<td>".$aRow['sOperator']."</td>";

View File

@ -638,10 +638,30 @@
$aSearch = $aCurrentSearch;
$aSearch['iSearchRank'] += 1;
if ($aWordFrequencyScores[$aSearchTerm['word_id']] < CONST_Max_Word_Frequency)
{
$aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id'];
if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch;
}
elseif (isset($aValidTokens[' '.$sToken])) // revert to the token version?
{
foreach($aValidTokens[' '.$sToken] as $aSearchTermToken)
{
if (empty($aSearchTermToken['country_code'])
&& empty($aSearchTermToken['lat'])
&& empty($aSearchTermToken['class']))
{
$aSearch = $aCurrentSearch;
$aSearch['iSearchRank'] += 1;
$aSearch['aAddress'][$aSearchTermToken['word_id']] = $aSearchTermToken['word_id'];
if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch;
}
}
}
else
{
$aSearch['aAddressNonSearch'][$aSearchTerm['word_id']] = $aSearchTerm['word_id'];
if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch;
if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch;
}
}
if (!sizeof($aCurrentSearch['aName']) || $aCurrentSearch['iNamePhrase'] == $iPhrase)
@ -900,6 +920,7 @@
// TODO: filter out the pointless search terms (2 letter name tokens and less)
// they might be right - but they are just too darned expensive to run
if (sizeof($aSearch['aName'])) $aTerms[] = "name_vector @> ARRAY[".join($aSearch['aName'],",")."]";
if (sizeof($aSearch['aNameNonSearch'])) $aTerms[] = "array_cat(name_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aNameNonSearch'],",")."]";
if (sizeof($aSearch['aAddress']) && $aSearch['aName'] != $aSearch['aAddress'])
{
// For infrequent name terms disable index usage for address
@ -907,11 +928,12 @@
sizeof($aSearch['aName']) == 1 &&
$aWordFrequencyScores[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold)
{
$aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddress'],",")."]";
$aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join(array_merge($aSearch['aAddress'],$aSearch['aAddressNonSearch']),",")."]";
}
else
{
$aTerms[] = "nameaddress_vector @> ARRAY[".join($aSearch['aAddress'],",")."]";
if (sizeof($aSearch['aAddressNonSearch'])) $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddressNonSearch'],",")."]";
}
}
if ($aSearch['sCountryCode']) $aTerms[] = "country_code = '".pg_escape_string($aSearch['sCountryCode'])."'";