ignore frequent partial search terms

Adds word counts from a full planet to the word table. There is a new configuration option CONST_Max_Word_Frequency which allows to take into account the word count: the value that was set on import is used to determine if a word is added to the search_name table. The value during runtime determines if a single term should be used for partial search or simply be ignored.
2024-09-11 19:17:17 +03:00 · 2012-09-25 00:36:34 +02:00 · 2012-09-25 00:36:34 +02:00 · e73e67001e
commit e73e67001e
parent f1063497ef
5 changed files with 49667 additions and 12 deletions
--- a/data/words.sql
+++ b/data/words.sql
--- a/settings/settings.php
+++ b/settings/settings.php
@ -5,6 +5,7 @@
 	// General settings
 	@define('CONST_Debug', false);
 	@define('CONST_Database_DSN', 'pgsql://@/nominatim');
+	@define('CONST_Max_Word_Frequency', '50000');

 	// Paths
 	@define('CONST_Postgresql_Version', '9.1');
--- a/sql/functions.sql
+++ b/sql/functions.sql
@ -83,18 +83,24 @@ END;
 $$
 LANGUAGE 'plpgsql' IMMUTABLE;

+-- returns NULL if the word is too common
 CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
  RETURNS INTEGER
  AS $$
 DECLARE
  lookup_token TEXT;
  return_word_id INTEGER;
+  count INTEGER;
 BEGIN
  lookup_token := trim(lookup_word);
-  SELECT min(word_id) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id;
+  SELECT min(word_id), max(search_name_count) FROM word WHERE word_token = lookup_token and class is null and type is null into return_word_id, count;
  IF return_word_id IS NULL THEN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, regexp_replace(lookup_token,E'([^0-9])\\1+',E'\\1','g'), null, null, null, null, 0, null);
+  ELSE
+    IF count > get_maxwordfreq() THEN
+      return_word_id := NULL;
+    END IF;
  END IF;
  RETURN return_word_id;
 END;
@ -317,7 +323,7 @@ BEGIN
      FOR j IN 1..array_upper(words, 1) LOOP
        IF (words[j] != '') THEN
          w = getorcreate_word_id(words[j]);
-          IF NOT (ARRAY[w] <@ result) THEN
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
@ -330,7 +336,7 @@ BEGIN
        s := make_standard_name(words[j]);
        IF s != '' THEN
          w := getorcreate_word_id(s);
-          IF NOT (ARRAY[w] <@ result) THEN
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
            result := result || w;
          END IF;
        END IF;
@ -379,7 +385,7 @@ BEGIN
    FOR j IN 1..array_upper(words, 1) LOOP
      IF (words[j] != '') THEN
        w = getorcreate_word_id(words[j]);
-        IF NOT (ARRAY[w] <@ result) THEN
+        IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
          result := result || w;
        END IF;
      END IF;
@ -392,7 +398,7 @@ BEGIN
      s := make_standard_name(words[j]);
      IF s != '' THEN
        w := getorcreate_word_id(s);
-        IF NOT (ARRAY[w] <@ result) THEN
+        IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
          result := result || w;
        END IF;
      END IF;
--- a/utils/setup.php
+++ b/utils/setup.php
@ -271,7 +271,7 @@

 	if ($aCMDResult['load-data'] || $aCMDResult['all'])
 	{
-		echo "Load Data\n";
+		echo "Drop old Data\n";
 		$bDidSomething = true;

 		$oDB =& getDB();
@ -307,17 +307,18 @@
 			echo '.';
 		}

+		// used by getorcreate_word_id to ignore frequent partial words
+		if (!pg_query($oDB->connection, 'CREATE OR REPLACE FUNCTION get_maxwordfreq() RETURNS integer AS $$ SELECT '.CONST_Max_Word_Frequency.' as maxwordfreq; $$ LANGUAGE SQL IMMUTABLE')) fail(pg_last_error($oDB->connection));
+		echo ".\n";
+
 		// pre-create the word list
 		if (!$aCMDResult['disable-token-precalc'])
 		{
-			if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct svals(name) as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection));
-			echo '.';
-			if (!pg_query($oDB->connection, 'select count(make_keywords(v)) from (select distinct postcode as v from place) as w where v is not null;')) fail(pg_last_error($oDB->connection));
-			echo '.';
-			if (!pg_query($oDB->connection, 'select count(getorcreate_housenumber_id(v)) from (select distinct housenumber as v from place where housenumber is not null) as w;')) fail(pg_last_error($oDB->connection));
-			echo '.';
+			echo "Loading word list\n";
+			pgsqlRunScriptFile(CONST_BasePath.'/data/words.sql');
 		}

+		echo "Load Data\n";
 		$aDBInstances = array();
 		for($i = 0; $i < $iInstances; $i++)
 		{
--- a/website/search.php
+++ b/website/search.php
@ -312,6 +312,7 @@
 			// Check which tokens we have, get the ID numbers			
 			$sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator';
 			$sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')';
+			$sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency;
 			$sSQL .= ' and (class is null or class not in (\'highway\'))';
 //			$sSQL .= ' group by word_token, word, class, type, location, country_code';