Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql

ICU tokenizer: switch match method to using partial terms
2024-12-25 05:52:32 +03:00 · 2021-09-28 09:45:15 +02:00 · 2021-09-28 09:45:15 +02:00 · 40f9d52ad8
commit 40f9d52ad8
parent 779ea8ac62 09c9fad6c3
11 changed files with 236 additions and 189 deletions
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
 house numbers.
 ```sql
-FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
 ```
-Return the match token IDs by which to search a matching street from the
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
-`addr:street` tag. These IDs will be matched against the IDs supplied by
+match against the `addr:street` tag name. Must return either NULL or FALSE
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
+when the place has no `addr:street` tag.
 tag.
 ```sql
-FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
 ```
-Return the match token IDs by which to search a matching place from the
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
-`addr:place` tag. These IDs will be matched against the IDs supplied by
+match against the `addr:place` tag name. Must return either NULL or FALSE
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
+when the place has no `addr:place` tag.
-tag.
+
 ```sql
 FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
 database. Must be NULL when the place has no `addr:place` tag.
 ```sql
-CREATE TYPE token_addresstoken AS (
+FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
  key TEXT,
  match_tokens INT[],
  search_tokens INT[]
 );
 FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
 ```
-Return the match and search token IDs for explicit `addr:*` tags for the place
+Return the set of keys for which address information is provided. This
-other than `addr:street` and `addr:place`. For each address item there are
+should correspond to the list of (relevant) `addr:*` tags with the `addr:`
-three pieces of information returned:
+prefix removed or the keys used in the `address` dictionary of the place info.
- * _key_ contains the type of address item (city, county, etc.). This is the
+```sql
-   key handed in with the `address` dictionary.
+FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
- * *match_tokens* is the list of token IDs used to find the corresponding
+```
-   place object for the address part. The list is matched against the IDs
+
-   from `token_get_name_match_tokens`.
+Return the array of search tokens for the given address part. `key` can be
- * *search_tokens* is the list of token IDs under which to search the address
+expected to be one of those returned with `token_get_address_keys()`. The
-   item. It is used when no corresponding place object was found.
+search tokens are added to the address search vector of the place, when no
 corresponding OSM object could be found for the given address part from which
 to copy the name information.
 ```sql
 FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
 ```
 Check if the given tokens match against the address part `key`.
 __Warning:__ the tokens that are handed in are the lists previously saved
 from `token_get_name_search_tokens()`, _not_ from the match token list. This
 is an historical oddity which will be fixed at some point in the future.
 Currently, tokenizers are encouraged to make sure that matching works against
 both the search token list and the match token list.
 ```sql
 FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
                                                    partition SMALLINT,
                                                    centroid GEOMETRY, geom GEOMETRY)
  RETURNS BIGINT
@ -52,7 +52,7 @@ DECLARE
  parent_place_id BIGINT;
  location RECORD;
 BEGIN
-  parent_place_id := find_parent_for_address(street, place, partition, centroid);
+  parent_place_id := find_parent_for_address(token_info, partition, centroid);
  IF parent_place_id is null THEN
    FOR location IN SELECT place_id FROM placex
@ -155,9 +155,8 @@ BEGIN
  NEW.interpolationtype = NEW.address->'interpolation';
  place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
+  NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
-                                                 token_addr_place_match_tokens(NEW.token_info),
+                                                 place_centroid, NEW.linegeo);
                                                 NEW.partition, place_centroid, NEW.linegeo);
  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
                                             from_rank SMALLINT, to_rank SMALLINT,
-                                             extent FLOAT, tokens INT[])
+                                             extent FLOAT, token_info JSONB, key TEXT)
  RETURNS nearfeaturecentr
  AS $$
 DECLARE
@ -80,7 +80,7 @@ BEGIN
        FROM location_area_large_{{ partition }}
        WHERE geometry && ST_Expand(feature, extent)
              AND rank_address between from_rank and to_rank
-              AND tokens && keywords
+              AND token_matches_address(token_info, key, keywords)
        GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
        ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
      RETURN r;
@ -148,18 +148,21 @@ LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
                                                      point GEOMETRY,
-                                                      isin_token INTEGER[])
+                                                      token_info JSONB)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent BIGINT;
 BEGIN
  IF not token_has_addr_street(token_info) THEN
    RETURN NULL;
  END IF;
 {% for partition in db.partitions %}
  IF in_partition = {{ partition }} THEN
    SELECT place_id FROM search_name_{{ partition }}
      INTO parent
-      WHERE name_vector && isin_token
+      WHERE token_matches_street(token_info, name_vector)
            AND centroid && ST_Expand(point, 0.015)
            AND address_rank between 26 and 27
      ORDER BY ST_Distance(centroid, point) ASC limit 1;
@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
                                                       point GEOMETRY,
-                                                       isin_token INTEGER[])
+                                                       token_info JSONB)
  RETURNS BIGINT
  AS $$
 DECLARE
  parent BIGINT;
 BEGIN
  IF not token_has_addr_place(token_info) THEN
    RETURN NULL;
  END IF;
 {% for partition in db.partitions %}
  IF in_partition = {{ partition }} THEN
    SELECT place_id
      INTO parent
      FROM search_name_{{ partition }}
-      WHERE name_vector && isin_token
+      WHERE token_matches_place(token_info, name_vector)
            AND centroid && ST_Expand(point, 0.04)
            AND address_rank between 16 and 25
      ORDER BY ST_Distance(centroid, point) ASC limit 1;
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                               poi_osm_id BIGINT,
                                               poi_partition SMALLINT,
                                               bbox GEOMETRY,
-                                               addr_street INTEGER[],
+                                               token_info JSONB,
                                               addr_place INTEGER[],
                                               is_place_addr BOOLEAN)
  RETURNS BIGINT
  AS $$
@ -119,8 +118,7 @@ BEGIN
  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
  IF parent_place_id is null THEN
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
+    parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
                                               poi_partition, bbox);
  END IF;
  IF parent_place_id is null and poi_osm_type = 'N' THEN
@ -333,13 +331,14 @@ BEGIN
    WHERE s.place_id = parent_place_id;
  FOR addr_item IN
-    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+    SELECT (get_addr_tag_rank(key, country)).*, key,
-      FROM token_get_address_tokens(token_info)
+           token_get_address_search_tokens(token_info, key) as search_tokens
-      WHERE not search_tokens <@ parent_address_vector
+      FROM token_get_address_keys(token_info) as key
      WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
  LOOP
    addr_place := get_address_place(in_partition, geometry,
                                    addr_item.from_rank, addr_item.to_rank,
-                                    addr_item.extent, addr_item.match_tokens);
+                                    addr_item.extent, token_info, addr_item.key);
    IF addr_place is null THEN
      -- No place found in OSM that matches. Make it at least searchable.
@ -447,14 +446,16 @@ BEGIN
  FOR location IN
    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                              extent, match_tokens)).*, search_tokens
+                              extent, token_info, key)).*, key
-      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, key
-              FROM token_get_address_tokens(token_info)) x
+              FROM token_get_address_keys(token_info) as key) x
      ORDER BY rank_address, distance, isguess desc
  LOOP
    IF location.place_id is null THEN
      {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      nameaddress_vector := array_merge(nameaddress_vector,
                                        token_get_address_search_tokens(token_info,
                                                                        location.key));
      {% endif %}
    ELSE
      {% if not db.reverse_only %}
@ -689,9 +690,6 @@ DECLARE
  parent_address_level SMALLINT;
  place_address_level SMALLINT;
  addr_street INTEGER[];
  addr_place INTEGER[];
  max_rank SMALLINT;
  name_vector INTEGER[];
@ -860,8 +858,6 @@ BEGIN
  END IF;
  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
  addr_street := token_addr_street_match_tokens(NEW.token_info);
  addr_place := token_addr_place_match_tokens(NEW.token_info);
  NEW.postcode := null;
@ -907,7 +903,7 @@ BEGIN
    NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                               NEW.partition,
                                               ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place,
+                                               NEW.token_info,
                                               is_place_address);
    -- If we found the road take a shortcut here.
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
 -- Find the parent of an address with addr:street/addr:place tag.
 --
-- \param street     Value of addr:street or NULL if tag is missing.
+-- \param token_info Naming info with the address information.
 -- \param place      Value of addr:place or NULL if tag is missing.
 -- \param partition  Partition where to search the parent.
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
                                                   partition SMALLINT,
                                                   centroid GEOMETRY)
  RETURNS BIGINT
@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
 DECLARE
  parent_place_id BIGINT;
 BEGIN
-  IF street is not null THEN
+  -- Check for addr:street attributes
-    -- Check for addr:street attributes
+  parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
-    -- Note that addr:street links can only be indexed, once the street itself is indexed
+  IF parent_place_id is not null THEN
-    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
+    {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
-    IF parent_place_id is not null THEN
+    RETURN parent_place_id;
      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
      RETURN parent_place_id;
    END IF;
  END IF;
  -- Check for addr:place attributes.
-  IF place is not null THEN
+  parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
-    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
+  {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
-    IF parent_place_id is not null THEN
+  RETURN parent_place_id;
      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
      RETURN parent_place_id;
    END IF;
  END IF;
  RETURN NULL;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
  RETURNS BOOLEAN
  AS $$
--- a/lib-sql/tiger_import_start.sql
+++ b/lib-sql/tiger_import_start.sql
@ -14,7 +14,6 @@ DECLARE
  out_partition INTEGER;
  out_parent_place_id BIGINT;
  location RECORD;
  address_street_word_ids INTEGER[];
 BEGIN
@ -54,13 +53,9 @@ BEGIN
  place_centroid := ST_Centroid(linegeo);
  out_partition := get_partition('us');
  out_parent_place_id := null;
-  address_street_word_ids := token_addr_street_match_tokens(token_info);
+  out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-  IF address_street_word_ids IS NOT NULL THEN
+                                                    token_info);
    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
                                                      address_street_word_ids);
  END IF;
  IF out_parent_place_id IS NULL THEN
    SELECT getNearestParallelRoadFeature(out_partition, linegeo)
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@ -34,40 +34,59 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
-  RETURNS INTEGER[]
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
 $$ LANGUAGE SQL IMMUTABLE;
 CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
  RETURNS BOOLEAN
 AS $$
  SELECT info->>'place' is not null;
 $$ LANGUAGE SQL IMMUTABLE;
 CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
  RETURNS BOOLEAN
 AS $$
  SELECT (info->>'street')::INTEGER[] <@ street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
-  RETURNS INTEGER[]
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[] <@ place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
-  SELECT (info->>'place_search')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
-CREATE TYPE token_addresstoken AS (
+  RETURNS SETOF TEXT
  key TEXT,
  match_tokens INT[],
  search_tokens INT[]
 );
 CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
  RETURNS SETOF token_addresstoken
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
+  SELECT * FROM jsonb_object_keys(info->'addr');
-         (value->>0)::int[] as search_tokens
+$$ LANGUAGE SQL IMMUTABLE STRICT;
-  FROM jsonb_each(info->'addr');
+
 CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->'addr'->>key)::INTEGER[];
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
  RETURNS BOOLEAN
 AS $$
  SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
@ -127,15 +146,34 @@ BEGIN
        VALUES (term_id, term, 'w', json_build_object('count', term_count));
    END IF;
-    IF term_count < {{ max_word_freq }} THEN
+    partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
    END IF;
  END LOOP;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  token INTEGER;
 BEGIN
  SELECT min(word_id) INTO token
    FROM word WHERE word_token = partial and type = 'w';
  IF token IS NULL THEN
    token := nextval('seq_word');
    INSERT INTO word (word_id, word_token, type, info)
        VALUES (token, partial, 'w', json_build_object('count', 0));
  END IF;
  RETURN token;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
  RETURNS INTEGER
  AS $$
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@ -34,17 +34,31 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
-  RETURNS INTEGER[]
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'street' is not null;
 $$ LANGUAGE SQL IMMUTABLE;
 CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
  RETURNS BOOLEAN
 AS $$
  SELECT info->>'place_match' is not null;
 $$ LANGUAGE SQL IMMUTABLE;
 CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
  RETURNS BOOLEAN
 AS $$
  SELECT (info->>'street')::INTEGER[] && street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
-  RETURNS INTEGER[]
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
@ -55,19 +69,24 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
-CREATE TYPE token_addresstoken AS (
+  RETURNS SETOF TEXT
  key TEXT,
  match_tokens INT[],
  search_tokens INT[]
 );
 CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
  RETURNS SETOF token_addresstoken
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
+  SELECT * FROM jsonb_object_keys(info->'addr');
-         (value->>0)::int[] as search_tokens
+$$ LANGUAGE SQL IMMUTABLE STRICT;
-  FROM jsonb_each(info->'addr');
+
 CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->'addr'->key->>0)::INTEGER[];
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
  RETURNS BOOLEAN
 AS $$
  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 LOG = logging.getLogger()
@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
        self.data_dir = data_dir
        self.naming_rules = None
        self.term_normalization = None
        self.max_word_frequency = None
    def init_new_db(self, config, init_db=True):
@ -52,10 +50,9 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                                             config='TOKENIZER_CONFIG'))
        self.naming_rules = ICUNameProcessorRules(loader=loader)
        self.term_normalization = config.TERM_NORMALIZATION
        self.max_word_frequency = config.MAX_WORD_FREQUENCY
        self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
        if init_db:
            self.update_sql_functions(config)
@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            self.naming_rules = ICUNameProcessorRules(conn=conn)
            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
    def finalize_import(self, _):
@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
                              max_word_freq=max_word_freq)
    def check_database(self):
@ -122,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(dedent(f"""\
            <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
-    def _save_config(self, config):
+    def _save_config(self):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        with connect(self.dsn) as conn:
            self.naming_rules.save_rules(conn)
            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@ -424,12 +417,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                hnrs.append(value)
            elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
+                token_info.add_street(self._compute_partial_tokens(value))
            elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
+                token_info.add_place(self._compute_partial_tokens(value))
            elif not key.startswith('_') and \
                 key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+                addr_terms.append((key, self._compute_partial_tokens(value)))
        if hnrs:
            hnrs = self._split_housenumbers(hnrs)
@ -438,6 +431,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
        if addr_terms:
            token_info.add_address_terms(addr_terms)
    def _compute_partial_tokens(self, name):
        """ Normalize the given term, split it into partial words and return
            then token list for them.
        """
        norm_name = self.name_processor.get_search_normalized(name)
        tokens = []
        need_lookup = []
        for partial in norm_name.split():
            token = self._cache.partials.get(partial)
            if token:
                tokens.append(token)
            else:
                need_lookup.append(partial)
        if need_lookup:
            with self.conn.cursor() as cur:
                cur.execute("""SELECT word, getorcreate_partial_word(word)
                               FROM unnest(%s) word""",
                            (need_lookup, ))
                for partial, token in cur:
                    tokens.append(token)
                    self._cache.partials[partial] = token
        return tokens
    def _compute_name_tokens(self, names):
        """ Computes the full name and partial name tokens for the given
@ -551,30 +570,25 @@ class _TokenInfo:
        self.data['hnr'] = ';'.join(hnrs)
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
        """ Add addr:street match terms.
        """
-        if fulls:
+        if tokens:
-            self.data['street'] = self._mk_array(fulls)
+            self.data['street'] = self._mk_array(tokens)
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
        """ Add addr:place search and match terms.
        """
-        if fulls:
+        if tokens:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place'] = self._mk_array(tokens)
            self.data['place_match'] = self._mk_array(fulls)
    def add_address_terms(self, terms):
        """ Add additional address terms.
        """
-        tokens = {}
+        tokens = {key: self._mk_array(partials)
-
+                  for key, partials in terms if partials}
        for key, fulls, partials in terms:
            if fulls:
                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
                               self._mk_array(fulls)]
        if tokens:
            self.data['addr'] = tokens
@ -588,6 +602,7 @@ class _TokenCache:
    """
    def __init__(self):
        self.names = {}
        self.partials = {}
        self.postcodes = set()
        self.housenumbers = {}
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@ -125,9 +125,6 @@ Feature: Creation of search terms
        Then placex contains
         | object | parent_place_id |
         | N1     | N2              |
        Then search_name contains
         | object | name_vector | nameaddress_vector |
         | N1     | #Walltown   | Strange, Town |
        When sending search query "23 Rose Street"
        Then exactly 1 results are returned
        And results contain
@ -156,9 +153,6 @@ Feature: Creation of search terms
         | W1  | highway | residential | Rose Street  | :w-north |
         | N2  | place   | city        | Strange Town | :p-N1 |
        When importing
        Then search_name contains
         | object | name_vector      | nameaddress_vector      |
         | N1     | #Walltown, #Blue house | Walltown, Strange, Town |
        When sending search query "23 Walltown, Strange Town"
        Then results contain
         | osm | display_name |
@ -190,9 +184,6 @@ Feature: Creation of search terms
         | W1  | highway | residential | Rose Street  | :w-north |
         | N2  | place   | city        | Strange Town | :p-N1 |
        When importing
        Then search_name contains
         | object | name_vector      | nameaddress_vector      |
         | N1     | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
        When sending search query "23 Moon Sun, Strange Town"
        Then results contain
         | osm | display_name |
@ -212,9 +203,6 @@ Feature: Creation of search terms
         | W1  | highway | residential | Rose Street  | Walltown  | :w-north |
         | N2  | place   | suburb      | Strange Town | Walltown  | :p-N1 |
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
         | N1     | #Walltown   | Strange, Town      |
        When sending search query "23 Rose Street, Walltown"
        Then exactly 1 result is returned
        And results contain
@ -303,9 +291,6 @@ Feature: Creation of search terms
         | W1  | highway | residential | Rose Street  | :w-north |
         | N2  | place   | suburb      | Strange Town | :p-N1 |
        When importing
        Then search_name contains
         | object | name_vector | nameaddress_vector |
         | N1     | #Green Moss | Walltown |
        When sending search query "Green Moss, Rose Street, Walltown"
        Then exactly 0 result is returned
        When sending search query "Green Moss, Walltown"
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@ -10,6 +10,7 @@ from nominatim.tokenizer import icu_tokenizer
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from mock_icu_word_table import MockIcuWordTable
@ -76,6 +77,15 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
    return _mk_analyser
@pytest.fixture
 def sql_functions(temp_db_conn, def_config, src_dir):
    orig_sql = def_config.lib_dir.sql
    def_config.lib_dir.sql = src_dir / 'lib-sql'
    sqlproc = SQLPreprocessor(temp_db_conn, def_config)
    sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
    sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
    def_config.lib_dir.sql = orig_sql
@pytest.fixture
 def getorcreate_full_word(temp_db_cursor):
@ -144,7 +154,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
    tok.init_new_db(test_config)
    assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@ -163,7 +172,6 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
 def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    monkeypatch.undo()
@ -173,23 +181,18 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
    assert tok.naming_rules is not None
    assert tok.term_normalization == ':: lower();'
    assert tok.max_word_frequency == '90300'
 def test_update_sql_functions(db_prop, temp_db_cursor,
                              tokenizer_factory, test_config, table_factory,
                              monkeypatch):
    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
    tok = tokenizer_factory()
    tok.init_new_db(test_config)
    monkeypatch.undo()
    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
    table_factory('test', 'txt TEXT')
    func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""")
    tok.update_sql_functions(test_config)
@ -304,7 +307,7 @@ def test_add_country_names_extend(analyzer, word_table):
 class TestPlaceNames:
    @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
        with analyzer() as anl:
            self.analyzer = anl
            yield anl
@ -351,7 +354,7 @@ class TestPlaceNames:
 class TestPlaceAddress:
    @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
        with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
            self.analyzer = anl
            yield anl
@ -424,7 +427,7 @@ class TestPlaceAddress:
    def test_process_place_street(self):
        info = self.process_address(street='Grand Road')
-        assert eval(info['street']) == self.name_token_set('#GRAND ROAD')
+        assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
    def test_process_place_street_empty(self):
@ -436,16 +439,13 @@ class TestPlaceAddress:
    def test_process_place_place(self):
        info = self.process_address(place='Honu Lulu')
-        assert eval(info['place_search']) == self.name_token_set('#HONU LULU',
+        assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
                                                                 'HONU', 'LULU')
        assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
    def test_process_place_place_empty(self):
        info = self.process_address(place='🜵')
-        assert 'place_search' not in info
+        assert 'place' not in info
        assert 'place_match' not in info
    def test_process_place_address_terms(self):
@ -453,16 +453,12 @@ class TestPlaceAddress:
                                    suburb='Zwickau', street='Hauptstr',
                                    full='right behind the church')
-        city_full = self.name_token_set('#ZWICKAU')
+        city = self.name_token_set('ZWICKAU')
-        city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU')
+        state = self.name_token_set('SACHSEN')
        state_full = self.name_token_set('#SACHSEN')
        state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
-        result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()}
+        result = {k: eval(v) for k,v in info['addr'].items()}
-        assert result == {'city': [city_all, city_full],
+        assert result == {'city': city, 'suburb': city, 'state': state}
                          'suburb': [city_all, city_full],
                          'state': [state_all, state_full]}
    def test_process_place_address_terms_empty(self):