Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql

ICU tokenizer: switch match method to using partial terms
This commit is contained in:
Sarah Hoffmann 2021-09-28 09:45:15 +02:00 committed by GitHub
commit 40f9d52ad8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 236 additions and 189 deletions

View File

@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
house numbers. house numbers.
```sql ```sql
FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[] FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
``` ```
Return the match token IDs by which to search a matching street from the Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
`addr:street` tag. These IDs will be matched against the IDs supplied by match against the `addr:street` tag name. Must return either NULL or FALSE
`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street` when the place has no `addr:street` tag.
tag.
```sql ```sql
FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[] FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
``` ```
Return the match token IDs by which to search a matching place from the Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
`addr:place` tag. These IDs will be matched against the IDs supplied by match against the `addr:place` tag name. Must return either NULL or FALSE
`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place` when the place has no `addr:place` tag.
tag.
```sql ```sql
FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
database. Must be NULL when the place has no `addr:place` tag. database. Must be NULL when the place has no `addr:place` tag.
```sql ```sql
CREATE TYPE token_addresstoken AS ( FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
``` ```
Return the match and search token IDs for explicit `addr:*` tags for the place Return the set of keys for which address information is provided. This
other than `addr:street` and `addr:place`. For each address item there are should correspond to the list of (relevant) `addr:*` tags with the `addr:`
three pieces of information returned: prefix removed or the keys used in the `address` dictionary of the place info.
* _key_ contains the type of address item (city, county, etc.). This is the ```sql
key handed in with the `address` dictionary. FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
* *match_tokens* is the list of token IDs used to find the corresponding ```
place object for the address part. The list is matched against the IDs
from `token_get_name_match_tokens`. Return the array of search tokens for the given address part. `key` can be
* *search_tokens* is the list of token IDs under which to search the address expected to be one of those returned with `token_get_address_keys()`. The
item. It is used when no corresponding place object was found. search tokens are added to the address search vector of the place, when no
corresponding OSM object could be found for the given address part from which
to copy the name information.
```sql
FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
```
Check if the given tokens match against the address part `key`.
__Warning:__ the tokens that are handed in are the lists previously saved
from `token_get_name_search_tokens()`, _not_ from the match token list. This
is an historical oddity which will be fixed at some point in the future.
Currently, tokenizers are encouraged to make sure that matching works against
both the search token list and the match token list.
```sql ```sql
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT

View File

@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
-- find the parent road of the cut road parts -- find the parent road of the cut road parts
CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[], CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
partition SMALLINT, partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY) centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT RETURNS BIGINT
@ -52,7 +52,7 @@ DECLARE
parent_place_id BIGINT; parent_place_id BIGINT;
location RECORD; location RECORD;
BEGIN BEGIN
parent_place_id := find_parent_for_address(street, place, partition, centroid); parent_place_id := find_parent_for_address(token_info, partition, centroid);
IF parent_place_id is null THEN IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex FOR location IN SELECT place_id FROM placex
@ -155,9 +155,8 @@ BEGIN
NEW.interpolationtype = NEW.address->'interpolation'; NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo); place_centroid := ST_PointOnSurface(NEW.linegeo);
NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info), NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
token_addr_place_match_tokens(NEW.token_info), place_centroid, NEW.linegeo);
NEW.partition, place_centroid, NEW.linegeo);
interpol_postcode := token_normalized_postcode(NEW.address->'postcode'); interpol_postcode := token_normalized_postcode(NEW.address->'postcode');

View File

@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY, CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
from_rank SMALLINT, to_rank SMALLINT, from_rank SMALLINT, to_rank SMALLINT,
extent FLOAT, tokens INT[]) extent FLOAT, token_info JSONB, key TEXT)
RETURNS nearfeaturecentr RETURNS nearfeaturecentr
AS $$ AS $$
DECLARE DECLARE
@ -80,7 +80,7 @@ BEGIN
FROM location_area_large_{{ partition }} FROM location_area_large_{{ partition }}
WHERE geometry && ST_Expand(feature, extent) WHERE geometry && ST_Expand(feature, extent)
AND rank_address between from_rank and to_rank AND rank_address between from_rank and to_rank
AND tokens && keywords AND token_matches_address(token_info, key, keywords)
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1; ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
RETURN r; RETURN r;
@ -148,18 +148,21 @@ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER, CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
point GEOMETRY, point GEOMETRY,
isin_token INTEGER[]) token_info JSONB)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
DECLARE DECLARE
parent BIGINT; parent BIGINT;
BEGIN BEGIN
IF not token_has_addr_street(token_info) THEN
RETURN NULL;
END IF;
{% for partition in db.partitions %} {% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN IF in_partition = {{ partition }} THEN
SELECT place_id FROM search_name_{{ partition }} SELECT place_id FROM search_name_{{ partition }}
INTO parent INTO parent
WHERE name_vector && isin_token WHERE token_matches_street(token_info, name_vector)
AND centroid && ST_Expand(point, 0.015) AND centroid && ST_Expand(point, 0.015)
AND address_rank between 26 and 27 AND address_rank between 26 and 27
ORDER BY ST_Distance(centroid, point) ASC limit 1; ORDER BY ST_Distance(centroid, point) ASC limit 1;
@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER, CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
point GEOMETRY, point GEOMETRY,
isin_token INTEGER[]) token_info JSONB)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
DECLARE DECLARE
parent BIGINT; parent BIGINT;
BEGIN BEGIN
IF not token_has_addr_place(token_info) THEN
RETURN NULL;
END IF;
{% for partition in db.partitions %} {% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN IF in_partition = {{ partition }} THEN
SELECT place_id SELECT place_id
INTO parent INTO parent
FROM search_name_{{ partition }} FROM search_name_{{ partition }}
WHERE name_vector && isin_token WHERE token_matches_place(token_info, name_vector)
AND centroid && ST_Expand(point, 0.04) AND centroid && ST_Expand(point, 0.04)
AND address_rank between 16 and 25 AND address_rank between 16 and 25
ORDER BY ST_Distance(centroid, point) ASC limit 1; ORDER BY ST_Distance(centroid, point) ASC limit 1;

View File

@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
poi_osm_id BIGINT, poi_osm_id BIGINT,
poi_partition SMALLINT, poi_partition SMALLINT,
bbox GEOMETRY, bbox GEOMETRY,
addr_street INTEGER[], token_info JSONB,
addr_place INTEGER[],
is_place_addr BOOLEAN) is_place_addr BOOLEAN)
RETURNS BIGINT RETURNS BIGINT
AS $$ AS $$
@ -119,8 +118,7 @@ BEGIN
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id); parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
IF parent_place_id is null THEN IF parent_place_id is null THEN
parent_place_id := find_parent_for_address(addr_street, addr_place, parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
poi_partition, bbox);
END IF; END IF;
IF parent_place_id is null and poi_osm_type = 'N' THEN IF parent_place_id is null and poi_osm_type = 'N' THEN
@ -333,13 +331,14 @@ BEGIN
WHERE s.place_id = parent_place_id; WHERE s.place_id = parent_place_id;
FOR addr_item IN FOR addr_item IN
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens SELECT (get_addr_tag_rank(key, country)).*, key,
FROM token_get_address_tokens(token_info) token_get_address_search_tokens(token_info, key) as search_tokens
WHERE not search_tokens <@ parent_address_vector FROM token_get_address_keys(token_info) as key
WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
LOOP LOOP
addr_place := get_address_place(in_partition, geometry, addr_place := get_address_place(in_partition, geometry,
addr_item.from_rank, addr_item.to_rank, addr_item.from_rank, addr_item.to_rank,
addr_item.extent, addr_item.match_tokens); addr_item.extent, token_info, addr_item.key);
IF addr_place is null THEN IF addr_place is null THEN
-- No place found in OSM that matches. Make it at least searchable. -- No place found in OSM that matches. Make it at least searchable.
@ -447,14 +446,16 @@ BEGIN
FOR location IN FOR location IN
SELECT (get_address_place(partition, geometry, from_rank, to_rank, SELECT (get_address_place(partition, geometry, from_rank, to_rank,
extent, match_tokens)).*, search_tokens extent, token_info, key)).*, key
FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens FROM (SELECT (get_addr_tag_rank(key, country)).*, key
FROM token_get_address_tokens(token_info)) x FROM token_get_address_keys(token_info) as key) x
ORDER BY rank_address, distance, isguess desc ORDER BY rank_address, distance, isguess desc
LOOP LOOP
IF location.place_id is null THEN IF location.place_id is null THEN
{% if not db.reverse_only %} {% if not db.reverse_only %}
nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens); nameaddress_vector := array_merge(nameaddress_vector,
token_get_address_search_tokens(token_info,
location.key));
{% endif %} {% endif %}
ELSE ELSE
{% if not db.reverse_only %} {% if not db.reverse_only %}
@ -689,9 +690,6 @@ DECLARE
parent_address_level SMALLINT; parent_address_level SMALLINT;
place_address_level SMALLINT; place_address_level SMALLINT;
addr_street INTEGER[];
addr_place INTEGER[];
max_rank SMALLINT; max_rank SMALLINT;
name_vector INTEGER[]; name_vector INTEGER[];
@ -860,8 +858,6 @@ BEGIN
END IF; END IF;
NEW.housenumber := token_normalized_housenumber(NEW.token_info); NEW.housenumber := token_normalized_housenumber(NEW.token_info);
addr_street := token_addr_street_match_tokens(NEW.token_info);
addr_place := token_addr_place_match_tokens(NEW.token_info);
NEW.postcode := null; NEW.postcode := null;
@ -907,7 +903,7 @@ BEGIN
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
NEW.partition, NEW.partition,
ST_Envelope(NEW.geometry), ST_Envelope(NEW.geometry),
addr_street, addr_place, NEW.token_info,
is_place_address); is_place_address);
-- If we found the road take a shortcut here. -- If we found the road take a shortcut here.

View File

@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
-- Find the parent of an address with addr:street/addr:place tag. -- Find the parent of an address with addr:street/addr:place tag.
-- --
-- \param street Value of addr:street or NULL if tag is missing. -- \param token_info Naming info with the address information.
-- \param place Value of addr:place or NULL if tag is missing.
-- \param partition Partition where to search the parent. -- \param partition Partition where to search the parent.
-- \param centroid Location of the address. -- \param centroid Location of the address.
-- --
-- \return Place ID of the parent if one was found, NULL otherwise. -- \return Place ID of the parent if one was found, NULL otherwise.
CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[], CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
partition SMALLINT, partition SMALLINT,
centroid GEOMETRY) centroid GEOMETRY)
RETURNS BIGINT RETURNS BIGINT
@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
DECLARE DECLARE
parent_place_id BIGINT; parent_place_id BIGINT;
BEGIN BEGIN
IF street is not null THEN -- Check for addr:street attributes
-- Check for addr:street attributes parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
-- Note that addr:street links can only be indexed, once the street itself is indexed IF parent_place_id is not null THEN
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street); {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
IF parent_place_id is not null THEN RETURN parent_place_id;
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
END IF; END IF;
-- Check for addr:place attributes. -- Check for addr:place attributes.
IF place is not null THEN parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place); {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
IF parent_place_id is not null THEN RETURN parent_place_id;
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
RETURN parent_place_id;
END IF;
END IF;
RETURN NULL;
END; END;
$$ $$
LANGUAGE plpgsql STABLE; LANGUAGE plpgsql STABLE;
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT) CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
RETURNS BOOLEAN RETURNS BOOLEAN
AS $$ AS $$

View File

@ -14,7 +14,6 @@ DECLARE
out_partition INTEGER; out_partition INTEGER;
out_parent_place_id BIGINT; out_parent_place_id BIGINT;
location RECORD; location RECORD;
address_street_word_ids INTEGER[];
BEGIN BEGIN
@ -54,13 +53,9 @@ BEGIN
place_centroid := ST_Centroid(linegeo); place_centroid := ST_Centroid(linegeo);
out_partition := get_partition('us'); out_partition := get_partition('us');
out_parent_place_id := null;
address_street_word_ids := token_addr_street_match_tokens(token_info); out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
IF address_street_word_ids IS NOT NULL THEN token_info);
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
address_street_word_ids);
END IF;
IF out_parent_place_id IS NULL THEN IF out_parent_place_id IS NULL THEN
SELECT getNearestParallelRoadFeature(out_partition, linegeo) SELECT getNearestParallelRoadFeature(out_partition, linegeo)

View File

@ -34,40 +34,59 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
RETURNS INTEGER[] RETURNS BOOLEAN
AS $$ AS $$
SELECT (info->>'street')::INTEGER[] SELECT info->>'street' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT info->>'place' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[] <@ street_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
RETURNS INTEGER[] RETURNS BOOLEAN
AS $$ AS $$
SELECT (info->>'place_match')::INTEGER[] SELECT (info->>'place')::INTEGER[] <@ place_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
RETURNS INTEGER[] RETURNS INTEGER[]
AS $$ AS $$
SELECT (info->>'place_search')::INTEGER[] SELECT (info->>'place')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE; CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
CREATE TYPE token_addresstoken AS ( RETURNS SETOF TEXT
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
AS $$ AS $$
SELECT key, (value->>1)::int[] as match_tokens, SELECT * FROM jsonb_object_keys(info->'addr');
(value->>0)::int[] as search_tokens $$ LANGUAGE SQL IMMUTABLE STRICT;
FROM jsonb_each(info->'addr');
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
RETURNS INTEGER[]
AS $$
SELECT (info->'addr'->>key)::INTEGER[];
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
@ -127,15 +146,34 @@ BEGIN
VALUES (term_id, term, 'w', json_build_object('count', term_count)); VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF; END IF;
IF term_count < {{ max_word_freq }} THEN partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
END LOOP; END LOOP;
END; END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
RETURNS INTEGER
AS $$
DECLARE
token INTEGER;
BEGIN
SELECT min(word_id) INTO token
FROM word WHERE word_token = partial and type = 'w';
IF token IS NULL THEN
token := nextval('seq_word');
INSERT INTO word (word_id, word_token, type, info)
VALUES (token, partial, 'w', json_build_object('count', 0));
END IF;
RETURN token;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
RETURNS INTEGER RETURNS INTEGER
AS $$ AS $$

View File

@ -34,17 +34,31 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
RETURNS INTEGER[] RETURNS BOOLEAN
AS $$ AS $$
SELECT (info->>'street')::INTEGER[] SELECT info->>'street' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
RETURNS BOOLEAN
AS $$
SELECT info->>'place_match' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->>'street')::INTEGER[] && street_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
RETURNS INTEGER[] RETURNS BOOLEAN
AS $$ AS $$
SELECT (info->>'place_match')::INTEGER[] SELECT (info->>'place_match')::INTEGER[] && place_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
@ -55,19 +69,24 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;
DROP TYPE IF EXISTS token_addresstoken CASCADE; CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
CREATE TYPE token_addresstoken AS ( RETURNS SETOF TEXT
key TEXT,
match_tokens INT[],
search_tokens INT[]
);
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
RETURNS SETOF token_addresstoken
AS $$ AS $$
SELECT key, (value->>1)::int[] as match_tokens, SELECT * FROM jsonb_object_keys(info->'addr');
(value->>0)::int[] as search_tokens $$ LANGUAGE SQL IMMUTABLE STRICT;
FROM jsonb_each(info->'addr');
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
RETURNS INTEGER[]
AS $$
SELECT (info->'addr'->key->>0)::INTEGER[];
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
RETURNS BOOLEAN
AS $$
SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
$$ LANGUAGE SQL IMMUTABLE STRICT; $$ LANGUAGE SQL IMMUTABLE STRICT;

View File

@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger() LOG = logging.getLogger()
@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
self.data_dir = data_dir self.data_dir = data_dir
self.naming_rules = None self.naming_rules = None
self.term_normalization = None self.term_normalization = None
self.max_word_frequency = None
def init_new_db(self, config, init_db=True): def init_new_db(self, config, init_db=True):
@ -52,10 +50,9 @@ class LegacyICUTokenizer(AbstractTokenizer):
config='TOKENIZER_CONFIG')) config='TOKENIZER_CONFIG'))
self.naming_rules = ICUNameProcessorRules(loader=loader) self.naming_rules = ICUNameProcessorRules(loader=loader)
self.term_normalization = config.TERM_NORMALIZATION self.term_normalization = config.TERM_NORMALIZATION
self.max_word_frequency = config.MAX_WORD_FREQUENCY
self._install_php(config.lib_dir.php) self._install_php(config.lib_dir.php)
self._save_config(config) self._save_config()
if init_db: if init_db:
self.update_sql_functions(config) self.update_sql_functions(config)
@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
self.naming_rules = ICUNameProcessorRules(conn=conn) self.naming_rules = ICUNameProcessorRules(conn=conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, _): def finalize_import(self, _):
@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Reimport the SQL functions for this tokenizer. """ Reimport the SQL functions for this tokenizer.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
sqlp = SQLPreprocessor(conn, config) sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
max_word_freq=max_word_freq)
def check_database(self): def check_database(self):
@ -122,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
php_file = self.data_dir / "tokenizer.php" php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent(f"""\ php_file.write_text(dedent(f"""\
<?php <?php
@define('CONST_Max_Word_Frequency', {self.max_word_frequency}); @define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}"); @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}"); @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
def _save_config(self, config): def _save_config(self):
""" Save the configuration that needs to remain stable for the given """ Save the configuration that needs to remain stable for the given
database as database properties. database as database properties.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
self.naming_rules.save_rules(conn) self.naming_rules.save_rules(conn)
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization) set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@ -424,12 +417,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value) hnrs.append(value)
elif key == 'street': elif key == 'street':
token_info.add_street(*self._compute_name_tokens({'name': value})) token_info.add_street(self._compute_partial_tokens(value))
elif key == 'place': elif key == 'place':
token_info.add_place(*self._compute_name_tokens({'name': value})) token_info.add_place(self._compute_partial_tokens(value))
elif not key.startswith('_') and \ elif not key.startswith('_') and \
key not in ('country', 'full'): key not in ('country', 'full'):
addr_terms.append((key, *self._compute_name_tokens({'name': value}))) addr_terms.append((key, self._compute_partial_tokens(value)))
if hnrs: if hnrs:
hnrs = self._split_housenumbers(hnrs) hnrs = self._split_housenumbers(hnrs)
@ -438,6 +431,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
if addr_terms: if addr_terms:
token_info.add_address_terms(addr_terms) token_info.add_address_terms(addr_terms)
def _compute_partial_tokens(self, name):
""" Normalize the given term, split it into partial words and return
then token list for them.
"""
norm_name = self.name_processor.get_search_normalized(name)
tokens = []
need_lookup = []
for partial in norm_name.split():
token = self._cache.partials.get(partial)
if token:
tokens.append(token)
else:
need_lookup.append(partial)
if need_lookup:
with self.conn.cursor() as cur:
cur.execute("""SELECT word, getorcreate_partial_word(word)
FROM unnest(%s) word""",
(need_lookup, ))
for partial, token in cur:
tokens.append(token)
self._cache.partials[partial] = token
return tokens
def _compute_name_tokens(self, names): def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given """ Computes the full name and partial name tokens for the given
@ -551,30 +570,25 @@ class _TokenInfo:
self.data['hnr'] = ';'.join(hnrs) self.data['hnr'] = ';'.join(hnrs)
def add_street(self, fulls, _): def add_street(self, tokens):
""" Add addr:street match terms. """ Add addr:street match terms.
""" """
if fulls: if tokens:
self.data['street'] = self._mk_array(fulls) self.data['street'] = self._mk_array(tokens)
def add_place(self, fulls, partials): def add_place(self, tokens):
""" Add addr:place search and match terms. """ Add addr:place search and match terms.
""" """
if fulls: if tokens:
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) self.data['place'] = self._mk_array(tokens)
self.data['place_match'] = self._mk_array(fulls)
def add_address_terms(self, terms): def add_address_terms(self, terms):
""" Add additional address terms. """ Add additional address terms.
""" """
tokens = {} tokens = {key: self._mk_array(partials)
for key, partials in terms if partials}
for key, fulls, partials in terms:
if fulls:
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
self._mk_array(fulls)]
if tokens: if tokens:
self.data['addr'] = tokens self.data['addr'] = tokens
@ -588,6 +602,7 @@ class _TokenCache:
""" """
def __init__(self): def __init__(self):
self.names = {} self.names = {}
self.partials = {}
self.postcodes = set() self.postcodes = set()
self.housenumbers = {} self.housenumbers = {}

View File

@ -125,9 +125,6 @@ Feature: Creation of search terms
Then placex contains Then placex contains
| object | parent_place_id | | object | parent_place_id |
| N1 | N2 | | N1 | N2 |
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | #Walltown | Strange, Town |
When sending search query "23 Rose Street" When sending search query "23 Rose Street"
Then exactly 1 results are returned Then exactly 1 results are returned
And results contain And results contain
@ -156,9 +153,6 @@ Feature: Creation of search terms
| W1 | highway | residential | Rose Street | :w-north | | W1 | highway | residential | Rose Street | :w-north |
| N2 | place | city | Strange Town | :p-N1 | | N2 | place | city | Strange Town | :p-N1 |
When importing When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | #Walltown, #Blue house | Walltown, Strange, Town |
When sending search query "23 Walltown, Strange Town" When sending search query "23 Walltown, Strange Town"
Then results contain Then results contain
| osm | display_name | | osm | display_name |
@ -190,9 +184,6 @@ Feature: Creation of search terms
| W1 | highway | residential | Rose Street | :w-north | | W1 | highway | residential | Rose Street | :w-north |
| N2 | place | city | Strange Town | :p-N1 | | N2 | place | city | Strange Town | :p-N1 |
When importing When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
When sending search query "23 Moon Sun, Strange Town" When sending search query "23 Moon Sun, Strange Town"
Then results contain Then results contain
| osm | display_name | | osm | display_name |
@ -212,9 +203,6 @@ Feature: Creation of search terms
| W1 | highway | residential | Rose Street | Walltown | :w-north | | W1 | highway | residential | Rose Street | Walltown | :w-north |
| N2 | place | suburb | Strange Town | Walltown | :p-N1 | | N2 | place | suburb | Strange Town | Walltown | :p-N1 |
When importing When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | #Walltown | Strange, Town |
When sending search query "23 Rose Street, Walltown" When sending search query "23 Rose Street, Walltown"
Then exactly 1 result is returned Then exactly 1 result is returned
And results contain And results contain
@ -303,9 +291,6 @@ Feature: Creation of search terms
| W1 | highway | residential | Rose Street | :w-north | | W1 | highway | residential | Rose Street | :w-north |
| N2 | place | suburb | Strange Town | :p-N1 | | N2 | place | suburb | Strange Town | :p-N1 |
When importing When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | #Green Moss | Walltown |
When sending search query "Green Moss, Rose Street, Walltown" When sending search query "Green Moss, Rose Street, Walltown"
Then exactly 0 result is returned Then exactly 0 result is returned
When sending search query "Green Moss, Walltown" When sending search query "Green Moss, Walltown"

View File

@ -10,6 +10,7 @@ from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
from mock_icu_word_table import MockIcuWordTable from mock_icu_word_table import MockIcuWordTable
@ -76,6 +77,15 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
return _mk_analyser return _mk_analyser
@pytest.fixture
def sql_functions(temp_db_conn, def_config, src_dir):
orig_sql = def_config.lib_dir.sql
def_config.lib_dir.sql = src_dir / 'lib-sql'
sqlproc = SQLPreprocessor(temp_db_conn, def_config)
sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
def_config.lib_dir.sql = orig_sql
@pytest.fixture @pytest.fixture
def getorcreate_full_word(temp_db_cursor): def getorcreate_full_word(temp_db_cursor):
@ -144,7 +154,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();' assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@ -163,7 +172,6 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
def test_init_from_project(monkeypatch, test_config, tokenizer_factory): def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();') monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
monkeypatch.undo() monkeypatch.undo()
@ -173,23 +181,18 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
assert tok.naming_rules is not None assert tok.naming_rules is not None
assert tok.term_normalization == ':: lower();' assert tok.term_normalization == ':: lower();'
assert tok.max_word_frequency == '90300'
def test_update_sql_functions(db_prop, temp_db_cursor, def test_update_sql_functions(db_prop, temp_db_cursor,
tokenizer_factory, test_config, table_factory, tokenizer_factory, test_config, table_factory,
monkeypatch): monkeypatch):
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
monkeypatch.undo()
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT') table_factory('test', 'txt TEXT')
func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql' func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""") func_file.write_text("""INSERT INTO test VALUES (1133)""")
tok.update_sql_functions(test_config) tok.update_sql_functions(test_config)
@ -304,7 +307,7 @@ def test_add_country_names_extend(analyzer, word_table):
class TestPlaceNames: class TestPlaceNames:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def setup(self, analyzer, getorcreate_full_word): def setup(self, analyzer, sql_functions):
with analyzer() as anl: with analyzer() as anl:
self.analyzer = anl self.analyzer = anl
yield anl yield anl
@ -351,7 +354,7 @@ class TestPlaceNames:
class TestPlaceAddress: class TestPlaceAddress:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def setup(self, analyzer, getorcreate_full_word): def setup(self, analyzer, sql_functions):
with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl: with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
self.analyzer = anl self.analyzer = anl
yield anl yield anl
@ -424,7 +427,7 @@ class TestPlaceAddress:
def test_process_place_street(self): def test_process_place_street(self):
info = self.process_address(street='Grand Road') info = self.process_address(street='Grand Road')
assert eval(info['street']) == self.name_token_set('#GRAND ROAD') assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
def test_process_place_street_empty(self): def test_process_place_street_empty(self):
@ -436,16 +439,13 @@ class TestPlaceAddress:
def test_process_place_place(self): def test_process_place_place(self):
info = self.process_address(place='Honu Lulu') info = self.process_address(place='Honu Lulu')
assert eval(info['place_search']) == self.name_token_set('#HONU LULU', assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
'HONU', 'LULU')
assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
def test_process_place_place_empty(self): def test_process_place_place_empty(self):
info = self.process_address(place='🜵') info = self.process_address(place='🜵')
assert 'place_search' not in info assert 'place' not in info
assert 'place_match' not in info
def test_process_place_address_terms(self): def test_process_place_address_terms(self):
@ -453,16 +453,12 @@ class TestPlaceAddress:
suburb='Zwickau', street='Hauptstr', suburb='Zwickau', street='Hauptstr',
full='right behind the church') full='right behind the church')
city_full = self.name_token_set('#ZWICKAU') city = self.name_token_set('ZWICKAU')
city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU') state = self.name_token_set('SACHSEN')
state_full = self.name_token_set('#SACHSEN')
state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()} result = {k: eval(v) for k,v in info['addr'].items()}
assert result == {'city': [city_all, city_full], assert result == {'city': city, 'suburb': city, 'state': state}
'suburb': [city_all, city_full],
'state': [state_all, state_full]}
def test_process_place_address_terms_empty(self): def test_process_place_address_terms_empty(self):