mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-25 05:52:32 +03:00
Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql
ICU tokenizer: switch match method to using partial terms
This commit is contained in:
commit
40f9d52ad8
@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
|
|||||||
house numbers.
|
house numbers.
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
|
FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
|
||||||
```
|
```
|
||||||
|
|
||||||
Return the match token IDs by which to search a matching street from the
|
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||||
`addr:street` tag. These IDs will be matched against the IDs supplied by
|
match against the `addr:street` tag name. Must return either NULL or FALSE
|
||||||
`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
|
when the place has no `addr:street` tag.
|
||||||
tag.
|
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
|
FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
|
||||||
```
|
```
|
||||||
|
|
||||||
Return the match token IDs by which to search a matching place from the
|
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||||
`addr:place` tag. These IDs will be matched against the IDs supplied by
|
match against the `addr:place` tag name. Must return either NULL or FALSE
|
||||||
`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
|
when the place has no `addr:place` tag.
|
||||||
tag.
|
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
|
FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||||
@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
|
|||||||
database. Must be NULL when the place has no `addr:place` tag.
|
database. Must be NULL when the place has no `addr:place` tag.
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
CREATE TYPE token_addresstoken AS (
|
FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
|
||||||
key TEXT,
|
|
||||||
match_tokens INT[],
|
|
||||||
search_tokens INT[]
|
|
||||||
);
|
|
||||||
|
|
||||||
FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Return the match and search token IDs for explicit `addr:*` tags for the place
|
Return the set of keys for which address information is provided. This
|
||||||
other than `addr:street` and `addr:place`. For each address item there are
|
should correspond to the list of (relevant) `addr:*` tags with the `addr:`
|
||||||
three pieces of information returned:
|
prefix removed or the keys used in the `address` dictionary of the place info.
|
||||||
|
|
||||||
* _key_ contains the type of address item (city, county, etc.). This is the
|
```sql
|
||||||
key handed in with the `address` dictionary.
|
FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
|
||||||
* *match_tokens* is the list of token IDs used to find the corresponding
|
```
|
||||||
place object for the address part. The list is matched against the IDs
|
|
||||||
from `token_get_name_match_tokens`.
|
Return the array of search tokens for the given address part. `key` can be
|
||||||
* *search_tokens* is the list of token IDs under which to search the address
|
expected to be one of those returned with `token_get_address_keys()`. The
|
||||||
item. It is used when no corresponding place object was found.
|
search tokens are added to the address search vector of the place, when no
|
||||||
|
corresponding OSM object could be found for the given address part from which
|
||||||
|
to copy the name information.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
```
|
||||||
|
|
||||||
|
Check if the given tokens match against the address part `key`.
|
||||||
|
|
||||||
|
__Warning:__ the tokens that are handed in are the lists previously saved
|
||||||
|
from `token_get_name_search_tokens()`, _not_ from the match token list. This
|
||||||
|
is an historical oddity which will be fixed at some point in the future.
|
||||||
|
Currently, tokenizers are encouraged to make sure that matching works against
|
||||||
|
both the search token list and the match token list.
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
|
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
|
||||||
|
@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
|
|
||||||
-- find the parent road of the cut road parts
|
-- find the parent road of the cut road parts
|
||||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
|
CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
|
||||||
partition SMALLINT,
|
partition SMALLINT,
|
||||||
centroid GEOMETRY, geom GEOMETRY)
|
centroid GEOMETRY, geom GEOMETRY)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
@ -52,7 +52,7 @@ DECLARE
|
|||||||
parent_place_id BIGINT;
|
parent_place_id BIGINT;
|
||||||
location RECORD;
|
location RECORD;
|
||||||
BEGIN
|
BEGIN
|
||||||
parent_place_id := find_parent_for_address(street, place, partition, centroid);
|
parent_place_id := find_parent_for_address(token_info, partition, centroid);
|
||||||
|
|
||||||
IF parent_place_id is null THEN
|
IF parent_place_id is null THEN
|
||||||
FOR location IN SELECT place_id FROM placex
|
FOR location IN SELECT place_id FROM placex
|
||||||
@ -155,9 +155,8 @@ BEGIN
|
|||||||
NEW.interpolationtype = NEW.address->'interpolation';
|
NEW.interpolationtype = NEW.address->'interpolation';
|
||||||
|
|
||||||
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
||||||
NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
|
NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
|
||||||
token_addr_place_match_tokens(NEW.token_info),
|
place_centroid, NEW.linegeo);
|
||||||
NEW.partition, place_centroid, NEW.linegeo);
|
|
||||||
|
|
||||||
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
|
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
|
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
|
||||||
from_rank SMALLINT, to_rank SMALLINT,
|
from_rank SMALLINT, to_rank SMALLINT,
|
||||||
extent FLOAT, tokens INT[])
|
extent FLOAT, token_info JSONB, key TEXT)
|
||||||
RETURNS nearfeaturecentr
|
RETURNS nearfeaturecentr
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
@ -80,7 +80,7 @@ BEGIN
|
|||||||
FROM location_area_large_{{ partition }}
|
FROM location_area_large_{{ partition }}
|
||||||
WHERE geometry && ST_Expand(feature, extent)
|
WHERE geometry && ST_Expand(feature, extent)
|
||||||
AND rank_address between from_rank and to_rank
|
AND rank_address between from_rank and to_rank
|
||||||
AND tokens && keywords
|
AND token_matches_address(token_info, key, keywords)
|
||||||
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
|
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
|
||||||
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
|
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
|
||||||
RETURN r;
|
RETURN r;
|
||||||
@ -148,18 +148,21 @@ LANGUAGE plpgsql;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
|
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
|
||||||
point GEOMETRY,
|
point GEOMETRY,
|
||||||
isin_token INTEGER[])
|
token_info JSONB)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
parent BIGINT;
|
parent BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
|
IF not token_has_addr_street(token_info) THEN
|
||||||
|
RETURN NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
SELECT place_id FROM search_name_{{ partition }}
|
SELECT place_id FROM search_name_{{ partition }}
|
||||||
INTO parent
|
INTO parent
|
||||||
WHERE name_vector && isin_token
|
WHERE token_matches_street(token_info, name_vector)
|
||||||
AND centroid && ST_Expand(point, 0.015)
|
AND centroid && ST_Expand(point, 0.015)
|
||||||
AND address_rank between 26 and 27
|
AND address_rank between 26 and 27
|
||||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||||
@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
|
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
|
||||||
point GEOMETRY,
|
point GEOMETRY,
|
||||||
isin_token INTEGER[])
|
token_info JSONB)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
parent BIGINT;
|
parent BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
|
IF not token_has_addr_place(token_info) THEN
|
||||||
|
RETURN NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
SELECT place_id
|
SELECT place_id
|
||||||
INTO parent
|
INTO parent
|
||||||
FROM search_name_{{ partition }}
|
FROM search_name_{{ partition }}
|
||||||
WHERE name_vector && isin_token
|
WHERE token_matches_place(token_info, name_vector)
|
||||||
AND centroid && ST_Expand(point, 0.04)
|
AND centroid && ST_Expand(point, 0.04)
|
||||||
AND address_rank between 16 and 25
|
AND address_rank between 16 and 25
|
||||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||||
|
@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
|
|||||||
poi_osm_id BIGINT,
|
poi_osm_id BIGINT,
|
||||||
poi_partition SMALLINT,
|
poi_partition SMALLINT,
|
||||||
bbox GEOMETRY,
|
bbox GEOMETRY,
|
||||||
addr_street INTEGER[],
|
token_info JSONB,
|
||||||
addr_place INTEGER[],
|
|
||||||
is_place_addr BOOLEAN)
|
is_place_addr BOOLEAN)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
@ -119,8 +118,7 @@ BEGIN
|
|||||||
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
|
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
|
||||||
|
|
||||||
IF parent_place_id is null THEN
|
IF parent_place_id is null THEN
|
||||||
parent_place_id := find_parent_for_address(addr_street, addr_place,
|
parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
|
||||||
poi_partition, bbox);
|
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF parent_place_id is null and poi_osm_type = 'N' THEN
|
IF parent_place_id is null and poi_osm_type = 'N' THEN
|
||||||
@ -333,13 +331,14 @@ BEGIN
|
|||||||
WHERE s.place_id = parent_place_id;
|
WHERE s.place_id = parent_place_id;
|
||||||
|
|
||||||
FOR addr_item IN
|
FOR addr_item IN
|
||||||
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
SELECT (get_addr_tag_rank(key, country)).*, key,
|
||||||
FROM token_get_address_tokens(token_info)
|
token_get_address_search_tokens(token_info, key) as search_tokens
|
||||||
WHERE not search_tokens <@ parent_address_vector
|
FROM token_get_address_keys(token_info) as key
|
||||||
|
WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
|
||||||
LOOP
|
LOOP
|
||||||
addr_place := get_address_place(in_partition, geometry,
|
addr_place := get_address_place(in_partition, geometry,
|
||||||
addr_item.from_rank, addr_item.to_rank,
|
addr_item.from_rank, addr_item.to_rank,
|
||||||
addr_item.extent, addr_item.match_tokens);
|
addr_item.extent, token_info, addr_item.key);
|
||||||
|
|
||||||
IF addr_place is null THEN
|
IF addr_place is null THEN
|
||||||
-- No place found in OSM that matches. Make it at least searchable.
|
-- No place found in OSM that matches. Make it at least searchable.
|
||||||
@ -447,14 +446,16 @@ BEGIN
|
|||||||
|
|
||||||
FOR location IN
|
FOR location IN
|
||||||
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
|
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
|
||||||
extent, match_tokens)).*, search_tokens
|
extent, token_info, key)).*, key
|
||||||
FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
FROM (SELECT (get_addr_tag_rank(key, country)).*, key
|
||||||
FROM token_get_address_tokens(token_info)) x
|
FROM token_get_address_keys(token_info) as key) x
|
||||||
ORDER BY rank_address, distance, isguess desc
|
ORDER BY rank_address, distance, isguess desc
|
||||||
LOOP
|
LOOP
|
||||||
IF location.place_id is null THEN
|
IF location.place_id is null THEN
|
||||||
{% if not db.reverse_only %}
|
{% if not db.reverse_only %}
|
||||||
nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
|
nameaddress_vector := array_merge(nameaddress_vector,
|
||||||
|
token_get_address_search_tokens(token_info,
|
||||||
|
location.key));
|
||||||
{% endif %}
|
{% endif %}
|
||||||
ELSE
|
ELSE
|
||||||
{% if not db.reverse_only %}
|
{% if not db.reverse_only %}
|
||||||
@ -689,9 +690,6 @@ DECLARE
|
|||||||
parent_address_level SMALLINT;
|
parent_address_level SMALLINT;
|
||||||
place_address_level SMALLINT;
|
place_address_level SMALLINT;
|
||||||
|
|
||||||
addr_street INTEGER[];
|
|
||||||
addr_place INTEGER[];
|
|
||||||
|
|
||||||
max_rank SMALLINT;
|
max_rank SMALLINT;
|
||||||
|
|
||||||
name_vector INTEGER[];
|
name_vector INTEGER[];
|
||||||
@ -860,8 +858,6 @@ BEGIN
|
|||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
||||||
addr_street := token_addr_street_match_tokens(NEW.token_info);
|
|
||||||
addr_place := token_addr_place_match_tokens(NEW.token_info);
|
|
||||||
|
|
||||||
NEW.postcode := null;
|
NEW.postcode := null;
|
||||||
|
|
||||||
@ -907,7 +903,7 @@ BEGIN
|
|||||||
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
|
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
|
||||||
NEW.partition,
|
NEW.partition,
|
||||||
ST_Envelope(NEW.geometry),
|
ST_Envelope(NEW.geometry),
|
||||||
addr_street, addr_place,
|
NEW.token_info,
|
||||||
is_place_address);
|
is_place_address);
|
||||||
|
|
||||||
-- If we found the road take a shortcut here.
|
-- If we found the road take a shortcut here.
|
||||||
|
@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
-- Find the parent of an address with addr:street/addr:place tag.
|
-- Find the parent of an address with addr:street/addr:place tag.
|
||||||
--
|
--
|
||||||
-- \param street Value of addr:street or NULL if tag is missing.
|
-- \param token_info Naming info with the address information.
|
||||||
-- \param place Value of addr:place or NULL if tag is missing.
|
|
||||||
-- \param partition Partition where to search the parent.
|
-- \param partition Partition where to search the parent.
|
||||||
-- \param centroid Location of the address.
|
-- \param centroid Location of the address.
|
||||||
--
|
--
|
||||||
-- \return Place ID of the parent if one was found, NULL otherwise.
|
-- \return Place ID of the parent if one was found, NULL otherwise.
|
||||||
CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
|
CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
|
||||||
partition SMALLINT,
|
partition SMALLINT,
|
||||||
centroid GEOMETRY)
|
centroid GEOMETRY)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
|
|||||||
DECLARE
|
DECLARE
|
||||||
parent_place_id BIGINT;
|
parent_place_id BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
IF street is not null THEN
|
-- Check for addr:street attributes
|
||||||
-- Check for addr:street attributes
|
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
|
||||||
-- Note that addr:street links can only be indexed, once the street itself is indexed
|
IF parent_place_id is not null THEN
|
||||||
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
|
{% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
|
||||||
IF parent_place_id is not null THEN
|
RETURN parent_place_id;
|
||||||
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
|
|
||||||
RETURN parent_place_id;
|
|
||||||
END IF;
|
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
-- Check for addr:place attributes.
|
-- Check for addr:place attributes.
|
||||||
IF place is not null THEN
|
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
|
||||||
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
|
{% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
|
||||||
IF parent_place_id is not null THEN
|
RETURN parent_place_id;
|
||||||
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
|
|
||||||
RETURN parent_place_id;
|
|
||||||
END IF;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
RETURN NULL;
|
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql STABLE;
|
LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
|
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
|
||||||
RETURNS BOOLEAN
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
|
@ -14,7 +14,6 @@ DECLARE
|
|||||||
out_partition INTEGER;
|
out_partition INTEGER;
|
||||||
out_parent_place_id BIGINT;
|
out_parent_place_id BIGINT;
|
||||||
location RECORD;
|
location RECORD;
|
||||||
address_street_word_ids INTEGER[];
|
|
||||||
|
|
||||||
BEGIN
|
BEGIN
|
||||||
|
|
||||||
@ -54,13 +53,9 @@ BEGIN
|
|||||||
|
|
||||||
place_centroid := ST_Centroid(linegeo);
|
place_centroid := ST_Centroid(linegeo);
|
||||||
out_partition := get_partition('us');
|
out_partition := get_partition('us');
|
||||||
out_parent_place_id := null;
|
|
||||||
|
|
||||||
address_street_word_ids := token_addr_street_match_tokens(token_info);
|
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
||||||
IF address_street_word_ids IS NOT NULL THEN
|
token_info);
|
||||||
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
|
||||||
address_street_word_ids);
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF out_parent_place_id IS NULL THEN
|
IF out_parent_place_id IS NULL THEN
|
||||||
SELECT getNearestParallelRoadFeature(out_partition, linegeo)
|
SELECT getNearestParallelRoadFeature(out_partition, linegeo)
|
||||||
|
@ -34,40 +34,59 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'street')::INTEGER[]
|
SELECT info->>'street' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT info->>'place' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->>'street')::INTEGER[] <@ street_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_match')::INTEGER[]
|
SELECT (info->>'place')::INTEGER[] <@ place_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS INTEGER[]
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_search')::INTEGER[]
|
SELECT (info->>'place')::INTEGER[]
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||||
CREATE TYPE token_addresstoken AS (
|
RETURNS SETOF TEXT
|
||||||
key TEXT,
|
|
||||||
match_tokens INT[],
|
|
||||||
search_tokens INT[]
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
|
||||||
RETURNS SETOF token_addresstoken
|
|
||||||
AS $$
|
AS $$
|
||||||
SELECT key, (value->>1)::int[] as match_tokens,
|
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||||
(value->>0)::int[] as search_tokens
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
FROM jsonb_each(info->'addr');
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||||
|
RETURNS INTEGER[]
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->>key)::INTEGER[];
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
@ -127,15 +146,34 @@ BEGIN
|
|||||||
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF term_count < {{ max_word_freq }} THEN
|
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
||||||
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
|
||||||
END IF;
|
|
||||||
END LOOP;
|
END LOOP;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
|
||||||
|
RETURNS INTEGER
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
token INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT min(word_id) INTO token
|
||||||
|
FROM word WHERE word_token = partial and type = 'w';
|
||||||
|
|
||||||
|
IF token IS NULL THEN
|
||||||
|
token := nextval('seq_word');
|
||||||
|
INSERT INTO word (word_id, word_token, type, info)
|
||||||
|
VALUES (token, partial, 'w', json_build_object('count', 0));
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN token;
|
||||||
|
END;
|
||||||
|
$$
|
||||||
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
||||||
RETURNS INTEGER
|
RETURNS INTEGER
|
||||||
AS $$
|
AS $$
|
||||||
|
@ -34,17 +34,31 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'street')::INTEGER[]
|
SELECT info->>'street' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT info->>'place_match' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->>'street')::INTEGER[] && street_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_match')::INTEGER[]
|
SELECT (info->>'place_match')::INTEGER[] && place_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
@ -55,19 +69,24 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||||
CREATE TYPE token_addresstoken AS (
|
RETURNS SETOF TEXT
|
||||||
key TEXT,
|
|
||||||
match_tokens INT[],
|
|
||||||
search_tokens INT[]
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
|
||||||
RETURNS SETOF token_addresstoken
|
|
||||||
AS $$
|
AS $$
|
||||||
SELECT key, (value->>1)::int[] as match_tokens,
|
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||||
(value->>0)::int[] as search_tokens
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
FROM jsonb_each(info->'addr');
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||||
|
RETURNS INTEGER[]
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->key->>0)::INTEGER[];
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
|||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
|
||||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.naming_rules = None
|
self.naming_rules = None
|
||||||
self.term_normalization = None
|
self.term_normalization = None
|
||||||
self.max_word_frequency = None
|
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config, init_db=True):
|
def init_new_db(self, config, init_db=True):
|
||||||
@ -52,10 +50,9 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
config='TOKENIZER_CONFIG'))
|
config='TOKENIZER_CONFIG'))
|
||||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||||
self.term_normalization = config.TERM_NORMALIZATION
|
self.term_normalization = config.TERM_NORMALIZATION
|
||||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
|
||||||
|
|
||||||
self._install_php(config.lib_dir.php)
|
self._install_php(config.lib_dir.php)
|
||||||
self._save_config(config)
|
self._save_config()
|
||||||
|
|
||||||
if init_db:
|
if init_db:
|
||||||
self.update_sql_functions(config)
|
self.update_sql_functions(config)
|
||||||
@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
||||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
||||||
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
|
|
||||||
|
|
||||||
|
|
||||||
def finalize_import(self, _):
|
def finalize_import(self, _):
|
||||||
@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
""" Reimport the SQL functions for this tokenizer.
|
""" Reimport the SQL functions for this tokenizer.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
|
|
||||||
sqlp = SQLPreprocessor(conn, config)
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||||
max_word_freq=max_word_freq)
|
|
||||||
|
|
||||||
|
|
||||||
def check_database(self):
|
def check_database(self):
|
||||||
@ -122,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
php_file = self.data_dir / "tokenizer.php"
|
php_file = self.data_dir / "tokenizer.php"
|
||||||
php_file.write_text(dedent(f"""\
|
php_file.write_text(dedent(f"""\
|
||||||
<?php
|
<?php
|
||||||
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
|
@define('CONST_Max_Word_Frequency', 10000000);
|
||||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
||||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
||||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||||
|
|
||||||
|
|
||||||
def _save_config(self, config):
|
def _save_config(self):
|
||||||
""" Save the configuration that needs to remain stable for the given
|
""" Save the configuration that needs to remain stable for the given
|
||||||
database as database properties.
|
database as database properties.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules.save_rules(conn)
|
self.naming_rules.save_rules(conn)
|
||||||
|
|
||||||
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
|
||||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
||||||
|
|
||||||
|
|
||||||
@ -424,12 +417,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||||
hnrs.append(value)
|
hnrs.append(value)
|
||||||
elif key == 'street':
|
elif key == 'street':
|
||||||
token_info.add_street(*self._compute_name_tokens({'name': value}))
|
token_info.add_street(self._compute_partial_tokens(value))
|
||||||
elif key == 'place':
|
elif key == 'place':
|
||||||
token_info.add_place(*self._compute_name_tokens({'name': value}))
|
token_info.add_place(self._compute_partial_tokens(value))
|
||||||
elif not key.startswith('_') and \
|
elif not key.startswith('_') and \
|
||||||
key not in ('country', 'full'):
|
key not in ('country', 'full'):
|
||||||
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
|
addr_terms.append((key, self._compute_partial_tokens(value)))
|
||||||
|
|
||||||
if hnrs:
|
if hnrs:
|
||||||
hnrs = self._split_housenumbers(hnrs)
|
hnrs = self._split_housenumbers(hnrs)
|
||||||
@ -438,6 +431,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if addr_terms:
|
if addr_terms:
|
||||||
token_info.add_address_terms(addr_terms)
|
token_info.add_address_terms(addr_terms)
|
||||||
|
|
||||||
|
def _compute_partial_tokens(self, name):
|
||||||
|
""" Normalize the given term, split it into partial words and return
|
||||||
|
then token list for them.
|
||||||
|
"""
|
||||||
|
norm_name = self.name_processor.get_search_normalized(name)
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
need_lookup = []
|
||||||
|
for partial in norm_name.split():
|
||||||
|
token = self._cache.partials.get(partial)
|
||||||
|
if token:
|
||||||
|
tokens.append(token)
|
||||||
|
else:
|
||||||
|
need_lookup.append(partial)
|
||||||
|
|
||||||
|
if need_lookup:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("""SELECT word, getorcreate_partial_word(word)
|
||||||
|
FROM unnest(%s) word""",
|
||||||
|
(need_lookup, ))
|
||||||
|
|
||||||
|
for partial, token in cur:
|
||||||
|
tokens.append(token)
|
||||||
|
self._cache.partials[partial] = token
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
def _compute_name_tokens(self, names):
|
def _compute_name_tokens(self, names):
|
||||||
""" Computes the full name and partial name tokens for the given
|
""" Computes the full name and partial name tokens for the given
|
||||||
@ -551,30 +570,25 @@ class _TokenInfo:
|
|||||||
self.data['hnr'] = ';'.join(hnrs)
|
self.data['hnr'] = ';'.join(hnrs)
|
||||||
|
|
||||||
|
|
||||||
def add_street(self, fulls, _):
|
def add_street(self, tokens):
|
||||||
""" Add addr:street match terms.
|
""" Add addr:street match terms.
|
||||||
"""
|
"""
|
||||||
if fulls:
|
if tokens:
|
||||||
self.data['street'] = self._mk_array(fulls)
|
self.data['street'] = self._mk_array(tokens)
|
||||||
|
|
||||||
|
|
||||||
def add_place(self, fulls, partials):
|
def add_place(self, tokens):
|
||||||
""" Add addr:place search and match terms.
|
""" Add addr:place search and match terms.
|
||||||
"""
|
"""
|
||||||
if fulls:
|
if tokens:
|
||||||
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
|
self.data['place'] = self._mk_array(tokens)
|
||||||
self.data['place_match'] = self._mk_array(fulls)
|
|
||||||
|
|
||||||
|
|
||||||
def add_address_terms(self, terms):
|
def add_address_terms(self, terms):
|
||||||
""" Add additional address terms.
|
""" Add additional address terms.
|
||||||
"""
|
"""
|
||||||
tokens = {}
|
tokens = {key: self._mk_array(partials)
|
||||||
|
for key, partials in terms if partials}
|
||||||
for key, fulls, partials in terms:
|
|
||||||
if fulls:
|
|
||||||
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
|
|
||||||
self._mk_array(fulls)]
|
|
||||||
|
|
||||||
if tokens:
|
if tokens:
|
||||||
self.data['addr'] = tokens
|
self.data['addr'] = tokens
|
||||||
@ -588,6 +602,7 @@ class _TokenCache:
|
|||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.names = {}
|
self.names = {}
|
||||||
|
self.partials = {}
|
||||||
self.postcodes = set()
|
self.postcodes = set()
|
||||||
self.housenumbers = {}
|
self.housenumbers = {}
|
||||||
|
|
||||||
|
@ -125,9 +125,6 @@ Feature: Creation of search terms
|
|||||||
Then placex contains
|
Then placex contains
|
||||||
| object | parent_place_id |
|
| object | parent_place_id |
|
||||||
| N1 | N2 |
|
| N1 | N2 |
|
||||||
Then search_name contains
|
|
||||||
| object | name_vector | nameaddress_vector |
|
|
||||||
| N1 | #Walltown | Strange, Town |
|
|
||||||
When sending search query "23 Rose Street"
|
When sending search query "23 Rose Street"
|
||||||
Then exactly 1 results are returned
|
Then exactly 1 results are returned
|
||||||
And results contain
|
And results contain
|
||||||
@ -156,9 +153,6 @@ Feature: Creation of search terms
|
|||||||
| W1 | highway | residential | Rose Street | :w-north |
|
| W1 | highway | residential | Rose Street | :w-north |
|
||||||
| N2 | place | city | Strange Town | :p-N1 |
|
| N2 | place | city | Strange Town | :p-N1 |
|
||||||
When importing
|
When importing
|
||||||
Then search_name contains
|
|
||||||
| object | name_vector | nameaddress_vector |
|
|
||||||
| N1 | #Walltown, #Blue house | Walltown, Strange, Town |
|
|
||||||
When sending search query "23 Walltown, Strange Town"
|
When sending search query "23 Walltown, Strange Town"
|
||||||
Then results contain
|
Then results contain
|
||||||
| osm | display_name |
|
| osm | display_name |
|
||||||
@ -190,9 +184,6 @@ Feature: Creation of search terms
|
|||||||
| W1 | highway | residential | Rose Street | :w-north |
|
| W1 | highway | residential | Rose Street | :w-north |
|
||||||
| N2 | place | city | Strange Town | :p-N1 |
|
| N2 | place | city | Strange Town | :p-N1 |
|
||||||
When importing
|
When importing
|
||||||
Then search_name contains
|
|
||||||
| object | name_vector | nameaddress_vector |
|
|
||||||
| N1 | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
|
|
||||||
When sending search query "23 Moon Sun, Strange Town"
|
When sending search query "23 Moon Sun, Strange Town"
|
||||||
Then results contain
|
Then results contain
|
||||||
| osm | display_name |
|
| osm | display_name |
|
||||||
@ -212,9 +203,6 @@ Feature: Creation of search terms
|
|||||||
| W1 | highway | residential | Rose Street | Walltown | :w-north |
|
| W1 | highway | residential | Rose Street | Walltown | :w-north |
|
||||||
| N2 | place | suburb | Strange Town | Walltown | :p-N1 |
|
| N2 | place | suburb | Strange Town | Walltown | :p-N1 |
|
||||||
When importing
|
When importing
|
||||||
Then search_name contains
|
|
||||||
| object | name_vector | nameaddress_vector |
|
|
||||||
| N1 | #Walltown | Strange, Town |
|
|
||||||
When sending search query "23 Rose Street, Walltown"
|
When sending search query "23 Rose Street, Walltown"
|
||||||
Then exactly 1 result is returned
|
Then exactly 1 result is returned
|
||||||
And results contain
|
And results contain
|
||||||
@ -303,9 +291,6 @@ Feature: Creation of search terms
|
|||||||
| W1 | highway | residential | Rose Street | :w-north |
|
| W1 | highway | residential | Rose Street | :w-north |
|
||||||
| N2 | place | suburb | Strange Town | :p-N1 |
|
| N2 | place | suburb | Strange Town | :p-N1 |
|
||||||
When importing
|
When importing
|
||||||
Then search_name contains
|
|
||||||
| object | name_vector | nameaddress_vector |
|
|
||||||
| N1 | #Green Moss | Walltown |
|
|
||||||
When sending search query "Green Moss, Rose Street, Walltown"
|
When sending search query "Green Moss, Rose Street, Walltown"
|
||||||
Then exactly 0 result is returned
|
Then exactly 0 result is returned
|
||||||
When sending search query "Green Moss, Walltown"
|
When sending search query "Green Moss, Walltown"
|
||||||
|
@ -10,6 +10,7 @@ from nominatim.tokenizer import icu_tokenizer
|
|||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
|
||||||
from mock_icu_word_table import MockIcuWordTable
|
from mock_icu_word_table import MockIcuWordTable
|
||||||
|
|
||||||
@ -76,6 +77,15 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
|
|
||||||
return _mk_analyser
|
return _mk_analyser
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sql_functions(temp_db_conn, def_config, src_dir):
|
||||||
|
orig_sql = def_config.lib_dir.sql
|
||||||
|
def_config.lib_dir.sql = src_dir / 'lib-sql'
|
||||||
|
sqlproc = SQLPreprocessor(temp_db_conn, def_config)
|
||||||
|
sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
|
||||||
|
sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
|
||||||
|
def_config.lib_dir.sql = orig_sql
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def getorcreate_full_word(temp_db_cursor):
|
def getorcreate_full_word(temp_db_cursor):
|
||||||
@ -144,7 +154,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
|
|||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
|
|
||||||
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
||||||
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
||||||
@ -163,7 +172,6 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
|||||||
|
|
||||||
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
||||||
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
|
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
|
||||||
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
|
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
monkeypatch.undo()
|
monkeypatch.undo()
|
||||||
@ -173,23 +181,18 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
|||||||
|
|
||||||
assert tok.naming_rules is not None
|
assert tok.naming_rules is not None
|
||||||
assert tok.term_normalization == ':: lower();'
|
assert tok.term_normalization == ':: lower();'
|
||||||
assert tok.max_word_frequency == '90300'
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_sql_functions(db_prop, temp_db_cursor,
|
def test_update_sql_functions(db_prop, temp_db_cursor,
|
||||||
tokenizer_factory, test_config, table_factory,
|
tokenizer_factory, test_config, table_factory,
|
||||||
monkeypatch):
|
monkeypatch):
|
||||||
monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
|
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
monkeypatch.undo()
|
|
||||||
|
|
||||||
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
|
|
||||||
|
|
||||||
table_factory('test', 'txt TEXT')
|
table_factory('test', 'txt TEXT')
|
||||||
|
|
||||||
func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
|
func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
|
||||||
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
|
func_file.write_text("""INSERT INTO test VALUES (1133)""")
|
||||||
|
|
||||||
tok.update_sql_functions(test_config)
|
tok.update_sql_functions(test_config)
|
||||||
|
|
||||||
@ -304,7 +307,7 @@ def test_add_country_names_extend(analyzer, word_table):
|
|||||||
class TestPlaceNames:
|
class TestPlaceNames:
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def setup(self, analyzer, getorcreate_full_word):
|
def setup(self, analyzer, sql_functions):
|
||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
self.analyzer = anl
|
self.analyzer = anl
|
||||||
yield anl
|
yield anl
|
||||||
@ -351,7 +354,7 @@ class TestPlaceNames:
|
|||||||
class TestPlaceAddress:
|
class TestPlaceAddress:
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def setup(self, analyzer, getorcreate_full_word):
|
def setup(self, analyzer, sql_functions):
|
||||||
with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
|
with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
|
||||||
self.analyzer = anl
|
self.analyzer = anl
|
||||||
yield anl
|
yield anl
|
||||||
@ -424,7 +427,7 @@ class TestPlaceAddress:
|
|||||||
def test_process_place_street(self):
|
def test_process_place_street(self):
|
||||||
info = self.process_address(street='Grand Road')
|
info = self.process_address(street='Grand Road')
|
||||||
|
|
||||||
assert eval(info['street']) == self.name_token_set('#GRAND ROAD')
|
assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_street_empty(self):
|
def test_process_place_street_empty(self):
|
||||||
@ -436,16 +439,13 @@ class TestPlaceAddress:
|
|||||||
def test_process_place_place(self):
|
def test_process_place_place(self):
|
||||||
info = self.process_address(place='Honu Lulu')
|
info = self.process_address(place='Honu Lulu')
|
||||||
|
|
||||||
assert eval(info['place_search']) == self.name_token_set('#HONU LULU',
|
assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
|
||||||
'HONU', 'LULU')
|
|
||||||
assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_place_empty(self):
|
def test_process_place_place_empty(self):
|
||||||
info = self.process_address(place='🜵')
|
info = self.process_address(place='🜵')
|
||||||
|
|
||||||
assert 'place_search' not in info
|
assert 'place' not in info
|
||||||
assert 'place_match' not in info
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_address_terms(self):
|
def test_process_place_address_terms(self):
|
||||||
@ -453,16 +453,12 @@ class TestPlaceAddress:
|
|||||||
suburb='Zwickau', street='Hauptstr',
|
suburb='Zwickau', street='Hauptstr',
|
||||||
full='right behind the church')
|
full='right behind the church')
|
||||||
|
|
||||||
city_full = self.name_token_set('#ZWICKAU')
|
city = self.name_token_set('ZWICKAU')
|
||||||
city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU')
|
state = self.name_token_set('SACHSEN')
|
||||||
state_full = self.name_token_set('#SACHSEN')
|
|
||||||
state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
|
|
||||||
|
|
||||||
result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()}
|
result = {k: eval(v) for k,v in info['addr'].items()}
|
||||||
|
|
||||||
assert result == {'city': [city_all, city_full],
|
assert result == {'city': city, 'suburb': city, 'state': state}
|
||||||
'suburb': [city_all, city_full],
|
|
||||||
'state': [state_all, state_full]}
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_place_address_terms_empty(self):
|
def test_process_place_address_terms_empty(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user