simplify weigh_search() function

Use JSON arrays which can have mixed types and therefore have
a more logical structure than separate arrays. Avoid JSON dicts
because of their verboseness.
This commit is contained in:
Sarah Hoffmann 2023-12-05 16:07:56 +01:00
parent 05e47fbb28
commit c41f2fed21
3 changed files with 20 additions and 15 deletions

View File

@ -287,21 +287,19 @@ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION weigh_search(search_vector INT[],
term_vectors TEXT[],
weight_vectors FLOAT[],
rankings TEXT,
def_weight FLOAT)
RETURNS FLOAT
AS $$
DECLARE
pos INT := 1;
terms TEXT;
rank JSON;
BEGIN
FOREACH terms IN ARRAY term_vectors
FOR rank IN
SELECT * FROM json_array_elements(rankings::JSON)
LOOP
IF search_vector @> terms::INTEGER[] THEN
RETURN weight_vectors[pos];
IF true = ALL(SELECT x::int = ANY(search_vector) FROM json_array_elements_text(rank->1) as x) THEN
RETURN (rank->>0)::float;
END IF;
pos := pos + 1;
END LOOP;
RETURN def_weight;
END;

View File

@ -14,6 +14,7 @@ import sqlalchemy as sa
from nominatim.typing import SaFromClause, SaColumn, SaExpression
from nominatim.api.search.query import Token
from nominatim.utils.json_writer import JsonWriter
@dataclasses.dataclass
class WeightedStrings:
@ -128,11 +129,17 @@ class FieldRanking:
"""
assert self.rankings
return sa.func.weigh_search(table.c[self.column],
[f"{{{','.join((str(s) for s in r.tokens))}}}"
for r in self.rankings],
[r.penalty for r in self.rankings],
self.default)
rout = JsonWriter().start_array()
for rank in self.rankings:
rout.start_array().value(rank.penalty).next()
rout.start_array()
for token in rank.tokens:
rout.value(token).next()
rout.end_array()
rout.end_array().next()
rout.end_array()
return sa.func.weigh_search(table.c[self.column], rout(), self.default)
@dataclasses.dataclass

View File

@ -76,8 +76,8 @@ class JsonWriter:
def end_array(self) -> 'JsonWriter':
""" Write the closing bracket of a JSON array.
"""
assert self.pending in (',', '[', '')
if self.pending == '[':
assert self.pending in (',', '[', ']', ')', '')
if self.pending not in (',', ''):
self.data.write(self.pending)
self.pending = ']'
return self