diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 230cb2ea..6092319a 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -44,28 +44,28 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'place_match' is not null; + SELECT info->>'place' is not null; $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] && street_tokens + SELECT (info->>'street')::INTEGER[] <@ street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] && place_tokens + SELECT (info->>'place')::INTEGER[] <@ place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] AS $$ - SELECT (info->>'place_search')::INTEGER[] + SELECT (info->>'place')::INTEGER[] $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -79,14 +79,14 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[] AS $$ - SELECT (info->'addr'->key->>0)::INTEGER[]; + SELECT (info->'addr'->>key)::INTEGER[]; $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; + SELECT (info->'addr'->>key)::INTEGER[] <@ tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -146,15 +146,34 @@ BEGIN VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; - IF term_count < {{ max_word_freq }} THEN - partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); - END IF; + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); END LOOP; END; $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT) + RETURNS INTEGER + AS $$ +DECLARE + token INTEGER; +BEGIN + SELECT min(word_id) INTO token + FROM word WHERE word_token = partial and type = 'w'; + + IF token IS NULL THEN + token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, info) + VALUES (token, partial, 'w', json_build_object('count', 0)); + END IF; + + RETURN token; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) RETURNS INTEGER AS $$ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 61263678..22f5e78f 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer -DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer): self.data_dir = data_dir self.naming_rules = None self.term_normalization = None - self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -52,7 +50,6 @@ class LegacyICUTokenizer(AbstractTokenizer): config='TOKENIZER_CONFIG')) self.naming_rules = ICUNameProcessorRules(loader=loader) self.term_normalization = config.TERM_NORMALIZATION - self.max_word_frequency = config.MAX_WORD_FREQUENCY self._install_php(config.lib_dir.php) self._save_config(config) @@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.naming_rules = ICUNameProcessorRules(conn=conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) - self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, _): @@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: - max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', - max_word_freq=max_word_freq) + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') def check_database(self): @@ -122,7 +116,7 @@ class LegacyICUTokenizer(AbstractTokenizer): php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent(f"""\