diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index cbbf240a..ccce99ca 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -157,7 +157,8 @@ class Tokenizer $sSQL = 'SELECT word_id, word_token, type, word,'; $sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; - $sSQL .= " info->>'count' as count"; + $sSQL .= " info->>'count' as count,"; + $sSQL .= " info->>'lookup' as lookup"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -179,7 +180,8 @@ class Tokenizer } break; case 'H': // house number tokens - $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); + $sLookup = $aWord['lookup'] ?? $aWord['word_token']; + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup)); break; case 'P': // postcode tokens // Postcodes are not normalized, so they may have content diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 03408b4a..a3dac8dd 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -200,3 +200,26 @@ BEGIN END; $$ LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[]) + RETURNS INTEGER + AS $$ +DECLARE + return_id INTEGER; +BEGIN + SELECT min(word_id) INTO return_id + FROM word WHERE word = norm_term and type = 'H'; + + IF return_id IS NULL THEN + return_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, word, info) + SELECT return_id, lookup_term, 'H', norm_term, + json_build_object('lookup', lookup_terms[1]) + FROM unnest(lookup_terms) as lookup_term; + END IF; + + RETURN return_id; +END; +$$ +LANGUAGE plpgsql; diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 58965b57..509f6f65 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -28,6 +28,10 @@ CREATE INDEX idx_word_postcodes ON word CREATE INDEX idx_word_full_word ON word USING btree(word) {{db.tablespace.address_index}} WHERE type = 'W'; +-- Used when inserting analyzed housenumbers (exclude old-style entries). +CREATE INDEX idx_word_housenumbers ON word + USING btree(word) {{db.tablespace.address_index}} + WHERE type = 'H' and word is not null; GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 3ce4895b..7bc4720e 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -485,18 +485,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Normalize the housenumber and return the word token and the canonical form. """ - norm_name = self._search_normalized(hnr.name) - if not norm_name: - return None, None + analyzer = self.token_analysis.analysis.get('@housenumber') + result = None, None - token = self._cache.housenumbers.get(norm_name) - if token is None: - with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - token = cur.fetchone()[0] - self._cache.housenumbers[norm_name] = token + if analyzer is None: + # When no custom analyzer is set, simply normalize and transliterate + norm_name = self._search_normalized(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + with self.conn.cursor() as cur: + cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + result = cur.fetchone()[0], norm_name + self._cache.housenumbers[norm_name] = result + else: + # Otherwise use the analyzer to determine the canonical name. + # Per convention we use the first variant as the 'lookup name', the + # name that gets saved in the housenumber field of the place. + norm_name = analyzer.normalize(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + variants = analyzer.get_variants_ascii(norm_name) + if variants: + with self.conn.cursor() as cur: + cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", + (norm_name, list(variants))) + result = cur.fetchone()[0], variants[0] + self._cache.housenumbers[norm_name] = result - return token, norm_name + return result def _compute_partial_tokens(self, name):