diff --git a/lib-php/init-website.php b/lib-php/init-website.php index d6cc8a24..6f3b5545 100644 --- a/lib-php/init-website.php +++ b/lib-php/init-website.php @@ -12,7 +12,7 @@ require_once(CONST_Debug ? 'DebugHtml.php' : 'DebugNone.php'); function userError($sMsg) { - throw new Exception($sMsg, 400); + throw new \Exception($sMsg, 400); } @@ -37,7 +37,7 @@ function shutdown_exception_handler_xml() { $error = error_get_last(); if ($error !== null && $error['type'] === E_ERROR) { - exception_handler_xml(new Exception($error['message'], 500)); + exception_handler_xml(new \Exception($error['message'], 500)); } } @@ -45,7 +45,7 @@ function shutdown_exception_handler_json() { $error = error_get_last(); if ($error !== null && $error['type'] === E_ERROR) { - exception_handler_json(new Exception($error['message'], 500)); + exception_handler_json(new \Exception($error['message'], 500)); } } diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index eac964e4..3751e821 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -19,13 +19,13 @@ class Tokenizer public function checkStatus() { - $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; + $sSQL = 'SELECT word_id FROM word limit 1'; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { - throw new Exception('Query failed', 703); + throw new \Exception('Query failed', 703); } if (!$iWordID) { - throw new Exception('No value', 704); + throw new \Exception('No value', 704); } } @@ -55,9 +55,8 @@ class Tokenizer { $aResults = array(); - $sSQL = 'SELECT word_id, class, type FROM word '; - $sSQL .= ' WHERE word_token = \' \' || :term'; - $sSQL .= ' AND class is not null AND class not in (\'place\')'; + $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type "; + $sSQL .= ' FROM word WHERE word_token = :term and type = \'S\''; Debug::printVar('Term', $sTerm); Debug::printSQL($sSQL); @@ -146,8 +145,10 @@ class Tokenizer private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers - $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,'; - $sSQL .= ' operator, coalesce(search_name_count, 0) as count'; + $sSQL = 'SELECT word_id, word_token, type, word,'; + $sSQL .= " info->>'op' as operator,"; + $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; + $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -156,66 +157,66 @@ class Tokenizer $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.'); foreach ($aDBWords as $aWord) { - $oToken = null; $iId = (int) $aWord['word_id']; + $sTok = $aWord['word_token']; - if ($aWord['class']) { - // Special terms need to appear in their normalized form. - // (postcodes are not normalized in the word table) - $sNormWord = $this->normalizeString($aWord['word']); - if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) { - continue; - } - - if ($aWord['class'] == 'place' && $aWord['type'] == 'house') { - $oToken = new Token\HouseNumber($iId, trim($aWord['word_token'])); - } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') { - if ($aWord['word'] - && pg_escape_string($aWord['word']) == $aWord['word'] + switch ($aWord['type']) { + case 'C': // country name tokens + if ($aWord['word'] !== null + && (!$this->aCountryRestriction + || in_array($aWord['word'], $this->aCountryRestriction)) ) { - $oToken = new Token\Postcode( - $iId, - $aWord['word'], - $aWord['country_code'] + $oValidTokens->addToken( + $sTok, + new Token\Country($iId, $aWord['word']) ); } - } else { - // near and in operator the same at the moment - $oToken = new Token\SpecialTerm( + break; + case 'H': // house number tokens + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); + break; + case 'P': // postcode tokens + // Postcodes are not normalized, so they may have content + // that makes SQL injection possible. Reject postcodes + // that would need special escaping. + if ($aWord['word'] !== null + && pg_escape_string($aWord['word']) == $aWord['word'] + ) { + $sNormPostcode = $this->normalizeString($aWord['word']); + if (strpos($sNormQuery, $sNormPostcode) !== false) { + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $aWord['word'], null) + ); + } + } + break; + case 'S': // tokens for classification terms (special phrases) + if ($aWord['class'] !== null && $aWord['ctype'] !== null) { + $oValidTokens->addToken($sTok, new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['ctype'], + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE + )); + } + break; + case 'W': // full-word tokens + $oValidTokens->addToken($sTok, new Token\Word( $iId, - $aWord['class'], - $aWord['type'], - $aWord['operator'] ? Operator::NEAR : Operator::NONE - ); - } - } elseif ($aWord['country_code']) { - // Filter country tokens that do not match restricted countries. - if (!$this->aCountryRestriction - || in_array($aWord['country_code'], $this->aCountryRestriction) - ) { - $oToken = new Token\Country($iId, $aWord['country_code']); - } - } elseif ($aWord['word_token'][0] == ' ') { - $oToken = new Token\Word( - $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') - ); - } else { - $oToken = new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - ); - } - - if ($oToken) { - // remove any leading spaces - if ($aWord['word_token'][0] == ' ') { - $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); - } else { - $oValidTokens->addToken($aWord['word_token'], $oToken); - } + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + )); + break; + case 'w': // partial word terms + $oValidTokens->addToken($sTok, new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + )); + break; + default: + break; } } } @@ -234,12 +235,10 @@ class Tokenizer for ($i = 0; $i < $iNumWords; $i++) { $sPhrase = $aWords[$i]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; $aTokens[$sPhrase] = $sPhrase; for ($j = $i + 1; $j < $iNumWords; $j++) { $sPhrase .= ' '.$aWords[$j]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; $aTokens[$sPhrase] = $sPhrase; } } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 064b4166..570b8828 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -19,20 +19,20 @@ class Tokenizer { $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')"); if ($sStandardWord === false) { - throw new Exception('Module failed', 701); + throw new \Exception('Module failed', 701); } if ($sStandardWord != 'a') { - throw new Exception('Module call failed', 702); + throw new \Exception('Module call failed', 702); } $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')"; $iWordID = $this->oDB->getOne($sSQL); if ($iWordID === false) { - throw new Exception('Query failed', 703); + throw new \Exception('Query failed', 703); } if (!$iWordID) { - throw new Exception('No value', 704); + throw new \Exception('No value', 704); } } diff --git a/lib-php/website/details.php b/lib-php/website/details.php index c16725e2..0d67ec83 100644 --- a/lib-php/website/details.php +++ b/lib-php/website/details.php @@ -83,7 +83,7 @@ if ($sOsmType && $iOsmId > 0) { } if ($sPlaceId === false) { - throw new Exception('No place with that OSM ID found.', 404); + throw new \Exception('No place with that OSM ID found.', 404); } } else { if ($sPlaceId === false) { @@ -146,7 +146,7 @@ $sSQL .= " WHERE place_id = $iPlaceID"; $aPointDetails = $oDB->getRow($sSQL, null, 'Could not get details of place object.'); if (!$aPointDetails) { - throw new Exception('No place with that place ID found.', 404); + throw new \Exception('No place with that place ID found.', 404); } $aPointDetails['localname'] = $aPointDetails['localname']?$aPointDetails['localname']:$aPointDetails['housenumber']; diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql new file mode 100644 index 00000000..7ec3c6f8 --- /dev/null +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -0,0 +1,29 @@ +DROP TABLE IF EXISTS word; +CREATE TABLE word ( + word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + word text, + info jsonb +) {{db.tablespace.search_data}}; + +CREATE INDEX idx_word_word_token ON word + USING BTREE (word_token) {{db.tablespace.search_index}}; +-- Used when updating country names from the boundary relation. +CREATE INDEX idx_word_country_names ON word + USING btree(word) {{db.tablespace.address_index}} + WHERE type = 'C'; +-- Used when inserting new postcodes on updates. +CREATE INDEX idx_word_postcodes ON word + USING btree(word) {{db.tablespace.address_index}} + WHERE type = 'P'; +-- Used when inserting full words. +CREATE INDEX idx_word_full_word ON word + USING btree(word) {{db.tablespace.address_index}} + WHERE type = 'W'; + +GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; + +DROP SEQUENCE IF EXISTS seq_word; +CREATE SEQUENCE seq_word start 1; +GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 686137de..ffe6648c 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -98,12 +98,14 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE word = norm_term and class is null and country_code is null; + FROM word WHERE word = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, word, search_name_count) - SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; + INSERT INTO word (word_id, word_token, type, word, info) + SELECT full_token, lookup_term, 'W', norm_term, + json_build_object('count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP @@ -115,14 +117,14 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(search_name_count) INTO term_id, term_count - FROM word WHERE word_token = term and class is null and country_code is null; + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (term_id, term, 0); + INSERT INTO word (word_id, word_token, type, info) + VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; IF term_count < {{ max_word_freq }} THEN @@ -140,15 +142,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) DECLARE return_id INTEGER; BEGIN - SELECT min(word_id) INTO return_id - FROM word - WHERE word_token = ' ' || lookup_term - and class = 'place' and type = 'house'; + SELECT min(word_id) INTO return_id FROM word + WHERE word_token = lookup_term and type = 'H'; IF return_id IS NULL THEN return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, class, type, search_name_count) - VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); + INSERT INTO word (word_id, word_token, type) + VALUES (return_id, lookup_term, 'H'); END IF; RETURN return_id; diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 9a4a41a5..bb7faa25 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -65,6 +65,7 @@ _SQL_TRANSLATION = {ord(u'\\'): u'\\\\', ord(u'\t'): u'\\t', ord(u'\n'): u'\\n'} + class CopyBuffer: """ Data collector for the copy_from command. """ diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6d3d11c1..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module. """ from collections import Counter import itertools +import json import logging import re from textwrap import dedent @@ -74,13 +75,10 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): @@ -121,18 +119,17 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) return len(to_delete) @@ -371,22 +378,28 @@ class LegacyICUNameAnalyzer: """ word_tokens = set() for name in self._compute_full_names(names): - if name: - word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, %s, 0 + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) """, (country_code, list(word_tokens))) + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. + def process_place(self, place): """ Determine tokenizer information about the given place. @@ -497,14 +510,12 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and word = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) @@ -595,7 +606,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = [] diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 6102e99b..4c839db0 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -134,9 +134,7 @@ Feature: Import of postcodes Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Different postcodes with the same normalization can both be found Given the places diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature index 94550ffd..c2fb30ce 100644 --- a/test/bdd/db/update/postcode.feature +++ b/test/bdd/db/update/postcode.feature @@ -18,10 +18,7 @@ Feature: Update of postcode | country | postcode | geometry | | de | 01982 | country:de | | ch | 4567 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | - | 4567 | place | postcode | + And there are word tokens for postcodes 01982,4567 Scenario: When the last postcode is deleted, it is deleted from postcode and word Given the places @@ -34,12 +31,8 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 4567 | country:ch | - And word contains not - | word | class | type | - | 01982 | place | postcode | - And word contains - | word | class | type | - | 4567 | place | postcode | + And there are word tokens for postcodes 4567 + And there are no word tokens for postcodes 01982 Scenario: A postcode is not deleted from postcode and word when it exist in another country Given the places @@ -52,9 +45,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 01982 | country:ch | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 Scenario: Updating a postcode is reflected in postcode table Given the places @@ -68,9 +59,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing from a postcode type, the entry appears in placex When importing @@ -91,9 +80,7 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And word contains - | word | class | type | - | 20453 | place | postcode | + And there are word tokens for postcodes 20453 Scenario: When changing to a postcode type, the entry disappears from placex When importing @@ -114,6 +101,4 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And word contains - | word | class | type | - | 01982 | place | postcode | + And there are word tokens for postcodes 01982 diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index b4f0d853..ac61fc67 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -266,20 +266,36 @@ def check_location_postcode(context): db_row.assert_row(row, ('country', 'postcode')) -@then("word contains(?P not)?") -def check_word_table(context, exclude): - """ Check the contents of the word table. Each row represents a table row - and all data must match. Data not present in the expected table, may - be arbitry. The rows are identified via all given columns. +@then("there are(?P no)? word tokens for postcodes (?P.*)") +def check_word_table_for_postcodes(context, exclude, postcodes): + """ Check that the tokenizer produces postcode tokens for the given + postcodes. The postcodes are a comma-separated list of postcodes. + Whitespace matters. """ + nctx = context.nominatim + tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config()) + with tokenizer.name_analyzer() as ana: + plist = [ana.normalize_postcode(p) for p in postcodes.split(',')] + + plist.sort() + with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - for row in context.table: - wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings]) - cur.execute("SELECT * from word WHERE " + wheres, list(row.cells)) - if exclude: - assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values) - else: - assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values) + if nctx.tokenizer == 'legacy_icu': + cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", + (plist,)) + else: + cur.execute("""SELECT word FROM word WHERE word = any(%s) + and class = 'place' and type = 'postcode'""", + (plist,)) + + found = [row[0] for row in cur] + assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}" + + if exclude: + assert len(found) == 0, f"Unexpected postcodes: {found}" + else: + assert set(found) == set(plist), \ + f"Missing postcodes {set(plist) - set(found)}. Found: {found}" @then("place_addressline contains") def check_place_addressline(context): diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py new file mode 100644 index 00000000..cde5e770 --- /dev/null +++ b/test/python/mock_icu_word_table.py @@ -0,0 +1,84 @@ +""" +Legacy word table for testing with functions to prefil and test contents +of the table. +""" + +class MockIcuWordTable: + """ A word table for testing using legacy word table structure. + """ + def __init__(self, conn): + self.conn = conn + with conn.cursor() as cur: + cur.execute("""CREATE TABLE word (word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + word text, + info jsonb)""") + + conn.commit() + + def add_special(self, word_token, word, cls, typ, oper): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, word, info) + VALUES (%s, 'S', %s, + json_build_object('class', %s, + 'type', %s, + 'op', %s)) + """, (word_token, word, cls, typ, oper)) + self.conn.commit() + + + def add_country(self, country_code, word_token): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES(%s, 'C', %s)""", + (word_token, country_code)) + self.conn.commit() + + + def add_postcode(self, word_token, postcode): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, type, word) + VALUES (%s, 'P', %s) + """, (word_token, postcode)) + self.conn.commit() + + + def count(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word") + + + def count_special(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'") + + + def get_special(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'") + result = set(((row[0], row[2], row[1]['class'], + row[1]['type'], row[1]['op']) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_country(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word, word_token FROM word WHERE type = 'C'") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_postcodes(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word FROM word WHERE type = 'P'") + return set((row[0] for row in cur)) + + + def get_partial_words(self): + with self.conn.cursor() as cur: + cur.execute("SELECT word_token, info FROM word WHERE type ='w'") + return set(((row[0], row[1]['count']) for row in cur)) + diff --git a/test/python/mock_legacy_word_table.py b/test/python/mock_legacy_word_table.py new file mode 100644 index 00000000..8baf3adc --- /dev/null +++ b/test/python/mock_legacy_word_table.py @@ -0,0 +1,86 @@ +""" +Legacy word table for testing with functions to prefil and test contents +of the table. +""" + +class MockLegacyWordTable: + """ A word table for testing using legacy word table structure. + """ + def __init__(self, conn): + self.conn = conn + with conn.cursor() as cur: + cur.execute("""CREATE TABLE word (word_id INTEGER, + word_token text, + word text, + class text, + type text, + country_code varchar(2), + search_name_count INTEGER, + operator TEXT)""") + + conn.commit() + + def add_special(self, word_token, word, cls, typ, oper): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, word, class, type, operator) + VALUES (%s, %s, %s, %s, %s) + """, (word_token, word, cls, typ, oper)) + self.conn.commit() + + + def add_country(self, country_code, word_token): + with self.conn.cursor() as cur: + cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)", + (word_token, country_code)) + self.conn.commit() + + + def add_postcode(self, word_token, postcode): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_token, word, class, type) + VALUES (%s, %s, 'place', 'postcode') + """, (word_token, postcode)) + self.conn.commit() + + + def count(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word") + + + def count_special(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'") + + + def get_special(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word_token, word, class, type, operator + FROM word WHERE class != 'place'""") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_country(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT country_code, word_token + FROM word WHERE country_code is not null""") + result = set((tuple(row) for row in cur)) + assert len(result) == cur.rowcount, "Word table has duplicates." + return result + + + def get_postcodes(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word FROM word + WHERE class = 'place' and type = 'postcode'""") + return set((row[0] for row in cur)) + + def get_partial_words(self): + with self.conn.cursor() as cur: + cur.execute("""SELECT word_token, search_name_count FROM word + WHERE class is null and country_code is null + and not word_token like ' %'""") + return set((tuple(row) for row in cur)) + diff --git a/test/python/mocks.py b/test/python/mocks.py index f9faaa93..7f7aaafc 100644 --- a/test/python/mocks.py +++ b/test/python/mocks.py @@ -7,6 +7,9 @@ import psycopg2.extras from nominatim.db import properties +# This must always point to the mock word table for the default tokenizer. +from mock_legacy_word_table import MockLegacyWordTable as MockWordTable + class MockParamCapture: """ Mock that records the parameters with which a function was called as well as the number of calls. @@ -24,88 +27,6 @@ class MockParamCapture: return self.return_value -class MockWordTable: - """ A word table for testing. - """ - def __init__(self, conn): - self.conn = conn - with conn.cursor() as cur: - cur.execute("""CREATE TABLE word (word_id INTEGER, - word_token text, - word text, - class text, - type text, - country_code varchar(2), - search_name_count INTEGER, - operator TEXT)""") - - conn.commit() - - def add_special(self, word_token, word, cls, typ, oper): - with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, word, class, type, operator) - VALUES (%s, %s, %s, %s, %s) - """, (word_token, word, cls, typ, oper)) - self.conn.commit() - - - def add_country(self, country_code, word_token): - with self.conn.cursor() as cur: - cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)", - (word_token, country_code)) - self.conn.commit() - - - def add_postcode(self, word_token, postcode): - with self.conn.cursor() as cur: - cur.execute("""INSERT INTO word (word_token, word, class, type) - VALUES (%s, %s, 'place', 'postcode') - """, (word_token, postcode)) - self.conn.commit() - - - def count(self): - with self.conn.cursor() as cur: - return cur.scalar("SELECT count(*) FROM word") - - - def count_special(self): - with self.conn.cursor() as cur: - return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'") - - - def get_special(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word_token, word, class, type, operator - FROM word WHERE class != 'place'""") - result = set((tuple(row) for row in cur)) - assert len(result) == cur.rowcount, "Word table has duplicates." - return result - - - def get_country(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT country_code, word_token - FROM word WHERE country_code is not null""") - result = set((tuple(row) for row in cur)) - assert len(result) == cur.rowcount, "Word table has duplicates." - return result - - - def get_postcodes(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class = 'place' and type = 'postcode'""") - return set((row[0] for row in cur)) - - def get_partial_words(self): - with self.conn.cursor() as cur: - cur.execute("""SELECT word_token, search_name_count FROM word - WHERE class is null and country_code is null - and not word_token like ' %'""") - return set((tuple(row) for row in cur)) - - class MockPlacexTable: """ A placex table for testing. """ diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 545cc58f..9eea7ed1 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -1,6 +1,8 @@ """ Tests for DB utility functions in db.utils """ +import json + import pytest import nominatim.db.utils as db_utils @@ -115,3 +117,38 @@ class TestCopyBuffer: +class TestCopyBufferJson: + TABLE_NAME = 'copytable' + + @pytest.fixture(autouse=True) + def setup_test_table(self, table_factory): + table_factory(self.TABLE_NAME, 'colA INT, colB JSONB') + + + def table_rows(self, cursor): + cursor.execute('SELECT * FROM ' + self.TABLE_NAME) + results = {k: v for k,v in cursor} + + assert len(results) == cursor.rowcount + + return results + + + def test_json_object(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'test': 'value', 'number': 1})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'test': 'value', 'number': 1}} + + + def test_json_object_special_chras(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'te\tst': 'va\nlue', 'nu"mber': None})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'te\tst': 'va\nlue', 'nu"mber': None}} diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index 39fc9fb4..ed489662 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -11,6 +11,12 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties +from mock_icu_word_table import MockIcuWordTable + +@pytest.fixture +def word_table(temp_db_conn): + return MockIcuWordTable(temp_db_conn) + @pytest.fixture def test_config(def_config, tmp_path): @@ -21,8 +27,8 @@ def test_config(def_config, tmp_path): sqldir.mkdir() (sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'") - shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'), - str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql')) + shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), + str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql')) def_config.lib_dir.sql = sqldir @@ -88,12 +94,14 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE word = norm_term and class is null and country_code is null; + FROM word WHERE info->>'word' = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, word, search_name_count) - SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; + INSERT INTO word (word_id, word_token, type, info) + SELECT full_token, lookup_term, 'W', + json_build_object('word', norm_term, 'count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP @@ -105,18 +113,18 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(search_name_count) INTO term_id, term_count - FROM word WHERE word_token = term and class is null and country_code is null; + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (term_id, term, 0); + INSERT INTO word (word_id, word_token, type, info) + VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; IF NOT (ARRAY[term_id] <@ partial_tokens) THEN - partial_tokens := partial_tokens || term_id; + partial_tokens := partial_tokens || term_id; END IF; END LOOP; END; @@ -232,14 +240,14 @@ def test_update_special_phrase_empty_table(analyzer, word_table): ], True) assert word_table.get_special() \ - == {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'), - (' KÖNIGE', 'Könige', 'amenity', 'royal', None), - (' STREET', 'street', 'highway', 'primary', 'in')} + == {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'), + ('KÖNIGE', 'Könige', 'amenity', 'royal', None), + ('STREET', 'street', 'highway', 'primary', 'in')} def test_update_special_phrase_delete_all(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -250,8 +258,8 @@ def test_update_special_phrase_delete_all(analyzer, word_table): def test_update_special_phrases_no_replace(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -262,8 +270,8 @@ def test_update_special_phrases_no_replace(analyzer, word_table): def test_update_special_phrase_modify(analyzer, word_table): - word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' BAR', 'bar', 'highway', 'road', None) + word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in') + word_table.add_special('BAR', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 @@ -275,25 +283,25 @@ def test_update_special_phrase_modify(analyzer, word_table): ], True) assert word_table.get_special() \ - == {(' PRISON', 'prison', 'amenity', 'prison', 'in'), - (' BAR', 'bar', 'highway', 'road', None), - (' GARDEN', 'garden', 'leisure', 'garden', 'near')} + == {('PRISON', 'prison', 'amenity', 'prison', 'in'), + ('BAR', 'bar', 'highway', 'road', None), + ('GARDEN', 'garden', 'leisure', 'garden', 'near')} def test_add_country_names_new(analyzer, word_table): with analyzer() as anl: anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'}) - assert word_table.get_country() == {('es', ' ESPAGÑA'), ('es', ' SPAIN')} + assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')} def test_add_country_names_extend(analyzer, word_table): - word_table.add_country('ch', ' SCHWEIZ') + word_table.add_country('ch', 'SCHWEIZ') with analyzer() as anl: anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'}) - assert word_table.get_country() == {('ch', ' SCHWEIZ'), ('ch', ' SUISSE')} + assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')} class TestPlaceNames: @@ -307,6 +315,7 @@ class TestPlaceNames: def expect_name_terms(self, info, *expected_terms): tokens = self.analyzer.get_word_token_info(expected_terms) + print (tokens) for token in tokens: assert token[2] is not None, "No token for {0}".format(token) @@ -316,7 +325,7 @@ class TestPlaceNames: def test_simple_names(self): info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) - self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34') + self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34') @pytest.mark.parametrize('sep', [',' , ';']) @@ -339,7 +348,7 @@ class TestPlaceNames: 'country_feature': 'no'}) self.expect_name_terms(info, '#norge', 'norge') - assert word_table.get_country() == {('no', ' NORGE')} + assert word_table.get_country() == {('no', 'NORGE')} class TestPlaceAddress: