switch word tokens to new word table layout

This commit is contained in:
Sarah Hoffmann 2021-07-21 10:41:38 +02:00
parent 4342b28882
commit 70f154be8b
4 changed files with 58 additions and 86 deletions

View File

@ -148,8 +148,9 @@ class Tokenizer
// Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, type';
$sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,";
$sSQL .= " info->>'word' as word, info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as type";
$sSQL .= " info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as type,";
$sSQL .= " info->>'count' as count";
$sSQL .= ' FROM word WHERE word_token in (';
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
@ -190,8 +191,6 @@ class Tokenizer
break;
'S': // tokens for classification terms (special phrases)
if ($aWord['class'] === null || $aWord['type'] === null
|| $aWord['word'] === null
|| strpos($sNormQuery, $aWord['word']) === false
) {
continue;
}
@ -202,58 +201,23 @@ class Tokenizer
$aWord['op'] ? Operator::NEAR : Operator::NONE
);
break;
'W': // full-word tokens
$oToken = new Token\Word(
$iId,
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
break;
'w': // partial word terms
$oToken = new Token\Partial(
$iId,
$aWord['word_token'],
(int) $aWord['count']
);
break;
default:
continue;
}
/* if ($aWord['class']) {
// Special terms need to appear in their normalized form.
// (postcodes are not normalized in the word table)
$sNormWord = $this->normalizeString($aWord['word']);
if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
continue;
}
if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
$oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
} elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
if ($aWord['word']
&& pg_escape_string($aWord['word']) == $aWord['word']
) {
$oToken = new Token\Postcode(
$iId,
$aWord['word'],
$aWord['country_code']
);
}
} else {
// near and in operator the same at the moment
$oToken = new Token\SpecialTerm(
$iId,
$aWord['class'],
$aWord['type'],
$aWord['operator'] ? Operator::NEAR : Operator::NONE
);
}
} elseif ($aWord['country_code']) {
// Filter country tokens that do not match restricted countries.
if (!$this->aCountryRestriction
|| in_array($aWord['country_code'], $this->aCountryRestriction)
) {
$oToken = new Token\Country($iId, $aWord['country_code']);
}
} elseif ($aWord['word_token'][0] == ' ') {
$oToken = new Token\Word(
$iId,
(int) $aWord['count'],
substr_count($aWord['word_token'], ' ')
);
} else {
$oToken = new Token\Partial(
$iId,
$aWord['word_token'],
(int) $aWord['count']
);
}*/
$oValidTokens->addToken($aWord['word_token'], $oToken);
}

View File

@ -15,7 +15,12 @@ CREATE INDEX idx_word_country_names ON word
-- Used when inserting new postcodes on updates.
CREATE INDEX idx_word_postcodes ON word
USING btree((info->>'postcode')) {{db.tablespace.address_index}}
WHERE type = 'P'
WHERE type = 'P';
-- Used when inserting full words.
CREATE INDEX idx_word_full_word ON word
USING btree((info->>'word')) {{db.tablespace.address_index}}
WHERE type = 'W';
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;

View File

@ -98,12 +98,14 @@ DECLARE
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
FROM word WHERE info->>'word' = norm_term and type = 'W';
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
INSERT INTO word (word_id, word_token, info)
SELECT full_token, lookup_term,
json_build_object('word', norm_term, 'count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
@ -115,14 +117,14 @@ BEGIN
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
SELECT min(word_id), max(info->>'count') INTO term_id, term_count
FROM word WHERE word_token = term and type = 'w';
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
INSERT INTO word (word_id, word_token, info)
VALUES (term_id, term, json_build_object('count', term_count));
END IF;
IF term_count < {{ max_word_freq }} THEN

View File

@ -74,13 +74,11 @@ class LegacyICUTokenizer:
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, config):
def finalize_import(self, _):
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
pass
def update_sql_functions(self, config):
@ -121,18 +119,17 @@ class LegacyICUTokenizer:
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
# pylint: disable=missing-format-attribute
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', {0.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, phpdir)))
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
def _save_config(self, config):
@ -175,14 +172,14 @@ class LegacyICUTokenizer:
# copy them back into the word table
with CopyBuffer() as copystr:
for args in words.items():
copystr.add(*args)
for k, v in words.items():
copystr.add('w', k, {'count': v})
with conn.cursor() as cur:
copystr.copy_out(cur, 'word',
columns=['word_token', 'search_name_count'])
columns=['type', 'word_token', 'info'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null""")
WHERE word_id is null and type = 'w'""")
conn.commit()
@ -229,22 +226,26 @@ class LegacyICUNameAnalyzer:
The function is used for testing and debugging only
and not necessarily efficient.
"""
tokens = {}
full_tokens = {}
partial_tokens = {}
for word in words:
if word.startswith('#'):
tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
else:
tokens[word] = self.name_processor.get_search_normalized(word)
partial_tokens[word] = self.name_processor.get_search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = t.term
and class is null and country_code is null""",
(list(tokens.values()), ))
cur.execute("""(SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'W')
UNION
(SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'w')""",
(list(full_tokens.values()),
list(partial_tokens.values())))
ids = {r[0]: r[1] for r in cur}
return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
+ [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
@staticmethod
@ -290,7 +291,7 @@ class LegacyICUNameAnalyzer:
if to_delete:
cur.execute("""DELETE FROM WORD
WHERE class ='P' and info->>'postcode' = any(%s)
WHERE type ='P' and info->>'postcode' = any(%s)
""", (to_delete, ))
copystr.copy_out(cur, 'word',