add query analyser for legacy tokenizer

This commit is contained in:
Sarah Hoffmann 2023-05-22 11:07:14 +02:00
parent 2448cf2a14
commit d8240f9ee4
4 changed files with 514 additions and 10 deletions

View File

@ -23,6 +23,7 @@ from nominatim.api.types import Point, Bbox, LookupDetails
from nominatim.api.connection import SearchConnection
from nominatim.api.logging import log
from nominatim.api.localization import Locales
from nominatim.api.search.query_analyzer_factory import make_query_analyzer
# This file defines complex result data classes.
# pylint: disable=too-many-instance-attributes
@ -420,10 +421,12 @@ async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None:
result.name_keywords = []
result.address_keywords = []
for name_tokens, address_tokens in await conn.execute(sql):
t = conn.t.word
sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
await make_query_analyzer(conn)
t = conn.t.meta.tables['word']
sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
for name_tokens, address_tokens in await conn.execute(sql):
for row in await conn.execute(sel.where(t.c.word_id == sa.any_(name_tokens))):
result.name_keywords.append(WordInfo(*row))

View File

@ -0,0 +1,263 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of query analysis for the legacy tokenizer.
"""
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from copy import copy
from collections import defaultdict
import dataclasses
import sqlalchemy as sa
from nominatim.typing import SaRow
from nominatim.api.connection import SearchConnection
from nominatim.api.logging import log
from nominatim.api.search import query as qmod
from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
""" Return all combinations of words in the terms list after the
given position.
"""
total = len(terms)
for first in range(start, total):
word = terms[first]
yield word, qmod.TokenRange(first, first + 1)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last]))
yield word, qmod.TokenRange(first, last + 1)
@dataclasses.dataclass
class LegacyToken(qmod.Token):
""" Specialised token for legacy tokenizer.
"""
word_token: str
category: Optional[Tuple[str, str]]
country: Optional[str]
operator: Optional[str]
@property
def info(self) -> Dict[str, Any]:
""" Dictionary of additional propoerties of the token.
Should only be used for debugging purposes.
"""
return {'category': self.category,
'country': self.country,
'operator': self.operator}
def get_category(self) -> Tuple[str, str]:
assert self.category
return self.category
class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a legacy tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('class', sa.Text),
sa.Column('type', sa.Text),
sa.Column('country_code', sa.Text),
sa.Column('search_name_count', sa.Integer),
sa.Column('operator', sa.Text))
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using Legacy tokenizer)')
normalized = []
if phrases:
for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
for p in phrases))):
normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
break
query = qmod.QueryStruct(normalized)
log().var_dump('Normalized query', query.source)
if not query.source:
return query
parts, words = self.split_query(query)
lookup_words = list(words.keys())
log().var_dump('Split query', parts)
log().var_dump('Extracted words', lookup_words)
for row in await self.lookup_in_db(lookup_words):
for trange in words[row.word_token.strip()]:
token, ttype = self.make_token(row)
if ttype == qmod.TokenType.CATEGORY:
if trange.start == 0:
query.add_token(trange, qmod.TokenType.CATEGORY, token)
elif ttype == qmod.TokenType.QUALIFIER:
query.add_token(trange, qmod.TokenType.QUALIFIER, token)
if trange.start == 0 or trange.end == query.num_token_slots():
token = copy(token)
token.penalty += 0.1 * (query.num_token_slots())
query.add_token(trange, qmod.TokenType.CATEGORY, token)
elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
query.add_token(trange, ttype, token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query)
log().table_dump('Word tokens', _dump_word_tokens(query))
return query
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
Dict[str, List[qmod.TokenRange]]]:
""" Transliterate the phrases and split them into tokens.
Returns a list of transliterated tokens and a dictionary
of words for lookup together with their position.
"""
parts: List[str] = []
phrase_start = 0
words = defaultdict(list)
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for trans in phrase.text.split(' '):
if trans:
for term in trans.split(' '):
if term:
parts.append(trans)
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BreakType.END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
"""
t = self.conn.t.meta.tables['word']
sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
return await self.conn.execute(sql)
def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
""" Create a LegacyToken from the row of the word table.
Also determines the type of token.
"""
penalty = 0.0
is_indexed = True
rowclass = getattr(row, 'class')
if row.country_code is not None:
ttype = qmod.TokenType.COUNTRY
lookup_word = row.country_code
elif rowclass is not None:
if rowclass == 'place' and row.type == 'house':
ttype = qmod.TokenType.HOUSENUMBER
lookup_word = row.word_token[1:]
elif rowclass == 'place' and row.type == 'postcode':
ttype = qmod.TokenType.POSTCODE
lookup_word = row.word_token[1:]
else:
ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\
else qmod.TokenType.QUALIFIER
lookup_word = row.word
elif row.word_token.startswith(' '):
ttype = qmod.TokenType.WORD
lookup_word = row.word or row.word_token[1:]
else:
ttype = qmod.TokenType.PARTIAL
lookup_word = row.word_token
penalty = 0.21
if row.search_name_count > self.max_word_freq:
is_indexed = False
return LegacyToken(penalty=penalty, token=row.word_id,
count=row.search_name_count or 1,
lookup_word=lookup_word,
word_token=row.word_token.strip(),
category=(rowclass, row.type) if rowclass is not None else None,
country=row.country_code,
operator=row.operator,
is_indexed=is_indexed),\
ttype
def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part) <= 4 and part.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
LegacyToken(penalty=0.5, token=0, count=1,
lookup_word=part, word_token=part,
category=None, country=None,
operator=None, is_indexed=True))
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for _, node, tlist in query.iter_token_lists():
if tlist.ttype == qmod.TokenType.POSTCODE:
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) > 4):
repl.add_penalty(0.39)
elif tlist.ttype == qmod.TokenType.HOUSENUMBER:
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
for repl in node.starting:
if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \
and (repl.ttype != qmod.TokenType.HOUSENUMBER
or len(tlist.tokens[0].lookup_word) <= 3):
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
for tlist in node.starting:
for token in tlist.tokens:
t = cast(LegacyToken, token)
yield [tlist.ttype.name, t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = LegacyQueryAnalyzer(conn)
await out.setup()
return out

View File

@ -113,13 +113,6 @@ class SearchTables:
sa.Column('postcode', sa.Text),
sa.Column('country_code', sa.String(2)))
self.word = sa.Table('word', meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', self.types.Json))
self.country_name = sa.Table('country_name', meta,
sa.Column('country_code', sa.String(2)),
sa.Column('name', self.types.Composite),

View File

@ -0,0 +1,245 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for query analyzer for legacy tokenizer.
"""
from pathlib import Path
import pytest
import pytest_asyncio
from nominatim.api import NominatimAPIAsync
from nominatim.api.search.query import Phrase, PhraseType, TokenType, BreakType
import nominatim.api.search.legacy_tokenizer as tok
from nominatim.api.logging import set_log_output, get_and_disable
async def add_word(conn, word_id, word_token, word, count):
t = conn.t.meta.tables['word']
await conn.execute(t.insert(), {'word_id': word_id,
'word_token': word_token,
'search_name_count': count,
'word': word})
async def add_housenumber(conn, word_id, hnr):
t = conn.t.meta.tables['word']
await conn.execute(t.insert(), {'word_id': word_id,
'word_token': ' ' + hnr,
'word': hnr,
'class': 'place',
'type': 'house'})
async def add_postcode(conn, word_id, postcode):
t = conn.t.meta.tables['word']
await conn.execute(t.insert(), {'word_id': word_id,
'word_token': ' ' + postcode,
'word': postcode,
'class': 'place',
'type': 'postcode'})
async def add_special_term(conn, word_id, word_token, cls, typ, op):
t = conn.t.meta.tables['word']
await conn.execute(t.insert(), {'word_id': word_id,
'word_token': word_token,
'word': word_token,
'class': cls,
'type': typ,
'operator': op})
def make_phrase(query):
return [Phrase(PhraseType.NONE, s) for s in query.split(',')]
@pytest_asyncio.fixture
async def conn(table_factory, temp_db_cursor):
""" Create an asynchronous SQLAlchemy engine for the test DB.
"""
table_factory('nominatim_properties',
definition='property TEXT, value TEXT',
content=(('tokenizer_maxwordfreq', '10000'), ))
table_factory('word',
definition="""word_id INT, word_token TEXT, word TEXT,
class TEXT, type TEXT, country_code TEXT,
search_name_count INT, operator TEXT
""")
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
RETURNS TEXT AS $$ SELECT lower(name); $$ LANGUAGE SQL;""")
api = NominatimAPIAsync(Path('/invalid'), {})
async with api.begin() as conn:
yield conn
await api.close()
@pytest.mark.asyncio
async def test_empty_phrase(conn):
ana = await tok.create_query_analyzer(conn)
query = await ana.analyze_query([])
assert len(query.source) == 0
assert query.num_token_slots() == 0
@pytest.mark.asyncio
async def test_single_phrase_with_unknown_terms(conn):
ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, 'foo', 'FOO', 3)
query = await ana.analyze_query(make_phrase('foo BAR'))
assert len(query.source) == 1
assert query.source[0].ptype == PhraseType.NONE
assert query.source[0].text == 'foo bar'
assert query.num_token_slots() == 2
assert len(query.nodes[0].starting) == 1
assert not query.nodes[1].starting
@pytest.mark.asyncio
async def test_multiple_phrases(conn):
ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, 'one', 'one', 13)
await add_word(conn, 2, 'two', 'two', 45)
await add_word(conn, 100, 'one two', 'one two', 3)
await add_word(conn, 3, 'three', 'three', 4584)
query = await ana.analyze_query(make_phrase('one two,three'))
assert len(query.source) == 2
@pytest.mark.asyncio
async def test_housenumber_token(conn):
ana = await tok.create_query_analyzer(conn)
await add_housenumber(conn, 556, '45 a')
query = await ana.analyze_query(make_phrase('45 A'))
assert query.num_token_slots() == 2
assert len(query.nodes[0].starting) == 2
query.nodes[0].starting.sort(key=lambda tl: tl.end)
hn1 = query.nodes[0].starting[0]
assert hn1.ttype == TokenType.HOUSENUMBER
assert hn1.end == 1
assert hn1.tokens[0].token == 0
hn2 = query.nodes[0].starting[1]
assert hn2.ttype == TokenType.HOUSENUMBER
assert hn2.end == 2
assert hn2.tokens[0].token == 556
@pytest.mark.asyncio
async def test_postcode_token(conn):
ana = await tok.create_query_analyzer(conn)
await add_postcode(conn, 34, '45ax')
query = await ana.analyze_query(make_phrase('45AX'))
assert query.num_token_slots() == 1
assert [tl.ttype for tl in query.nodes[0].starting] == [TokenType.POSTCODE]
@pytest.mark.asyncio
async def test_partial_tokens(conn):
ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, ' foo', 'foo', 99)
await add_word(conn, 1, 'foo', 'FOO', 99)
await add_word(conn, 1, 'bar', 'FOO', 990000)
query = await ana.analyze_query(make_phrase('foo bar'))
assert query.num_token_slots() == 2
first = query.nodes[0].starting
first.sort(key=lambda tl: tl.tokens[0].penalty)
assert [tl.ttype for tl in first] == [TokenType.WORD, TokenType.PARTIAL]
assert all(tl.tokens[0].lookup_word == 'foo' for tl in first)
second = query.nodes[1].starting
assert [tl.ttype for tl in second] == [TokenType.PARTIAL]
assert not second[0].tokens[0].is_indexed
@pytest.mark.asyncio
@pytest.mark.parametrize('term,order', [('23456', ['POSTCODE', 'HOUSENUMBER', 'WORD', 'PARTIAL']),
('3', ['HOUSENUMBER', 'POSTCODE', 'WORD', 'PARTIAL'])
])
async def test_penalty_postcodes_and_housenumbers(conn, term, order):
ana = await tok.create_query_analyzer(conn)
await add_postcode(conn, 1, term)
await add_housenumber(conn, 2, term)
await add_word(conn, 3, term, term, 5)
await add_word(conn, 4, ' ' + term, term, 1)
query = await ana.analyze_query(make_phrase(term))
assert query.num_token_slots() == 1
torder = [(tl.tokens[0].penalty, tl.ttype) for tl in query.nodes[0].starting]
print(query.nodes[0].starting)
torder.sort()
assert [t[1] for t in torder] == [TokenType[o] for o in order]
@pytest.mark.asyncio
async def test_category_words_only_at_beginning(conn):
ana = await tok.create_query_analyzer(conn)
await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', 'in')
await add_word(conn, 2, ' bar', 'BAR', 1)
query = await ana.analyze_query(make_phrase('foo BAR foo'))
assert query.num_token_slots() == 3
assert len(query.nodes[0].starting) == 1
assert query.nodes[0].starting[0].ttype == TokenType.CATEGORY
assert not query.nodes[2].starting
@pytest.mark.asyncio
async def test_qualifier_words(conn):
ana = await tok.create_query_analyzer(conn)
await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', '-')
await add_word(conn, 2, ' bar', 'w', None)
query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo'))
assert query.num_token_slots() == 5
assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.CATEGORY, TokenType.QUALIFIER}
assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER}
assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.CATEGORY, TokenType.QUALIFIER}
@pytest.mark.asyncio
@pytest.mark.parametrize('logtype', ['text', 'html'])
async def test_log_output(conn, logtype):
ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, 'foo', 'FOO', 99)
set_log_output(logtype)
await ana.analyze_query(make_phrase('foo'))
assert get_and_disable()