From 3bf489cd7c5eec14e56ea6e95156f2209762828a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 22 May 2023 15:49:03 +0200 Subject: [PATCH] implement token assignment --- nominatim/api/search/query.py | 18 + nominatim/api/search/token_assignment.py | 345 ++++++++++++++++++ .../api/search/test_token_assignment.py | 327 +++++++++++++++++ 3 files changed, 690 insertions(+) create mode 100644 nominatim/api/search/token_assignment.py create mode 100644 test/python/api/search/test_token_assignment.py diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index 4e28d365..2ba49bbe 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -114,6 +114,24 @@ class TokenRange(NamedTuple): start: int end: int + def replace_start(self, new_start: int) -> 'TokenRange': + """ Return a new token range with the new start. + """ + return TokenRange(new_start, self.end) + + + def replace_end(self, new_end: int) -> 'TokenRange': + """ Return a new token range with the new end. + """ + return TokenRange(self.start, new_end) + + + def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']: + """ Split the span into two spans at the given index. + The index must be within the span. + """ + return self.replace_end(index), self.replace_start(index) + @dataclasses.dataclass class TokenList: diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py new file mode 100644 index 00000000..e9c03d3f --- /dev/null +++ b/nominatim/api/search/token_assignment.py @@ -0,0 +1,345 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Create query interpretations where each vertice in the query is assigned +a specific function (expressed as a token type). +""" +from typing import Optional, List, Iterator +import dataclasses + +import nominatim.api.search.query as qmod +from nominatim.api.logging import log + +# pylint: disable=too-many-return-statements,too-many-branches + +@dataclasses.dataclass +class TypedRange: + """ A token range for a specific type of tokens. + """ + ttype: qmod.TokenType + trange: qmod.TokenRange + + +PENALTY_TOKENCHANGE = { + qmod.BreakType.START: 0.0, + qmod.BreakType.END: 0.0, + qmod.BreakType.PHRASE: 0.0, + qmod.BreakType.WORD: 0.1, + qmod.BreakType.PART: 0.2, + qmod.BreakType.TOKEN: 0.4 +} + +TypedRangeSeq = List[TypedRange] + +@dataclasses.dataclass +class TokenAssignment: # pylint: disable=too-many-instance-attributes + """ Representation of a possible assignment of token types + to the tokens in a tokenized query. + """ + penalty: float = 0.0 + name: Optional[qmod.TokenRange] = None + address: List[qmod.TokenRange] = dataclasses.field(default_factory=list) + housenumber: Optional[qmod.TokenRange] = None + postcode: Optional[qmod.TokenRange] = None + country: Optional[qmod.TokenRange] = None + category: Optional[qmod.TokenRange] = None + qualifier: Optional[qmod.TokenRange] = None + + + @staticmethod + def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment': + """ Create a new token assignment from a sequence of typed spans. + """ + out = TokenAssignment() + for token in ranges: + if token.ttype == qmod.TokenType.PARTIAL: + out.address.append(token.trange) + elif token.ttype == qmod.TokenType.HOUSENUMBER: + out.housenumber = token.trange + elif token.ttype == qmod.TokenType.POSTCODE: + out.postcode = token.trange + elif token.ttype == qmod.TokenType.COUNTRY: + out.country = token.trange + elif token.ttype == qmod.TokenType.CATEGORY: + out.category = token.trange + elif token.ttype == qmod.TokenType.QUALIFIER: + out.qualifier = token.trange + return out + + +class _TokenSequence: + """ Working state used to put together the token assignements. + + Represents an intermediate state while traversing the tokenized + query. + """ + def __init__(self, seq: TypedRangeSeq, + direction: int = 0, penalty: float = 0.0) -> None: + self.seq = seq + self.direction = direction + self.penalty = penalty + + + def __str__(self) -> str: + seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq) + return f'{seq} (dir: {self.direction}, penalty: {self.penalty})' + + + @property + def end_pos(self) -> int: + """ Return the index of the global end of the current sequence. + """ + return self.seq[-1].trange.end if self.seq else 0 + + + def has_types(self, *ttypes: qmod.TokenType) -> bool: + """ Check if the current sequence contains any typed ranges of + the given types. + """ + return any(s.ttype in ttypes for s in self.seq) + + + def is_final(self) -> bool: + """ Return true when the sequence cannot be extended by any + form of token anymore. + """ + # Country and category must be the final term for left-to-right + return len(self.seq) > 1 and \ + self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.CATEGORY) + + + def appendable(self, ttype: qmod.TokenType) -> Optional[int]: + """ Check if the give token type is appendable to the existing sequence. + + Returns None if the token type is not appendable, otherwise the + new direction of the sequence after adding such a type. The + token is not added. + """ + if ttype == qmod.TokenType.WORD: + return None + + if not self.seq: + # Append unconditionally to the empty list + if ttype == qmod.TokenType.COUNTRY: + return -1 + if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + return 1 + return self.direction + + # Name tokens are always acceptable and don't change direction + if ttype == qmod.TokenType.PARTIAL: + return self.direction + + # Other tokens may only appear once + if self.has_types(ttype): + return None + + if ttype == qmod.TokenType.HOUSENUMBER: + if self.direction == 1: + if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER: + return None + if len(self.seq) > 2 \ + or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): + return None # direction left-to-right: housenumber must come before anything + elif self.direction == -1 \ + or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): + return -1 # force direction right-to-left if after other terms + + return self.direction + + if ttype == qmod.TokenType.POSTCODE: + if self.direction == -1: + if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + return None + return -1 + if self.direction == 1: + return None if self.has_types(qmod.TokenType.COUNTRY) else 1 + if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + return 1 + return self.direction + + if ttype == qmod.TokenType.COUNTRY: + return None if self.direction == -1 else 1 + + if ttype == qmod.TokenType.CATEGORY: + return self.direction + + if ttype == qmod.TokenType.QUALIFIER: + if self.direction == 1: + if (len(self.seq) == 1 + and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.CATEGORY)) \ + or (len(self.seq) == 2 + and self.seq[0].ttype == qmod.TokenType.CATEGORY + and self.seq[1].ttype == qmod.TokenType.PARTIAL): + return 1 + return None + if self.direction == -1: + return -1 + + tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.CATEGORY else self.seq + if len(tempseq) == 0: + return 1 + if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER: + return None + if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): + return -1 + return 0 + + return None + + + def advance(self, ttype: qmod.TokenType, end_pos: int, + btype: qmod.BreakType) -> Optional['_TokenSequence']: + """ Return a new token sequence state with the given token type + extended. + """ + newdir = self.appendable(ttype) + if newdir is None: + return None + + if not self.seq: + newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))] + new_penalty = 0.0 + else: + last = self.seq[-1] + if btype != qmod.BreakType.PHRASE and last.ttype == ttype: + # extend the existing range + newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))] + new_penalty = 0.0 + else: + # start a new range + newseq = list(self.seq) + [TypedRange(ttype, + qmod.TokenRange(last.trange.end, end_pos))] + new_penalty = PENALTY_TOKENCHANGE[btype] + + return _TokenSequence(newseq, newdir, self.penalty + new_penalty) + + + def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool: + if priors == 2: + self.penalty += 1.0 + elif priors > 2: + if self.direction == 0: + self.direction = new_dir + else: + return False + + return True + + + def recheck_sequence(self) -> bool: + """ Check that the sequence is a fully valid token assignment + and addapt direction and penalties further if necessary. + + This function catches some impossible assignments that need + forward context and can therefore not be exluded when building + the assignment. + """ + # housenumbers may not be further than 2 words from the beginning. + # If there are two words in front, give it a penalty. + hnrpos = next((i for i, tr in enumerate(self.seq) + if tr.ttype == qmod.TokenType.HOUSENUMBER), + None) + if hnrpos is not None: + if self.direction != -1: + priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL) + if not self._adapt_penalty_from_priors(priors, -1): + return False + if self.direction != 1: + priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL) + if not self._adapt_penalty_from_priors(priors, 1): + return False + + return True + + + def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]: + """ Yield possible assignments for the current sequence. + + This function splits up general name assignments into name + and address and yields all possible variants of that. + """ + base = TokenAssignment.from_ranges(self.seq) + + # Postcode search (postcode-only search is covered in next case) + if base.postcode is not None and base.address: + if (base.postcode.start == 0 and self.direction != -1)\ + or (base.postcode.end == query.num_token_slots() and self.direction != 1): + log().comment('postcode search') + yield dataclasses.replace(base, penalty=self.penalty) + + # Postcode or country-only search + if not base.address: + if not base.housenumber and (base.postcode or base.country or base.category): + log().comment('postcode/country search') + yield dataclasses.replace(base, penalty=self.penalty) + else: + # Use entire first word as name + if self.direction != -1: + log().comment('first word = name') + yield dataclasses.replace(base, name=base.address[0], + penalty=self.penalty, + address=base.address[1:]) + + # Use entire last word as name + if self.direction == -1 or (self.direction == 0 and len(base.address) > 1): + log().comment('last word = name') + yield dataclasses.replace(base, name=base.address[-1], + penalty=self.penalty, + address=base.address[:-1]) + + # Use beginning of first word as name + if self.direction != -1: + first = base.address[0] + if (not base.housenumber or first.end >= base.housenumber.start)\ + and (not base.qualifier or first.start >= base.qualifier.end): + for i in range(first.start + 1, first.end): + name, addr = first.split(i) + penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype] + log().comment(f'split first word = name ({i - first.start})') + yield dataclasses.replace(base, name=name, penalty=penalty, + address=[addr] + base.address[1:]) + + # Use end of last word as name + if self.direction != 1: + last = base.address[-1] + if (not base.housenumber or last.start <= base.housenumber.end)\ + and (not base.qualifier or last.end <= base.qualifier.start): + for i in range(last.start + 1, last.end): + addr, name = last.split(i) + penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype] + log().comment(f'split last word = name ({i - last.start})') + yield dataclasses.replace(base, name=name, penalty=penalty, + address=base.address[:-1] + [addr]) + + + +def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]: + """ Return possible word type assignments to word positions. + + The assignments are computed from the concrete tokens listed + in the tokenized query. + + The result includes the penalty for transitions from one word type to + another. It does not include penalties for transitions within a + type. + """ + todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)] + + while todo: + state = todo.pop() + node = query.nodes[state.end_pos] + + for tlist in node.starting: + newstate = state.advance(tlist.ttype, tlist.end, node.btype) + if newstate is not None: + if newstate.end_pos == query.num_token_slots(): + if newstate.recheck_sequence(): + log().var_dump('Assignment', newstate) + yield from newstate.get_assignments(query) + elif not newstate.is_final(): + todo.append(newstate) diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py new file mode 100644 index 00000000..8cbcccb9 --- /dev/null +++ b/test/python/api/search/test_token_assignment.py @@ -0,0 +1,327 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Test for creation of token assignments from tokenized queries. +""" +import pytest + +from nominatim.api.search.query import QueryStruct, Phrase, PhraseType, BreakType, TokenType, TokenRange, Token +from nominatim.api.search.token_assignment import yield_token_assignments, TokenAssignment, PENALTY_TOKENCHANGE + +class MyToken(Token): + def get_category(self): + return 'this', 'that' + + +def make_query(*args): + q = None + dummy = MyToken(3.0, 45, 1, 'foo', True) + + for btype, ptype, tlist in args: + if q is None: + q = QueryStruct([Phrase(ptype, '')]) + else: + q.add_node(btype, ptype) + + start = len(q.nodes) - 1 + for end, ttype in tlist: + q.add_token(TokenRange(start, end), ttype, [dummy]) + + q.add_node(BreakType.END, PhraseType.NONE) + + return q + + +def check_assignments(actual, *expected): + todo = list(expected) + for assignment in actual: + assert assignment in todo, f"Unexpected assignment: {assignment}" + todo.remove(assignment) + + assert not todo, f"Missing assignments: {expected}" + + +def test_query_with_missing_tokens(): + q = QueryStruct([Phrase(PhraseType.NONE, '')]) + q.add_node(BreakType.END, PhraseType.NONE) + + assert list(yield_token_assignments(q)) == [] + + +def test_one_word_query(): + q = make_query((BreakType.START, PhraseType.NONE, + [(1, TokenType.PARTIAL), + (1, TokenType.WORD), + (1, TokenType.HOUSENUMBER)])) + + res = list(yield_token_assignments(q)) + assert res == [TokenAssignment(name=TokenRange(0, 1))] + + +def test_single_postcode(): + q = make_query((BreakType.START, PhraseType.NONE, + [(1, TokenType.POSTCODE)])) + + res = list(yield_token_assignments(q)) + assert res == [TokenAssignment(postcode=TokenRange(0, 1))] + + +def test_single_country_name(): + q = make_query((BreakType.START, PhraseType.NONE, + [(1, TokenType.COUNTRY)])) + + res = list(yield_token_assignments(q)) + assert res == [TokenAssignment(country=TokenRange(0, 1))] + + +def test_single_word_poi_search(): + q = make_query((BreakType.START, PhraseType.NONE, + [(1, TokenType.CATEGORY), + (1, TokenType.QUALIFIER)])) + + res = list(yield_token_assignments(q)) + assert res == [TokenAssignment(category=TokenRange(0, 1))] + + +@pytest.mark.parametrize('btype', [BreakType.WORD, BreakType.PART, BreakType.TOKEN]) +def test_multiple_simple_words(btype): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (btype, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (btype, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + penalty = PENALTY_TOKENCHANGE[btype] + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(0, 3)), + TokenAssignment(penalty=penalty, name=TokenRange(0, 2), + address=[TokenRange(2, 3)]), + TokenAssignment(penalty=penalty, name=TokenRange(0, 1), + address=[TokenRange(1, 3)]), + TokenAssignment(penalty=penalty, name=TokenRange(1, 3), + address=[TokenRange(0, 1)]), + TokenAssignment(penalty=penalty, name=TokenRange(2, 3), + address=[TokenRange(0, 2)]) + ) + + +def test_multiple_words_respect_phrase_break(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(0, 1), + address=[TokenRange(1, 2)]), + TokenAssignment(name=TokenRange(1, 2), + address=[TokenRange(0, 1)])) + + +def test_housenumber_and_street(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(1, 2), + housenumber=TokenRange(0, 1))) + + +def test_housenumber_and_street_backwards(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(0, 1), + housenumber=TokenRange(1, 2))) + + +def test_housenumber_and_postcode(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=pytest.approx(0.3), + name=TokenRange(0, 1), + housenumber=TokenRange(1, 2), + address=[TokenRange(2, 3)], + postcode=TokenRange(3, 4))) + +def test_postcode_and_housenumber(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.POSTCODE)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=pytest.approx(0.3), + name=TokenRange(2, 3), + housenumber=TokenRange(3, 4), + address=[TokenRange(0, 1)], + postcode=TokenRange(1, 2))) + + +def test_country_housenumber_postcode(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)])) + + check_assignments(yield_token_assignments(q)) + + +@pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.COUNTRY, + TokenType.CATEGORY, TokenType.QUALIFIER]) +def test_housenumber_with_only_special_terms(ttype): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(2, ttype)])) + + check_assignments(yield_token_assignments(q)) + + +@pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.HOUSENUMBER, TokenType.COUNTRY]) +def test_multiple_special_tokens(ttype): + q = make_query((BreakType.START, PhraseType.NONE, [(1, ttype)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(3, ttype)])) + + check_assignments(yield_token_assignments(q)) + + +def test_housenumber_many_phrases(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(3, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, + name=TokenRange(4, 5), + housenumber=TokenRange(3, 4),\ + address=[TokenRange(0, 1), TokenRange(1, 2), + TokenRange(2, 3)])) + + +def test_country_at_beginning(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, name=TokenRange(1, 2), + country=TokenRange(0, 1))) + + +def test_country_at_end(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, name=TokenRange(0, 1), + country=TokenRange(1, 2))) + + +def test_country_in_middle(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q)) + + +def test_postcode_with_designation(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.POSTCODE)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(1, 2), + postcode=TokenRange(0, 1)), + TokenAssignment(postcode=TokenRange(0, 1), + address=[TokenRange(1, 2)])) + + +def test_postcode_with_designation_backwards(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.POSTCODE)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(name=TokenRange(0, 1), + postcode=TokenRange(1, 2)), + TokenAssignment(postcode=TokenRange(1, 2), + address=[TokenRange(0, 1)])) + + +def test_category_at_beginning(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.CATEGORY)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, name=TokenRange(1, 2), + category=TokenRange(0, 1))) + + +def test_category_at_end(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.CATEGORY)])) + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, name=TokenRange(0, 1), + category=TokenRange(1, 2))) + + +def test_category_in_middle(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.CATEGORY)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q)) + + +def test_qualifier_at_beginning(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.1, name=TokenRange(1, 3), + qualifier=TokenRange(0, 1)), + TokenAssignment(penalty=0.2, name=TokenRange(1, 2), + qualifier=TokenRange(0, 1), + address=[TokenRange(2, 3)])) + + +def test_qualifier_after_name(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]), + (BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]), + (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)])) + + + check_assignments(yield_token_assignments(q), + TokenAssignment(penalty=0.2, name=TokenRange(0, 2), + qualifier=TokenRange(2, 3), + address=[TokenRange(3, 5)]), + TokenAssignment(penalty=0.2, name=TokenRange(3, 5), + qualifier=TokenRange(2, 3), + address=[TokenRange(0, 2)])) + + +def test_qualifier_before_housenumber(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q)) + + +def test_qualifier_after_housenumber(): + q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]), + (BreakType.WORD, PhraseType.NONE, [(2, TokenType.QUALIFIER)]), + (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)])) + + check_assignments(yield_token_assignments(q))