complete tests for rule loader

This commit is contained in:
Sarah Hoffmann 2021-06-10 10:06:49 +02:00
parent a0a7b05c9f
commit 2e81084f35
2 changed files with 63 additions and 12 deletions

View File

@ -3,7 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module. libICU instead of the PostgreSQL module.
""" """
from collections import Counter from collections import Counter
import io
import itertools import itertools
import logging import logging
import re import re
@ -178,7 +177,7 @@ class LegacyICUTokenizer:
with conn.cursor() as cur: with conn.cursor() as cur:
copystr.copy_out(cur, 'word', copystr.copy_out(cur, 'word',
columns=['word_token', 'search_name_count']) columns=['word_token', 'search_name_count'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word') cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null""") WHERE word_id is null""")

View File

@ -63,6 +63,22 @@ def test_missing_normalization(tmp_path, section):
with pytest.raises(UsageError): with pytest.raises(UsageError):
ICURuleLoader(fpath) ICURuleLoader(fpath)
@pytest.mark.parametrize("abbr", ["simple",
"double => arrow => bad",
"bad = > arrow"])
def test_bad_abbreviation_syntax(tmp_path, abbr):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalization:
transliteration:
compound_suffixes:
abbreviations:
- {}
""".format(abbr)))
with pytest.raises(UsageError):
rules = ICURuleLoader(fpath)
def test_get_search_rules(cfgfile): def test_get_search_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'], fpath = cfgfile(['strasse', 'straße', 'weg'],
@ -105,18 +121,54 @@ def test_get_transliteration_rules(cfgfile):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt " assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_get_synonym_pairs(cfgfile): def test_get_replacement_pairs_multi_to(cfgfile):
fpath = cfgfile(['Weg', 'Strasse'], fpath = cfgfile(['Pfad', 'Strasse'],
['Strasse => str,st']) ['Strasse => str,st'])
loader = ICURuleLoader(fpath) repl = ICURuleLoader(fpath).get_replacement_pairs()
repl = loader.get_replacement_pairs() assert [(a, sorted(b)) for a, b in repl] == \
[(' strasse ', [' st ', ' str ', ' strasse ']),
('strasse ', [' st ', ' str ', ' strasse ']),
('pfad ', [' pfad ']),
('str ' , [' str ']),
('st ' , [' st '])]
assert sorted(((a, sorted(b)) for a, b in repl)) == \
sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
('strasse ', [' st ', ' str ', ' strasse ']),
('st ' , [' st ']),
('str ' , [' str ']),
('weg ', [' weg '])])
def test_get_replacement_pairs_multi_from(cfgfile):
fpath = cfgfile([], ['saint,Sainte => st'])
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
[(' sainte ', [' sainte ', ' st ']),
(' saint ', [' saint ', ' st '])]
def test_get_replacement_pairs_cross_abbreviations(cfgfile):
fpath = cfgfile([], ['saint,Sainte => st',
'sainte => ste'])
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
[(' sainte ', [' sainte ', ' st ', ' ste ']),
(' saint ', [' saint ', ' st '])]
@pytest.mark.parametrize("abbr", ["missing to =>",
" => missing from",
"=>"])
def test_bad_abbreviation_syntax(tmp_path, abbr):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalization:
transliteration:
compound_suffixes:
abbreviations:
- {}
""".format(abbr)))
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert repl == []