complete tests for rule loader

2024-11-22 21:28:10 +03:00 · 2021-06-10 10:06:49 +02:00 · 2021-06-10 10:06:49 +02:00 · 2e81084f35
commit 2e81084f35
parent a0a7b05c9f
2 changed files with 63 additions and 12 deletions
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@ -3,7 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 import io
 import itertools
 import logging
 import re
@ -178,7 +177,7 @@ class LegacyICUTokenizer:
                with conn.cursor() as cur:
                    copystr.copy_out(cur, 'word',
-                                      columns=['word_token', 'search_name_count'])
+                                     columns=['word_token', 'search_name_count'])
                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                   WHERE word_id is null""")
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@ -63,6 +63,22 @@ def test_missing_normalization(tmp_path, section):
    with pytest.raises(UsageError):
        ICURuleLoader(fpath)
@pytest.mark.parametrize("abbr", ["simple",
                                  "double => arrow => bad",
                                  "bad = > arrow"])
 def test_bad_abbreviation_syntax(tmp_path, abbr):
    fpath = tmp_path / ('test_config.yaml')
    fpath.write_text(dedent("""\
        normalization:
        transliteration:
        compound_suffixes:
        abbreviations:
         - {}
        """.format(abbr)))
    with pytest.raises(UsageError):
        rules = ICURuleLoader(fpath)
 def test_get_search_rules(cfgfile):
    fpath = cfgfile(['strasse', 'straße', 'weg'],
@ -105,18 +121,54 @@ def test_get_transliteration_rules(cfgfile):
    assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
-def test_get_synonym_pairs(cfgfile):
+def test_get_replacement_pairs_multi_to(cfgfile):
-    fpath = cfgfile(['Weg', 'Strasse'],
+    fpath = cfgfile(['Pfad', 'Strasse'],
                    ['Strasse => str,st'])
-    loader = ICURuleLoader(fpath)
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
-    repl = loader.get_replacement_pairs()
+    assert [(a, sorted(b)) for a, b in repl] == \
             [(' strasse ', [' st ', ' str ', ' strasse ']),
              ('strasse ', [' st ', ' str ', ' strasse ']),
              ('pfad ', [' pfad ']),
              ('str ' , [' str ']),
              ('st ' , [' st '])]
    assert sorted(((a, sorted(b)) for a, b in repl)) == \
             sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
                     ('strasse ', [' st ', ' str ', ' strasse ']),
                     ('st ' , [' st ']),
                     ('str ' , [' str ']),
                     ('weg ', [' weg '])])
 def test_get_replacement_pairs_multi_from(cfgfile):
    fpath = cfgfile([], ['saint,Sainte => st'])
    repl = ICURuleLoader(fpath).get_replacement_pairs()
    assert [(a, sorted(b)) for a, b in repl] == \
             [(' sainte ', [' sainte ', ' st ']),
              (' saint ', [' saint ', ' st '])]
 def test_get_replacement_pairs_cross_abbreviations(cfgfile):
    fpath = cfgfile([], ['saint,Sainte => st',
                         'sainte => ste'])
    repl = ICURuleLoader(fpath).get_replacement_pairs()
    assert [(a, sorted(b)) for a, b in repl] == \
             [(' sainte ', [' sainte ', ' st ', ' ste ']),
              (' saint ', [' saint ', ' st '])]
@pytest.mark.parametrize("abbr", ["missing to =>",
                                  "  => missing from",
                                  "=>"])
 def test_bad_abbreviation_syntax(tmp_path, abbr):
    fpath = tmp_path / ('test_config.yaml')
    fpath.write_text(dedent("""\
        normalization:
        transliteration:
        compound_suffixes:
        abbreviations:
         - {}
        """.format(abbr)))
    repl = ICURuleLoader(fpath).get_replacement_pairs()
    assert repl == []