From ae83ceab5ef97988cf9ea375ae4cf7afd1c05110 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 10 Apr 2018 22:48:17 +0200
Subject: [PATCH] ignore Unicode format characters for normalization

Also adds tests.

Fixes #1007.
---
 settings/defaults.php                   |  2 +-
 test/bdd/db/import/postcodes.feature    |  1 -
 test/bdd/db/query/normalization.feature | 10 ++++++++++
 test/bdd/osm2pgsql/import/tags.feature  |  9 +++++++++
 test/bdd/steps/db_ops.py                |  2 ++
 5 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/settings/defaults.php b/settings/defaults.php
index 81c19c74..2d8f47d0 100644
--- a/settings/defaults.php
+++ b/settings/defaults.php
@@ -20,7 +20,7 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
 // Rules for normalizing terms for comparison before doing comparisons.
 // The default is to remove accents and punctuation and to lower-case the
 // term. Spaces are kept but collapsed to one standard space.
-@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
+@define('CONST_Term_Normalization_Rules', ":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
 
 // Set to false to avoid importing extra postcodes for the US.
 @define('CONST_Use_Extra_US_Postcodes', true);
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature
index 4c49dc5b..7fde34d3 100644
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -95,7 +95,6 @@ Feature: Import of postcodes
             | object | postcode |
             | W93    | 445023   |
 
-    @wip
     Scenario: Postcodes from admin boundaries are preferred over estimated postcodes
         Given the scene admin-areas
         And the named places
diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature
index 1ef1fcbe..32052647 100644
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -136,3 +136,13 @@ Feature: Import and search of names
         Then results contain
          | ID | osm_type | osm_id |
          | 0  | R        | 1 |
+
+     Scenario: Unprintable characters in postcodes are ignored
+        Given the named places
+            | osm  | class   | type   | address |
+            | N234 | amenity | prison | 'postcode' : u'1234\u200e' |
+        When importing
+        And searching for "1234"
+        Then results contain
+         | ID | osm_type |
+         | 0  | P        |
diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature
index 7db8d629..e2594343 100644
--- a/test/bdd/osm2pgsql/import/tags.feature
+++ b/test/bdd/osm2pgsql/import/tags.feature
@@ -96,6 +96,15 @@ Feature: Tag evaluation
          | N3     | 'name: de' : 'Foo', 'name:\\\\' : 'real3' |
          | N4     | 'name: de' : 'Foo', 'name' : 'rea\\l3' |
 
+    Scenario: Unprintable character in address tag are maintained
+        When loading osm data
+         """
+         n23 Tamenity=yes,name=foo,addr:postcode=1234%200e%
+         """
+        Then place contains
+         | object | address |
+         | N23    | 'postcode' : u'1234\u200e' |
+
     Scenario Outline: Included places
         When loading osm data
          """
diff --git a/test/bdd/steps/db_ops.py b/test/bdd/steps/db_ops.py
index be2211fa..87babdad 100644
--- a/test/bdd/steps/db_ops.py
+++ b/test/bdd/steps/db_ops.py
@@ -22,6 +22,8 @@ class PlaceColumn:
             self.add_hstore('extratags', key[6:], value)
         elif key.startswith('addr+'):
             self.add_hstore('address', key[5:], value)
+        elif key in ('name', 'address', 'extratags'):
+            self.columns[key] = eval('{' + value + '}')
         else:
             assert_in(key, ('class', 'type'))
             self.columns[key] = None if value == '' else value