diff --git a/.travis.yml b/.travis.yml index 68d5be4e..5efc9f08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ script: - cd $TRAVIS_BUILD_DIR/build - if [[ $TEST_SUITE == "monaco" ]]; then wget --no-verbose --output-document=../data/monaco.osm.pbf http://download.geofabrik.de/europe/monaco-latest.osm.pbf; fi - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/setup.php --osm-file ../data/monaco.osm.pbf --osm2pgsql-cache 1000 --all 2>&1 | grep -v 'ETA (seconds)'; fi + - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/specialphrases.php --wiki-import | psql -d test_api_nominatim >/dev/null; fi - cd $TRAVIS_BUILD_DIR/test/php - if [[ $TEST_SUITE == "tests" ]]; then phpunit ./ ; fi - if [[ $TEST_SUITE == "tests" ]]; then phpcs --report-width=120 */**.php ; fi diff --git a/Vagrantfile b/Vagrantfile index 15d66e9e..b9d618e2 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -23,7 +23,16 @@ Vagrant.configure("2") do |config| end end - config.vm.define "centos" do |sub| + config.vm.define "travis" do |sub| + sub.vm.box = "bento/ubuntu-14.04" + sub.vm.provision :shell do |s| + s.path = "vagrant/install-on-travis-ci.sh" + s.privileged = false + s.args = [checkout] + end + end + + config.vm.define "centos" do |sub| sub.vm.box = "bento/centos-7.2" sub.vm.provision :shell do |s| s.path = "vagrant/install-on-centos-7.sh" diff --git a/docs/Installation.md b/docs/Installation.md index 41f76df1..88f32ada 100644 --- a/docs/Installation.md +++ b/docs/Installation.md @@ -39,6 +39,7 @@ For running Nominatim: * [PostGIS](http://postgis.refractions.net) (2.0 or later) * [PHP](http://php.net) (5.4 or later) * PHP-pgsql + * PHP-intl (bundled with PHP) * [PEAR::DB](http://pear.php.net/package/DB) * a webserver (apache or nginx are recommended) diff --git a/lib/Geocode.php b/lib/Geocode.php index ec8eb348..17aaf826 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -653,7 +653,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases) + public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -752,13 +752,19 @@ class Geocode */ } } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) { - if ($aSearch['sClass'] === '') { - $aSearch['sOperator'] = $aSearchTerm['operator']; + // require a normalized exact match of the term + // if we have the normalizer version of the query + // available + if ($aSearch['sClass'] === '' + && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) { $aSearch['sClass'] = $aSearchTerm['class']; $aSearch['sType'] = $aSearchTerm['type']; - if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name'; - else $aSearch['sOperator'] = 'near'; // near = in for the moment - if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1; + if ($aSearchTerm['operator'] == '') { + $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' : 'near'; + $aSearch['iSearchRank'] += 2; + } else { + $aSearch['sOperator'] = 'near'; // near = in for the moment + } if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -913,6 +919,13 @@ class Geocode { if (!$this->sQuery && !$this->aStructuredQuery) return array(); + $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); + if ($oNormalizer !== null) { + $sNormQuery = $oNormalizer->transliterate($this->sQuery); + } else { + $sNormQuery = null; + } + $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]"; $sCountryCodesSQL = false; if ($this->aCountryCodes) { @@ -1139,7 +1152,7 @@ class Geocode // array with: placeid => -1 | tiger-housenumber $aResultPlaceIDs = array(); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1151,7 +1164,7 @@ class Geocode $aFinalPhrase = end($aPhrases); $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/settings/defaults.php b/settings/defaults.php index 16711542..9f694c89 100644 --- a/settings/defaults.php +++ b/settings/defaults.php @@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true); // codes, to restrict import to a subset of languages. // Currently only affects the import of country names and special phrases. @define('CONST_Languages', false); +// Rules for normalizing terms for comparison before doing comparisons. +// The default is to remove accents and punctuation and to lower-case the +// term. Spaces are kept but collapsed to one standard space. +@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); // Set to false to avoid importing extra postcodes for the US. @define('CONST_Use_Extra_US_Postcodes', true); diff --git a/sql/functions.sql b/sql/functions.sql index 6cc42803..da496a10 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -101,7 +101,7 @@ END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, lookup_class text, lookup_type text) +CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text) RETURNS INTEGER AS $$ DECLARE @@ -109,17 +109,17 @@ DECLARE return_word_id INTEGER; BEGIN lookup_token := ' '||trim(lookup_word); - SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type into return_word_id; + SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type into return_word_id; IF return_word_id IS NULL THEN return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0); + INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0); END IF; RETURN return_word_id; END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, lookup_class text, lookup_type text, op text) +CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text, op text) RETURNS INTEGER AS $$ DECLARE @@ -127,10 +127,10 @@ DECLARE return_word_id INTEGER; BEGIN lookup_token := ' '||trim(lookup_word); - SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type and operator = op into return_word_id; + SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type and operator = op into return_word_id; IF return_word_id IS NULL THEN return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0, op); + INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0, op); END IF; RETURN return_word_id; END; diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 50522fc2..1a4a51d7 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -19,6 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true); include(CONST_InstallPath.'/settings/phrase_settings.php'); if ($aCMDResult['wiki-import']) { + $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules); $aPairs = array(); $sLanguageIn = CONST_Languages ? CONST_Languages : @@ -31,6 +32,11 @@ if ($aCMDResult['wiki-import']) { if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) { foreach ($aMatches as $aMatch) { $sLabel = trim($aMatch[1]); + if ($oNormalizer !== null) { + $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + } else { + $sTrans = null; + } $sClass = trim($aMatch[2]); $sType = trim($aMatch[3]); // hack around a bug where building=yes was imported with @@ -57,13 +63,13 @@ if ($aCMDResult['wiki-import']) { switch (trim($aMatch[4])) { case 'near': - echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'near');\n"; + echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'near');\n"; break; case 'in': - echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'in');\n"; + echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'in');\n"; break; default: - echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType');\n"; + echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType');\n"; break; } } diff --git a/vagrant/install-on-centos-7.sh b/vagrant/install-on-centos-7.sh index 8aeedcc6..8b283ef6 100755 --- a/vagrant/install-on-centos-7.sh +++ b/vagrant/install-on-centos-7.sh @@ -21,7 +21,7 @@ sudo yum install -y postgresql-server postgresql-contrib postgresql-devel postgis postgis-utils \ git cmake make gcc gcc-c++ libtool policycoreutils-python \ - php-pgsql php php-pear php-pear-DB libpqxx-devel proj-epsg \ + php-pgsql php php-pear php-pear-DB php-intl libpqxx-devel proj-epsg \ bzip2-devel proj-devel geos-devel libxml2-devel boost-devel expat-devel zlib-devel # If you want to run the test suite, you need to install the following diff --git a/vagrant/install-on-travis-ci.sh b/vagrant/install-on-travis-ci.sh index 44faa614..ec0a92da 100755 --- a/vagrant/install-on-travis-ci.sh +++ b/vagrant/install-on-travis-ci.sh @@ -16,7 +16,7 @@ sudo apt-get install -y -qq libboost-dev libboost-system-dev \ libboost-filesystem-dev libexpat1-dev zlib1g-dev libxml2-dev\ libbz2-dev libpq-dev libgeos-c1 libgeos++-dev libproj-dev \ postgresql-server-dev-9.6 postgresql-9.6-postgis-2.3 postgresql-contrib-9.6 \ - apache2 php5 php5-pgsql php-pear php-db + apache2 php5 php5-pgsql php-pear php-db php5-intl sudo apt-get install -y -qq python3-dev python3-pip python3-psycopg2 phpunit php5-cgi diff --git a/vagrant/install-on-ubuntu-16.sh b/vagrant/install-on-ubuntu-16.sh index c347923f..11f80a3e 100755 --- a/vagrant/install-on-ubuntu-16.sh +++ b/vagrant/install-on-ubuntu-16.sh @@ -28,7 +28,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: libbz2-dev libpq-dev libgeos-dev libgeos++-dev libproj-dev \ postgresql-server-dev-9.5 postgresql-9.5-postgis-2.2 postgresql-contrib-9.5 \ apache2 php php-pgsql libapache2-mod-php php-pear php-db \ - git + php-intl git # If you want to run the test suite, you need to install the following # additional packages: