diff --git a/docs/admin/Import-and-Update.md b/docs/admin/Import-and-Update.md index 757dab69..246fcdda 100644 --- a/docs/admin/Import-and-Update.md +++ b/docs/admin/Import-and-Update.md @@ -33,7 +33,7 @@ the directory exists. There should be at least 40GB of free space. ## Downloading additional data -### Wikipedia rankings +### Wikipedia/Wikidata rankings Wikipedia can be used as an optional auxiliary data source to help indicate the importance of OSM features. Nominatim will work without this information @@ -41,15 +41,13 @@ but it will improve the quality of the results if this is installed. This data is available as a binary download: cd $NOMINATIM_SOURCE_DIR/data - wget https://www.nominatim.org/data/wikipedia_article.sql.bin - wget https://www.nominatim.org/data/wikipedia_redirect.sql.bin + wget https://www.nominatim.org/data/wikimedia-importance.sql.gz -Combined the 2 files are around 1.5GB and add around 30GB to the install -size of Nominatim. They also increase the install time by an hour or so. +The file is about 400MB and adds around 4GB to Nominatim database. -*NOTE:* you'll need to download the Wikipedia rankings before performing -the initial import of the data if you want the rankings applied to the -loaded data. +*NOTE:* if you forgot to download the wikipedia rankings, you can also add +them after the import by running `./utils/setup.php --import-wikipedia-articles` +and then `./utils/update.php --recompute-importance`. ### Great Britain, USA postcodes diff --git a/docs/admin/Migration.md b/docs/admin/Migration.md index f3668357..e6b6d102 100644 --- a/docs/admin/Migration.md +++ b/docs/admin/Migration.md @@ -6,6 +6,21 @@ to newer versions of Nominatim. SQL statements should be executed from the PostgreSQL commandline. Execute `psql nominatim` to enter command line mode. +## 3.4.0 -> master + +### New Wikipedia/Wikidata importance tables + +The `wikipedia_*` tables have a new format that also includes references to +Wikidata. You need to update the computation functions and the tables as +follows: + + * download the new Wikipedia tables as described in the import section + * reimport the tables: `./utils/setup.php --import-wikipedia-articles` + * update the functions: `./utils/setup.php --create-functions --enable-diff-updates` + * compute importance: `./utils/update.php --recompute-importance` + +The last step takes about 10 hours on the full planet. + ## 3.3.0 -> 3.4.0 ### Reorganisation of location_area_country table diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index 818aeeb7..2fdb3926 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -160,13 +160,6 @@ class SetupFunctions if ($this->bNoPartitions) { $this->pgsqlRunScript('update country_name set partition = 0'); } - - // the following will be needed by createFunctions later but - // is only defined in the subsequently called createTables - // Create dummies here that will be overwritten by the proper - // versions in create-tables. - $this->pgsqlRunScript('CREATE TABLE IF NOT EXISTS place_boundingbox ()'); - $this->pgsqlRunScript('CREATE TYPE wikipedia_article_match AS ()', false); } public function importData($sOSMFile) @@ -323,19 +316,14 @@ class SetupFunctions public function importWikipediaArticles() { - $sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikipedia_article.sql.bin'; - $sWikiRedirectsFile = CONST_Wikipedia_Data_Path.'/wikipedia_redirect.sql.bin'; + $sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikimedia-importance.sql.gz'; if (file_exists($sWikiArticlesFile)) { - info('Importing wikipedia articles'); - $this->pgsqlRunDropAndRestore($sWikiArticlesFile); + info('Importing wikipedia articles and redirects'); + $this->pgExec('DROP TABLE IF EXISTS wikipedia_article'); + $this->pgExec('DROP TABLE IF EXISTS wikipedia_redirect'); + $this->pgsqlRunScriptFile($sWikiArticlesFile); } else { - warn('wikipedia article dump file not found - places will have default importance'); - } - if (file_exists($sWikiRedirectsFile)) { - info('Importing wikipedia redirects'); - $this->pgsqlRunDropAndRestore($sWikiRedirectsFile); - } else { - warn('wikipedia redirect dump file not found - some place importance values may be missing'); + warn('wikipedia importance dump file not found - places will have default importance'); } } @@ -351,8 +339,6 @@ class SetupFunctions echo '.'; $this->pgExec('TRUNCATE place_addressline'); echo '.'; - $this->pgExec('TRUNCATE place_boundingbox'); - echo '.'; $this->pgExec('TRUNCATE location_area'); echo '.'; if (!$this->dbReverseOnly()) { @@ -744,25 +730,6 @@ class SetupFunctions } } - private function pgsqlRunDropAndRestore($sDumpFile) - { - $sCMD = 'pg_restore' - .' -p '.escapeshellarg($this->aDSNInfo['port']) - .' -d '.escapeshellarg($this->aDSNInfo['database']) - .' --no-owner -Fc --clean '.escapeshellarg($sDumpFile); - if ($this->oDB->getPostgresVersion() >= 9.04) { - $sCMD .= ' --if-exists'; - } - if (isset($this->aDSNInfo['hostspec'])) { - $sCMD .= ' -h '.escapeshellarg($this->aDSNInfo['hostspec']); - } - if (isset($this->aDSNInfo['username'])) { - $sCMD .= ' -U '.escapeshellarg($this->aDSNInfo['username']); - } - - $this->runWithPgEnv($sCMD); - } - private function pgsqlRunScript($sScript, $bfatal = true) { runSQLScript( diff --git a/sql/functions.sql b/sql/functions.sql index f3bff89c..ad2007cb 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -1358,10 +1358,9 @@ BEGIN END LOOP; NEW.importance := null; - select language||':'||title,importance from get_wikipedia_match(NEW.extratags, NEW.country_code) INTO NEW.wikipedia,NEW.importance; - IF NEW.importance IS NULL THEN - select language||':'||title,importance from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance; - END IF; + SELECT wikipedia, importance + FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id) + INTO NEW.wikipedia,NEW.importance; --DEBUG: RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance; @@ -1600,9 +1599,10 @@ BEGIN -- mark the linked place (excludes from search results) UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; - -- keep a note of the node id in case we need it for wikipedia in a bit - linked_node_id := linkedPlacex.osm_id; - select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; + select wikipedia, importance + FROM compute_importance(linkedPlacex.extratags, NEW.country_code, + 'N', linkedPlacex.osm_id) + INTO linked_wikipedia,linked_importance; --DEBUG: RAISE WARNING 'Linked label member'; END LOOP; @@ -1639,9 +1639,10 @@ BEGIN -- mark the linked place (excludes from search results) UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; - -- keep a note of the node id in case we need it for wikipedia in a bit - linked_node_id := linkedPlacex.osm_id; - select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; + select wikipedia, importance + FROM compute_importance(linkedPlacex.extratags, NEW.country_code, + 'N', linkedPlacex.osm_id) + INTO linked_wikipedia,linked_importance; --DEBUG: RAISE WARNING 'Linked admin_center'; END IF; @@ -1684,9 +1685,10 @@ BEGIN -- mark the linked place (excludes from search results) UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; - -- keep a note of the node id in case we need it for wikipedia in a bit - linked_node_id := linkedPlacex.osm_id; - select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; + select wikipedia, importance + FROM compute_importance(linkedPlacex.extratags, NEW.country_code, + 'N', linkedPlacex.osm_id) + INTO linked_wikipedia,linked_importance; --DEBUG: RAISE WARNING 'Linked named place'; END LOOP; END IF; @@ -1714,13 +1716,6 @@ BEGIN (NEW.importance is null or NEW.importance < linked_importance) THEN NEW.importance = linked_importance; END IF; - - -- Still null? how about looking it up by the node id - IF NEW.importance IS NULL THEN - --DEBUG: RAISE WARNING 'Looking up importance by linked node id'; - select language||':'||title,importance from wikipedia_article where osm_type = 'N'::char(1) and osm_id = linked_node_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance; - END IF; - END IF; -- make sure all names are in the word table @@ -2627,7 +2622,7 @@ END; $$ LANGUAGE plpgsql IMMUTABLE; -DROP TYPE wikipedia_article_match CASCADE; +DROP TYPE IF EXISTS wikipedia_article_match CASCADE; create type wikipedia_article_match as ( language TEXT, title TEXT, @@ -2684,6 +2679,42 @@ END; $$ LANGUAGE plpgsql; +DROP TYPE IF EXISTS place_importance CASCADE; +create type place_importance as ( + importance FLOAT, + wikipedia TEXT +); + +CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT) + RETURNS place_importance + AS $$ +DECLARE + match RECORD; + result place_importance; +BEGIN + FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code) + WHERE language is not NULL + LOOP + result.importance := match.importance; + result.wikipedia := match.language || ':' || match.title; + RETURN result; + END LOOP; + + IF extratags ? 'wikidata' THEN + FOR match IN SELECT * FROM wikipedia_article + WHERE wd_page_title = extratags->'wikidata' + ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP + result.importance := match.importance; + result.wikipedia := match.language || ':' || match.title; + RETURN result; + END LOOP; + END IF; + + RETURN null; +END; +$$ +LANGUAGE plpgsql; + CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER) RETURNS SETOF GEOMETRY AS $$ diff --git a/sql/tables.sql b/sql/tables.sql index 0559abd4..0245e3c3 100644 --- a/sql/tables.sql +++ b/sql/tables.sql @@ -268,7 +268,9 @@ CREATE TABLE wikipedia_article ( lon double precision, importance double precision, osm_type character(1), - osm_id bigint + osm_id bigint, + wd_page_title text, + instance_of text ); ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title); CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id); diff --git a/test/bdd/api/search/params.feature b/test/bdd/api/search/params.feature index 23a86705..cdea3f69 100644 --- a/test/bdd/api/search/params.feature +++ b/test/bdd/api/search/params.feature @@ -119,13 +119,13 @@ Feature: Search queries | en | Then result addresses contain | ID | state | - | 0 | Salto | + | 0 | Florida | When sending json search query "25 de Mayo" with address | accept-language | viewbox | - | en | -56.35879,-34.18330,-56.31618,-34.20815 | + | en | -57.95468,-31.39261,-57.94741,-31.39490 | Then result addresses contain | ID | state | - | 0 | Florida | + | 0 | Salto | Scenario: viewboxes cannot be points When sending json search query "foo" diff --git a/test/testdb/wikimedia-importance.sql.gz b/test/testdb/wikimedia-importance.sql.gz new file mode 100644 index 00000000..1024f725 Binary files /dev/null and b/test/testdb/wikimedia-importance.sql.gz differ diff --git a/test/testdb/wikipedia_article.sql.bin b/test/testdb/wikipedia_article.sql.bin deleted file mode 100644 index 628e2af4..00000000 Binary files a/test/testdb/wikipedia_article.sql.bin and /dev/null differ diff --git a/test/testdb/wikipedia_redirect.sql.bin b/test/testdb/wikipedia_redirect.sql.bin deleted file mode 100644 index 9c4b513d..00000000 Binary files a/test/testdb/wikipedia_redirect.sql.bin and /dev/null differ diff --git a/utils/update.php b/utils/update.php index c3620b06..4317367a 100644 --- a/utils/update.php +++ b/utils/update.php @@ -42,6 +42,7 @@ $aCMDOptions array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'), array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'), array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'), + array('recompute-importance', '', 0, 1, 0, 0, 'bool', 'Recompute place importances'), array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'), ); @@ -320,6 +321,23 @@ if ($aResult['update-address-levels']) { $oAlParser->createTable($oDB, 'address_levels'); } +if ($aResult['recompute-importance']) { + echo "Updating importance values for database.\n"; + $oDB = new Nominatim\DB(); + $oDB->connect(); + + $sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;'; + $sSQL .= 'UPDATE placex SET (wikipedia, importance) ='; + $sSQL .= ' (SELECT wikipedia, importance'; + $sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));'; + $sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance'; + $sSQL .= ' FROM placex d'; + $sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null'; + $sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);'; + $sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;'; + $oDB->exec($sSQL); +} + if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { // if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) {