Merge pull request #1570 from lonvia/wikipedia-importance-updates

Wikipedia importance updates
This commit is contained in:
Sarah Hoffmann 2019-11-20 11:25:23 +01:00 committed by GitHub
commit f180f99a95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 103 additions and 72 deletions

View File

@ -33,7 +33,7 @@ the directory exists. There should be at least 40GB of free space.
## Downloading additional data ## Downloading additional data
### Wikipedia rankings ### Wikipedia/Wikidata rankings
Wikipedia can be used as an optional auxiliary data source to help indicate Wikipedia can be used as an optional auxiliary data source to help indicate
the importance of OSM features. Nominatim will work without this information the importance of OSM features. Nominatim will work without this information
@ -41,15 +41,13 @@ but it will improve the quality of the results if this is installed.
This data is available as a binary download: This data is available as a binary download:
cd $NOMINATIM_SOURCE_DIR/data cd $NOMINATIM_SOURCE_DIR/data
wget https://www.nominatim.org/data/wikipedia_article.sql.bin wget https://www.nominatim.org/data/wikimedia-importance.sql.gz
wget https://www.nominatim.org/data/wikipedia_redirect.sql.bin
Combined the 2 files are around 1.5GB and add around 30GB to the install The file is about 400MB and adds around 4GB to Nominatim database.
size of Nominatim. They also increase the install time by an hour or so.
*NOTE:* you'll need to download the Wikipedia rankings before performing *NOTE:* if you forgot to download the wikipedia rankings, you can also add
the initial import of the data if you want the rankings applied to the them after the import by running `./utils/setup.php --import-wikipedia-articles`
loaded data. and then `./utils/update.php --recompute-importance`.
### Great Britain, USA postcodes ### Great Britain, USA postcodes

View File

@ -6,6 +6,21 @@ to newer versions of Nominatim.
SQL statements should be executed from the PostgreSQL commandline. Execute SQL statements should be executed from the PostgreSQL commandline. Execute
`psql nominatim` to enter command line mode. `psql nominatim` to enter command line mode.
## 3.4.0 -> master
### New Wikipedia/Wikidata importance tables
The `wikipedia_*` tables have a new format that also includes references to
Wikidata. You need to update the computation functions and the tables as
follows:
* download the new Wikipedia tables as described in the import section
* reimport the tables: `./utils/setup.php --import-wikipedia-articles`
* update the functions: `./utils/setup.php --create-functions --enable-diff-updates`
* compute importance: `./utils/update.php --recompute-importance`
The last step takes about 10 hours on the full planet.
## 3.3.0 -> 3.4.0 ## 3.3.0 -> 3.4.0
### Reorganisation of location_area_country table ### Reorganisation of location_area_country table

View File

@ -160,13 +160,6 @@ class SetupFunctions
if ($this->bNoPartitions) { if ($this->bNoPartitions) {
$this->pgsqlRunScript('update country_name set partition = 0'); $this->pgsqlRunScript('update country_name set partition = 0');
} }
// the following will be needed by createFunctions later but
// is only defined in the subsequently called createTables
// Create dummies here that will be overwritten by the proper
// versions in create-tables.
$this->pgsqlRunScript('CREATE TABLE IF NOT EXISTS place_boundingbox ()');
$this->pgsqlRunScript('CREATE TYPE wikipedia_article_match AS ()', false);
} }
public function importData($sOSMFile) public function importData($sOSMFile)
@ -323,19 +316,14 @@ class SetupFunctions
public function importWikipediaArticles() public function importWikipediaArticles()
{ {
$sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikipedia_article.sql.bin'; $sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikimedia-importance.sql.gz';
$sWikiRedirectsFile = CONST_Wikipedia_Data_Path.'/wikipedia_redirect.sql.bin';
if (file_exists($sWikiArticlesFile)) { if (file_exists($sWikiArticlesFile)) {
info('Importing wikipedia articles'); info('Importing wikipedia articles and redirects');
$this->pgsqlRunDropAndRestore($sWikiArticlesFile); $this->pgExec('DROP TABLE IF EXISTS wikipedia_article');
$this->pgExec('DROP TABLE IF EXISTS wikipedia_redirect');
$this->pgsqlRunScriptFile($sWikiArticlesFile);
} else { } else {
warn('wikipedia article dump file not found - places will have default importance'); warn('wikipedia importance dump file not found - places will have default importance');
}
if (file_exists($sWikiRedirectsFile)) {
info('Importing wikipedia redirects');
$this->pgsqlRunDropAndRestore($sWikiRedirectsFile);
} else {
warn('wikipedia redirect dump file not found - some place importance values may be missing');
} }
} }
@ -351,8 +339,6 @@ class SetupFunctions
echo '.'; echo '.';
$this->pgExec('TRUNCATE place_addressline'); $this->pgExec('TRUNCATE place_addressline');
echo '.'; echo '.';
$this->pgExec('TRUNCATE place_boundingbox');
echo '.';
$this->pgExec('TRUNCATE location_area'); $this->pgExec('TRUNCATE location_area');
echo '.'; echo '.';
if (!$this->dbReverseOnly()) { if (!$this->dbReverseOnly()) {
@ -744,25 +730,6 @@ class SetupFunctions
} }
} }
private function pgsqlRunDropAndRestore($sDumpFile)
{
$sCMD = 'pg_restore'
.' -p '.escapeshellarg($this->aDSNInfo['port'])
.' -d '.escapeshellarg($this->aDSNInfo['database'])
.' --no-owner -Fc --clean '.escapeshellarg($sDumpFile);
if ($this->oDB->getPostgresVersion() >= 9.04) {
$sCMD .= ' --if-exists';
}
if (isset($this->aDSNInfo['hostspec'])) {
$sCMD .= ' -h '.escapeshellarg($this->aDSNInfo['hostspec']);
}
if (isset($this->aDSNInfo['username'])) {
$sCMD .= ' -U '.escapeshellarg($this->aDSNInfo['username']);
}
$this->runWithPgEnv($sCMD);
}
private function pgsqlRunScript($sScript, $bfatal = true) private function pgsqlRunScript($sScript, $bfatal = true)
{ {
runSQLScript( runSQLScript(

View File

@ -1358,10 +1358,9 @@ BEGIN
END LOOP; END LOOP;
NEW.importance := null; NEW.importance := null;
select language||':'||title,importance from get_wikipedia_match(NEW.extratags, NEW.country_code) INTO NEW.wikipedia,NEW.importance; SELECT wikipedia, importance
IF NEW.importance IS NULL THEN FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id)
select language||':'||title,importance from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance; INTO NEW.wikipedia,NEW.importance;
END IF;
--DEBUG: RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance; --DEBUG: RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;
@ -1600,9 +1599,10 @@ BEGIN
-- mark the linked place (excludes from search results) -- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit select wikipedia, importance
linked_node_id := linkedPlacex.osm_id; FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; 'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked label member'; --DEBUG: RAISE WARNING 'Linked label member';
END LOOP; END LOOP;
@ -1639,9 +1639,10 @@ BEGIN
-- mark the linked place (excludes from search results) -- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit select wikipedia, importance
linked_node_id := linkedPlacex.osm_id; FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; 'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked admin_center'; --DEBUG: RAISE WARNING 'Linked admin_center';
END IF; END IF;
@ -1684,9 +1685,10 @@ BEGIN
-- mark the linked place (excludes from search results) -- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id; UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit select wikipedia, importance
linked_node_id := linkedPlacex.osm_id; FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance; 'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked named place'; --DEBUG: RAISE WARNING 'Linked named place';
END LOOP; END LOOP;
END IF; END IF;
@ -1714,13 +1716,6 @@ BEGIN
(NEW.importance is null or NEW.importance < linked_importance) THEN (NEW.importance is null or NEW.importance < linked_importance) THEN
NEW.importance = linked_importance; NEW.importance = linked_importance;
END IF; END IF;
-- Still null? how about looking it up by the node id
IF NEW.importance IS NULL THEN
--DEBUG: RAISE WARNING 'Looking up importance by linked node id';
select language||':'||title,importance from wikipedia_article where osm_type = 'N'::char(1) and osm_id = linked_node_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance;
END IF;
END IF; END IF;
-- make sure all names are in the word table -- make sure all names are in the word table
@ -2627,7 +2622,7 @@ END;
$$ $$
LANGUAGE plpgsql IMMUTABLE; LANGUAGE plpgsql IMMUTABLE;
DROP TYPE wikipedia_article_match CASCADE; DROP TYPE IF EXISTS wikipedia_article_match CASCADE;
create type wikipedia_article_match as ( create type wikipedia_article_match as (
language TEXT, language TEXT,
title TEXT, title TEXT,
@ -2684,6 +2679,42 @@ END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;
DROP TYPE IF EXISTS place_importance CASCADE;
create type place_importance as (
importance FLOAT,
wikipedia TEXT
);
CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT)
RETURNS place_importance
AS $$
DECLARE
match RECORD;
result place_importance;
BEGIN
FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code)
WHERE language is not NULL
LOOP
result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title;
RETURN result;
END LOOP;
IF extratags ? 'wikidata' THEN
FOR match IN SELECT * FROM wikipedia_article
WHERE wd_page_title = extratags->'wikidata'
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP
result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title;
RETURN result;
END LOOP;
END IF;
RETURN null;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER) CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER)
RETURNS SETOF GEOMETRY RETURNS SETOF GEOMETRY
AS $$ AS $$

View File

@ -268,7 +268,9 @@ CREATE TABLE wikipedia_article (
lon double precision, lon double precision,
importance double precision, importance double precision,
osm_type character(1), osm_type character(1),
osm_id bigint osm_id bigint,
wd_page_title text,
instance_of text
); );
ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title); ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id); CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);

View File

@ -119,13 +119,13 @@ Feature: Search queries
| en | | en |
Then result addresses contain Then result addresses contain
| ID | state | | ID | state |
| 0 | Salto | | 0 | Florida |
When sending json search query "25 de Mayo" with address When sending json search query "25 de Mayo" with address
| accept-language | viewbox | | accept-language | viewbox |
| en | -56.35879,-34.18330,-56.31618,-34.20815 | | en | -57.95468,-31.39261,-57.94741,-31.39490 |
Then result addresses contain Then result addresses contain
| ID | state | | ID | state |
| 0 | Florida | | 0 | Salto |
Scenario: viewboxes cannot be points Scenario: viewboxes cannot be points
When sending json search query "foo" When sending json search query "foo"

Binary file not shown.

View File

@ -42,6 +42,7 @@ $aCMDOptions
array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'), array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'),
array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'), array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'),
array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'), array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'),
array('recompute-importance', '', 0, 1, 0, 0, 'bool', 'Recompute place importances'),
array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'), array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'),
); );
@ -320,6 +321,23 @@ if ($aResult['update-address-levels']) {
$oAlParser->createTable($oDB, 'address_levels'); $oAlParser->createTable($oDB, 'address_levels');
} }
if ($aResult['recompute-importance']) {
echo "Updating importance values for database.\n";
$oDB = new Nominatim\DB();
$oDB->connect();
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
$sSQL .= ' (SELECT wikipedia, importance';
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
$sSQL .= ' FROM placex d';
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
$oDB->exec($sSQL);
}
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) { if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
// //
if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) { if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) {