Merge pull request #1570 from lonvia/wikipedia-importance-updates

Wikipedia importance updates
This commit is contained in:
Sarah Hoffmann 2019-11-20 11:25:23 +01:00 committed by GitHub
commit f180f99a95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 103 additions and 72 deletions

View File

@ -33,7 +33,7 @@ the directory exists. There should be at least 40GB of free space.
## Downloading additional data
### Wikipedia rankings
### Wikipedia/Wikidata rankings
Wikipedia can be used as an optional auxiliary data source to help indicate
the importance of OSM features. Nominatim will work without this information
@ -41,15 +41,13 @@ but it will improve the quality of the results if this is installed.
This data is available as a binary download:
cd $NOMINATIM_SOURCE_DIR/data
wget https://www.nominatim.org/data/wikipedia_article.sql.bin
wget https://www.nominatim.org/data/wikipedia_redirect.sql.bin
wget https://www.nominatim.org/data/wikimedia-importance.sql.gz
Combined the 2 files are around 1.5GB and add around 30GB to the install
size of Nominatim. They also increase the install time by an hour or so.
The file is about 400MB and adds around 4GB to Nominatim database.
*NOTE:* you'll need to download the Wikipedia rankings before performing
the initial import of the data if you want the rankings applied to the
loaded data.
*NOTE:* if you forgot to download the wikipedia rankings, you can also add
them after the import by running `./utils/setup.php --import-wikipedia-articles`
and then `./utils/update.php --recompute-importance`.
### Great Britain, USA postcodes

View File

@ -6,6 +6,21 @@ to newer versions of Nominatim.
SQL statements should be executed from the PostgreSQL commandline. Execute
`psql nominatim` to enter command line mode.
## 3.4.0 -> master
### New Wikipedia/Wikidata importance tables
The `wikipedia_*` tables have a new format that also includes references to
Wikidata. You need to update the computation functions and the tables as
follows:
* download the new Wikipedia tables as described in the import section
* reimport the tables: `./utils/setup.php --import-wikipedia-articles`
* update the functions: `./utils/setup.php --create-functions --enable-diff-updates`
* compute importance: `./utils/update.php --recompute-importance`
The last step takes about 10 hours on the full planet.
## 3.3.0 -> 3.4.0
### Reorganisation of location_area_country table

View File

@ -160,13 +160,6 @@ class SetupFunctions
if ($this->bNoPartitions) {
$this->pgsqlRunScript('update country_name set partition = 0');
}
// the following will be needed by createFunctions later but
// is only defined in the subsequently called createTables
// Create dummies here that will be overwritten by the proper
// versions in create-tables.
$this->pgsqlRunScript('CREATE TABLE IF NOT EXISTS place_boundingbox ()');
$this->pgsqlRunScript('CREATE TYPE wikipedia_article_match AS ()', false);
}
public function importData($sOSMFile)
@ -323,19 +316,14 @@ class SetupFunctions
public function importWikipediaArticles()
{
$sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikipedia_article.sql.bin';
$sWikiRedirectsFile = CONST_Wikipedia_Data_Path.'/wikipedia_redirect.sql.bin';
$sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikimedia-importance.sql.gz';
if (file_exists($sWikiArticlesFile)) {
info('Importing wikipedia articles');
$this->pgsqlRunDropAndRestore($sWikiArticlesFile);
info('Importing wikipedia articles and redirects');
$this->pgExec('DROP TABLE IF EXISTS wikipedia_article');
$this->pgExec('DROP TABLE IF EXISTS wikipedia_redirect');
$this->pgsqlRunScriptFile($sWikiArticlesFile);
} else {
warn('wikipedia article dump file not found - places will have default importance');
}
if (file_exists($sWikiRedirectsFile)) {
info('Importing wikipedia redirects');
$this->pgsqlRunDropAndRestore($sWikiRedirectsFile);
} else {
warn('wikipedia redirect dump file not found - some place importance values may be missing');
warn('wikipedia importance dump file not found - places will have default importance');
}
}
@ -351,8 +339,6 @@ class SetupFunctions
echo '.';
$this->pgExec('TRUNCATE place_addressline');
echo '.';
$this->pgExec('TRUNCATE place_boundingbox');
echo '.';
$this->pgExec('TRUNCATE location_area');
echo '.';
if (!$this->dbReverseOnly()) {
@ -744,25 +730,6 @@ class SetupFunctions
}
}
private function pgsqlRunDropAndRestore($sDumpFile)
{
$sCMD = 'pg_restore'
.' -p '.escapeshellarg($this->aDSNInfo['port'])
.' -d '.escapeshellarg($this->aDSNInfo['database'])
.' --no-owner -Fc --clean '.escapeshellarg($sDumpFile);
if ($this->oDB->getPostgresVersion() >= 9.04) {
$sCMD .= ' --if-exists';
}
if (isset($this->aDSNInfo['hostspec'])) {
$sCMD .= ' -h '.escapeshellarg($this->aDSNInfo['hostspec']);
}
if (isset($this->aDSNInfo['username'])) {
$sCMD .= ' -U '.escapeshellarg($this->aDSNInfo['username']);
}
$this->runWithPgEnv($sCMD);
}
private function pgsqlRunScript($sScript, $bfatal = true)
{
runSQLScript(

View File

@ -1358,10 +1358,9 @@ BEGIN
END LOOP;
NEW.importance := null;
select language||':'||title,importance from get_wikipedia_match(NEW.extratags, NEW.country_code) INTO NEW.wikipedia,NEW.importance;
IF NEW.importance IS NULL THEN
select language||':'||title,importance from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance;
END IF;
SELECT wikipedia, importance
FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id)
INTO NEW.wikipedia,NEW.importance;
--DEBUG: RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;
@ -1600,9 +1599,10 @@ BEGIN
-- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit
linked_node_id := linkedPlacex.osm_id;
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
select wikipedia, importance
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked label member';
END LOOP;
@ -1639,9 +1639,10 @@ BEGIN
-- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit
linked_node_id := linkedPlacex.osm_id;
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
select wikipedia, importance
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked admin_center';
END IF;
@ -1684,9 +1685,10 @@ BEGIN
-- mark the linked place (excludes from search results)
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
-- keep a note of the node id in case we need it for wikipedia in a bit
linked_node_id := linkedPlacex.osm_id;
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
select wikipedia, importance
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
'N', linkedPlacex.osm_id)
INTO linked_wikipedia,linked_importance;
--DEBUG: RAISE WARNING 'Linked named place';
END LOOP;
END IF;
@ -1714,13 +1716,6 @@ BEGIN
(NEW.importance is null or NEW.importance < linked_importance) THEN
NEW.importance = linked_importance;
END IF;
-- Still null? how about looking it up by the node id
IF NEW.importance IS NULL THEN
--DEBUG: RAISE WARNING 'Looking up importance by linked node id';
select language||':'||title,importance from wikipedia_article where osm_type = 'N'::char(1) and osm_id = linked_node_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance;
END IF;
END IF;
-- make sure all names are in the word table
@ -2627,7 +2622,7 @@ END;
$$
LANGUAGE plpgsql IMMUTABLE;
DROP TYPE wikipedia_article_match CASCADE;
DROP TYPE IF EXISTS wikipedia_article_match CASCADE;
create type wikipedia_article_match as (
language TEXT,
title TEXT,
@ -2684,6 +2679,42 @@ END;
$$
LANGUAGE plpgsql;
DROP TYPE IF EXISTS place_importance CASCADE;
create type place_importance as (
importance FLOAT,
wikipedia TEXT
);
CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT)
RETURNS place_importance
AS $$
DECLARE
match RECORD;
result place_importance;
BEGIN
FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code)
WHERE language is not NULL
LOOP
result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title;
RETURN result;
END LOOP;
IF extratags ? 'wikidata' THEN
FOR match IN SELECT * FROM wikipedia_article
WHERE wd_page_title = extratags->'wikidata'
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP
result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title;
RETURN result;
END LOOP;
END IF;
RETURN null;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER)
RETURNS SETOF GEOMETRY
AS $$

View File

@ -268,7 +268,9 @@ CREATE TABLE wikipedia_article (
lon double precision,
importance double precision,
osm_type character(1),
osm_id bigint
osm_id bigint,
wd_page_title text,
instance_of text
);
ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);

View File

@ -119,13 +119,13 @@ Feature: Search queries
| en |
Then result addresses contain
| ID | state |
| 0 | Salto |
| 0 | Florida |
When sending json search query "25 de Mayo" with address
| accept-language | viewbox |
| en | -56.35879,-34.18330,-56.31618,-34.20815 |
| en | -57.95468,-31.39261,-57.94741,-31.39490 |
Then result addresses contain
| ID | state |
| 0 | Florida |
| 0 | Salto |
Scenario: viewboxes cannot be points
When sending json search query "foo"

Binary file not shown.

View File

@ -42,6 +42,7 @@ $aCMDOptions
array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'),
array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'),
array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'),
array('recompute-importance', '', 0, 1, 0, 0, 'bool', 'Recompute place importances'),
array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'),
);
@ -320,6 +321,23 @@ if ($aResult['update-address-levels']) {
$oAlParser->createTable($oDB, 'address_levels');
}
if ($aResult['recompute-importance']) {
echo "Updating importance values for database.\n";
$oDB = new Nominatim\DB();
$oDB->connect();
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
$sSQL .= ' (SELECT wikipedia, importance';
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
$sSQL .= ' FROM placex d';
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
$oDB->exec($sSQL);
}
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
//
if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) {