mirror of
https://github.com/osm-search/Nominatim.git
synced 2025-01-08 06:34:44 +03:00
Merge pull request #1570 from lonvia/wikipedia-importance-updates
Wikipedia importance updates
This commit is contained in:
commit
f180f99a95
@ -33,7 +33,7 @@ the directory exists. There should be at least 40GB of free space.
|
||||
|
||||
## Downloading additional data
|
||||
|
||||
### Wikipedia rankings
|
||||
### Wikipedia/Wikidata rankings
|
||||
|
||||
Wikipedia can be used as an optional auxiliary data source to help indicate
|
||||
the importance of OSM features. Nominatim will work without this information
|
||||
@ -41,15 +41,13 @@ but it will improve the quality of the results if this is installed.
|
||||
This data is available as a binary download:
|
||||
|
||||
cd $NOMINATIM_SOURCE_DIR/data
|
||||
wget https://www.nominatim.org/data/wikipedia_article.sql.bin
|
||||
wget https://www.nominatim.org/data/wikipedia_redirect.sql.bin
|
||||
wget https://www.nominatim.org/data/wikimedia-importance.sql.gz
|
||||
|
||||
Combined the 2 files are around 1.5GB and add around 30GB to the install
|
||||
size of Nominatim. They also increase the install time by an hour or so.
|
||||
The file is about 400MB and adds around 4GB to Nominatim database.
|
||||
|
||||
*NOTE:* you'll need to download the Wikipedia rankings before performing
|
||||
the initial import of the data if you want the rankings applied to the
|
||||
loaded data.
|
||||
*NOTE:* if you forgot to download the wikipedia rankings, you can also add
|
||||
them after the import by running `./utils/setup.php --import-wikipedia-articles`
|
||||
and then `./utils/update.php --recompute-importance`.
|
||||
|
||||
### Great Britain, USA postcodes
|
||||
|
||||
|
@ -6,6 +6,21 @@ to newer versions of Nominatim.
|
||||
SQL statements should be executed from the PostgreSQL commandline. Execute
|
||||
`psql nominatim` to enter command line mode.
|
||||
|
||||
## 3.4.0 -> master
|
||||
|
||||
### New Wikipedia/Wikidata importance tables
|
||||
|
||||
The `wikipedia_*` tables have a new format that also includes references to
|
||||
Wikidata. You need to update the computation functions and the tables as
|
||||
follows:
|
||||
|
||||
* download the new Wikipedia tables as described in the import section
|
||||
* reimport the tables: `./utils/setup.php --import-wikipedia-articles`
|
||||
* update the functions: `./utils/setup.php --create-functions --enable-diff-updates`
|
||||
* compute importance: `./utils/update.php --recompute-importance`
|
||||
|
||||
The last step takes about 10 hours on the full planet.
|
||||
|
||||
## 3.3.0 -> 3.4.0
|
||||
|
||||
### Reorganisation of location_area_country table
|
||||
|
@ -160,13 +160,6 @@ class SetupFunctions
|
||||
if ($this->bNoPartitions) {
|
||||
$this->pgsqlRunScript('update country_name set partition = 0');
|
||||
}
|
||||
|
||||
// the following will be needed by createFunctions later but
|
||||
// is only defined in the subsequently called createTables
|
||||
// Create dummies here that will be overwritten by the proper
|
||||
// versions in create-tables.
|
||||
$this->pgsqlRunScript('CREATE TABLE IF NOT EXISTS place_boundingbox ()');
|
||||
$this->pgsqlRunScript('CREATE TYPE wikipedia_article_match AS ()', false);
|
||||
}
|
||||
|
||||
public function importData($sOSMFile)
|
||||
@ -323,19 +316,14 @@ class SetupFunctions
|
||||
|
||||
public function importWikipediaArticles()
|
||||
{
|
||||
$sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikipedia_article.sql.bin';
|
||||
$sWikiRedirectsFile = CONST_Wikipedia_Data_Path.'/wikipedia_redirect.sql.bin';
|
||||
$sWikiArticlesFile = CONST_Wikipedia_Data_Path.'/wikimedia-importance.sql.gz';
|
||||
if (file_exists($sWikiArticlesFile)) {
|
||||
info('Importing wikipedia articles');
|
||||
$this->pgsqlRunDropAndRestore($sWikiArticlesFile);
|
||||
info('Importing wikipedia articles and redirects');
|
||||
$this->pgExec('DROP TABLE IF EXISTS wikipedia_article');
|
||||
$this->pgExec('DROP TABLE IF EXISTS wikipedia_redirect');
|
||||
$this->pgsqlRunScriptFile($sWikiArticlesFile);
|
||||
} else {
|
||||
warn('wikipedia article dump file not found - places will have default importance');
|
||||
}
|
||||
if (file_exists($sWikiRedirectsFile)) {
|
||||
info('Importing wikipedia redirects');
|
||||
$this->pgsqlRunDropAndRestore($sWikiRedirectsFile);
|
||||
} else {
|
||||
warn('wikipedia redirect dump file not found - some place importance values may be missing');
|
||||
warn('wikipedia importance dump file not found - places will have default importance');
|
||||
}
|
||||
}
|
||||
|
||||
@ -351,8 +339,6 @@ class SetupFunctions
|
||||
echo '.';
|
||||
$this->pgExec('TRUNCATE place_addressline');
|
||||
echo '.';
|
||||
$this->pgExec('TRUNCATE place_boundingbox');
|
||||
echo '.';
|
||||
$this->pgExec('TRUNCATE location_area');
|
||||
echo '.';
|
||||
if (!$this->dbReverseOnly()) {
|
||||
@ -744,25 +730,6 @@ class SetupFunctions
|
||||
}
|
||||
}
|
||||
|
||||
private function pgsqlRunDropAndRestore($sDumpFile)
|
||||
{
|
||||
$sCMD = 'pg_restore'
|
||||
.' -p '.escapeshellarg($this->aDSNInfo['port'])
|
||||
.' -d '.escapeshellarg($this->aDSNInfo['database'])
|
||||
.' --no-owner -Fc --clean '.escapeshellarg($sDumpFile);
|
||||
if ($this->oDB->getPostgresVersion() >= 9.04) {
|
||||
$sCMD .= ' --if-exists';
|
||||
}
|
||||
if (isset($this->aDSNInfo['hostspec'])) {
|
||||
$sCMD .= ' -h '.escapeshellarg($this->aDSNInfo['hostspec']);
|
||||
}
|
||||
if (isset($this->aDSNInfo['username'])) {
|
||||
$sCMD .= ' -U '.escapeshellarg($this->aDSNInfo['username']);
|
||||
}
|
||||
|
||||
$this->runWithPgEnv($sCMD);
|
||||
}
|
||||
|
||||
private function pgsqlRunScript($sScript, $bfatal = true)
|
||||
{
|
||||
runSQLScript(
|
||||
|
@ -1358,10 +1358,9 @@ BEGIN
|
||||
END LOOP;
|
||||
|
||||
NEW.importance := null;
|
||||
select language||':'||title,importance from get_wikipedia_match(NEW.extratags, NEW.country_code) INTO NEW.wikipedia,NEW.importance;
|
||||
IF NEW.importance IS NULL THEN
|
||||
select language||':'||title,importance from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance;
|
||||
END IF;
|
||||
SELECT wikipedia, importance
|
||||
FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id)
|
||||
INTO NEW.wikipedia,NEW.importance;
|
||||
|
||||
--DEBUG: RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;
|
||||
|
||||
@ -1600,9 +1599,10 @@ BEGIN
|
||||
-- mark the linked place (excludes from search results)
|
||||
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
|
||||
|
||||
-- keep a note of the node id in case we need it for wikipedia in a bit
|
||||
linked_node_id := linkedPlacex.osm_id;
|
||||
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
|
||||
select wikipedia, importance
|
||||
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
|
||||
'N', linkedPlacex.osm_id)
|
||||
INTO linked_wikipedia,linked_importance;
|
||||
--DEBUG: RAISE WARNING 'Linked label member';
|
||||
END LOOP;
|
||||
|
||||
@ -1639,9 +1639,10 @@ BEGIN
|
||||
-- mark the linked place (excludes from search results)
|
||||
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
|
||||
|
||||
-- keep a note of the node id in case we need it for wikipedia in a bit
|
||||
linked_node_id := linkedPlacex.osm_id;
|
||||
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
|
||||
select wikipedia, importance
|
||||
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
|
||||
'N', linkedPlacex.osm_id)
|
||||
INTO linked_wikipedia,linked_importance;
|
||||
--DEBUG: RAISE WARNING 'Linked admin_center';
|
||||
END IF;
|
||||
|
||||
@ -1684,9 +1685,10 @@ BEGIN
|
||||
-- mark the linked place (excludes from search results)
|
||||
UPDATE placex set linked_place_id = NEW.place_id where place_id = linkedPlacex.place_id;
|
||||
|
||||
-- keep a note of the node id in case we need it for wikipedia in a bit
|
||||
linked_node_id := linkedPlacex.osm_id;
|
||||
select language||':'||title,importance from get_wikipedia_match(linkedPlacex.extratags, NEW.country_code) INTO linked_wikipedia,linked_importance;
|
||||
select wikipedia, importance
|
||||
FROM compute_importance(linkedPlacex.extratags, NEW.country_code,
|
||||
'N', linkedPlacex.osm_id)
|
||||
INTO linked_wikipedia,linked_importance;
|
||||
--DEBUG: RAISE WARNING 'Linked named place';
|
||||
END LOOP;
|
||||
END IF;
|
||||
@ -1714,13 +1716,6 @@ BEGIN
|
||||
(NEW.importance is null or NEW.importance < linked_importance) THEN
|
||||
NEW.importance = linked_importance;
|
||||
END IF;
|
||||
|
||||
-- Still null? how about looking it up by the node id
|
||||
IF NEW.importance IS NULL THEN
|
||||
--DEBUG: RAISE WARNING 'Looking up importance by linked node id';
|
||||
select language||':'||title,importance from wikipedia_article where osm_type = 'N'::char(1) and osm_id = linked_node_id order by importance desc limit 1 INTO NEW.wikipedia,NEW.importance;
|
||||
END IF;
|
||||
|
||||
END IF;
|
||||
|
||||
-- make sure all names are in the word table
|
||||
@ -2627,7 +2622,7 @@ END;
|
||||
$$
|
||||
LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
DROP TYPE wikipedia_article_match CASCADE;
|
||||
DROP TYPE IF EXISTS wikipedia_article_match CASCADE;
|
||||
create type wikipedia_article_match as (
|
||||
language TEXT,
|
||||
title TEXT,
|
||||
@ -2684,6 +2679,42 @@ END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
DROP TYPE IF EXISTS place_importance CASCADE;
|
||||
create type place_importance as (
|
||||
importance FLOAT,
|
||||
wikipedia TEXT
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT)
|
||||
RETURNS place_importance
|
||||
AS $$
|
||||
DECLARE
|
||||
match RECORD;
|
||||
result place_importance;
|
||||
BEGIN
|
||||
FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code)
|
||||
WHERE language is not NULL
|
||||
LOOP
|
||||
result.importance := match.importance;
|
||||
result.wikipedia := match.language || ':' || match.title;
|
||||
RETURN result;
|
||||
END LOOP;
|
||||
|
||||
IF extratags ? 'wikidata' THEN
|
||||
FOR match IN SELECT * FROM wikipedia_article
|
||||
WHERE wd_page_title = extratags->'wikidata'
|
||||
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP
|
||||
result.importance := match.importance;
|
||||
result.wikipedia := match.language || ':' || match.title;
|
||||
RETURN result;
|
||||
END LOOP;
|
||||
END IF;
|
||||
|
||||
RETURN null;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER)
|
||||
RETURNS SETOF GEOMETRY
|
||||
AS $$
|
||||
|
@ -268,7 +268,9 @@ CREATE TABLE wikipedia_article (
|
||||
lon double precision,
|
||||
importance double precision,
|
||||
osm_type character(1),
|
||||
osm_id bigint
|
||||
osm_id bigint,
|
||||
wd_page_title text,
|
||||
instance_of text
|
||||
);
|
||||
ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
|
||||
CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);
|
||||
|
@ -119,13 +119,13 @@ Feature: Search queries
|
||||
| en |
|
||||
Then result addresses contain
|
||||
| ID | state |
|
||||
| 0 | Salto |
|
||||
| 0 | Florida |
|
||||
When sending json search query "25 de Mayo" with address
|
||||
| accept-language | viewbox |
|
||||
| en | -56.35879,-34.18330,-56.31618,-34.20815 |
|
||||
| en | -57.95468,-31.39261,-57.94741,-31.39490 |
|
||||
Then result addresses contain
|
||||
| ID | state |
|
||||
| 0 | Florida |
|
||||
| 0 | Salto |
|
||||
|
||||
Scenario: viewboxes cannot be points
|
||||
When sending json search query "foo"
|
||||
|
BIN
test/testdb/wikimedia-importance.sql.gz
Normal file
BIN
test/testdb/wikimedia-importance.sql.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -42,6 +42,7 @@ $aCMDOptions
|
||||
array('deduplicate', '', 0, 1, 0, 0, 'bool', 'Deduplicate tokens'),
|
||||
array('recompute-word-counts', '', 0, 1, 0, 0, 'bool', 'Compute frequency of full-word search terms'),
|
||||
array('update-address-levels', '', 0, 1, 0, 0, 'bool', 'Reimport address level configuration (EXPERT)'),
|
||||
array('recompute-importance', '', 0, 1, 0, 0, 'bool', 'Recompute place importances'),
|
||||
array('no-npi', '', 0, 1, 0, 0, 'bool', '(obsolete)'),
|
||||
);
|
||||
|
||||
@ -320,6 +321,23 @@ if ($aResult['update-address-levels']) {
|
||||
$oAlParser->createTable($oDB, 'address_levels');
|
||||
}
|
||||
|
||||
if ($aResult['recompute-importance']) {
|
||||
echo "Updating importance values for database.\n";
|
||||
$oDB = new Nominatim\DB();
|
||||
$oDB->connect();
|
||||
|
||||
$sSQL = 'ALTER TABLE placex DISABLE TRIGGER ALL;';
|
||||
$sSQL .= 'UPDATE placex SET (wikipedia, importance) =';
|
||||
$sSQL .= ' (SELECT wikipedia, importance';
|
||||
$sSQL .= ' FROM compute_importance(extratags, country_code, osm_type, osm_id));';
|
||||
$sSQL .= 'UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance';
|
||||
$sSQL .= ' FROM placex d';
|
||||
$sSQL .= ' WHERE s.place_id = d.linked_place_id and d.wikipedia is not null';
|
||||
$sSQL .= ' and (s.wikipedia is null or s.importance < d.importance);';
|
||||
$sSQL .= 'ALTER TABLE placex ENABLE TRIGGER ALL;';
|
||||
$oDB->exec($sSQL);
|
||||
}
|
||||
|
||||
if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
|
||||
//
|
||||
if (strpos(CONST_Replication_Url, 'download.geofabrik.de') !== false && CONST_Replication_Update_Interval < 86400) {
|
||||
|
Loading…
Reference in New Issue
Block a user