calculate importance from wikipedia

This commit is contained in:
Brian Quinion 2012-04-30 01:05:06 +01:00
parent 709ab1bb03
commit 1c0fa81d88
3 changed files with 152 additions and 5 deletions

View File

@ -1225,6 +1225,10 @@ DECLARE
name_vector INTEGER[];
nameaddress_vector INTEGER[];
wiki_article TEXT;
wiki_article_title TEXT;
wiki_article_language TEXT;
result BOOLEAN;
BEGIN
@ -1308,6 +1312,34 @@ BEGIN
address_havelevel[i] := false;
END LOOP;
NEW.importance := null;
-- WARNING: see duplicate of code below (yuk!)
IF NEW.extratags?'wikipedia' THEN
wiki_article := replace(regexp_replace(NEW.extratags->'wikipedia',E'(.*?)([a-z]+).wikipedia.org/wiki/',E'\\2:'),' ','_');
wiki_article_title := split_part(wiki_article, ':', 2);
IF wiki_article_title IS NULL OR wiki_article_title = '' THEN
wiki_article_title := wiki_article;
wiki_article_language := 'en';
ELSE
wiki_article_language := lower(split_part(wiki_article, ':', 1));
END IF;
--RAISE WARNING '% %', wiki_article_language, wiki_article_title;
select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title
from wikipedia_article
where language = wiki_article_language and
(title = wiki_article_title OR title = decode_url_part(wiki_article_title) OR title = replace(decode_url_part(wiki_article_title),E'\\',''))
UNION ALL
select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title
from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title)
where wikipedia_redirect.language = wiki_article_language and
(from_title = wiki_article_title OR from_title = decode_url_part(wiki_article_title) OR from_title = replace(decode_url_part(wiki_article_title),E'\\',''))
order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia;
ELSE
select importance,language||':'||title from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia;
END IF;
--RAISE WARNING '% %', NEW.place_id, NEW.rank_search;
-- For low level elements we inherit from our parent road
@ -1535,7 +1567,7 @@ BEGIN
END IF;
-- not found one yet? how about doing a name search
IF NEW.centroid IS NULL AND NEW.name->'name' is not null and make_standard_name(NEW.name->'name') != '' THEN
IF NEW.centroid IS NULL AND (NEW.name->'name') is not null and make_standard_name(NEW.name->'name') != '' THEN
FOR linkedPlacex IN select placex.* from placex WHERE
make_standard_name(name->'name') = make_standard_name(NEW.name->'name')
@ -1567,6 +1599,31 @@ BEGIN
place_centroid := NEW.centroid;
END IF;
-- Did we gain a wikipedia tag in the process? then we need to recalculate our importance
-- WARNING: duplicate of code above (yuk!)
IF NEW.importance is null AND NEW.extratags?'wikipedia' THEN
wiki_article := replace(regexp_replace(NEW.extratags->'wikipedia',E'(.*?)([a-z]+).wikipedia.org/wiki/',E'\\2:'),' ','_');
wiki_article_title := split_part(wiki_article, ':', 2);
IF wiki_article_title IS NULL OR wiki_article_title = '' THEN
wiki_article_title := wiki_article;
wiki_article_language := 'en';
ELSE
wiki_article_language := lower(split_part(wiki_article, ':', 1));
END IF;
select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title
from wikipedia_article
where language = wiki_article_language and
(title = wiki_article_title OR title = decode_url_part(wiki_article_title) OR title = replace(decode_url_part(wiki_article_title),E'\\',''))
UNION ALL
select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title
from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title)
where wikipedia_redirect.language = wiki_article_language and
(from_title = wiki_article_title OR from_title = decode_url_part(wiki_article_title) OR from_title = replace(decode_url_part(wiki_article_title),E'\\',''))
order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia;
END IF;
END IF;
NEW.parent_place_id = 0;
@ -2706,4 +2763,10 @@ END;
$$
LANGUAGE plpgsql;
-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql
CREATE OR REPLACE FUNCTION decode_url_part(p varchar) RETURNS varchar
AS $$
SELECT convert_from(CAST(E'\\x' || string_agg(CASE WHEN length(r.m[1]) = 1 THEN encode(convert_to(r.m[1], 'SQL_ASCII'), 'hex') ELSE substring(r.m[1] from 2 for 2) END, '') AS bytea), 'UTF8')
FROM regexp_matches($1, '%[0-9a-f][0-9a-f]|.', 'gi') AS r(m);
$$
LANGUAGE SQL IMMUTABLE STRICT;

View File

@ -210,6 +210,7 @@ CREATE TABLE placex (
importance FLOAT,
indexed_status INTEGER,
indexed_date TIMESTAMP,
wikipedia TEXT, -- calculated wikipedia article name (language:title)
geometry_sector INTEGER
);
SELECT AddGeometryColumn('placex', 'geometry', 4326, 'GEOMETRY', 2);
@ -301,3 +302,28 @@ CREATE INDEX idx_import_polygon_delete_osmid ON import_polygon_delete USING BTRE
drop sequence file;
CREATE SEQUENCE file start 1;
-- null table so it won't error
-- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it
CREATE TABLE wikipedia_article (
language text NOT NULL,
title text NOT NULL,
langcount integer,
othercount integer,
totalcount integer,
lat double precision,
lon double precision,
importance double precision,
osm_type character(1),
osm_id bigint
);
ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title);
CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id);
CREATE TABLE wikipedia_redirect (
language text,
from_title text,
to_title text
);
ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title);

View File

@ -23,6 +23,7 @@
array('create-minimal-tables', '', 0, 1, 0, 0, 'bool', 'Create minimal main tables'),
array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create main tables'),
array('create-partitions', '', 0, 1, 0, 0, 'bool', 'Create required partition tables and triggers'),
array('import-wikipedia-articles', '', 0, 1, 0, 0, 'bool', 'Import wikipedia article dump'),
array('load-data', '', 0, 1, 0, 0, 'bool', 'Copy data to live tables from import table'),
array('import-tiger-data', '', 0, 1, 0, 0, 'bool', 'Import tiger data (not included in \'all\')'),
array('calculate-postcodes', '', 0, 1, 0, 0, 'bool', 'Calculate postcode centroids'),
@ -207,6 +208,34 @@
pgsqlRunScript($sTemplate);
}
if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all'])
{
$bDidSomething = true;
$sWikiArticlesFile = CONST_BasePath.'/data/wikipedia_article.sql.bin';
$sWikiRedirectsFile = CONST_BasePath.'/data/wikipedia_redirect.sql.bin';
if (file_exists($sWikiArticlesFile))
{
echo "Importing wikipedia articles...";
pgsqlRunRestoreData($sWikiArticlesFile);
echo "...done\n";
}
else
{
echo "WARNING: wikipedia article dump file not found - places will have default importance\n";
}
if (file_exists($sWikiRedirectsFile))
{
echo "Importing wikipedia redirects...";
pgsqlRunRestoreData($sWikiRedirectsFile);
echo "...done\n";
}
else
{
echo "WARNING: wikipedia redirect dump file not found - some place importance values may be missing\n";
}
}
if ($aCMDResult['load-data'] || $aCMDResult['all'])
{
echo "Load Data\n";
@ -326,7 +355,7 @@
}
fclose($hFile);
$bAnyBusy = true;
while($bAnyBusy)
{
@ -451,7 +480,8 @@
// Convert database DSN to psql paramaters
$aDSNInfo = DB::parseDSN(CONST_Database_DSN);
$sCMD = 'psql -f '.$sFilename.' '.$aDSNInfo['database'];
if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432;
$sCMD = 'psql -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database'].' -f '.$sFilename;
$aDescriptors = array(
0 => array('pipe', 'r'),
@ -479,7 +509,7 @@
// Convert database DSN to psql paramaters
$aDSNInfo = DB::parseDSN(CONST_Database_DSN);
if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432;
$sCMD = 'psql -p '.$aDSNInfo['port'].' '.$aDSNInfo['database'];
$sCMD = 'psql -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database'];
$aDescriptors = array(
0 => array('pipe', 'r'),
1 => STDOUT,
@ -497,3 +527,31 @@
fclose($ahPipes[0]);
proc_close($hProcess);
}
function pgsqlRunRestoreData($sDumpFile)
{
// Convert database DSN to psql paramaters
$aDSNInfo = DB::parseDSN(CONST_Database_DSN);
if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432;
$sCMD = 'pg_restore -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database'].' -Fc -a '.$sDumpFile;
$aDescriptors = array(
0 => array('pipe', 'r'),
1 => array('pipe', 'w'),
2 => array('file', '/dev/null', 'a')
);
$ahPipes = null;
$hProcess = proc_open($sCMD, $aDescriptors, $ahPipes);
if (!is_resource($hProcess)) fail('unable to start pg_restore');
fclose($ahPipes[0]);
// TODO: error checking
while(!feof($ahPipes[1]))
{
echo fread($ahPipes[1], 4096);
}
fclose($ahPipes[1]);
proc_close($hProcess);
}