remove old wikidata script. See data-sources/wikipedia-wikidata/ for new process

This commit is contained in:
marc tobias 2019-12-01 11:14:28 +01:00
parent f8bd4f5133
commit 2051a84a09
6 changed files with 0 additions and 928 deletions

View File

@ -109,7 +109,6 @@ set(WEBSITESCRIPTS
set(CUSTOMSCRIPTS
utils/country_languages.php
utils/importWikipedia.php
utils/export.php
utils/query.php
utils/setup.php

View File

@ -1,557 +0,0 @@
<?php
require_once(CONST_BasePath.'/lib/init-cmd.php');
ini_set('memory_limit', '800M');
$aCMDOptions
= array(
'Create and setup nominatim search system',
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
);
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
/*
$sTestPageText = <<<EOD
{{Coord|47|N|2|E|type:country_region:FR|display=title}}
{{ Infobox Amusement park
| name = Six Flags Great Adventure
| image = [[File:SixFlagsGreatAdventure logo.png]]
| caption = Six Flags Great Adventure logo
| location = [[Jackson, New Jersey|Jackson]]
| location2 = New Jersey
| location3 = United States
| address = 1 Six Flags Boulevard<ref name="drivedir"/>
| season = March/April through October/November
| opening_date = July 1, 1974
| previous_names = Great Adventure
| area_acre = 2200
| rides = 45 park admission rides
| coasters = 12
| water_rides = 2
| owner = [[Six Flags]]
| general_manager =
| homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
}}
EOD;
var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
exit;
//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
*/
/*
$a = array();
$a[] = 'test';
$oDB = new Nominatim\DB();
$oDB->connect();
if ($aCMDResult['drop-tables'])
{
$oDB->query('DROP TABLE wikipedia_article');
$oDB->query('DROP TABLE wikipedia_link');
}
*/
if ($aCMDResult['create-tables']) {
$sSQL = <<<'EOD'
CREATE TABLE wikipedia_article (
language text NOT NULL,
title text NOT NULL,
langcount integer,
othercount integer,
totalcount integer,
lat double precision,
lon double precision,
importance double precision,
title_en text,
osm_type character(1),
osm_id bigint,
infobox_type text,
population bigint,
website text
);
$oDB->query($sSQL);
$oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
$sSQL = <<<'EOD'
CREATE TABLE wikipedia_link (
from_id INTEGER,
to_name text
);
EOD;
$oDB->query($sSQL);
}
function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
{
$sNSEW = strtoupper($sNSEW);
return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
}
function _parseWikipediaContent($sPageText)
{
$sPageText = str_replace("\n", ' ', $sPageText);
$sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
$sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
$aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
$aPageProperties = array();
$sPageBody = '';
$aTemplates = array();
$aLinks = array();
$aTemplateStack = array();
$aState = array('body');
foreach ($aPageText as $i => $sPart) {
switch ($sPart) {
case '{{':
array_unshift($aTemplateStack, array('', array()));
array_unshift($aState, 'template');
break;
case '}}':
if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
$aTemplate = array_shift($aTemplateStack);
array_shift($aState);
$aTemplates[] = $aTemplate;
}
break;
case '[[':
$sLinkPage = '';
$sLinkSyn = '';
array_unshift($aState, 'link');
break;
case ']]':
if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
$aLinks[] = array($sLinkPage, $sLinkSyn);
array_shift($aState);
switch ($aState[0]) {
case 'template':
$aTemplateStack[0][0] .= trim($sPart);
break;
case 'templateparam':
$aTemplateStack[0][1][0] .= $sLinkSyn;
break;
case 'link':
$sLinkPage .= trim($sPart);
break;
case 'linksynonim':
$sLinkSyn .= $sPart;
break;
case 'body':
$sPageBody .= $sLinkSyn;
break;
default:
var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
fail('unknown state');
}
}
break;
case '|':
if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
// Create a new template paramater
$aState[0] = 'templateparam';
array_unshift($aTemplateStack[0][1], '');
}
if ($aState[0] == 'link') $aState[0] = 'linksynonim';
break;
default:
switch ($aState[0]) {
case 'template':
$aTemplateStack[0][0] .= trim($sPart);
break;
case 'templateparam':
$aTemplateStack[0][1][0] .= $sPart;
break;
case 'link':
$sLinkPage .= trim($sPart);
break;
case 'linksynonim':
$sLinkSyn .= $sPart;
break;
case 'body':
$sPageBody .= $sPart;
break;
default:
var_dump($aState, $aPageText);
fail('unknown state');
}
break;
}
}
return $aTemplates;
}
function _templatesToProperties($aTemplates)
{
$aPageProperties = array();
foreach ($aTemplates as $iTemplate => $aTemplate) {
$aParams = array();
foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
if (($iPos = strpos($sParam, '=')) === false) {
$aParams[] = trim($sParam);
} else {
$aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
}
}
$aTemplates[$iTemplate][1] = $aParams;
if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
}
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
}
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
}
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
}
if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
$aPageProperties['sWebsite'] = $aMatch[1];
if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
$aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
}
}
}
if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
$aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
}
if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
$aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
// $aPageProperties['aInfoboxParams'] = $aParams;
}
// Assume the first template with lots of params is the type (fallback for infobox)
if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
$aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
// $aPageProperties['aInfoboxParams'] = $aParams;
}
// do we have a lat/lon
if (!isset($aPageProperties['fLat'])) {
if (isset($aParams['latd']) && isset($aParams['longd'])) {
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
}
if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
}
if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
$aPageProperties['fLat'] = (float)$aParams['latitude'];
$aPageProperties['fLon'] = (float)$aParams['longitude'];
}
}
if (strtolower($aTemplate[0]) == 'coord') {
if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
} elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
} elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
$aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
$aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
} elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
$aPageProperties['fLat'] = (float)$aParams[0];
$aPageProperties['fLon'] = (float)$aParams[1];
}
}
if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
$aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
$aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
if (preg_match('#^([0-9]+)°(([0-9]+))? ([NS]) to ([0-9]+)°(([0-9]+))? ([NS])#', $aParams['Latitude'], $aMatch)) {
$aPageProperties['fLat'] =
(degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
+degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
} elseif (preg_match('#^([0-9]+)°(([0-9]+))? ([NS])#', $aParams['Latitude'], $aMatch)) {
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
}
if (preg_match('#^([0-9]+)°(([0-9]+))? ([EW]) to ([0-9]+)°(([0-9]+))? ([EW])#', $aParams['Longitude'], $aMatch)) {
$aPageProperties['fLon'] =
(degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
+degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
} elseif (preg_match('#^([0-9]+)°(([0-9]+))? ([EW])#', $aParams['Longitude'], $aMatch)) {
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
}
}
}
}
if (isset($aPageProperties['sPossibleInfoboxType'])) {
if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
unset($aPageProperties['sPossibleInfoboxType']);
}
return $aPageProperties;
}
if (isset($aCMDResult['parse-wikipedia'])) {
$oDB = new Nominatim\DB();
$oDB->connect();
$sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
$sSQL .= $aCMDResult['parse-wikipedia'];
$sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
$aArticleNames = $oDB->getCol($sSQL);
/* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
*/
foreach ($aArticleNames as $sArticleName) {
$sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
$aP = _templatesToProperties(_parseWikipediaContent($sPageText));
if (isset($aP['sInfoboxType'])) {
$aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
$sSQL = 'update wikipedia_article set ';
$sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
$oDB->query($sSQL);
}
if (isset($aP['iPopulation'])) {
$sSQL = 'update wikipedia_article set ';
$sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
$oDB->query($sSQL);
}
if (isset($aP['sWebsite'])) {
$sSQL = 'update wikipedia_article set ';
$sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
$oDB->query($sSQL);
}
if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
$sSQL = 'update wikipedia_article set ';
$sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
$sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
$oDB->query($sSQL);
}
}
}
function nominatimXMLStart($hParser, $sName, $aAttr)
{
global $aNominatRecords;
switch ($sName) {
case 'PLACE':
$aNominatRecords[] = $aAttr;
break;
}
}
function nominatimXMLEnd($hParser, $sName)
{
}
if (isset($aCMDResult['link'])) {
$oDB = new Nominatim\DB();
$oDB->connect();
$aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
// If you point this script at production OSM you will be blocked
$sNominatimBaseURL = 'http://SEVERNAME/search.php';
foreach ($aWikiArticles as $aRecord) {
$aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
$sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
$fMaxDist = 0.0000001;
$bUnknown = false;
switch (strtolower($aRecord['infobox_type'])) {
case 'former country':
continue 2;
case 'sea':
$fMaxDist = 60; // effectively turn it off
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
break;
case 'country':
case 'island':
case 'islands':
case 'continent':
$fMaxDist = 60; // effectively turn it off
$sURL .= '&featuretype=country';
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
break;
case 'prefecture japan':
$aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
// intentionally no break
case 'state':
case '#us state':
case 'county':
case 'u.s. state':
case 'u.s. state symbols':
case 'german state':
case 'province or territory of canada':
case 'indian jurisdiction':
case 'province':
case 'french region':
case 'region of italy':
case 'kommune':
case '#australia state or territory':
case 'russian federal subject':
$fMaxDist = 4;
$sURL .= '&featuretype=state';
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
break;
case 'protected area':
$fMaxDist = 1;
$sURL .= '&nearlat='.$aRecord['lat'];
$sURL .= '&nearlon='.$aRecord['lon'];
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
break;
case 'settlement':
$bUnknown = true;
// intentionally no break
case 'french commune':
case 'italian comune':
case 'uk place':
case 'italian comune':
case 'australian place':
case 'german place':
case '#geobox':
case 'u.s. county':
case 'municipality':
case 'city japan':
case 'russian inhabited locality':
case 'finnish municipality/land area':
case 'england county':
case 'israel municipality':
case 'russian city':
case 'city':
$fMaxDist = 0.2;
$sURL .= '&featuretype=settlement';
$sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
break;
case 'mountain':
case 'mountain pass':
case 'river':
case 'lake':
case 'airport':
$fMaxDist = 0.2;
$sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
break;
case 'ship begin':
$fMaxDist = 0.1;
$aTypes = array('wreck');
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
$sURL .= '&nearlat='.$aRecord['lat'];
$sURL .= '&nearlon='.$aRecord['lon'];
break;
case 'road':
case 'university':
case 'company':
case 'department':
$fMaxDist = 0.005;
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
$sURL .= '&bounded=1';
$sURL .= '&nearlat='.$aRecord['lat'];
$sURL .= '&nearlon='.$aRecord['lon'];
break;
default:
$bUnknown = true;
$fMaxDist = 0.005;
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
// $sURL .= "&bounded=1";
$sURL .= '&nearlat='.$aRecord['lat'];
$sURL .= '&nearlon='.$aRecord['lon'];
echo '-- Unknown: '.$aRecord['infobox_type']."\n";
break;
}
$sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
var_Dump($sNameURL);
$sXML = file_get_contents($sNameURL);
$aNominatRecords = array();
$hXMLParser = xml_parser_create();
xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
xml_parse($hXMLParser, $sXML, true);
xml_parser_free($hXMLParser);
if (!isset($aNominatRecords[0])) {
$aNameParts = preg_split('#[(,]#', $aRecord['name']);
if (count($aNameParts) > 1) {
$sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
var_Dump($sNameURL);
$sXML = file_get_contents($sNameURL);
$aNominatRecords = array();
$hXMLParser = xml_parser_create();
xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
xml_parse($hXMLParser, $sXML, true);
xml_parser_free($hXMLParser);
}
}
// assume first is best/right
for ($i = 0; $i < count($aNominatRecords); $i++) {
$fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
$fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
$fDiff = sqrt($fDiff);
if ($bUnknown) {
// If it was an unknown type base it on the rank of the found result
$iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
if ($iRank <= 4) $fMaxDist = 2;
elseif ($iRank <= 8) $fMaxDist = 1;
elseif ($iRank <= 10) $fMaxDist = 0.8;
elseif ($iRank <= 12) $fMaxDist = 0.6;
elseif ($iRank <= 17) $fMaxDist = 0.2;
elseif ($iRank <= 18) $fMaxDist = 0.1;
elseif ($iRank <= 22) $fMaxDist = 0.02;
elseif ($iRank <= 26) $fMaxDist = 0.001;
else $fMaxDist = 0.001;
}
echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
echo " (dist:$fDiff, max:$fMaxDist)\n";
if ($fDiff > $fMaxDist) {
echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
} else {
$sSQL = 'update wikipedia_article set osm_type=';
switch ($aNominatRecords[$i]['OSM_TYPE']) {
case 'relation':
$sSQL .= "'R'";
break;
case 'way':
$sSQL .= "'W'";
break;
case 'node':
$sSQL .= "'N'";
break;
}
$sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
$oDB->query($sSQL);
break;
}
}
}
}

View File

@ -1,75 +0,0 @@
DROP TABLE entity;
DROP TABLE entity_label;
DROP TABLE entity_description;
DROP TABLE entity_alias;
DROP TABLE entity_link;
DROP TABLE entity_property;
CREATE TABLE entity (
entity_id bigint,
title text,
pid bigint,
qid bigint,
datatype text,
CONSTRAINT pk_entity PRIMARY KEY(entity_id)
);
CREATE TABLE entity_label (
entity_id bigint,
language text,
label text,
CONSTRAINT pk_entity_label PRIMARY KEY(entity_id,language)
);
CREATE TABLE entity_description (
entity_id bigint,
language text,
description text,
CONSTRAINT pk_entity_description PRIMARY KEY(entity_id,language)
);
CREATE TABLE entity_alias (
entity_id bigint,
language text,
alias text,
CONSTRAINT pk_entity_alias PRIMARY KEY(entity_id,language,alias)
);
CREATE TABLE entity_link (
entity_id bigint,
target text,
value text,
CONSTRAINT pk_entity_link PRIMARY KEY(entity_id,target)
);
CREATE TABLE entity_link_hit (
entity_id bigint,
target text,
value text,
hits bigint,
CONSTRAINT pk_entity_link_hit PRIMARY KEY(entity_id,target)
);
CREATE TABLE link_hit (
target text,
value text,
hits bigint,
CONSTRAINT pk_link_hit PRIMARY KEY(target,value)
);
CREATE TABLE entity_property (
entity_id bigint,
order_id bigint,
pid bigint,
string text,
toqid bigint,
location geometry,
datetime timestamp with time zone,
CONSTRAINT pk_entity_property PRIMARY KEY(entity_id, order_id)
);
CREATE TABLE import_link_hit (
target text,
value text,
hits bigint
);

View File

@ -1,28 +0,0 @@
PSQL=/usr/lib/postgresql/9.2/bin/psql -d wikidata
cat create.sql | $PSQL
cat entity.csv | $PSQL -c "COPY entity from STDIN WITH CSV"
cat entity_label.csv | $PSQL -c "COPY entity_label from STDIN WITH CSV"
cat entity_description.csv | $PSQL -c "COPY entity_description from STDIN WITH CSV"
cat entity_alias.csv | $PSQL -c "COPY entity_alias from STDIN WITH CSV"
cat entity_link.csv | $PSQL -c "COPY entity_link from STDIN WITH CSV"
cat entity_property.csv | $PSQL -c "COPY entity_property from STDIN WITH CSV"
$PSQL -c "create index idx_entity_link_target on entity_link using btree (target,value)"
$PSQL -c "create index idx_entity_qid on entity using btree (qid)"
$PSQL -c "create table property_label_en as select pid,null::text as label from entity where pid is not null"
$PSQL -c "update property_label_en set label = x.label from (select pid,label,language from entity join entity_label using (entity_id) where pid is not null and language = 'en') as x where x.pid = property_label_en.pid"
$PSQL -c "create unique index idx_property_label_en on property_label_en using btree (pid)"
$PSQL -c "alter table entity add column label_en text"
$PSQL -c "update entity set label_en = label from entity_label where entity.entity_id = entity_label.entity_id and language = 'en'"
$PSQL -c "alter table entity add column description_en text"
$PSQL -c "update entity set description_en = description from entity_description where entity.entity_id = entity_description.entity_id and language = 'en'"
cat totals.txt | $PSQL -c "COPY import_link_hit from STDIN WITH CSV DELIMITER ' '"
$PSQL -c "truncate link_hit"
$PSQL -c "insert into link_hit select target||'wiki', replace(catch_decode_url_part(value), '_', ' '), sum(hits) from import_link_hit where replace(catch_decode_url_part(value), '_', ' ') is not null group by target||'wiki', replace(dcatch_decode_url_part(value), '_', ' ')"
$PSQL -c "truncate entity_link_hit"
$PSQL -c "insert into entity_link_hit select entity_id, target, value, coalesce(hits,0) from entity_link left outer join link_hit using (target, value)"
$PSQL -c "create table entity_hit as select entity_id,sum(hits) as hits from entity_link_hit group by entity_id"
$PSQL -c "create unique index idx_entity_hit on entity_hit using btree (entity_id)"

View File

@ -1,188 +0,0 @@
#!/usr/bin/php -Cq
<?php
$hFile = @fopen('wikidatawiki-20130623-pages-articles.xml', 'r');
$hFileEntity = fopen('entity.csv', 'w');
$hFileEntityLabel = fopen('entity_label.csv', 'w');
$hFileEntityDescription = fopen('entity_description.csv', 'w');
$hFileEntityAlias = fopen('entity_alias.csv', 'w');
$hFileEntityLink = fopen('entity_link.csv', 'w');
$hFileEntityProperty = fopen('entity_property.csv', 'w');
$iCount = 0;
$sTitle = '';
$iNS = false;
$iID = false;
if ($hFile) {
while (($sLine = fgets($hFile, 4000000)) !== false) {
if (substr($sLine, 0, 11) == ' <title>') {
$sTitle = substr($sLine, 11, -9);
} elseif (substr($sLine, 0, 8) == ' <ns>') {
$iNS = (int)substr($sLine, 8, -6);
} elseif (substr($sLine, 0, 8) == ' <id>') {
$iID = (int)substr($sLine, 8, -6);
} elseif (substr($sLine, 0, 33) == ' <text xml:space="preserve">') {
if ($iNS == -2) continue;
if ($iNS == -1) continue;
if ($iNS == 1) continue;
if ($iNS == 2) continue;
if ($iNS == 3) continue;
if ($iNS == 4) continue;
if ($iNS == 5) continue;
if ($iNS == 6) continue;
if ($iNS == 7) continue;
if ($iNS == 8) continue;
if ($iNS == 9) continue;
if ($iNS == 10) continue;
if ($iNS == 11) continue;
if ($iNS == 12) continue;
if ($iNS == 13) continue;
if ($iNS == 14) continue;
if ($iNS == 15) continue;
if ($iNS == 121) continue;
if ($iNS == 123) continue;
if ($iNS == 829) continue;
if ($iNS == 1198) continue;
if ($iNS == 1199) continue;
$sText = html_entity_decode(substr($sLine, 33, -8), ENT_COMPAT, 'UTF-8');
$aArticle = json_decode($sText, true);
if (array_diff(array_keys($aArticle), array('label', 'description', 'aliases', 'links', 'entity', 'claims', 'datatype')) != array()) {
// DEBUG
var_dump($sTitle);
var_dump(array_keys($aArticle));
var_dump($aArticle);
exit;
}
$iPID = $iQID = null;
if ($aArticle['entity'][0] == 'p') {
$iPID = (int) substr($aArticle['entity'], 1);
} elseif ($aArticle['entity'][0] == 'q') {
$iQID = (int) substr($aArticle['entity'], 1);
} else {
continue;
}
echo '.';
fputcsv($hFileEntity, array($iID, $sTitle, $iPID, $iQID, @$aArticle['datatype']));
foreach ($aArticle['label'] as $sLang => $sLabel) {
fputcsv($hFileEntityLabel, array($iID, $sLang, $sLabel));
// echo "insert into entity_label values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
}
foreach ($aArticle['description'] as $sLang => $sLabel) {
fputcsv($hFileEntityDescription, array($iID, $sLang, $sLabel));
// echo "insert into entity_description values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
}
foreach ($aArticle['aliases'] as $sLang => $aLabels) {
$aUniqueAlias = array();
foreach ($aLabels as $sLabel) {
if (!isset($aUniqueAlias[$sLabel]) && $sLabel) {
fputcsv($hFileEntityAlias, array($iID, $sLang, $sLabel));
// echo "insert into entity_alias values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
$aUniqueAlias[$sLabel] = true;
}
}
}
foreach ($aArticle['links'] as $sLang => $sLabel) {
fputcsv($hFileEntityLink, array($iID, $sLang, $sLabel));
// echo "insert into entity_link values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
}
if (isset($aArticle['claims'])) {
//
foreach ($aArticle['claims'] as $iClaim => $aClaim) {
//
$bFail = false;
if ($aClaim['m'][0] == 'novalue') continue;
if ($aClaim['m'][0] == 'somevalue') continue;
$iPID = (int)$aClaim['m'][1];
if ($aClaim['m'][0] != 'value') $bFail = true;
if ($aClaim['m'][2]== 'wikibase-entityid') {
//
if ($aClaim['m'][3]['entity-type'] != 'item') $bFail = true;
fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, null, $aClaim['m'][3]['numeric-id'], null, null));
// echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",null,".$aClaim['m'][3]['numeric-id'].",null);\n";
} elseif ($aClaim['m'][2] == 'globecoordinate') {
//
if ($aClaim['m'][3]['globe'] != 'http://www.wikidata.org/entity/Q2') $bFail = true;
fputcsv(
$hFileEntityProperty,
array(
$iID,
$iClaim,
$iPID,
null,
null,
'SRID=4326;POINT('.((float) $aClaim['m'][3]['longitude']).' '.((float)$aClaim['m'][3]['latitude']).')', null
)
);
/* echo "insert into entity_property values (nextval('seq_entity_property'),";
* echo $iID.",".$iPID.",null,null,ST_SetSRID(ST_MakePoint(".((float)$aClaim['m'][3]['longitude']);
* echo ", ".((float)$aClaim['m'][3]['latitude'])."),4326));\n";
*/
} elseif ($aClaim['m'][2] == 'time') {
// TODO!
/*
if ($aClaim['m'][3]['calendarmodel'] == 'http://www.wikidata.org/entity/Q1985727') {
// Gregorian
if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
if ((int)$aMatch[2] < 4700 && ) {
$sDateString = $aMatch[2].'-'.$aMatch[3].($aClaim['m'][3]['timezone']>=0?'+':'').$aClaim['m'][3]['timezone'].($aMatch[1]=='-'?' bc':'');
fputcsv($hFileEntityProperty, array($iID,$iClaim,$iPID,null,null,null,$sDateString));
}
} else {
// $bFail = true;
}
} elseif ( $aClaim['m'][3]['calendarmodel'] != 'http://www.wikidata.org/entity/Q1985786') {
// Julian
if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
var_dump($aMatch);
exit;
$iDayCount = juliantojd(2, 11, 1732);
var_dump($iDayCount, jdtogregorian($iDayCount));
} else {
$bFail = true;
exit;
}
exit;
} else {
// $bFail = true;
}
*/
} elseif ($aClaim['m'][2] == 'string') {
// echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",'".pg_escape_string($aClaim['m'][3])."',null,null);\n";
fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, $aClaim['m'][3], null, null, null));
} else {
$bFail = true;
}
// Don't care about sources: if ($aClaim['refs'] != array()) $bFail = true;
if ($bFail) {
var_dump($sTitle);
var_dump($aClaim);
} else {
// process
}
}
}
}
}
fclose($hFile);
fclose($hFileEntity);
fclose($hFileEntityLabel);
fclose($hFileEntityDescription);
fclose($hFileEntityAlias);
fclose($hFileEntityLink);
fclose($hFileEntityProperty);
}

View File

@ -1,79 +0,0 @@
<?php
for ($iTimestamp = mktime(0, 0, 0, 5, 1, 2013); $iTimestamp < mktime(0, 0, 0, 6, 15, 2013); $iTimestamp += 24*60*60) {
$sYear = date('Y', $iTimestamp);
$sMonth = date('Y-m', $iTimestamp);
$sDay = date('Ymd', $iTimestamp);
for ($iHour = 0; $iHour < 24; $iHour++) {
$sFilename = sprintf('pagecounts-'.$sDay.'-%02d0000', $iHour);
echo $sFilename."\n";
if (!file_exists($sFilename.'.gz')) {
exec('wget http://dumps.wikimedia.org/other/pagecounts-raw/'.$sYear.'/'.$sMonth.'/'.$sFilename.'.gz');
}
exec('gzip -dc '.$sFilename.'.gz'.' | grep -e "^[a-z]\{2\} [^ :]\+ [0-9]\+" > hour.txt');
$hPrevTotals = @fopen('totals.txt', 'r');
$hDayTotals = @fopen('hour.txt', 'r');
$hNewTotals = @fopen('newtotals.txt', 'w');
$sPrevKey = $sDayKey = true;
$sPrevLine = true;
$sDayLine = true;
do {
if ($sPrevKey === $sDayKey) {
if ($sPrevLine !== true) fputs($hNewTotals, "$sPrevKey ".($iPrevValue+$iDayValue)."\n");
$sPrevLine = true;
$sDayLine = true;
} elseif ($sDayKey !== false && ($sPrevKey > $sDayKey || $sPrevKey === false)) {
fputs($hNewTotals, "$sDayKey ".($iDayValue)."\n");
$sDayLine = true;
} elseif ($sPrevKey !== false && ($sDayKey > $sPrevKey || $sDayKey === false)) {
fputs($hNewTotals, "$sPrevKey ".($iPrevValue)."\n");
$sPrevLine = true;
}
if ($sPrevLine === true) {
$sPrevLine = $hPrevTotals?fgets($hPrevTotals, 4096):false;
if ($sPrevLine !== false) {
$aPrevLine = explode(' ', $sPrevLine);
$sPrevKey = $aPrevLine[0].' '.$aPrevLine[1];
$iPrevValue = (int)$aPrevLine[2];
} else {
$sPrevKey = false;
$iPrevValue = 0;
}
}
if ($sDayLine === true) {
$sDayLine = $hDayTotals?fgets($hDayTotals, 4096):false;
if ($sDayLine !== false) {
preg_match('#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#', $sDayLine, $aMatch);
$sDayKey = $aMatch[1].' '.$aMatch[2];
$iDayValue = (int)$aMatch[3];
} else {
$sDayKey = false;
$iDayValue = 0;
}
}
} while ($sPrevLine !== false || $sDayLine !== false);
@fclose($hPrevTotals);
@fclose($hDayTotals);
@fclose($hNewTotals);
@unlink('totals.txt');
rename('newtotals.txt', 'totals.txt');
}
}
// Notes:
/*
gzip -dc $FILE.gz | grep -e "^en [^ :]\+ [0-9]\+" |
sed "s#\(^[a-z]\{2\}\) \([^ :]\+\) \([0-9]\+\) [0-9]\+#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = '\1'
and title = catch_decode_url_part('\2');#g" | /opt/mapquest/stdbase-dev$
cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = s,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = coalesce(hits,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
*/