mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-26 13:27:52 +03:00
remove old wikidata script. See data-sources/wikipedia-wikidata/ for new process
This commit is contained in:
parent
f8bd4f5133
commit
2051a84a09
@ -109,7 +109,6 @@ set(WEBSITESCRIPTS
|
||||
|
||||
set(CUSTOMSCRIPTS
|
||||
utils/country_languages.php
|
||||
utils/importWikipedia.php
|
||||
utils/export.php
|
||||
utils/query.php
|
||||
utils/setup.php
|
||||
|
@ -1,557 +0,0 @@
|
||||
<?php
|
||||
|
||||
require_once(CONST_BasePath.'/lib/init-cmd.php');
|
||||
ini_set('memory_limit', '800M');
|
||||
|
||||
$aCMDOptions
|
||||
= array(
|
||||
'Create and setup nominatim search system',
|
||||
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
|
||||
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
|
||||
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
|
||||
|
||||
array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
|
||||
array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
|
||||
array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
|
||||
);
|
||||
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
|
||||
|
||||
/*
|
||||
$sTestPageText = <<<EOD
|
||||
{{Coord|47|N|2|E|type:country_region:FR|display=title}}
|
||||
{{ Infobox Amusement park
|
||||
| name = Six Flags Great Adventure
|
||||
| image = [[File:SixFlagsGreatAdventure logo.png]]
|
||||
| caption = Six Flags Great Adventure logo
|
||||
| location = [[Jackson, New Jersey|Jackson]]
|
||||
| location2 = New Jersey
|
||||
| location3 = United States
|
||||
| address = 1 Six Flags Boulevard<ref name="drivedir"/>
|
||||
| season = March/April through October/November
|
||||
| opening_date = July 1, 1974
|
||||
| previous_names = Great Adventure
|
||||
| area_acre = 2200
|
||||
| rides = 45 park admission rides
|
||||
| coasters = 12
|
||||
| water_rides = 2
|
||||
| owner = [[Six Flags]]
|
||||
| general_manager =
|
||||
| homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
|
||||
}}
|
||||
EOD;
|
||||
var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
|
||||
exit;
|
||||
//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
|
||||
*/
|
||||
/*
|
||||
|
||||
$a = array();
|
||||
$a[] = 'test';
|
||||
|
||||
$oDB = new Nominatim\DB();
|
||||
$oDB->connect();
|
||||
|
||||
if ($aCMDResult['drop-tables'])
|
||||
{
|
||||
$oDB->query('DROP TABLE wikipedia_article');
|
||||
$oDB->query('DROP TABLE wikipedia_link');
|
||||
}
|
||||
*/
|
||||
|
||||
if ($aCMDResult['create-tables']) {
|
||||
$sSQL = <<<'EOD'
|
||||
CREATE TABLE wikipedia_article (
|
||||
language text NOT NULL,
|
||||
title text NOT NULL,
|
||||
langcount integer,
|
||||
othercount integer,
|
||||
totalcount integer,
|
||||
lat double precision,
|
||||
lon double precision,
|
||||
importance double precision,
|
||||
title_en text,
|
||||
osm_type character(1),
|
||||
osm_id bigint,
|
||||
infobox_type text,
|
||||
population bigint,
|
||||
website text
|
||||
);
|
||||
$oDB->query($sSQL);
|
||||
|
||||
$oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
|
||||
|
||||
$sSQL = <<<'EOD'
|
||||
CREATE TABLE wikipedia_link (
|
||||
from_id INTEGER,
|
||||
to_name text
|
||||
);
|
||||
EOD;
|
||||
$oDB->query($sSQL);
|
||||
}
|
||||
|
||||
|
||||
function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
|
||||
{
|
||||
$sNSEW = strtoupper($sNSEW);
|
||||
return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
|
||||
}
|
||||
|
||||
|
||||
function _parseWikipediaContent($sPageText)
|
||||
{
|
||||
$sPageText = str_replace("\n", ' ', $sPageText);
|
||||
$sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
|
||||
$sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
|
||||
|
||||
$aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
|
||||
|
||||
$aPageProperties = array();
|
||||
$sPageBody = '';
|
||||
$aTemplates = array();
|
||||
$aLinks = array();
|
||||
|
||||
$aTemplateStack = array();
|
||||
$aState = array('body');
|
||||
foreach ($aPageText as $i => $sPart) {
|
||||
switch ($sPart) {
|
||||
case '{{':
|
||||
array_unshift($aTemplateStack, array('', array()));
|
||||
array_unshift($aState, 'template');
|
||||
break;
|
||||
case '}}':
|
||||
if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
|
||||
$aTemplate = array_shift($aTemplateStack);
|
||||
array_shift($aState);
|
||||
|
||||
$aTemplates[] = $aTemplate;
|
||||
}
|
||||
break;
|
||||
case '[[':
|
||||
$sLinkPage = '';
|
||||
$sLinkSyn = '';
|
||||
array_unshift($aState, 'link');
|
||||
break;
|
||||
case ']]':
|
||||
if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
|
||||
if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
|
||||
if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
|
||||
|
||||
$aLinks[] = array($sLinkPage, $sLinkSyn);
|
||||
|
||||
array_shift($aState);
|
||||
switch ($aState[0]) {
|
||||
case 'template':
|
||||
$aTemplateStack[0][0] .= trim($sPart);
|
||||
break;
|
||||
case 'templateparam':
|
||||
$aTemplateStack[0][1][0] .= $sLinkSyn;
|
||||
break;
|
||||
case 'link':
|
||||
$sLinkPage .= trim($sPart);
|
||||
break;
|
||||
case 'linksynonim':
|
||||
$sLinkSyn .= $sPart;
|
||||
break;
|
||||
case 'body':
|
||||
$sPageBody .= $sLinkSyn;
|
||||
break;
|
||||
default:
|
||||
var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
|
||||
fail('unknown state');
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '|':
|
||||
if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
|
||||
// Create a new template paramater
|
||||
$aState[0] = 'templateparam';
|
||||
array_unshift($aTemplateStack[0][1], '');
|
||||
}
|
||||
if ($aState[0] == 'link') $aState[0] = 'linksynonim';
|
||||
break;
|
||||
default:
|
||||
switch ($aState[0]) {
|
||||
case 'template':
|
||||
$aTemplateStack[0][0] .= trim($sPart);
|
||||
break;
|
||||
case 'templateparam':
|
||||
$aTemplateStack[0][1][0] .= $sPart;
|
||||
break;
|
||||
case 'link':
|
||||
$sLinkPage .= trim($sPart);
|
||||
break;
|
||||
case 'linksynonim':
|
||||
$sLinkSyn .= $sPart;
|
||||
break;
|
||||
case 'body':
|
||||
$sPageBody .= $sPart;
|
||||
break;
|
||||
default:
|
||||
var_dump($aState, $aPageText);
|
||||
fail('unknown state');
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return $aTemplates;
|
||||
}
|
||||
|
||||
function _templatesToProperties($aTemplates)
|
||||
{
|
||||
$aPageProperties = array();
|
||||
foreach ($aTemplates as $iTemplate => $aTemplate) {
|
||||
$aParams = array();
|
||||
foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
|
||||
if (($iPos = strpos($sParam, '=')) === false) {
|
||||
$aParams[] = trim($sParam);
|
||||
} else {
|
||||
$aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
|
||||
}
|
||||
}
|
||||
$aTemplates[$iTemplate][1] = $aParams;
|
||||
if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
|
||||
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
|
||||
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
|
||||
}
|
||||
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
|
||||
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
|
||||
}
|
||||
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
|
||||
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
|
||||
}
|
||||
if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
|
||||
$aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
|
||||
}
|
||||
if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
|
||||
if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
|
||||
$aPageProperties['sWebsite'] = $aMatch[1];
|
||||
if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
|
||||
$aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
|
||||
$aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
|
||||
}
|
||||
|
||||
if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
|
||||
$aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
|
||||
// $aPageProperties['aInfoboxParams'] = $aParams;
|
||||
}
|
||||
|
||||
// Assume the first template with lots of params is the type (fallback for infobox)
|
||||
if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
|
||||
$aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
|
||||
// $aPageProperties['aInfoboxParams'] = $aParams;
|
||||
}
|
||||
|
||||
// do we have a lat/lon
|
||||
if (!isset($aPageProperties['fLat'])) {
|
||||
if (isset($aParams['latd']) && isset($aParams['longd'])) {
|
||||
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
|
||||
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
|
||||
}
|
||||
if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
|
||||
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
|
||||
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
|
||||
}
|
||||
if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
|
||||
if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
|
||||
$aPageProperties['fLat'] = (float)$aParams['latitude'];
|
||||
$aPageProperties['fLon'] = (float)$aParams['longitude'];
|
||||
}
|
||||
}
|
||||
if (strtolower($aTemplate[0]) == 'coord') {
|
||||
if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
|
||||
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
|
||||
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
|
||||
} elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
|
||||
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
|
||||
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
|
||||
} elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
|
||||
$aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
|
||||
$aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
|
||||
} elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
|
||||
$aPageProperties['fLat'] = (float)$aParams[0];
|
||||
$aPageProperties['fLon'] = (float)$aParams[1];
|
||||
}
|
||||
}
|
||||
if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
|
||||
$aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
|
||||
$aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
|
||||
if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
|
||||
$aPageProperties['fLat'] =
|
||||
(degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
|
||||
+degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
|
||||
} elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
|
||||
$aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
|
||||
}
|
||||
|
||||
if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
|
||||
$aPageProperties['fLon'] =
|
||||
(degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
|
||||
+degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
|
||||
} elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
|
||||
$aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isset($aPageProperties['sPossibleInfoboxType'])) {
|
||||
if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
|
||||
unset($aPageProperties['sPossibleInfoboxType']);
|
||||
}
|
||||
return $aPageProperties;
|
||||
}
|
||||
|
||||
if (isset($aCMDResult['parse-wikipedia'])) {
|
||||
$oDB = new Nominatim\DB();
|
||||
$oDB->connect();
|
||||
|
||||
$sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
|
||||
$sSQL .= $aCMDResult['parse-wikipedia'];
|
||||
$sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
|
||||
$aArticleNames = $oDB->getCol($sSQL);
|
||||
/* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
|
||||
and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
|
||||
and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
|
||||
*/
|
||||
foreach ($aArticleNames as $sArticleName) {
|
||||
$sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
|
||||
$aP = _templatesToProperties(_parseWikipediaContent($sPageText));
|
||||
|
||||
if (isset($aP['sInfoboxType'])) {
|
||||
$aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
|
||||
$sSQL = 'update wikipedia_article set ';
|
||||
$sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
|
||||
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
|
||||
$oDB->query($sSQL);
|
||||
}
|
||||
if (isset($aP['iPopulation'])) {
|
||||
$sSQL = 'update wikipedia_article set ';
|
||||
$sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
|
||||
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
|
||||
$oDB->query($sSQL);
|
||||
}
|
||||
if (isset($aP['sWebsite'])) {
|
||||
$sSQL = 'update wikipedia_article set ';
|
||||
$sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
|
||||
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
|
||||
$oDB->query($sSQL);
|
||||
}
|
||||
if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
|
||||
if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
|
||||
echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
|
||||
$sSQL = 'update wikipedia_article set ';
|
||||
$sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
|
||||
$sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
|
||||
$sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
|
||||
$oDB->query($sSQL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function nominatimXMLStart($hParser, $sName, $aAttr)
|
||||
{
|
||||
global $aNominatRecords;
|
||||
switch ($sName) {
|
||||
case 'PLACE':
|
||||
$aNominatRecords[] = $aAttr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function nominatimXMLEnd($hParser, $sName)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
if (isset($aCMDResult['link'])) {
|
||||
$oDB = new Nominatim\DB();
|
||||
$oDB->connect();
|
||||
|
||||
$aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
|
||||
|
||||
// If you point this script at production OSM you will be blocked
|
||||
$sNominatimBaseURL = 'http://SEVERNAME/search.php';
|
||||
|
||||
foreach ($aWikiArticles as $aRecord) {
|
||||
$aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
|
||||
|
||||
$sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
|
||||
|
||||
echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
|
||||
$fMaxDist = 0.0000001;
|
||||
$bUnknown = false;
|
||||
switch (strtolower($aRecord['infobox_type'])) {
|
||||
case 'former country':
|
||||
continue 2;
|
||||
case 'sea':
|
||||
$fMaxDist = 60; // effectively turn it off
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
|
||||
break;
|
||||
case 'country':
|
||||
case 'island':
|
||||
case 'islands':
|
||||
case 'continent':
|
||||
$fMaxDist = 60; // effectively turn it off
|
||||
$sURL .= '&featuretype=country';
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
|
||||
break;
|
||||
case 'prefecture japan':
|
||||
$aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
|
||||
// intentionally no break
|
||||
case 'state':
|
||||
case '#us state':
|
||||
case 'county':
|
||||
case 'u.s. state':
|
||||
case 'u.s. state symbols':
|
||||
case 'german state':
|
||||
case 'province or territory of canada':
|
||||
case 'indian jurisdiction':
|
||||
case 'province':
|
||||
case 'french region':
|
||||
case 'region of italy':
|
||||
case 'kommune':
|
||||
case '#australia state or territory':
|
||||
case 'russian federal subject':
|
||||
$fMaxDist = 4;
|
||||
$sURL .= '&featuretype=state';
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
|
||||
break;
|
||||
case 'protected area':
|
||||
$fMaxDist = 1;
|
||||
$sURL .= '&nearlat='.$aRecord['lat'];
|
||||
$sURL .= '&nearlon='.$aRecord['lon'];
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
|
||||
break;
|
||||
case 'settlement':
|
||||
$bUnknown = true;
|
||||
// intentionally no break
|
||||
case 'french commune':
|
||||
case 'italian comune':
|
||||
case 'uk place':
|
||||
case 'italian comune':
|
||||
case 'australian place':
|
||||
case 'german place':
|
||||
case '#geobox':
|
||||
case 'u.s. county':
|
||||
case 'municipality':
|
||||
case 'city japan':
|
||||
case 'russian inhabited locality':
|
||||
case 'finnish municipality/land area':
|
||||
case 'england county':
|
||||
case 'israel municipality':
|
||||
case 'russian city':
|
||||
case 'city':
|
||||
$fMaxDist = 0.2;
|
||||
$sURL .= '&featuretype=settlement';
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
|
||||
break;
|
||||
case 'mountain':
|
||||
case 'mountain pass':
|
||||
case 'river':
|
||||
case 'lake':
|
||||
case 'airport':
|
||||
$fMaxDist = 0.2;
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
|
||||
break;
|
||||
case 'ship begin':
|
||||
$fMaxDist = 0.1;
|
||||
$aTypes = array('wreck');
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
|
||||
$sURL .= '&nearlat='.$aRecord['lat'];
|
||||
$sURL .= '&nearlon='.$aRecord['lon'];
|
||||
break;
|
||||
case 'road':
|
||||
case 'university':
|
||||
case 'company':
|
||||
case 'department':
|
||||
$fMaxDist = 0.005;
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
|
||||
$sURL .= '&bounded=1';
|
||||
$sURL .= '&nearlat='.$aRecord['lat'];
|
||||
$sURL .= '&nearlon='.$aRecord['lon'];
|
||||
break;
|
||||
default:
|
||||
$bUnknown = true;
|
||||
$fMaxDist = 0.005;
|
||||
$sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
|
||||
// $sURL .= "&bounded=1";
|
||||
$sURL .= '&nearlat='.$aRecord['lat'];
|
||||
$sURL .= '&nearlon='.$aRecord['lon'];
|
||||
echo '-- Unknown: '.$aRecord['infobox_type']."\n";
|
||||
break;
|
||||
}
|
||||
$sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
|
||||
|
||||
var_Dump($sNameURL);
|
||||
$sXML = file_get_contents($sNameURL);
|
||||
|
||||
$aNominatRecords = array();
|
||||
$hXMLParser = xml_parser_create();
|
||||
xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
|
||||
xml_parse($hXMLParser, $sXML, true);
|
||||
xml_parser_free($hXMLParser);
|
||||
|
||||
if (!isset($aNominatRecords[0])) {
|
||||
$aNameParts = preg_split('#[(,]#', $aRecord['name']);
|
||||
if (count($aNameParts) > 1) {
|
||||
$sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
|
||||
var_Dump($sNameURL);
|
||||
$sXML = file_get_contents($sNameURL);
|
||||
|
||||
$aNominatRecords = array();
|
||||
$hXMLParser = xml_parser_create();
|
||||
xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
|
||||
xml_parse($hXMLParser, $sXML, true);
|
||||
xml_parser_free($hXMLParser);
|
||||
}
|
||||
}
|
||||
|
||||
// assume first is best/right
|
||||
for ($i = 0; $i < count($aNominatRecords); $i++) {
|
||||
$fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
|
||||
$fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
|
||||
$fDiff = sqrt($fDiff);
|
||||
if ($bUnknown) {
|
||||
// If it was an unknown type base it on the rank of the found result
|
||||
$iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
|
||||
if ($iRank <= 4) $fMaxDist = 2;
|
||||
elseif ($iRank <= 8) $fMaxDist = 1;
|
||||
elseif ($iRank <= 10) $fMaxDist = 0.8;
|
||||
elseif ($iRank <= 12) $fMaxDist = 0.6;
|
||||
elseif ($iRank <= 17) $fMaxDist = 0.2;
|
||||
elseif ($iRank <= 18) $fMaxDist = 0.1;
|
||||
elseif ($iRank <= 22) $fMaxDist = 0.02;
|
||||
elseif ($iRank <= 26) $fMaxDist = 0.001;
|
||||
else $fMaxDist = 0.001;
|
||||
}
|
||||
echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
|
||||
echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
|
||||
echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
|
||||
echo " (dist:$fDiff, max:$fMaxDist)\n";
|
||||
if ($fDiff > $fMaxDist) {
|
||||
echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
|
||||
} else {
|
||||
$sSQL = 'update wikipedia_article set osm_type=';
|
||||
switch ($aNominatRecords[$i]['OSM_TYPE']) {
|
||||
case 'relation':
|
||||
$sSQL .= "'R'";
|
||||
break;
|
||||
case 'way':
|
||||
$sSQL .= "'W'";
|
||||
break;
|
||||
case 'node':
|
||||
$sSQL .= "'N'";
|
||||
break;
|
||||
}
|
||||
$sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
|
||||
$oDB->query($sSQL);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
DROP TABLE entity;
|
||||
DROP TABLE entity_label;
|
||||
DROP TABLE entity_description;
|
||||
DROP TABLE entity_alias;
|
||||
DROP TABLE entity_link;
|
||||
DROP TABLE entity_property;
|
||||
|
||||
CREATE TABLE entity (
|
||||
entity_id bigint,
|
||||
title text,
|
||||
pid bigint,
|
||||
qid bigint,
|
||||
datatype text,
|
||||
CONSTRAINT pk_entity PRIMARY KEY(entity_id)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_label (
|
||||
entity_id bigint,
|
||||
language text,
|
||||
label text,
|
||||
CONSTRAINT pk_entity_label PRIMARY KEY(entity_id,language)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_description (
|
||||
entity_id bigint,
|
||||
language text,
|
||||
description text,
|
||||
CONSTRAINT pk_entity_description PRIMARY KEY(entity_id,language)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_alias (
|
||||
entity_id bigint,
|
||||
language text,
|
||||
alias text,
|
||||
CONSTRAINT pk_entity_alias PRIMARY KEY(entity_id,language,alias)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_link (
|
||||
entity_id bigint,
|
||||
target text,
|
||||
value text,
|
||||
CONSTRAINT pk_entity_link PRIMARY KEY(entity_id,target)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_link_hit (
|
||||
entity_id bigint,
|
||||
target text,
|
||||
value text,
|
||||
hits bigint,
|
||||
CONSTRAINT pk_entity_link_hit PRIMARY KEY(entity_id,target)
|
||||
);
|
||||
|
||||
CREATE TABLE link_hit (
|
||||
target text,
|
||||
value text,
|
||||
hits bigint,
|
||||
CONSTRAINT pk_link_hit PRIMARY KEY(target,value)
|
||||
);
|
||||
|
||||
CREATE TABLE entity_property (
|
||||
entity_id bigint,
|
||||
order_id bigint,
|
||||
pid bigint,
|
||||
string text,
|
||||
toqid bigint,
|
||||
location geometry,
|
||||
datetime timestamp with time zone,
|
||||
CONSTRAINT pk_entity_property PRIMARY KEY(entity_id, order_id)
|
||||
);
|
||||
|
||||
CREATE TABLE import_link_hit (
|
||||
target text,
|
||||
value text,
|
||||
hits bigint
|
||||
);
|
@ -1,28 +0,0 @@
|
||||
PSQL=/usr/lib/postgresql/9.2/bin/psql -d wikidata
|
||||
|
||||
cat create.sql | $PSQL
|
||||
|
||||
cat entity.csv | $PSQL -c "COPY entity from STDIN WITH CSV"
|
||||
cat entity_label.csv | $PSQL -c "COPY entity_label from STDIN WITH CSV"
|
||||
cat entity_description.csv | $PSQL -c "COPY entity_description from STDIN WITH CSV"
|
||||
cat entity_alias.csv | $PSQL -c "COPY entity_alias from STDIN WITH CSV"
|
||||
cat entity_link.csv | $PSQL -c "COPY entity_link from STDIN WITH CSV"
|
||||
cat entity_property.csv | $PSQL -c "COPY entity_property from STDIN WITH CSV"
|
||||
|
||||
$PSQL -c "create index idx_entity_link_target on entity_link using btree (target,value)"
|
||||
$PSQL -c "create index idx_entity_qid on entity using btree (qid)"
|
||||
$PSQL -c "create table property_label_en as select pid,null::text as label from entity where pid is not null"
|
||||
$PSQL -c "update property_label_en set label = x.label from (select pid,label,language from entity join entity_label using (entity_id) where pid is not null and language = 'en') as x where x.pid = property_label_en.pid"
|
||||
$PSQL -c "create unique index idx_property_label_en on property_label_en using btree (pid)"
|
||||
$PSQL -c "alter table entity add column label_en text"
|
||||
$PSQL -c "update entity set label_en = label from entity_label where entity.entity_id = entity_label.entity_id and language = 'en'"
|
||||
$PSQL -c "alter table entity add column description_en text"
|
||||
$PSQL -c "update entity set description_en = description from entity_description where entity.entity_id = entity_description.entity_id and language = 'en'"
|
||||
|
||||
cat totals.txt | $PSQL -c "COPY import_link_hit from STDIN WITH CSV DELIMITER ' '"
|
||||
$PSQL -c "truncate link_hit"
|
||||
$PSQL -c "insert into link_hit select target||'wiki', replace(catch_decode_url_part(value), '_', ' '), sum(hits) from import_link_hit where replace(catch_decode_url_part(value), '_', ' ') is not null group by target||'wiki', replace(dcatch_decode_url_part(value), '_', ' ')"
|
||||
$PSQL -c "truncate entity_link_hit"
|
||||
$PSQL -c "insert into entity_link_hit select entity_id, target, value, coalesce(hits,0) from entity_link left outer join link_hit using (target, value)"
|
||||
$PSQL -c "create table entity_hit as select entity_id,sum(hits) as hits from entity_link_hit group by entity_id"
|
||||
$PSQL -c "create unique index idx_entity_hit on entity_hit using btree (entity_id)"
|
@ -1,188 +0,0 @@
|
||||
#!/usr/bin/php -Cq
|
||||
<?php
|
||||
|
||||
$hFile = @fopen('wikidatawiki-20130623-pages-articles.xml', 'r');
|
||||
|
||||
$hFileEntity = fopen('entity.csv', 'w');
|
||||
$hFileEntityLabel = fopen('entity_label.csv', 'w');
|
||||
$hFileEntityDescription = fopen('entity_description.csv', 'w');
|
||||
$hFileEntityAlias = fopen('entity_alias.csv', 'w');
|
||||
$hFileEntityLink = fopen('entity_link.csv', 'w');
|
||||
$hFileEntityProperty = fopen('entity_property.csv', 'w');
|
||||
|
||||
$iCount = 0;
|
||||
|
||||
$sTitle = '';
|
||||
$iNS = false;
|
||||
$iID = false;
|
||||
|
||||
if ($hFile) {
|
||||
while (($sLine = fgets($hFile, 4000000)) !== false) {
|
||||
if (substr($sLine, 0, 11) == ' <title>') {
|
||||
$sTitle = substr($sLine, 11, -9);
|
||||
} elseif (substr($sLine, 0, 8) == ' <ns>') {
|
||||
$iNS = (int)substr($sLine, 8, -6);
|
||||
} elseif (substr($sLine, 0, 8) == ' <id>') {
|
||||
$iID = (int)substr($sLine, 8, -6);
|
||||
} elseif (substr($sLine, 0, 33) == ' <text xml:space="preserve">') {
|
||||
if ($iNS == -2) continue;
|
||||
if ($iNS == -1) continue;
|
||||
if ($iNS == 1) continue;
|
||||
if ($iNS == 2) continue;
|
||||
if ($iNS == 3) continue;
|
||||
if ($iNS == 4) continue;
|
||||
if ($iNS == 5) continue;
|
||||
if ($iNS == 6) continue;
|
||||
if ($iNS == 7) continue;
|
||||
if ($iNS == 8) continue;
|
||||
if ($iNS == 9) continue;
|
||||
if ($iNS == 10) continue;
|
||||
if ($iNS == 11) continue;
|
||||
if ($iNS == 12) continue;
|
||||
if ($iNS == 13) continue;
|
||||
if ($iNS == 14) continue;
|
||||
if ($iNS == 15) continue;
|
||||
if ($iNS == 121) continue;
|
||||
if ($iNS == 123) continue;
|
||||
if ($iNS == 829) continue;
|
||||
if ($iNS == 1198) continue;
|
||||
if ($iNS == 1199) continue;
|
||||
$sText = html_entity_decode(substr($sLine, 33, -8), ENT_COMPAT, 'UTF-8');
|
||||
$aArticle = json_decode($sText, true);
|
||||
|
||||
if (array_diff(array_keys($aArticle), array('label', 'description', 'aliases', 'links', 'entity', 'claims', 'datatype')) != array()) {
|
||||
// DEBUG
|
||||
var_dump($sTitle);
|
||||
var_dump(array_keys($aArticle));
|
||||
var_dump($aArticle);
|
||||
exit;
|
||||
}
|
||||
|
||||
$iPID = $iQID = null;
|
||||
if ($aArticle['entity'][0] == 'p') {
|
||||
$iPID = (int) substr($aArticle['entity'], 1);
|
||||
} elseif ($aArticle['entity'][0] == 'q') {
|
||||
$iQID = (int) substr($aArticle['entity'], 1);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
echo '.';
|
||||
|
||||
fputcsv($hFileEntity, array($iID, $sTitle, $iPID, $iQID, @$aArticle['datatype']));
|
||||
|
||||
foreach ($aArticle['label'] as $sLang => $sLabel) {
|
||||
fputcsv($hFileEntityLabel, array($iID, $sLang, $sLabel));
|
||||
// echo "insert into entity_label values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
|
||||
}
|
||||
|
||||
foreach ($aArticle['description'] as $sLang => $sLabel) {
|
||||
fputcsv($hFileEntityDescription, array($iID, $sLang, $sLabel));
|
||||
// echo "insert into entity_description values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
|
||||
}
|
||||
|
||||
foreach ($aArticle['aliases'] as $sLang => $aLabels) {
|
||||
$aUniqueAlias = array();
|
||||
foreach ($aLabels as $sLabel) {
|
||||
if (!isset($aUniqueAlias[$sLabel]) && $sLabel) {
|
||||
fputcsv($hFileEntityAlias, array($iID, $sLang, $sLabel));
|
||||
// echo "insert into entity_alias values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
|
||||
$aUniqueAlias[$sLabel] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($aArticle['links'] as $sLang => $sLabel) {
|
||||
fputcsv($hFileEntityLink, array($iID, $sLang, $sLabel));
|
||||
// echo "insert into entity_link values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
|
||||
}
|
||||
|
||||
|
||||
if (isset($aArticle['claims'])) {
|
||||
//
|
||||
foreach ($aArticle['claims'] as $iClaim => $aClaim) {
|
||||
//
|
||||
$bFail = false;
|
||||
if ($aClaim['m'][0] == 'novalue') continue;
|
||||
if ($aClaim['m'][0] == 'somevalue') continue;
|
||||
$iPID = (int)$aClaim['m'][1];
|
||||
if ($aClaim['m'][0] != 'value') $bFail = true;
|
||||
if ($aClaim['m'][2]== 'wikibase-entityid') {
|
||||
//
|
||||
if ($aClaim['m'][3]['entity-type'] != 'item') $bFail = true;
|
||||
fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, null, $aClaim['m'][3]['numeric-id'], null, null));
|
||||
// echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",null,".$aClaim['m'][3]['numeric-id'].",null);\n";
|
||||
} elseif ($aClaim['m'][2] == 'globecoordinate') {
|
||||
//
|
||||
if ($aClaim['m'][3]['globe'] != 'http://www.wikidata.org/entity/Q2') $bFail = true;
|
||||
fputcsv(
|
||||
$hFileEntityProperty,
|
||||
array(
|
||||
$iID,
|
||||
$iClaim,
|
||||
$iPID,
|
||||
null,
|
||||
null,
|
||||
'SRID=4326;POINT('.((float) $aClaim['m'][3]['longitude']).' '.((float)$aClaim['m'][3]['latitude']).')', null
|
||||
)
|
||||
);
|
||||
/* echo "insert into entity_property values (nextval('seq_entity_property'),";
|
||||
* echo $iID.",".$iPID.",null,null,ST_SetSRID(ST_MakePoint(".((float)$aClaim['m'][3]['longitude']);
|
||||
* echo ", ".((float)$aClaim['m'][3]['latitude'])."),4326));\n";
|
||||
*/
|
||||
} elseif ($aClaim['m'][2] == 'time') {
|
||||
// TODO!
|
||||
/*
|
||||
if ($aClaim['m'][3]['calendarmodel'] == 'http://www.wikidata.org/entity/Q1985727') {
|
||||
// Gregorian
|
||||
if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
|
||||
if ((int)$aMatch[2] < 4700 && ) {
|
||||
$sDateString = $aMatch[2].'-'.$aMatch[3].($aClaim['m'][3]['timezone']>=0?'+':'').$aClaim['m'][3]['timezone'].($aMatch[1]=='-'?' bc':'');
|
||||
fputcsv($hFileEntityProperty, array($iID,$iClaim,$iPID,null,null,null,$sDateString));
|
||||
}
|
||||
} else {
|
||||
// $bFail = true;
|
||||
}
|
||||
} elseif ( $aClaim['m'][3]['calendarmodel'] != 'http://www.wikidata.org/entity/Q1985786') {
|
||||
// Julian
|
||||
if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
|
||||
var_dump($aMatch);
|
||||
exit;
|
||||
$iDayCount = juliantojd(2, 11, 1732);
|
||||
var_dump($iDayCount, jdtogregorian($iDayCount));
|
||||
} else {
|
||||
$bFail = true;
|
||||
exit;
|
||||
}
|
||||
exit;
|
||||
} else {
|
||||
// $bFail = true;
|
||||
}
|
||||
*/
|
||||
} elseif ($aClaim['m'][2] == 'string') {
|
||||
// echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",'".pg_escape_string($aClaim['m'][3])."',null,null);\n";
|
||||
fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, $aClaim['m'][3], null, null, null));
|
||||
} else {
|
||||
$bFail = true;
|
||||
}
|
||||
|
||||
// Don't care about sources: if ($aClaim['refs'] != array()) $bFail = true;
|
||||
|
||||
if ($bFail) {
|
||||
var_dump($sTitle);
|
||||
var_dump($aClaim);
|
||||
} else {
|
||||
// process
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose($hFile);
|
||||
fclose($hFileEntity);
|
||||
fclose($hFileEntityLabel);
|
||||
fclose($hFileEntityDescription);
|
||||
fclose($hFileEntityAlias);
|
||||
fclose($hFileEntityLink);
|
||||
fclose($hFileEntityProperty);
|
||||
}
|
@ -1,79 +0,0 @@
|
||||
<?php
|
||||
|
||||
for ($iTimestamp = mktime(0, 0, 0, 5, 1, 2013); $iTimestamp < mktime(0, 0, 0, 6, 15, 2013); $iTimestamp += 24*60*60) {
|
||||
$sYear = date('Y', $iTimestamp);
|
||||
$sMonth = date('Y-m', $iTimestamp);
|
||||
$sDay = date('Ymd', $iTimestamp);
|
||||
|
||||
for ($iHour = 0; $iHour < 24; $iHour++) {
|
||||
$sFilename = sprintf('pagecounts-'.$sDay.'-%02d0000', $iHour);
|
||||
echo $sFilename."\n";
|
||||
if (!file_exists($sFilename.'.gz')) {
|
||||
exec('wget http://dumps.wikimedia.org/other/pagecounts-raw/'.$sYear.'/'.$sMonth.'/'.$sFilename.'.gz');
|
||||
}
|
||||
|
||||
exec('gzip -dc '.$sFilename.'.gz'.' | grep -e "^[a-z]\{2\} [^ :]\+ [0-9]\+" > hour.txt');
|
||||
|
||||
$hPrevTotals = @fopen('totals.txt', 'r');
|
||||
$hDayTotals = @fopen('hour.txt', 'r');
|
||||
$hNewTotals = @fopen('newtotals.txt', 'w');
|
||||
|
||||
$sPrevKey = $sDayKey = true;
|
||||
$sPrevLine = true;
|
||||
$sDayLine = true;
|
||||
|
||||
do {
|
||||
if ($sPrevKey === $sDayKey) {
|
||||
if ($sPrevLine !== true) fputs($hNewTotals, "$sPrevKey ".($iPrevValue+$iDayValue)."\n");
|
||||
$sPrevLine = true;
|
||||
$sDayLine = true;
|
||||
} elseif ($sDayKey !== false && ($sPrevKey > $sDayKey || $sPrevKey === false)) {
|
||||
fputs($hNewTotals, "$sDayKey ".($iDayValue)."\n");
|
||||
$sDayLine = true;
|
||||
} elseif ($sPrevKey !== false && ($sDayKey > $sPrevKey || $sDayKey === false)) {
|
||||
fputs($hNewTotals, "$sPrevKey ".($iPrevValue)."\n");
|
||||
$sPrevLine = true;
|
||||
}
|
||||
|
||||
if ($sPrevLine === true) {
|
||||
$sPrevLine = $hPrevTotals?fgets($hPrevTotals, 4096):false;
|
||||
if ($sPrevLine !== false) {
|
||||
$aPrevLine = explode(' ', $sPrevLine);
|
||||
$sPrevKey = $aPrevLine[0].' '.$aPrevLine[1];
|
||||
$iPrevValue = (int)$aPrevLine[2];
|
||||
} else {
|
||||
$sPrevKey = false;
|
||||
$iPrevValue = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if ($sDayLine === true) {
|
||||
$sDayLine = $hDayTotals?fgets($hDayTotals, 4096):false;
|
||||
if ($sDayLine !== false) {
|
||||
preg_match('#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#', $sDayLine, $aMatch);
|
||||
$sDayKey = $aMatch[1].' '.$aMatch[2];
|
||||
$iDayValue = (int)$aMatch[3];
|
||||
} else {
|
||||
$sDayKey = false;
|
||||
$iDayValue = 0;
|
||||
}
|
||||
}
|
||||
} while ($sPrevLine !== false || $sDayLine !== false);
|
||||
|
||||
@fclose($hPrevTotals);
|
||||
@fclose($hDayTotals);
|
||||
@fclose($hNewTotals);
|
||||
|
||||
@unlink('totals.txt');
|
||||
rename('newtotals.txt', 'totals.txt');
|
||||
}
|
||||
}
|
||||
|
||||
// Notes:
|
||||
/*
|
||||
gzip -dc $FILE.gz | grep -e "^en [^ :]\+ [0-9]\+" |
|
||||
sed "s#\(^[a-z]\{2\}\) \([^ :]\+\) \([0-9]\+\) [0-9]\+#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = '\1'
|
||||
and title = catch_decode_url_part('\2');#g" | /opt/mapquest/stdbase-dev$
|
||||
cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = s,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
|
||||
cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = coalesce(hits,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
|
||||
*/
|
Loading…
Reference in New Issue
Block a user