2013-03-19 02:57:10 +04:00
#!/usr/bin/php -Cq
< ? php
2016-09-04 04:19:48 +03:00
require_once ( dirname ( dirname ( __FILE__ )) . '/settings/settings.php' );
require_once ( CONST_BasePath . '/lib/init-cmd.php' );
ini_set ( 'memory_limit' , '800M' );
2016-09-10 22:10:52 +03:00
$aCMDOptions
= array (
2016-09-04 04:19:48 +03:00
" Create and setup nominatim search system " ,
array ( 'help' , 'h' , 0 , 1 , 0 , 0 , false , 'Show Help' ),
array ( 'quiet' , 'q' , 0 , 1 , 0 , 0 , 'bool' , 'Quiet output' ),
array ( 'verbose' , 'v' , 0 , 1 , 0 , 0 , 'bool' , 'Verbose output' ),
array ( 'create-tables' , '' , 0 , 1 , 0 , 0 , 'bool' , 'Create wikipedia tables' ),
array ( 'parse-articles' , '' , 0 , 1 , 0 , 0 , 'bool' , 'Parse wikipedia articles' ),
array ( 'link' , '' , 0 , 1 , 0 , 0 , 'bool' , 'Try to link to existing OSM ids' ),
2016-09-10 22:10:52 +03:00
);
2016-09-04 04:19:48 +03:00
getCmdOpt ( $_SERVER [ 'argv' ], $aCMDOptions , $aCMDResult , true , true );
2013-03-19 02:57:10 +04:00
/*
$sTestPageText = <<< EOD
{{ Coord | 47 | N | 2 | E | type : country_region : FR | display = title }}
{{ Infobox Amusement park
| name = Six Flags Great Adventure
| image = [[ File : SixFlagsGreatAdventure logo . png ]]
| caption = Six Flags Great Adventure logo
| location = [[ Jackson , New Jersey | Jackson ]]
| location2 = New Jersey
| location3 = United States
| address = 1 Six Flags Boulevard < ref name = " drivedir " />
| season = March / April through October / November
| opening_date = July 1 , 1974
| previous_names = Great Adventure
| area_acre = 2200
| rides = 45 park admission rides
| coasters = 12
| water_rides = 2
| owner = [[ Six Flags ]]
| general_manager =
| homepage = [ http :// www . sixflags . com / parks / greatadventure / Six Flags Great Adventure ]
}}
EOD ;
var_dump ( _templatesToProperties ( _parseWikipediaContent ( $sTestPageText )));
exit ;
//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
*/
/*
2016-09-04 04:19:48 +03:00
$a = array ();
$a [] = 'test' ;
2013-03-19 02:57:10 +04:00
2016-09-04 04:19:48 +03:00
$oDB &= getDB ();
2013-03-19 02:57:10 +04:00
2016-09-04 04:19:48 +03:00
if ( $aCMDResult [ 'drop-tables' ])
{
$oDB -> query ( 'DROP TABLE wikipedia_article' );
$oDB -> query ( 'DROP TABLE wikipedia_link' );
}
2013-03-19 02:57:10 +04:00
*/
2016-09-08 04:16:22 +03:00
if ( $aCMDResult [ 'create-tables' ]) {
2016-09-04 04:19:48 +03:00
$sSQL = <<< 'EOD'
2013-03-19 02:57:10 +04:00
CREATE TABLE wikipedia_article (
language text NOT NULL ,
title text NOT NULL ,
langcount integer ,
othercount integer ,
totalcount integer ,
lat double precision ,
lon double precision ,
importance double precision ,
title_en text ,
osm_type character ( 1 ),
osm_id bigint ,
infobox_type text ,
population bigint ,
website text
);
2016-09-04 04:19:48 +03:00
$oDB -> query ( $sSQL );
2013-03-19 02:57:10 +04:00
2016-09-04 04:19:48 +03:00
$oDB -> query ( " SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2) " );
2013-03-19 02:57:10 +04:00
2016-09-04 04:19:48 +03:00
$sSQL = <<< 'EOD'
2013-03-19 02:57:10 +04:00
CREATE TABLE wikipedia_link (
from_id INTEGER ,
to_name text
);
EOD ;
2016-09-04 04:19:48 +03:00
$oDB -> query ( $sSQL );
}
2016-09-14 04:16:46 +03:00
2016-09-11 06:22:51 +03:00
function degreesAndMinutesToDecimal ( $iDegrees , $iMinutes = 0 , $fSeconds = 0 , $sNSEW = 'N' )
2016-09-04 04:19:48 +03:00
{
$sNSEW = strtoupper ( $sNSEW );
return ( $sNSEW == 'S' || $sNSEW == 'W' ? - 1 : 1 ) * (( float ) $iDegrees + ( float ) $iMinutes / 60 + ( float ) $fSeconds / 3600 );
}
2016-09-14 04:16:46 +03:00
2016-09-04 04:19:48 +03:00
function _parseWikipediaContent ( $sPageText )
{
$sPageText = str_replace ( " \n " , ' ' , $sPageText );
$sPageText = preg_replace ( '#<!--.*?-->#m' , '' , $sPageText );
$sPageText = preg_replace ( '#<math>.*?<\\/math>#m' , '' , $sPageText );
$aPageText = preg_split ( '#({{|}}|\\[\\[|\\]\\]|[|])#' , $sPageText , - 1 , PREG_SPLIT_DELIM_CAPTURE );
$aPageProperties = array ();
$sPageBody = '' ;
$aTemplates = array ();
$aLinks = array ();
$aTemplateStack = array ();
$aState = array ( 'body' );
2016-09-08 04:16:22 +03:00
foreach ( $aPageText as $i => $sPart ) {
switch ( $sPart ) {
2016-09-14 04:16:46 +03:00
case '{{' :
array_unshift ( $aTemplateStack , array ( '' , array ()));
array_unshift ( $aState , 'template' );
break ;
case '}}' :
if ( $aState [ 0 ] == 'template' || $aState [ 0 ] == 'templateparam' ) {
$aTemplate = array_shift ( $aTemplateStack );
array_shift ( $aState );
2016-09-04 04:19:48 +03:00
2016-09-14 04:16:46 +03:00
$aTemplates [] = $aTemplate ;
2016-09-04 04:19:48 +03:00
}
break ;
2016-09-14 04:16:46 +03:00
case '[[' :
$sLinkPage = '' ;
$sLinkSyn = '' ;
array_unshift ( $aState , 'link' );
2016-09-04 04:19:48 +03:00
break ;
2016-09-14 04:16:46 +03:00
case ']]' :
if ( $aState [ 0 ] == 'link' || $aState [ 0 ] == 'linksynonim' ) {
if ( ! $sLinkSyn ) $sLinkSyn = $sLinkPage ;
if ( substr ( $sLinkPage , 0 , 6 ) == 'Image:' ) $sLinkSyn = substr ( $sLinkPage , 6 );
$aLinks [] = array ( $sLinkPage , $sLinkSyn );
array_shift ( $aState );
switch ( $aState [ 0 ]) {
case 'template' :
$aTemplateStack [ 0 ][ 0 ] .= trim ( $sPart );
break ;
case 'templateparam' :
$aTemplateStack [ 0 ][ 1 ][ 0 ] .= $sLinkSyn ;
break ;
case 'link' :
$sLinkPage .= trim ( $sPart );
break ;
case 'linksynonim' :
$sLinkSyn .= $sPart ;
break ;
case 'body' :
$sPageBody .= $sLinkSyn ;
break ;
default :
var_dump ( $aState , $sPageName , $aTemplateStack , $sPart , $aPageText );
fail ( 'unknown state' );
}
}
2016-09-04 04:19:48 +03:00
break ;
2016-09-14 04:16:46 +03:00
case '|' :
if ( $aState [ 0 ] == 'template' || $aState [ 0 ] == 'templateparam' ) {
// Create a new template paramater
$aState [ 0 ] = 'templateparam' ;
array_unshift ( $aTemplateStack [ 0 ][ 1 ], '' );
}
if ( $aState [ 0 ] == 'link' ) $aState [ 0 ] = 'linksynonim' ;
2016-09-04 04:19:48 +03:00
break ;
default :
2016-09-14 04:16:46 +03:00
switch ( $aState [ 0 ]) {
case 'template' :
$aTemplateStack [ 0 ][ 0 ] .= trim ( $sPart );
break ;
case 'templateparam' :
$aTemplateStack [ 0 ][ 1 ][ 0 ] .= $sPart ;
break ;
case 'link' :
$sLinkPage .= trim ( $sPart );
break ;
case 'linksynonim' :
$sLinkSyn .= $sPart ;
break ;
case 'body' :
$sPageBody .= $sPart ;
break ;
default :
var_dump ( $aState , $aPageText );
fail ( 'unknown state' );
}
break ;
2016-09-04 04:19:48 +03:00
}
}
return $aTemplates ;
}
function _templatesToProperties ( $aTemplates )
{
$aPageProperties = array ();
2016-09-08 04:16:22 +03:00
foreach ( $aTemplates as $iTemplate => $aTemplate ) {
2016-09-04 04:19:48 +03:00
$aParams = array ();
2016-09-08 04:16:22 +03:00
foreach ( array_reverse ( $aTemplate [ 1 ]) as $iParam => $sParam ) {
2016-09-14 04:16:46 +03:00
if (( $iPos = strpos ( $sParam , '=' )) === false ) {
2016-09-04 04:19:48 +03:00
$aParams [] = trim ( $sParam );
2016-09-08 04:16:22 +03:00
} else {
2016-09-04 04:19:48 +03:00
$aParams [ trim ( substr ( $sParam , 0 , $iPos ))] = trim ( substr ( $sParam , $iPos + 1 ));
}
}
$aTemplates [ $iTemplate ][ 1 ] = $aParams ;
if ( ! isset ( $aPageProperties [ 'sOfficialName' ]) && isset ( $aParams [ 'official_name' ]) && $aParams [ 'official_name' ]) $aPageProperties [ 'sOfficialName' ] = $aParams [ 'official_name' ];
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'iPopulation' ]) && isset ( $aParams [ 'population' ]) && $aParams [ 'population' ] && preg_match ( '#^[0-9.,]+#' , $aParams [ 'population' ])) {
2016-09-10 22:10:52 +03:00
$aPageProperties [ 'iPopulation' ] = ( int ) str_replace ( array ( ',' , '.' ), '' , $aParams [ 'population' ]);
2016-09-04 04:19:48 +03:00
}
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'iPopulation' ]) && isset ( $aParams [ 'population_total' ]) && $aParams [ 'population_total' ] && preg_match ( '#^[0-9.,]+#' , $aParams [ 'population_total' ])) {
2016-09-10 22:10:52 +03:00
$aPageProperties [ 'iPopulation' ] = ( int ) str_replace ( array ( ',' , '.' ), '' , $aParams [ 'population_total' ]);
2016-09-04 04:19:48 +03:00
}
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'iPopulation' ]) && isset ( $aParams [ 'population_urban' ]) && $aParams [ 'population_urban' ] && preg_match ( '#^[0-9.,]+#' , $aParams [ 'population_urban' ])) {
2016-09-10 22:10:52 +03:00
$aPageProperties [ 'iPopulation' ] = ( int ) str_replace ( array ( ',' , '.' ), '' , $aParams [ 'population_urban' ]);
2016-09-04 04:19:48 +03:00
}
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'iPopulation' ]) && isset ( $aParams [ 'population_estimate' ]) && $aParams [ 'population_estimate' ] && preg_match ( '#^[0-9.,]+#' , $aParams [ 'population_estimate' ])) {
2016-09-10 22:10:52 +03:00
$aPageProperties [ 'iPopulation' ] = ( int ) str_replace ( array ( ',' , '.' ), '' , $aParams [ 'population_estimate' ]);
2016-09-04 04:19:48 +03:00
}
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'sWebsite' ]) && isset ( $aParams [ 'website' ]) && $aParams [ 'website' ]) {
if ( preg_match ( '#^\\[?([^ \\]]+)[^\\]]*\\]?$#' , $aParams [ 'website' ], $aMatch )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'sWebsite' ] = $aMatch [ 1 ];
2016-09-14 04:16:46 +03:00
if ( strpos ( $aPageProperties [ 'sWebsite' ], ':/' . '/' ) === false ) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'sWebsite' ] = 'http:/' . '/' . $aPageProperties [ 'sWebsite' ];
}
}
}
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'sTopLevelDomain' ]) && isset ( $aParams [ 'cctld' ]) && $aParams [ 'cctld' ]) {
2016-09-11 06:22:51 +03:00
$aPageProperties [ 'sTopLevelDomain' ] = str_replace ( array ( '[' , ']' , '.' ), '' , $aParams [ 'cctld' ]);
2016-09-04 04:19:48 +03:00
}
2016-09-11 06:22:51 +03:00
if ( ! isset ( $aPageProperties [ 'sInfoboxType' ]) && strtolower ( substr ( $aTemplate [ 0 ], 0 , 7 )) == 'infobox' ) {
$aPageProperties [ 'sInfoboxType' ] = trim ( substr ( $aTemplate [ 0 ], 8 ));
2016-09-04 04:19:48 +03:00
// $aPageProperties['aInfoboxParams'] = $aParams;
}
// Assume the first template with lots of params is the type (fallback for infobox)
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'sPossibleInfoboxType' ]) && sizeof ( $aParams ) > 10 ) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'sPossibleInfoboxType' ] = trim ( $aTemplate [ 0 ]);
// $aPageProperties['aInfoboxParams'] = $aParams;
}
// do we have a lat/lon
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aPageProperties [ 'fLat' ])) {
if ( isset ( $aParams [ 'latd' ]) && isset ( $aParams [ 'longd' ])) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = degreesAndMinutesToDecimal ( $aParams [ 'latd' ], @ $aParams [ 'latm' ], @ $aParams [ 'lats' ], @ $aParams [ 'latNS' ]);
$aPageProperties [ 'fLon' ] = degreesAndMinutesToDecimal ( $aParams [ 'longd' ], @ $aParams [ 'longm' ], @ $aParams [ 'longs' ], @ $aParams [ 'longEW' ]);
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aParams [ 'lat_degrees' ]) && isset ( $aParams [ 'lat_degrees' ])) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = degreesAndMinutesToDecimal ( $aParams [ 'lat_degrees' ], @ $aParams [ 'lat_minutes' ], @ $aParams [ 'lat_seconds' ], @ $aParams [ 'lat_direction' ]);
$aPageProperties [ 'fLon' ] = degreesAndMinutesToDecimal ( $aParams [ 'long_degrees' ], @ $aParams [ 'long_minutes' ], @ $aParams [ 'long_seconds' ], @ $aParams [ 'long_direction' ]);
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aParams [ 'latitude' ]) && isset ( $aParams [ 'longitude' ])) {
if ( preg_match ( '#[0-9.]+#' , $aParams [ 'latitude' ]) && preg_match ( '#[0-9.]+#' , $aParams [ 'longitude' ])) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = ( float ) $aParams [ 'latitude' ];
$aPageProperties [ 'fLon' ] = ( float ) $aParams [ 'longitude' ];
}
}
2016-09-08 04:16:22 +03:00
if ( strtolower ( $aTemplate [ 0 ]) == 'coord' ) {
if ( isset ( $aParams [ 3 ]) && ( strtoupper ( $aParams [ 3 ]) == 'N' || strtoupper ( $aParams [ 3 ]) == 'S' )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = degreesAndMinutesToDecimal ( $aParams [ 0 ], $aParams [ 1 ], $aParams [ 2 ], $aParams [ 3 ]);
$aPageProperties [ 'fLon' ] = degreesAndMinutesToDecimal ( $aParams [ 4 ], $aParams [ 5 ], $aParams [ 6 ], $aParams [ 7 ]);
2016-09-08 04:16:22 +03:00
} elseif ( isset ( $aParams [ 0 ]) && isset ( $aParams [ 1 ]) && isset ( $aParams [ 2 ]) && ( strtoupper ( $aParams [ 2 ]) == 'N' || strtoupper ( $aParams [ 2 ]) == 'S' )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = degreesAndMinutesToDecimal ( $aParams [ 0 ], $aParams [ 1 ], 0 , $aParams [ 2 ]);
$aPageProperties [ 'fLon' ] = degreesAndMinutesToDecimal ( $aParams [ 3 ], $aParams [ 4 ], 0 , $aParams [ 5 ]);
2016-09-11 06:22:51 +03:00
} elseif ( isset ( $aParams [ 0 ]) && isset ( $aParams [ 1 ]) && ( strtoupper ( $aParams [ 1 ]) == 'N' || strtoupper ( $aParams [ 1 ]) == 'S' )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = ( strtoupper ( $aParams [ 1 ]) == 'N' ? 1 :- 1 ) * ( float ) $aParams [ 0 ];
$aPageProperties [ 'fLon' ] = ( strtoupper ( $aParams [ 3 ]) == 'E' ? 1 :- 1 ) * ( float ) $aParams [ 2 ];
2016-09-11 06:22:51 +03:00
} elseif ( isset ( $aParams [ 0 ]) && is_numeric ( $aParams [ 0 ]) && isset ( $aParams [ 1 ]) && is_numeric ( $aParams [ 1 ])) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = ( float ) $aParams [ 0 ];
$aPageProperties [ 'fLon' ] = ( float ) $aParams [ 1 ];
}
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aParams [ 'Latitude' ]) && isset ( $aParams [ 'Longitude' ])) {
2016-09-11 06:22:51 +03:00
$aParams [ 'Latitude' ] = str_replace ( ' ' , ' ' , $aParams [ 'Latitude' ]);
$aParams [ 'Longitude' ] = str_replace ( ' ' , ' ' , $aParams [ 'Longitude' ]);
2016-09-08 04:16:22 +03:00
if ( preg_match ( '#^([0-9]+)°( ([0-9]+)′ )? ([NS]) to ([0-9]+)°( ([0-9]+)′ )? ([NS])#' , $aParams [ 'Latitude' ], $aMatch )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] =
( degreesAndMinutesToDecimal ( $aMatch [ 1 ], $aMatch [ 3 ], 0 , $aMatch [ 4 ])
+ degreesAndMinutesToDecimal ( $aMatch [ 5 ], $aMatch [ 7 ], 0 , $aMatch [ 8 ])) / 2 ;
2016-09-11 06:22:51 +03:00
} elseif ( preg_match ( '#^([0-9]+)°( ([0-9]+)′ )? ([NS])#' , $aParams [ 'Latitude' ], $aMatch )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLat' ] = degreesAndMinutesToDecimal ( $aMatch [ 1 ], $aMatch [ 3 ], 0 , $aMatch [ 4 ]);
}
2016-09-08 04:16:22 +03:00
if ( preg_match ( '#^([0-9]+)°( ([0-9]+)′ )? ([EW]) to ([0-9]+)°( ([0-9]+)′ )? ([EW])#' , $aParams [ 'Longitude' ], $aMatch )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLon' ] =
( degreesAndMinutesToDecimal ( $aMatch [ 1 ], $aMatch [ 3 ], 0 , $aMatch [ 4 ])
+ degreesAndMinutesToDecimal ( $aMatch [ 5 ], $aMatch [ 7 ], 0 , $aMatch [ 8 ])) / 2 ;
2016-09-11 06:22:51 +03:00
} elseif ( preg_match ( '#^([0-9]+)°( ([0-9]+)′ )? ([EW])#' , $aParams [ 'Longitude' ], $aMatch )) {
2016-09-04 04:19:48 +03:00
$aPageProperties [ 'fLon' ] = degreesAndMinutesToDecimal ( $aMatch [ 1 ], $aMatch [ 3 ], 0 , $aMatch [ 4 ]);
}
}
}
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aPageProperties [ 'sPossibleInfoboxType' ])) {
2016-09-04 04:19:48 +03:00
if ( ! isset ( $aPageProperties [ 'sInfoboxType' ])) $aPageProperties [ 'sInfoboxType' ] = '#' . $aPageProperties [ 'sPossibleInfoboxType' ];
unset ( $aPageProperties [ 'sPossibleInfoboxType' ]);
}
return $aPageProperties ;
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aCMDResult [ 'parse-wikipedia' ])) {
2016-09-04 04:19:48 +03:00
$oDB =& getDB ();
2016-10-14 01:01:16 +03:00
$sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ' ;
$sSQL .= $aCMDResult [ 'parse-wikipedia' ];
$sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'
$aArticleNames = $oDB -> getCol ( $sSQL );
/* $aArticleNames = $oDB -> getCol ( $sSQL = ' select page_title from content where page_namespace = 0
and ( page_content ilike \ ' % {{ Coord % \ ' or ( page_content ilike \ ' % lat % \ '
and page_content ilike \ '%lon%\')) and page_title in (\'Virginia\')' );
*/
2016-09-08 04:16:22 +03:00
foreach ( $aArticleNames as $sArticleName ) {
2016-09-04 04:19:48 +03:00
$sPageText = $oDB -> getOne ( 'select page_content from content where page_namespace = 0 and page_title = \'' . pg_escape_string ( $sArticleName ) . '\'' );
$aP = _templatesToProperties ( _parseWikipediaContent ( $sPageText ));
2016-09-08 04:16:22 +03:00
if ( isset ( $aP [ 'sInfoboxType' ])) {
2016-09-11 06:22:51 +03:00
$aP [ 'sInfoboxType' ] = preg_replace ( '#\\s+#' , ' ' , $aP [ 'sInfoboxType' ]);
2016-09-04 04:19:48 +03:00
$sSQL = 'update wikipedia_article set ' ;
$sSQL .= 'infobox_type = \'' . pg_escape_string ( $aP [ 'sInfoboxType' ]) . '\'' ;
$sSQL .= ' where language = \'en\' and title = \'' . pg_escape_string ( $sArticleName ) . '\';' ;
$oDB -> query ( $sSQL );
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aP [ 'iPopulation' ])) {
2016-09-04 04:19:48 +03:00
$sSQL = 'update wikipedia_article set ' ;
$sSQL .= 'population = \'' . pg_escape_string ( $aP [ 'iPopulation' ]) . '\'' ;
$sSQL .= ' where language = \'en\' and title = \'' . pg_escape_string ( $sArticleName ) . '\';' ;
$oDB -> query ( $sSQL );
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aP [ 'sWebsite' ])) {
2016-09-04 04:19:48 +03:00
$sSQL = 'update wikipedia_article set ' ;
$sSQL .= 'website = \'' . pg_escape_string ( $aP [ 'sWebsite' ]) . '\'' ;
$sSQL .= ' where language = \'en\' and title = \'' . pg_escape_string ( $sArticleName ) . '\';' ;
$oDB -> query ( $sSQL );
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aP [ 'fLat' ]) && ( $aP [ 'fLat' ] != '-0' || $aP [ 'fLon' ] != '-0' )) {
2016-09-04 04:19:48 +03:00
if ( ! isset ( $aP [ 'sInfoboxType' ])) $aP [ 'sInfoboxType' ] = '' ;
echo $sArticleName . '|' . $aP [ 'sInfoboxType' ] . '|' . $aP [ 'fLat' ] . '|' . $aP [ 'fLon' ] . " \n " ;
$sSQL = 'update wikipedia_article set ' ;
$sSQL .= 'lat = \'' . pg_escape_string ( $aP [ 'fLat' ]) . '\',' ;
$sSQL .= 'lon = \'' . pg_escape_string ( $aP [ 'fLon' ]) . '\'' ;
$sSQL .= ' where language = \'en\' and title = \'' . pg_escape_string ( $sArticleName ) . '\';' ;
$oDB -> query ( $sSQL );
}
}
}
2016-09-14 04:16:46 +03:00
2016-09-04 04:19:48 +03:00
function nominatimXMLStart ( $hParser , $sName , $aAttr )
{
2016-09-14 04:16:46 +03:00
global $aNominatRecords ;
switch ( $sName ) {
2016-09-04 04:19:48 +03:00
case 'PLACE' :
2016-09-14 04:16:46 +03:00
$aNominatRecords [] = $aAttr ;
break ;
}
2016-09-04 04:19:48 +03:00
}
2016-09-14 04:16:46 +03:00
2016-09-04 04:19:48 +03:00
function nominatimXMLEnd ( $hParser , $sName )
{
}
2016-09-08 04:16:22 +03:00
if ( isset ( $aCMDResult [ 'link' ])) {
2016-09-04 04:19:48 +03:00
$oDB =& getDB ();
$aWikiArticles = $oDB -> getAll ( " select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000 " );
// If you point this script at production OSM you will be blocked
$sNominatimBaseURL = 'http://SEVERNAME/search.php' ;
2016-09-08 04:16:22 +03:00
foreach ( $aWikiArticles as $aRecord ) {
2016-09-11 06:22:51 +03:00
$aRecord [ 'name' ] = str_replace ( '_' , ' ' , $aRecord [ 'title' ]);
2016-09-04 04:19:48 +03:00
$sURL = $sNominatimBaseURL . '?format=xml&accept-language=en' ;
echo " \n -- " . $aRecord [ 'name' ] . " , " . $aRecord [ 'infobox_type' ] . " \n " ;
$fMaxDist = 0.0000001 ;
$bUnknown = false ;
2016-09-08 04:16:22 +03:00
switch ( strtolower ( $aRecord [ 'infobox_type' ])) {
2016-09-14 04:16:46 +03:00
case 'former country' :
continue 2 ;
case 'sea' :
$fMaxDist = 60 ; // effectively turn it off
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lon' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] - $fMaxDist );
break ;
case 'country' :
case 'island' :
case 'islands' :
case 'continent' :
$fMaxDist = 60 ; // effectively turn it off
$sURL .= " &featuretype=country " ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lon' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] - $fMaxDist );
break ;
case 'prefecture japan' :
$aRecord [ 'name' ] = trim ( str_replace ( ' Prefecture' , ' ' , $aRecord [ 'name' ]));
2016-09-15 20:36:32 +03:00
// intentionally no break
2016-09-14 04:16:46 +03:00
case 'state' :
case '#us state' :
case 'county' :
case 'u.s. state' :
case 'u.s. state symbols' :
case 'german state' :
case 'province or territory of canada' :
case 'indian jurisdiction' :
case 'province' :
case 'french region' :
case 'region of italy' :
case 'kommune' :
case '#australia state or territory' :
case 'russian federal subject' :
$fMaxDist = 4 ;
$sURL .= " &featuretype=state " ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lon' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] - $fMaxDist );
break ;
case 'protected area' :
$fMaxDist = 1 ;
$sURL .= " &nearlat= " . $aRecord [ 'lat' ];
$sURL .= " &nearlon= " . $aRecord [ 'lon' ];
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lon' ] + $fMaxDist ) . " , " . ( $aRecord [ 'lat' ] - $fMaxDist );
break ;
case 'settlement' :
$bUnknown = true ;
2016-09-15 20:36:32 +03:00
// intentionally no break
2016-09-14 04:16:46 +03:00
case 'french commune' :
case 'italian comune' :
case 'uk place' :
case 'italian comune' :
case 'australian place' :
case 'german place' :
case '#geobox' :
case 'u.s. county' :
case 'municipality' :
case 'city japan' :
case 'russian inhabited locality' :
case 'finnish municipality/land area' :
case 'england county' :
case 'israel municipality' :
case 'russian city' :
case 'city' :
$fMaxDist = 0.2 ;
$sURL .= " &featuretype=settlement " ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - 0.5 ) . " , " . ( $aRecord [ 'lat' ] + 0.5 ) . " , " . ( $aRecord [ 'lon' ] + 0.5 ) . " , " . ( $aRecord [ 'lat' ] - 0.5 );
break ;
case 'mountain' :
case 'mountain pass' :
case 'river' :
case 'lake' :
case 'airport' :
$fMaxDist = 0.2 ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - 0.5 ) . " , " . ( $aRecord [ 'lat' ] + 0.5 ) . " , " . ( $aRecord [ 'lon' ] + 0.5 ) . " , " . ( $aRecord [ 'lat' ] - 0.5 );
break ;
case 'ship begin' :
$fMaxDist = 0.1 ;
$aTypes = array ( 'wreck' );
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - 0.01 ) . " , " . ( $aRecord [ 'lat' ] + 0.01 ) . " , " . ( $aRecord [ 'lon' ] + 0.01 ) . " , " . ( $aRecord [ 'lat' ] - 0.01 );
$sURL .= " &nearlat= " . $aRecord [ 'lat' ];
$sURL .= " &nearlon= " . $aRecord [ 'lon' ];
break ;
case 'road' :
case 'university' :
case 'company' :
case 'department' :
$fMaxDist = 0.005 ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - 0.01 ) . " , " . ( $aRecord [ 'lat' ] + 0.01 ) . " , " . ( $aRecord [ 'lon' ] + 0.01 ) . " , " . ( $aRecord [ 'lat' ] - 0.01 );
$sURL .= " &bounded=1 " ;
$sURL .= " &nearlat= " . $aRecord [ 'lat' ];
$sURL .= " &nearlon= " . $aRecord [ 'lon' ];
break ;
default :
$bUnknown = true ;
$fMaxDist = 0.005 ;
$sURL .= " &viewbox= " . ( $aRecord [ 'lon' ] - 0.01 ) . " , " . ( $aRecord [ 'lat' ] + 0.01 ) . " , " . ( $aRecord [ 'lon' ] + 0.01 ) . " , " . ( $aRecord [ 'lat' ] - 0.01 );
// $sURL .= "&bounded=1";
$sURL .= " &nearlat= " . $aRecord [ 'lat' ];
$sURL .= " &nearlon= " . $aRecord [ 'lon' ];
echo " -- Unknown: " . $aRecord [ 'infobox_type' ] . " \n " ;
break ;
2016-09-04 04:19:48 +03:00
}
$sNameURL = $sURL . '&q=' . urlencode ( $aRecord [ 'name' ]);
var_Dump ( $sNameURL );
$sXML = file_get_contents ( $sNameURL );
$aNominatRecords = array ();
$hXMLParser = xml_parser_create ();
xml_set_element_handler ( $hXMLParser , 'nominatimXMLStart' , 'nominatimXMLEnd' );
xml_parse ( $hXMLParser , $sXML , true );
xml_parser_free ( $hXMLParser );
2016-09-08 04:16:22 +03:00
if ( ! isset ( $aNominatRecords [ 0 ])) {
2016-09-11 06:22:51 +03:00
$aNameParts = preg_split ( '#[(,]#' , $aRecord [ 'name' ]);
2016-09-08 04:16:22 +03:00
if ( sizeof ( $aNameParts ) > 1 ) {
2016-09-04 04:19:48 +03:00
$sNameURL = $sURL . '&q=' . urlencode ( trim ( $aNameParts [ 0 ]));
var_Dump ( $sNameURL );
$sXML = file_get_contents ( $sNameURL );
$aNominatRecords = array ();
$hXMLParser = xml_parser_create ();
xml_set_element_handler ( $hXMLParser , 'nominatimXMLStart' , 'nominatimXMLEnd' );
xml_parse ( $hXMLParser , $sXML , true );
2016-09-14 04:16:46 +03:00
xml_parser_free ( $hXMLParser );
2016-09-04 04:19:48 +03:00
}
}
// assume first is best/right
2016-09-08 04:16:22 +03:00
for ( $i = 0 ; $i < sizeof ( $aNominatRecords ); $i ++ ) {
2016-09-04 04:19:48 +03:00
$fDiff = ( $aRecord [ 'lat' ] - $aNominatRecords [ $i ][ 'LAT' ]) * ( $aRecord [ 'lat' ] - $aNominatRecords [ $i ][ 'LAT' ]);
$fDiff += ( $aRecord [ 'lon' ] - $aNominatRecords [ $i ][ 'LON' ]) * ( $aRecord [ 'lon' ] - $aNominatRecords [ $i ][ 'LON' ]);
$fDiff = sqrt ( $fDiff );
if ( $bUnknown ) {
// If it was an unknown type base it on the rank of the found result
$iRank = ( int ) $aNominatRecords [ $i ][ 'PLACE_RANK' ];
if ( $iRank <= 4 ) $fMaxDist = 2 ;
elseif ( $iRank <= 8 ) $fMaxDist = 1 ;
elseif ( $iRank <= 10 ) $fMaxDist = 0.8 ;
elseif ( $iRank <= 12 ) $fMaxDist = 0.6 ;
elseif ( $iRank <= 17 ) $fMaxDist = 0.2 ;
elseif ( $iRank <= 18 ) $fMaxDist = 0.1 ;
elseif ( $iRank <= 22 ) $fMaxDist = 0.02 ;
elseif ( $iRank <= 26 ) $fMaxDist = 0.001 ;
else $fMaxDist = 0.001 ;
}
2016-10-14 01:01:16 +03:00
echo " -- FOUND \" " . substr ( $aNominatRecords [ $i ][ 'DISPLAY_NAME' ], 0 , 50 );
echo " \" , " . $aNominatRecords [ $i ][ 'CLASS' ] . " , " . $aNominatRecords [ $i ][ 'TYPE' ];
echo " , " . $aNominatRecords [ $i ][ 'PLACE_RANK' ] . " , " . $aNominatRecords [ $i ][ 'OSM_TYPE' ];
echo " (dist: $fDiff , max: $fMaxDist ) \n " ;
2016-09-08 04:16:22 +03:00
if ( $fDiff > $fMaxDist ) {
2016-09-04 04:19:48 +03:00
echo " -- Diff too big $fDiff (max: $fMaxDist ) " . $aRecord [ 'lat' ] . ',' . $aNominatRecords [ $i ][ 'LAT' ] . ' & ' . $aRecord [ 'lon' ] . ',' . $aNominatRecords [ $i ][ 'LON' ] . " \n " ;
2016-09-08 04:16:22 +03:00
} else {
2016-09-04 04:19:48 +03:00
$sSQL = " update wikipedia_article set osm_type= " ;
2016-09-08 04:16:22 +03:00
switch ( $aNominatRecords [ $i ][ 'OSM_TYPE' ]) {
2016-09-14 04:16:46 +03:00
case 'relation' :
$sSQL .= " 'R' " ;
break ;
case 'way' :
$sSQL .= " 'W' " ;
break ;
case 'node' :
$sSQL .= " 'N' " ;
break ;
2016-09-04 04:19:48 +03:00
}
$sSQL .= " , osm_id= " . $aNominatRecords [ $i ][ 'OSM_ID' ] . " where language = ' " . pg_escape_string ( $aRecord [ 'language' ]) . " ' and title = ' " . pg_escape_string ( $aRecord [ 'title' ]) . " ' " ;
$oDB -> query ( $sSQL );
break ;
}
}
}
}