2013-09-06 13:52:55 +04:00
< ? php
2016-09-14 04:39:17 +03:00
for ( $iTimestamp = mktime ( 0 , 0 , 0 , 5 , 1 , 2013 ); $iTimestamp < mktime ( 0 , 0 , 0 , 6 , 15 , 2013 ); $iTimestamp += 24 * 60 * 60 ) {
2016-09-04 04:19:48 +03:00
$sYear = date ( " Y " , $iTimestamp );
$sMonth = date ( " Y-m " , $iTimestamp );
$sDay = date ( " Ymd " , $iTimestamp );
2013-09-06 13:52:55 +04:00
2016-09-14 04:39:17 +03:00
for ( $iHour = 0 ; $iHour < 24 ; $iHour ++ ) {
2016-09-04 04:19:48 +03:00
$sFilename = sprintf ( " pagecounts- " . $sDay . " -%02d0000 " , $iHour );
echo $sFilename . " \n " ;
2016-09-14 04:39:17 +03:00
if ( ! file_exists ( $sFilename . '.gz' )) {
2016-09-04 04:19:48 +03:00
exec ( 'wget http://dumps.wikimedia.org/other/pagecounts-raw/' . $sYear . '/' . $sMonth . '/' . $sFilename . '.gz' );
}
2013-09-06 13:52:55 +04:00
2016-09-04 04:19:48 +03:00
exec ( 'gzip -dc ' . $sFilename . '.gz' . ' | grep -e "^[a-z]\{2\} [^ :]\+ [0-9]\+" > hour.txt' );
2013-09-06 13:52:55 +04:00
2016-09-04 04:19:48 +03:00
$hPrevTotals = @ fopen ( " totals.txt " , " r " );
$hDayTotals = @ fopen ( " hour.txt " , " r " );
$hNewTotals = @ fopen ( " newtotals.txt " , " w " );
2013-09-06 13:52:55 +04:00
2016-09-04 04:19:48 +03:00
$sPrevKey = $sDayKey = true ;
$sPrevLine = true ;
$sDayLine = true ;
2013-09-06 13:52:55 +04:00
2016-09-14 04:39:17 +03:00
do {
if ( $sPrevKey === $sDayKey ) {
2016-09-04 04:19:48 +03:00
if ( $sPrevLine !== true ) fputs ( $hNewTotals , " $sPrevKey " . ( $iPrevValue + $iDayValue ) . " \n " );
$sPrevLine = true ;
$sDayLine = true ;
2016-09-14 04:39:17 +03:00
} elseif ( $sDayKey !== false && ( $sPrevKey > $sDayKey || $sPrevKey === false )) {
2016-09-04 04:19:48 +03:00
fputs ( $hNewTotals , " $sDayKey " . ( $iDayValue ) . " \n " );
$sDayLine = true ;
2016-09-14 04:39:17 +03:00
} elseif ( $sPrevKey !== false && ( $sDayKey > $sPrevKey || $sDayKey === false )) {
2016-09-04 04:19:48 +03:00
fputs ( $hNewTotals , " $sPrevKey " . ( $iPrevValue ) . " \n " );
$sPrevLine = true ;
}
2013-09-06 13:52:55 +04:00
2016-09-14 04:39:17 +03:00
if ( $sPrevLine === true ) {
2016-09-04 04:19:48 +03:00
$sPrevLine = $hPrevTotals ? fgets ( $hPrevTotals , 4096 ) : false ;
2016-09-14 04:39:17 +03:00
if ( $sPrevLine !== false ) {
2016-09-04 04:19:48 +03:00
$aPrevLine = explode ( ' ' , $sPrevLine );
$sPrevKey = $aPrevLine [ 0 ] . ' ' . $aPrevLine [ 1 ];
$iPrevValue = ( int ) $aPrevLine [ 2 ];
2016-09-14 04:39:17 +03:00
} else {
2016-09-04 04:19:48 +03:00
$sPrevKey = false ;
$iPrevValue = 0 ;
}
}
2013-09-06 13:52:55 +04:00
2016-09-14 04:39:17 +03:00
if ( $sDayLine === true ) {
2016-09-04 04:19:48 +03:00
$sDayLine = $hDayTotals ? fgets ( $hDayTotals , 4096 ) : false ;
2016-09-14 04:39:17 +03:00
if ( $sDayLine !== false ) {
2016-09-04 04:19:48 +03:00
preg_match ( '#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#' , $sDayLine , $aMatch );
$sDayKey = $aMatch [ 1 ] . ' ' . $aMatch [ 2 ];
$iDayValue = ( int ) $aMatch [ 3 ];
2016-09-14 04:39:17 +03:00
} else {
2016-09-04 04:19:48 +03:00
$sDayKey = false ;
$iDayValue = 0 ;
}
}
} while ( $sPrevLine !== false || $sDayLine !== false );
2013-09-06 13:52:55 +04:00
2016-09-04 04:19:48 +03:00
@ fclose ( $hPrevTotals );
@ fclose ( $hDayTotals );
@ fclose ( $hNewTotals );
2013-09-06 13:52:55 +04:00
2016-09-04 04:19:48 +03:00
@ unlink ( " totals.txt " );
rename ( " newtotals.txt " , " totals.txt " );
}
}
2013-09-06 13:52:55 +04:00
// Notes:
/*
2016-10-14 01:01:16 +03:00
gzip - dc $FILE . gz | grep - e " ^en [^ :] \ + [0-9] \ + " |
sed " s# \ (^[a-z] \ { 2 \ } \ ) \ ([^ :] \ + \ ) \ ([0-9] \ + \ ) [0-9] \ +#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = ' \1 '
and title = catch_decode_url_part ( '\2' ); #g" | /opt/mapquest/stdbase-dev$
2013-09-06 13:52:55 +04:00
cat totals . txt | sed " s# \ (^[a-z] \ { 2 \ } \ ) \ ([^ ] \ + \ ) \ ([0-9] \ + \ ) \$ #update entity_link set hits = s,0) + \3 where target = ' \1 wiki' and value = catch_decode_url_part(' \2 ');#g "
cat totals . txt | sed " s# \ (^[a-z] \ { 2 \ } \ ) \ ([^ ] \ + \ ) \ ([0-9] \ + \ ) \$ #update entity_link set hits = coalesce(hits,0) + \3 where target = ' \1 wiki' and value = catch_decode_url_part(' \2 ');#g "
*/