mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-23 05:35:13 +03:00
refactor import_wiki* script for readability
This commit is contained in:
parent
7db0da40ad
commit
9587fc9909
@ -1,37 +1,66 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
psqlcmd() {
|
psqlcmd() {
|
||||||
psql wikiprocessingdb
|
psql --quiet wikiprocessingdb
|
||||||
}
|
}
|
||||||
|
|
||||||
mysql2pgsqlcmd() {
|
mysql2pgsqlcmd() {
|
||||||
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download() {
|
||||||
|
echo "Downloading $1"
|
||||||
|
wget --quiet --no-clobber --tries 3
|
||||||
|
}
|
||||||
|
|
||||||
# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
||||||
|
# requires Bash 4.0
|
||||||
language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
|
readarray -t LANGUAGES < languages.txt
|
||||||
|
|
||||||
|
|
||||||
# get a few wikidata dump tables
|
|
||||||
|
|
||||||
wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
|
echo "====================================================================="
|
||||||
wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
|
echo "Download wikidata dump tables"
|
||||||
wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
|
echo "====================================================================="
|
||||||
|
|
||||||
|
# 114M wikidatawiki-latest-geo_tags.sql.gz
|
||||||
|
# 1.7G wikidatawiki-latest-page.sql.gz
|
||||||
|
# 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
|
||||||
|
|
||||||
|
|
||||||
# import wikidata tables
|
|
||||||
|
|
||||||
gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
|
|
||||||
gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
|
echo "====================================================================="
|
||||||
|
echo "Import wikidata dump tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "Importing wikidatawiki-latest-geo_tags"
|
||||||
|
gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
|
||||||
|
|
||||||
|
echo "Importing wikidatawiki-latest-page"
|
||||||
|
gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
|
||||||
|
|
||||||
|
echo "Importing wikidatawiki-latest-wb_items_per_site"
|
||||||
gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
|
gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
# get wikidata places from wikidata query API
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Get wikidata places from wikidata query API"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "Number of place types:"
|
||||||
|
wc -l wikidata_place_types.txt
|
||||||
|
|
||||||
while read F ; do
|
while read F ; do
|
||||||
wget "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
|
echo "Querying for place type $F..."
|
||||||
|
wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
|
||||||
jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
|
jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
|
||||||
awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
|
awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
|
||||||
cat $F.csv >> wikidata_place_dump.csv
|
cat $F.csv >> wikidata_place_dump.csv
|
||||||
@ -39,57 +68,207 @@ while read F ; do
|
|||||||
done < wikidata_place_types.txt
|
done < wikidata_place_types.txt
|
||||||
|
|
||||||
|
|
||||||
# import wikidata places
|
|
||||||
|
|
||||||
echo "CREATE TABLE wikidata_place_dump (item text, instance_of text);" | psqlcmd
|
|
||||||
echo "COPY wikidata_place_dump (item, instance_of) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' DELIMITER ',' CSV;" | psqlcmd
|
|
||||||
|
|
||||||
echo "CREATE TABLE wikidata_place_type_levels (place_type text, level integer);" | psqlcmd
|
|
||||||
echo "COPY wikidata_place_type_levels (place_type, level) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' DELIMITER ',' CSV HEADER;" | psqlcmd
|
|
||||||
|
|
||||||
|
|
||||||
# create derived tables
|
echo "====================================================================="
|
||||||
|
echo "Import wikidata places"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
echo "CREATE TABLE geo_earth_primary AS SELECT gt_page_id, gt_lat, gt_lon FROM geo_tags WHERE gt_globe = 'earth' AND gt_primary = 1 AND NOT( gt_lat < -90 OR gt_lat > 90 OR gt_lon < -180 OR gt_lon > 180 OR gt_lat=0 OR gt_lon=0) ;" | psqlcmd
|
echo "CREATE TABLE wikidata_place_dump (
|
||||||
echo "CREATE TABLE geo_earth_wikidata AS SELECT DISTINCT geo_earth_primary.gt_page_id, geo_earth_primary.gt_lat, geo_earth_primary.gt_lon, page.page_title, page.page_namespace FROM geo_earth_primary LEFT OUTER JOIN page ON (geo_earth_primary.gt_page_id = page.page_id) ORDER BY geo_earth_primary.gt_page_id;" | psqlcmd
|
item text,
|
||||||
|
instance_of text
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
echo "ALTER TABLE wikidata_place_dump ADD COLUMN ont_level integer, ADD COLUMN lat numeric(11,8), ADD COLUMN lon numeric(11,8);" | psqlcmd
|
echo "COPY wikidata_place_dump (item, instance_of)
|
||||||
echo "UPDATE wikidata_place_dump SET ont_level = wikidata_place_type_levels.level FROM wikidata_place_type_levels WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type;" | psqlcmd
|
FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv'
|
||||||
|
DELIMITER ','
|
||||||
|
CSV
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
echo "CREATE TABLE wikidata_places AS SELECT DISTINCT ON (item) item, instance_of, MAX(ont_level) AS ont_level, lat, lon FROM wikidata_place_dump GROUP BY item, instance_of, ont_level, lat, lon ORDER BY item;" | psqlcmd
|
echo "CREATE TABLE wikidata_place_type_levels (
|
||||||
echo "UPDATE wikidata_places SET lat = geo_earth_wikidata.gt_lat, lon = geo_earth_wikidata.gt_lon FROM geo_earth_wikidata WHERE wikidata_places.item = geo_earth_wikidata.page_title" | psqlcmd
|
place_type text,
|
||||||
|
level integer
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
|
echo "COPY wikidata_place_type_levels (place_type, level)
|
||||||
|
FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv'
|
||||||
|
DELIMITER ','
|
||||||
|
CSV
|
||||||
|
HEADER
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
# process language pages
|
|
||||||
|
|
||||||
echo "CREATE TABLE wikidata_pages (item text, instance_of text, lat numeric(11,8), lon numeric(11,8), ips_site_page text, language text );" | psqlcmd
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
echo "====================================================================="
|
||||||
|
echo "Create derived tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "CREATE TABLE geo_earth_primary AS
|
||||||
|
SELECT gt_page_id,
|
||||||
|
gt_lat,
|
||||||
|
gt_lon
|
||||||
|
FROM geo_tags
|
||||||
|
WHERE gt_globe = 'earth'
|
||||||
|
AND gt_primary = 1
|
||||||
|
AND NOT( gt_lat < -90
|
||||||
|
OR gt_lat > 90
|
||||||
|
OR gt_lon < -180
|
||||||
|
OR gt_lon > 180
|
||||||
|
OR gt_lat=0
|
||||||
|
OR gt_lon=0)
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "CREATE TABLE geo_earth_wikidata AS
|
||||||
|
SELECT DISTINCT geo_earth_primary.gt_page_id,
|
||||||
|
geo_earth_primary.gt_lat,
|
||||||
|
geo_earth_primary.gt_lon,
|
||||||
|
page.page_title,
|
||||||
|
page.page_namespace
|
||||||
|
FROM geo_earth_primary
|
||||||
|
LEFT OUTER JOIN page
|
||||||
|
ON (geo_earth_primary.gt_page_id = page.page_id)
|
||||||
|
ORDER BY geo_earth_primary.gt_page_id
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "ALTER TABLE wikidata_place_dump
|
||||||
|
ADD COLUMN ont_level integer,
|
||||||
|
ADD COLUMN lat numeric(11,8),
|
||||||
|
ADD COLUMN lon numeric(11,8)
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "UPDATE wikidata_place_dump
|
||||||
|
SET ont_level = wikidata_place_type_levels.level
|
||||||
|
FROM wikidata_place_type_levels
|
||||||
|
WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "CREATE TABLE wikidata_places
|
||||||
|
AS
|
||||||
|
SELECT DISTINCT ON (item) item,
|
||||||
|
instance_of,
|
||||||
|
MAX(ont_level) AS ont_level,
|
||||||
|
lat,
|
||||||
|
lon
|
||||||
|
FROM wikidata_place_dump
|
||||||
|
GROUP BY item,
|
||||||
|
instance_of,
|
||||||
|
ont_level,
|
||||||
|
lat,
|
||||||
|
lon
|
||||||
|
ORDER BY item
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "UPDATE wikidata_places
|
||||||
|
SET lat = geo_earth_wikidata.gt_lat,
|
||||||
|
lon = geo_earth_wikidata.gt_lon
|
||||||
|
FROM geo_earth_wikidata
|
||||||
|
WHERE wikidata_places.item = geo_earth_wikidata.page_title
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Process language pages"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
|
||||||
|
echo "CREATE TABLE wikidata_pages (
|
||||||
|
item text,
|
||||||
|
instance_of text,
|
||||||
|
lat numeric(11,8),
|
||||||
|
lon numeric(11,8),
|
||||||
|
ips_site_page text,
|
||||||
|
language text
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
echo "CREATE TABLE wikidata_${i}_pages as select wikidata_places.item, wikidata_places.instance_of, wikidata_places.lat, wikidata_places.lon, wb_items_per_site.ips_site_page FROM wikidata_places LEFT JOIN wb_items_per_site ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) WHERE ips_site_id = '${i}wiki' AND LEFT(wikidata_places.item,1) = 'Q' order by wikidata_places.item;" | psqlcmd
|
echo "CREATE TABLE wikidata_${i}_pages AS
|
||||||
echo "ALTER TABLE wikidata_${i}_pages ADD COLUMN language text;" | psqlcmd
|
SELECT wikidata_places.item,
|
||||||
echo "UPDATE wikidata_${i}_pages SET language = '${i}';" | psqlcmd
|
wikidata_places.instance_of,
|
||||||
echo "INSERT INTO wikidata_pages SELECT item, instance_of, lat, lon, ips_site_page, language FROM wikidata_${i}_pages;" | psqlcmd
|
wikidata_places.lat,
|
||||||
|
wikidata_places.lon,
|
||||||
|
wb_items_per_site.ips_site_page
|
||||||
|
FROM wikidata_places
|
||||||
|
LEFT JOIN wb_items_per_site
|
||||||
|
ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
|
||||||
|
WHERE ips_site_id = '${i}wiki'
|
||||||
|
AND LEFT(wikidata_places.item,1) = 'Q'
|
||||||
|
ORDER BY wikidata_places.item
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "ALTER TABLE wikidata_${i}_pages
|
||||||
|
ADD COLUMN language text
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "UPDATE wikidata_${i}_pages
|
||||||
|
SET language = '${i}'
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "INSERT INTO wikidata_pages
|
||||||
|
SELECT item,
|
||||||
|
instance_of,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
ips_site_page,
|
||||||
|
language
|
||||||
|
FROM wikidata_${i}_pages
|
||||||
|
;" | psqlcmd
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "ALTER TABLE wikidata_pages ADD COLUMN wp_page_title text;" | psqlcmd
|
echo "ALTER TABLE wikidata_pages
|
||||||
echo "UPDATE wikidata_pages SET wp_page_title = REPLACE(ips_site_page, ' ', '_');" | psqlcmd
|
ADD COLUMN wp_page_title text
|
||||||
echo "ALTER TABLE wikidata_pages DROP COLUMN ips_site_page;" | psqlcmd
|
;" | psqlcmd
|
||||||
|
echo "UPDATE wikidata_pages
|
||||||
|
SET wp_page_title = REPLACE(ips_site_page, ' ', '_')
|
||||||
|
;" | psqlcmd
|
||||||
|
echo "ALTER TABLE wikidata_pages
|
||||||
|
DROP COLUMN ips_site_page
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
# add wikidata to wikipedia_article table
|
|
||||||
|
|
||||||
echo "UPDATE wikipedia_article SET lat = wikidata_pages.lat, lon = wikidata_pages.lon, wd_page_title = wikidata_pages.item, instance_of = wikidata_pages.instance_of FROM wikidata_pages WHERE wikipedia_article.language = wikidata_pages.language AND wikipedia_article.title = wikidata_pages.wp_page_title;" | psqlcmd
|
|
||||||
echo "CREATE TABLE wikipedia_article_slim AS SELECT * FROM wikipedia_article WHERE wikidata_id IS NOT NULL;" | psqlcmd
|
|
||||||
echo "ALTER TABLE wikipedia_article RENAME TO wikipedia_article_full;" | psqlcmd
|
|
||||||
echo "ALTER TABLE wikipedia_article_slim RENAME TO wikipedia_article;" | psqlcmd
|
|
||||||
|
|
||||||
|
|
||||||
# clean up intermediate tables
|
echo "====================================================================="
|
||||||
|
echo "Add wikidata to wikipedia_article table"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "UPDATE wikipedia_article
|
||||||
|
SET lat = wikidata_pages.lat,
|
||||||
|
lon = wikidata_pages.lon,
|
||||||
|
wd_page_title = wikidata_pages.item,
|
||||||
|
instance_of = wikidata_pages.instance_of
|
||||||
|
FROM wikidata_pages
|
||||||
|
WHERE wikipedia_article.language = wikidata_pages.language
|
||||||
|
AND wikipedia_article.title = wikidata_pages.wp_page_title
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "CREATE TABLE wikipedia_article_slim
|
||||||
|
AS
|
||||||
|
SELECT * FROM wikipedia_article
|
||||||
|
WHERE wikidata_id IS NOT NULL
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "ALTER TABLE wikipedia_article
|
||||||
|
RENAME TO wikipedia_article_full
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "ALTER TABLE wikipedia_article_slim
|
||||||
|
RENAME TO wikipedia_article
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Dropping intermediate tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
echo "DROP TABLE wikidata_place_dump;" | psqlcmd
|
echo "DROP TABLE wikidata_place_dump;" | psqlcmd
|
||||||
echo "DROP TABLE geo_earth_primary;" | psqlcmd
|
echo "DROP TABLE geo_earth_primary;" | psqlcmd
|
||||||
for i in "${language[@]}"
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
|
echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
|
||||||
done
|
done
|
||||||
|
@ -1,81 +1,213 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
psqlcmd() {
|
psqlcmd() {
|
||||||
psql wikiprocessingdb
|
psql --quiet wikiprocessingdb
|
||||||
}
|
}
|
||||||
|
|
||||||
mysql2pgsqlcmd() {
|
mysql2pgsqlcmd() {
|
||||||
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download() {
|
||||||
# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
echo "Downloading $1"
|
||||||
|
wget --quiet --no-clobber --tries 3
|
||||||
language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
|
}
|
||||||
|
|
||||||
|
|
||||||
# create wikipedia calculation tables
|
# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
||||||
|
# requires Bash 4.0
|
||||||
echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);" | psqlcmd
|
readarray -t LANGUAGES < languages.txt
|
||||||
echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd
|
|
||||||
echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | psqlcmd
|
|
||||||
|
|
||||||
|
|
||||||
# download individual wikipedia language tables
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
echo "====================================================================="
|
||||||
|
echo "Create wikipedia calculation tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "CREATE TABLE linkcounts (
|
||||||
|
language text,
|
||||||
|
title text,
|
||||||
|
count integer,
|
||||||
|
sumcount integer,
|
||||||
|
lat double precision,
|
||||||
|
lon double precision
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
|
echo "CREATE TABLE wikipedia_article (
|
||||||
|
language text NOT NULL,
|
||||||
|
title text NOT NULL,
|
||||||
|
langcount integer,
|
||||||
|
othercount integer,
|
||||||
|
totalcount integer,
|
||||||
|
lat double precision,
|
||||||
|
lon double precision,
|
||||||
|
importance double precision,
|
||||||
|
title_en text,
|
||||||
|
osm_type character(1),
|
||||||
|
osm_id bigint
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
|
echo "CREATE TABLE wikipedia_redirect (
|
||||||
|
language text,
|
||||||
|
from_title text,
|
||||||
|
to_title text
|
||||||
|
);" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Download individual wikipedia language tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
|
||||||
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
|
echo "Language: $i"
|
||||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
|
|
||||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
|
# english is the largest
|
||||||
wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
|
# 1.7G enwiki-latest-page.sql.gz
|
||||||
|
# 6.2G enwiki-latest-pagelinks.sql.gz
|
||||||
|
# 355M enwiki-latest-langlinks.sql.gz
|
||||||
|
# 128M enwiki-latest-redirect.sql.gz
|
||||||
|
|
||||||
|
# example of smaller languge turkish
|
||||||
|
# 53M trwiki-latest-page.sql.gz
|
||||||
|
# 176M trwiki-latest-pagelinks.sql.gz
|
||||||
|
# 106M trwiki-latest-langlinks.sql.gz
|
||||||
|
# 3.2M trwiki-latest-redirect.sql.gz
|
||||||
|
|
||||||
|
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
|
||||||
|
download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
# import individual wikipedia language tables
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Import individual wikipedia language tables"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
|
echo "Language: $i"
|
||||||
|
|
||||||
|
echo "Importing ${i}wiki-latest-pagelinks"
|
||||||
gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||||
gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
|
|
||||||
|
echo "Importing ${i}wiki-latest-page"
|
||||||
|
gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||||
|
|
||||||
|
echo "Importing ${i}wiki-latest-langlinks"
|
||||||
gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||||
gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
|
|
||||||
|
echo "Importing ${i}wiki-latest-redirect"
|
||||||
|
gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
# process language tables and associated pagelink counts
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
|
||||||
do
|
|
||||||
echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
|
|
||||||
echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
|
|
||||||
echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd
|
|
||||||
echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd
|
|
||||||
echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd
|
|
||||||
done
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Process language tables and associated pagelink counts"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
|
||||||
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
for j in "${language[@]}"
|
echo "Language: $i"
|
||||||
|
|
||||||
|
echo "CREATE TABLE ${i}pagelinkcount
|
||||||
|
AS
|
||||||
|
SELECT pl_title AS title,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM ${i}pagelinks
|
||||||
|
WHERE pl_namespace = 0
|
||||||
|
GROUP BY pl_title
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "INSERT INTO linkcounts
|
||||||
|
SELECT '${i}',
|
||||||
|
pl_title,
|
||||||
|
COUNT(*)
|
||||||
|
FROM ${i}pagelinks
|
||||||
|
WHERE pl_namespace = 0
|
||||||
|
GROUP BY pl_title
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "INSERT INTO wikipedia_redirect
|
||||||
|
SELECT '${i}',
|
||||||
|
page_title,
|
||||||
|
rd_title
|
||||||
|
FROM ${i}redirect
|
||||||
|
JOIN ${i}page ON (rd_from = page_id)
|
||||||
|
WHERE page_namespace = 0
|
||||||
|
AND rd_namespace = 0
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "ALTER TABLE ${i}pagelinkcount
|
||||||
|
ADD COLUMN othercount integer
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
echo "UPDATE ${i}pagelinkcount
|
||||||
|
SET othercount = 0
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
for j in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd
|
echo "UPDATE ${i}pagelinkcount
|
||||||
|
SET othercount = ${i}pagelinkcount.othercount + x.count
|
||||||
|
FROM (
|
||||||
|
SELECT page_title AS title,
|
||||||
|
count
|
||||||
|
FROM ${i}langlinks
|
||||||
|
JOIN ${i}page ON (ll_from = page_id)
|
||||||
|
JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title)
|
||||||
|
) AS x
|
||||||
|
WHERE x.title = ${i}pagelinkcount.title
|
||||||
|
;" | psqlcmd
|
||||||
done
|
done
|
||||||
echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd
|
|
||||||
|
echo "INSERT INTO wikipedia_article
|
||||||
|
SELECT '${i}',
|
||||||
|
title,
|
||||||
|
count,
|
||||||
|
othercount,
|
||||||
|
count + othercount
|
||||||
|
FROM ${i}pagelinkcount
|
||||||
|
;" | psqlcmd
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
# calculate importance score for each wikipedia page
|
|
||||||
|
|
||||||
echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd
|
|
||||||
|
|
||||||
|
|
||||||
# clean up intermediate tables to conserve space
|
|
||||||
|
|
||||||
for i in "${language[@]}"
|
echo "====================================================================="
|
||||||
|
echo "Calculate importance score for each wikipedia page"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
echo "UPDATE wikipedia_article
|
||||||
|
SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article))
|
||||||
|
;" | psqlcmd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "====================================================================="
|
||||||
|
echo "Clean up intermediate tables to conserve space"
|
||||||
|
echo "====================================================================="
|
||||||
|
|
||||||
|
for i in "${LANGUAGES[@]}"
|
||||||
do
|
do
|
||||||
echo "DROP TABLE ${i}pagelinks;" | psqlcmd
|
echo "DROP TABLE ${i}pagelinks;" | psqlcmd
|
||||||
echo "DROP TABLE ${i}page;" | psqlcmd
|
echo "DROP TABLE ${i}page;" | psqlcmd
|
||||||
echo "DROP TABLE ${i}langlinks;" | psqlcmd
|
echo "DROP TABLE ${i}langlinks;" | psqlcmd
|
||||||
echo "DROP TABLE ${i}redirect;" | psqlcmd
|
echo "DROP TABLE ${i}redirect;" | psqlcmd
|
||||||
echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
|
echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
|
||||||
done
|
done
|
||||||
|
39
data-sources/wikipedia-wikidata/languages.txt
Normal file
39
data-sources/wikipedia-wikidata/languages.txt
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
ar
|
||||||
|
bg
|
||||||
|
ca
|
||||||
|
cs
|
||||||
|
da
|
||||||
|
de
|
||||||
|
en
|
||||||
|
es
|
||||||
|
eo
|
||||||
|
eu
|
||||||
|
fa
|
||||||
|
fr
|
||||||
|
ko
|
||||||
|
hi
|
||||||
|
hr
|
||||||
|
id
|
||||||
|
it
|
||||||
|
he
|
||||||
|
lt
|
||||||
|
hu
|
||||||
|
ms
|
||||||
|
nl
|
||||||
|
ja
|
||||||
|
no
|
||||||
|
pl
|
||||||
|
pt
|
||||||
|
kk
|
||||||
|
ro
|
||||||
|
ru
|
||||||
|
sk
|
||||||
|
sl
|
||||||
|
sr
|
||||||
|
fi
|
||||||
|
sv
|
||||||
|
tr
|
||||||
|
uk
|
||||||
|
vi
|
||||||
|
war
|
||||||
|
zh
|
Loading…
Reference in New Issue
Block a user