mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-10-27 11:42:46 +03:00
275 lines
8.2 KiB
Bash
Executable File
275 lines
8.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
psqlcmd() {
|
|
psql --quiet wikiprocessingdb
|
|
}
|
|
|
|
mysql2pgsqlcmd() {
|
|
./mysql2pgsql.perl /dev/stdin /dev/stdout
|
|
}
|
|
|
|
download() {
|
|
echo "Downloading $1"
|
|
wget --quiet --no-clobber --tries 3 "$1"
|
|
}
|
|
|
|
# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
|
|
# requires Bash 4.0
|
|
readarray -t LANGUAGES < languages.txt
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Download wikidata dump tables"
|
|
echo "====================================================================="
|
|
|
|
# 114M wikidatawiki-latest-geo_tags.sql.gz
|
|
# 1.7G wikidatawiki-latest-page.sql.gz
|
|
# 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz
|
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
|
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
|
|
download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Import wikidata dump tables"
|
|
echo "====================================================================="
|
|
|
|
echo "Importing wikidatawiki-latest-geo_tags"
|
|
gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
|
|
|
|
echo "Importing wikidatawiki-latest-page"
|
|
gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
|
|
|
|
echo "Importing wikidatawiki-latest-wb_items_per_site"
|
|
gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Get wikidata places from wikidata query API"
|
|
echo "====================================================================="
|
|
|
|
echo "Number of place types:"
|
|
wc -l wikidata_place_types.txt
|
|
|
|
while read F ; do
|
|
echo "Querying for place type $F..."
|
|
wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
|
|
jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
|
|
awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
|
|
cat $F.csv >> wikidata_place_dump.csv
|
|
rm $F.json $F.txt $F.csv
|
|
done < wikidata_place_types.txt
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Import wikidata places"
|
|
echo "====================================================================="
|
|
|
|
echo "CREATE TABLE wikidata_place_dump (
|
|
item text,
|
|
instance_of text
|
|
);" | psqlcmd
|
|
|
|
echo "COPY wikidata_place_dump (item, instance_of)
|
|
FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv'
|
|
DELIMITER ','
|
|
CSV
|
|
;" | psqlcmd
|
|
|
|
echo "CREATE TABLE wikidata_place_type_levels (
|
|
place_type text,
|
|
level integer
|
|
);" | psqlcmd
|
|
|
|
echo "COPY wikidata_place_type_levels (place_type, level)
|
|
FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv'
|
|
DELIMITER ','
|
|
CSV
|
|
HEADER
|
|
;" | psqlcmd
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Create derived tables"
|
|
echo "====================================================================="
|
|
|
|
echo "CREATE TABLE geo_earth_primary AS
|
|
SELECT gt_page_id,
|
|
gt_lat,
|
|
gt_lon
|
|
FROM geo_tags
|
|
WHERE gt_globe = 'earth'
|
|
AND gt_primary = 1
|
|
AND NOT( gt_lat < -90
|
|
OR gt_lat > 90
|
|
OR gt_lon < -180
|
|
OR gt_lon > 180
|
|
OR gt_lat=0
|
|
OR gt_lon=0)
|
|
;" | psqlcmd
|
|
|
|
echo "CREATE TABLE geo_earth_wikidata AS
|
|
SELECT DISTINCT geo_earth_primary.gt_page_id,
|
|
geo_earth_primary.gt_lat,
|
|
geo_earth_primary.gt_lon,
|
|
page.page_title,
|
|
page.page_namespace
|
|
FROM geo_earth_primary
|
|
LEFT OUTER JOIN page
|
|
ON (geo_earth_primary.gt_page_id = page.page_id)
|
|
ORDER BY geo_earth_primary.gt_page_id
|
|
;" | psqlcmd
|
|
|
|
echo "ALTER TABLE wikidata_place_dump
|
|
ADD COLUMN ont_level integer,
|
|
ADD COLUMN lat numeric(11,8),
|
|
ADD COLUMN lon numeric(11,8)
|
|
;" | psqlcmd
|
|
|
|
echo "UPDATE wikidata_place_dump
|
|
SET ont_level = wikidata_place_type_levels.level
|
|
FROM wikidata_place_type_levels
|
|
WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
|
|
;" | psqlcmd
|
|
|
|
echo "CREATE TABLE wikidata_places
|
|
AS
|
|
SELECT DISTINCT ON (item) item,
|
|
instance_of,
|
|
MAX(ont_level) AS ont_level,
|
|
lat,
|
|
lon
|
|
FROM wikidata_place_dump
|
|
GROUP BY item,
|
|
instance_of,
|
|
ont_level,
|
|
lat,
|
|
lon
|
|
ORDER BY item
|
|
;" | psqlcmd
|
|
|
|
echo "UPDATE wikidata_places
|
|
SET lat = geo_earth_wikidata.gt_lat,
|
|
lon = geo_earth_wikidata.gt_lon
|
|
FROM geo_earth_wikidata
|
|
WHERE wikidata_places.item = geo_earth_wikidata.page_title
|
|
;" | psqlcmd
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Process language pages"
|
|
echo "====================================================================="
|
|
|
|
|
|
echo "CREATE TABLE wikidata_pages (
|
|
item text,
|
|
instance_of text,
|
|
lat numeric(11,8),
|
|
lon numeric(11,8),
|
|
ips_site_page text,
|
|
language text
|
|
);" | psqlcmd
|
|
|
|
for i in "${LANGUAGES[@]}"
|
|
do
|
|
echo "CREATE TABLE wikidata_${i}_pages AS
|
|
SELECT wikidata_places.item,
|
|
wikidata_places.instance_of,
|
|
wikidata_places.lat,
|
|
wikidata_places.lon,
|
|
wb_items_per_site.ips_site_page
|
|
FROM wikidata_places
|
|
LEFT JOIN wb_items_per_site
|
|
ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
|
|
WHERE ips_site_id = '${i}wiki'
|
|
AND LEFT(wikidata_places.item,1) = 'Q'
|
|
ORDER BY wikidata_places.item
|
|
;" | psqlcmd
|
|
|
|
echo "ALTER TABLE wikidata_${i}_pages
|
|
ADD COLUMN language text
|
|
;" | psqlcmd
|
|
|
|
echo "UPDATE wikidata_${i}_pages
|
|
SET language = '${i}'
|
|
;" | psqlcmd
|
|
|
|
echo "INSERT INTO wikidata_pages
|
|
SELECT item,
|
|
instance_of,
|
|
lat,
|
|
lon,
|
|
ips_site_page,
|
|
language
|
|
FROM wikidata_${i}_pages
|
|
;" | psqlcmd
|
|
done
|
|
|
|
echo "ALTER TABLE wikidata_pages
|
|
ADD COLUMN wp_page_title text
|
|
;" | psqlcmd
|
|
echo "UPDATE wikidata_pages
|
|
SET wp_page_title = REPLACE(ips_site_page, ' ', '_')
|
|
;" | psqlcmd
|
|
echo "ALTER TABLE wikidata_pages
|
|
DROP COLUMN ips_site_page
|
|
;" | psqlcmd
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Add wikidata to wikipedia_article table"
|
|
echo "====================================================================="
|
|
|
|
echo "UPDATE wikipedia_article
|
|
SET lat = wikidata_pages.lat,
|
|
lon = wikidata_pages.lon,
|
|
wd_page_title = wikidata_pages.item,
|
|
instance_of = wikidata_pages.instance_of
|
|
FROM wikidata_pages
|
|
WHERE wikipedia_article.language = wikidata_pages.language
|
|
AND wikipedia_article.title = wikidata_pages.wp_page_title
|
|
;" | psqlcmd
|
|
|
|
echo "CREATE TABLE wikipedia_article_slim
|
|
AS
|
|
SELECT * FROM wikipedia_article
|
|
WHERE wikidata_id IS NOT NULL
|
|
;" | psqlcmd
|
|
|
|
echo "ALTER TABLE wikipedia_article
|
|
RENAME TO wikipedia_article_full
|
|
;" | psqlcmd
|
|
|
|
echo "ALTER TABLE wikipedia_article_slim
|
|
RENAME TO wikipedia_article
|
|
;" | psqlcmd
|
|
|
|
|
|
|
|
|
|
echo "====================================================================="
|
|
echo "Dropping intermediate tables"
|
|
echo "====================================================================="
|
|
|
|
echo "DROP TABLE wikidata_place_dump;" | psqlcmd
|
|
echo "DROP TABLE geo_earth_primary;" | psqlcmd
|
|
for i in "${LANGUAGES[@]}"
|
|
do
|
|
echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
|
|
done
|