From aa41b813b8575aea1657c74e2ee22bdb6fe707b1 Mon Sep 17 00:00:00 2001 From: marc tobias Date: Fri, 28 Sep 2018 20:17:02 +0200 Subject: [PATCH] 2018 TIGER data conversion scripts, add documentation to /docs/data-sources --- CMakeLists.txt | 1 - data-sources/us-tiger/README.md | 29 ++++++++++ data-sources/us-tiger/convert.sh | 48 ++++++++++++++++ .../us-tiger/tiger_address_convert.py | 0 .../us-tiger}/tiger_county_fips.json | 0 docs/CMakeLists.txt | 2 + docs/admin/Import-and-Update.md | 42 ++++---------- docs/data-sources/overview.md | 4 ++ docs/mkdocs.yml | 3 + test/bdd/api/reverse/queries.feature | 2 +- utils/imports.php | 56 ------------------- 11 files changed, 99 insertions(+), 88 deletions(-) create mode 100644 data-sources/us-tiger/README.md create mode 100755 data-sources/us-tiger/convert.sh rename utils/tigerAddressImport.py => data-sources/us-tiger/tiger_address_convert.py (100%) rename {utils => data-sources/us-tiger}/tiger_county_fips.json (100%) create mode 100644 docs/data-sources/overview.md delete mode 100755 utils/imports.php diff --git a/CMakeLists.txt b/CMakeLists.txt index a7c7b395..d6f7d2cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,7 +105,6 @@ set(CUSTOMFILES website/status.php utils/blocks.php utils/country_languages.php - utils/imports.php utils/importWikipedia.php utils/export.php utils/query.php diff --git a/data-sources/us-tiger/README.md b/data-sources/us-tiger/README.md new file mode 100644 index 00000000..e75a9efa --- /dev/null +++ b/data-sources/us-tiger/README.md @@ -0,0 +1,29 @@ +# US TIGER address data + +Convert [TIGER](https://www.census.gov/geo/maps-data/data/tiger.html)/Line dataset of the US Census Bureau to SQL files which can be imported by Nominatim. The created tables in the Nominatim database are separate from OpenStreetMap tables and get queried at search time separately. + +The dataset gets updated once per year. Downloading is prown to be slow (can take a full day) and converting them can take hours as well. + +Replace '2018' with the current year throughout. + + 1. Install the GDAL library and python bindings and the unzip tool + + # Ubuntu: + sudo apt-get install python-gdal unzip + # CentOS: + sudo yum install gdal-python unzip + + 2. Get the TIGER 2018 data. You will need the EDGES files + (3,233 zip files, 11GB total). + + wget -r ftp://ftp2.census.gov/geo/tiger/TIGER2018/EDGES/ + + 3. Convert the data into SQL statements. Adjust the file paths in the scripts as needed + + cd data-sources/us-tiger + ./convert.sh + + 4. Maybe: package the created files + + tar -czf tiger2018-nominatim-preprocessed.tar.gz tiger + \ No newline at end of file diff --git a/data-sources/us-tiger/convert.sh b/data-sources/us-tiger/convert.sh new file mode 100755 index 00000000..b94017ea --- /dev/null +++ b/data-sources/us-tiger/convert.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +INPATH=$1 +OUTPATH=$2 + +if [[ ! -d "$INPATH" ]]; then + echo "input path does not exist" + exit 1 +fi + +if [[ ! -d "$OUTPATH" ]]; then + echo "output path does not exist" + exit 1 +fi + +INREGEX='_([0-9]{5})_edges.zip' +WORKPATH="$OUTPATH/tmp-workdir/" +mkdir -p "$WORKPATH" + + + +INFILES=($INPATH/*.zip) +echo "Found ${#INFILES[*]} files." + +for F in ${INFILES[*]}; do + # echo $F + + if [[ "$F" =~ $INREGEX ]]; then + COUNTYID=${BASH_REMATCH[1]} + SHAPEFILE="$WORKPATH/$(basename $F '.zip').shp" + SQLFILE="$OUTPATH/$COUNTYID.sql" + + unzip -o -q -d "$WORKPATH" "$F" + if [[ ! -e "$SHAPEFILE" ]]; then + echo "Unzip failed. $SHAPEFILE not found." + exit 1 + fi + + ./tiger_address_convert.py "$SHAPEFILE" "$SQLFILE" + + rm $WORKPATH/* + fi +done + +OUTFILES=($OUTPATH/*.sql) +echo "Wrote ${#OUTFILES[*]} files." + +rmdir $WORKPATH diff --git a/utils/tigerAddressImport.py b/data-sources/us-tiger/tiger_address_convert.py similarity index 100% rename from utils/tigerAddressImport.py rename to data-sources/us-tiger/tiger_address_convert.py diff --git a/utils/tiger_county_fips.json b/data-sources/us-tiger/tiger_county_fips.json similarity index 100% rename from utils/tiger_county_fips.json rename to data-sources/us-tiger/tiger_county_fips.json diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index cbe91b91..68af5429 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -10,8 +10,10 @@ ADD_CUSTOM_TARGET(doc COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/admin ${CMAKE_CURRENT_BINARY_DIR}/admin COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/develop ${CMAKE_CURRENT_BINARY_DIR}/develop COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/api ${CMAKE_CURRENT_BINARY_DIR}/api + COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/data-sources ${CMAKE_CURRENT_BINARY_DIR}/data-sources COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/index.md ${CMAKE_CURRENT_BINARY_DIR}/index.md COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/extra.css ${CMAKE_CURRENT_BINARY_DIR}/extra.css + COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/us-tiger/README.md ${CMAKE_CURRENT_BINARY_DIR}/data-sources/US-Tiger.md COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-7.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-7.md COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-16.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-16.md COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md diff --git a/docs/admin/Import-and-Update.md b/docs/admin/Import-and-Update.md index 731ff8fa..847aa37d 100644 --- a/docs/admin/Import-and-Update.md +++ b/docs/admin/Import-and-Update.md @@ -101,52 +101,34 @@ Note that this command downloads the phrases from the wiki link above. ## Installing Tiger housenumber data for the US -Nominatim is able to use the official TIGER address set to complement the -OSM house number data in the US. You can add TIGER data to your own Nominatim -instance by following these steps: +Nominatim is able to use the official [TIGER](https://www.census.gov/geo/maps-data/data/tiger.html) +address set to complement the OSM house number data in the US. You can add +TIGER data to your own Nominatim instance by following these steps. The +entire US adds about 10GB to your database. - 1. Install the GDAL library and python bindings and the unzip tool - - * Ubuntu: `sudo apt-get install python-gdal unzip` - * CentOS: `sudo yum install gdal-python unzip` - - 2. Get preprocessed TIGER 2017 data and unpack it into the + 1. Get preprocessed TIGER 2018 data and unpack it into the data directory in your Nominatim sources: cd Nominatim/data - wget https://nominatim.org/data/tiger2017-nominatim-preprocessed.tar.gz - tar xf tiger2017-nominatim-preprocessed.tar.gz + wget https://nominatim.org/data/tiger2018-nominatim-preprocessed.tar.gz + tar xf tiger2018-nominatim-preprocessed.tar.gz - 3. Import the data into your Nominatim database: + `data-source/us-tiger/README.md` explains how the data got preprocessed. + + 2. Import the data into your Nominatim database: ./utils/setup.php --import-tiger-data - 4. Enable use of the Tiger data in your `settings/local.php` by adding: + 3. Enable use of the Tiger data in your `settings/local.php` by adding: @define('CONST_Use_US_Tiger_Data', true); - 5. Apply the new settings: + 4. Apply the new settings: ```sh ./utils/setup.php --create-functions --enable-diff-updates --create-partition-functions ``` -The entire US adds about 10GB to your database. - -You can also process the data from the original TIGER data to create the -SQL files, Nominatim needs for the import: - - 1. Get the TIGER 2017 data. You will need the EDGES files - (3,234 zip files, 11GB total). - - wget -r ftp://ftp2.census.gov/geo/tiger/TIGER2017/EDGES/ - - 2. Convert the data into SQL statements: - - ./utils/imports.php --parse-tiger - -Be warned that this can take quite a long time. After this process is finished, -the same preprocessed files as above are available in `data/tiger`. ## Updates diff --git a/docs/data-sources/overview.md b/docs/data-sources/overview.md new file mode 100644 index 00000000..a6dc0dba --- /dev/null +++ b/docs/data-sources/overview.md @@ -0,0 +1,4 @@ +# Additional Data Sources + +This guide explains how data sources other than OpenStreetMap mentioned in +the install instructions got obtained and converted. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index b620decf..7c516070 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -20,6 +20,9 @@ pages: - 'Troubleshooting' : 'admin/Faq.md' - 'Developers Guide': - 'Overview' : 'develop/overview.md' + - 'Data Sources': + - 'Overview' : 'data-sources/overview.md' + - 'US Census (Tiger)': data-sources/US-Tiger.md' - 'Appendix': - 'Installation on CentOS 7' : 'appendix/Install-on-Centos-7.md' - 'Installation on Ubuntu 16' : 'appendix/Install-on-Ubuntu-16.md' diff --git a/test/bdd/api/reverse/queries.feature b/test/bdd/api/reverse/queries.feature index 1973f0b9..8fbe552c 100644 --- a/test/bdd/api/reverse/queries.feature +++ b/test/bdd/api/reverse/queries.feature @@ -10,7 +10,7 @@ Feature: Reverse geocoding | way | place | house | And result addresses contain | house_number | road | postcode | country_code | - | 906 | West 1st Street | 57274 | us | + | 909 | West 1st Street | 57274 | us | @Tiger Scenario: No TIGER house number for zoom < 18 diff --git a/utils/imports.php b/utils/imports.php deleted file mode 100755 index 9d1085f0..00000000 --- a/utils/imports.php +++ /dev/null @@ -1,56 +0,0 @@ -#!@PHP_BIN@ -Cq -