From 66efdc52af6e1bad118a2ddd1fca3ee917911d0e Mon Sep 17 00:00:00 2001 From: liameno Date: Sun, 4 Sep 2022 18:05:28 +0300 Subject: [PATCH] Add Docker --- README.md | 36 +++++++++++++----------------------- cli/CMakeLists.txt | 2 +- cli/main.cpp | 4 ++-- crawler/CMakeLists.txt | 2 +- crawler/Dockerfile | 26 ++++++++++++++++++++++++++ crawler/main.cpp | 7 +++---- docker-compose.yml | 12 ++++++++++++ lib/CMakeLists.txt | 2 +- lib/include/config.h | 6 ++++++ lib/src/config.cpp | 31 +++++++++++++++++++++++++++---- sites.txt | 5 ----- website/CMakeLists.txt | 2 +- website/Dockerfile | 22 ++++++++++++++++++++++ website/main.cpp | 4 ++-- 14 files changed, 117 insertions(+), 44 deletions(-) create mode 100644 crawler/Dockerfile create mode 100644 docker-compose.yml create mode 100644 website/Dockerfile diff --git a/README.md b/README.md index 74c5073..efd41b7 100644 --- a/README.md +++ b/README.md @@ -21,47 +21,37 @@ - Nodes - Rating -```shell -cd scripts && sh install_deps.sh -``` -## Build -```shell -cd scripts && sh build_all.sh -``` +## Usage (Docker) -## Run +Please run the build every time to change the arguments.
+The site is launched by default on port 8080 AND with tor proxy (!!!), to edit it you need to change config.json and rebuild website. +The api key for the database must be changed in the config and when the database is started(--api-key) -#### DB +#### DB - please run before using other ```shell -mkdir /tmp/typesense-data && -./typesense-server --data-dir=/tmp/typesense-data --api-key=xyz --enable-cors && -sh scripts/init_db.sh +sudo docker pull typesense/typesense:0.24.0.rcn6 +mkdir /tmp/typesense-data +sudo docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.24.0.rcn6 --data-dir /data --api-key=xyz ``` #### Crawler ```shell -./crawler ../../sites.txt 5 ../../config.json -#[sites_path] [threads_count] [config path] +sudo docker-compose build crawler --build-arg SITES="$(cat sites.txt)" --build-arg THREADS=1 --build-arg CONFIG="$(cat config.json)" +sudo docker-compose up crawler ``` #### Website ```shell -./website ../../config.json -#[config path] -``` - -#### CLI -###### Run website before! -```shell -./cli gnu 1 ../../config.json -#[query] [page] [config path] +sudo docker-compose build website --build-arg CONFIG="$(cat config.json)" +sudo docker-compose up crawler ``` ## Instances ¯\\_(ツ)_/¯ ## TODO +- [x] Docker - [x] Encryption (assymetric) - [x] Multithreading crawler - [ ] Robots Rules (from headers & html) & crawl-delay diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt index 8eb1093..6604e51 100644 --- a/cli/CMakeLists.txt +++ b/cli/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) project(cli LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/cli/main.cpp b/cli/main.cpp index 99dce83..8d04678 100644 --- a/cli/main.cpp +++ b/cli/main.cpp @@ -5,12 +5,12 @@ int main(int argc, char **argv) { using namespace librengine; if (argc <= 2) { - std::cout << "Usage: bin [query] [page] [config_path]\nExample: ./cli \"gnu\" 1 ../../config.json" << std::endl; + std::cout << "Usage: bin [query] [page] [config]\nExample: ./cli \"gnu\" 1 \"$(cat config.json)\"" << std::endl; return 1; } config::all config; - config.load_from_file(argv[3]); + config.load_from_content(argv[3]); std::string query = argv[1]; size_t page = std::stoi(argv[2]); diff --git a/crawler/CMakeLists.txt b/crawler/CMakeLists.txt index 632494d..f11740a 100644 --- a/crawler/CMakeLists.txt +++ b/crawler/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) project(crawler LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/crawler/Dockerfile b/crawler/Dockerfile new file mode 100644 index 0000000..25fe70c --- /dev/null +++ b/crawler/Dockerfile @@ -0,0 +1,26 @@ +FROM gcc:12.2 +FROM ubuntu:20.04 + +ARG SITES +ARG THREADS=1 +ARG CONFIG + +ENV DEBIAN_FRONTEND noninteractive +ENV envSITES=$SITES +ENV envTHREADS=$THREADS +ENV envCONFIG=$CONFIG + +RUN apt-get update && apt-get -y --no-install-recommends install \ + build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates + +RUN git clone https://github.com/lexbor/lexbor && \ + cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install + +COPY lib /usr/src/librengine/lib +COPY crawler /usr/src/librengine/crawler +WORKDIR /usr/src/librengine/crawler + +RUN rm -rf build && mkdir -p build && cd build && cmake .. && make + +WORKDIR /usr/src/librengine/website/build +CMD ["sh", "-c", "/usr/src/librengine/crawler/build/crawler \"${envSITES}\" ${envTHREADS} \"${envCONFIG}\""] \ No newline at end of file diff --git a/crawler/main.cpp b/crawler/main.cpp index a1b7b5b..b2acc8d 100644 --- a/crawler/main.cpp +++ b/crawler/main.cpp @@ -6,18 +6,17 @@ int main(int argc, char **argv) { using namespace librengine; if (argc <= 3) { - std::cout << "Usage: bin [sites_path] [threads_count] [config_path]\nExample: ./crawler ../../sites.txt 5 ../../config.json" << std::endl; + std::cout << "Usage: bin [sites] [threads_count] [config]\nExample: ./crawler \"$(cat sites.txt)\" 5 \"$(cat config.json)\"" << std::endl; return 1; } config::all config; - config.load_from_file(argv[3]); + config.load_from_content(argv[3]); //https://stackoverflow.com/questions/6087886 curl_global_init(CURL_GLOBAL_ALL); - auto content = config::helper::get_file_content(argv[1]); - auto splited = split(content, "\n"); + auto splited = split(argv[1], "\n"); auto threads_count = std::stoi(argv[2]); diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..78c2feb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,12 @@ +version: '3' +services: + website: + build: + context: ./ + dockerfile: ./website/Dockerfile + network_mode: "host" + crawler: + build: + context: ./ + dockerfile: ./crawler/Dockerfile + network_mode: "host" \ No newline at end of file diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 1db0b69..115127a 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) project(librengine LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/lib/include/config.h b/lib/include/config.h index 11958c0..a76b686 100644 --- a/lib/include/config.h +++ b/lib/include/config.h @@ -28,6 +28,7 @@ namespace librengine::config { std::vector nodes; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; struct crawler { std::string user_agent; @@ -48,17 +49,20 @@ namespace librengine::config { bool is_check_robots_txt; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; struct cli { std::optional proxy; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; struct website { size_t port; std::optional proxy; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; struct db { std::string url; @@ -67,6 +71,7 @@ namespace librengine::config { typesense robots; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; struct all { @@ -77,6 +82,7 @@ namespace librengine::config { db db_; void load_from_file(const std::string &path); + void load_from_content(const std::string &content); }; } diff --git a/lib/src/config.cpp b/lib/src/config.cpp index 88ab68b..5cfb381 100644 --- a/lib/src/config.cpp +++ b/lib/src/config.cpp @@ -15,13 +15,16 @@ namespace librengine::config { void global::load_from_file(const std::string &path) { const std::string content = helper::get_file_content(path); + load_from_content(content); + } + void global::load_from_content(const std::string &content) { nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true); json = json["global"]; auto nodes = json["nodes"]; for (auto node : nodes) { - this->nodes.push_back(node_s{node["name"], node["url"]}); + this->nodes.push_back(node_s { node["name"], node["url"] }); } rsa_key_length = json["rsa_key_length"].get(); @@ -31,6 +34,9 @@ namespace librengine::config { void crawler::load_from_file(const std::string &path) { const std::string content = helper::get_file_content(path); + load_from_content(content); + } + void crawler::load_from_content(const std::string &content) { nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true); json = json["crawler"]; @@ -38,7 +44,7 @@ namespace librengine::config { std::string proxy_string = json["proxy"].get(); - if (!proxy_string.empty()) proxy = http::proxy{proxy_string}; + if (!proxy_string.empty()) proxy = http::proxy { proxy_string }; load_page_timeout_s = json["load_page_timeout_s"].get(); update_time_site_info_s_after = json["update_time_site_info_s_after"].get(); @@ -54,16 +60,22 @@ namespace librengine::config { void cli::load_from_file(const std::string &path) { const std::string content = helper::get_file_content(path); + load_from_content(content); + } + void cli::load_from_content(const std::string &content) { nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true); json = json["cli"]; std::string proxy_string = json["proxy"].get(); - if (!proxy_string.empty()) proxy = http::proxy{proxy_string}; + if (!proxy_string.empty()) proxy = http::proxy { proxy_string }; } void website::load_from_file(const std::string &path) { const std::string content = helper::get_file_content(path); + load_from_content(content); + } + void website::load_from_content(const std::string &content) { nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true); json = json["website"]; @@ -71,11 +83,14 @@ namespace librengine::config { std::string proxy_string = json["proxy"].get(); - if (!proxy_string.empty()) proxy = http::proxy{proxy_string}; + if (!proxy_string.empty()) proxy = http::proxy { proxy_string }; } void db::load_from_file(const std::string &path) { const std::string content = helper::get_file_content(path); + load_from_content(content); + } + void db::load_from_content(const std::string &content) { nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true); json = json["db"]; @@ -93,4 +108,12 @@ namespace librengine::config { website_.load_from_file(path); db_.load_from_file(path); } + + void all::load_from_content(const std::string &content) { + global_.load_from_content(content); + crawler_.load_from_content(content); + cli_.load_from_content(content); + website_.load_from_content(content); + db_.load_from_content(content); + } } \ No newline at end of file diff --git a/sites.txt b/sites.txt index 2a2ed26..f0be159 100644 --- a/sites.txt +++ b/sites.txt @@ -367,7 +367,6 @@ leboncoin.fr sourceforge.net namasha.com grammarly.com -.com.tw friv.com livedoor.com cambridge.org @@ -420,7 +419,6 @@ weblio.jp lenta.ru ptt.cc google.gr -.com albawabhnews.com verystream.com repubblica.it @@ -593,7 +591,6 @@ cloudflare.com indiamart.com beeg.com drom.ru -.com mileroticos.com costco.com archiveofourown.org @@ -725,7 +722,6 @@ kizlarsoruyor.com concursolutions.com fast.com nature.com -.com binance.com rutor.info itmedia.co.jp @@ -759,7 +755,6 @@ southwest.com teamviewer.com chron.com caixa.gov.br -.com amazon.com.mx creditonebank.com sci-hub.tw diff --git a/website/CMakeLists.txt b/website/CMakeLists.txt index b3d1806..1e225ef 100644 --- a/website/CMakeLists.txt +++ b/website/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) project(website LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/website/Dockerfile b/website/Dockerfile new file mode 100644 index 0000000..5beb768 --- /dev/null +++ b/website/Dockerfile @@ -0,0 +1,22 @@ +FROM gcc:12.2 +FROM ubuntu:20.04 + +ARG CONFIG + +ENV DEBIAN_FRONTEND noninteractive +ENV envCONFIG=$CONFIG + +RUN apt-get update && apt-get -y --no-install-recommends install \ + build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates + +RUN git clone https://github.com/lexbor/lexbor && \ + cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install + +COPY lib /usr/src/librengine/lib +COPY website /usr/src/librengine/website +WORKDIR /usr/src/librengine/website + +RUN rm -rf build && mkdir -p build && cd build && cmake .. && make + +WORKDIR /usr/src/librengine/website/build +CMD ["sh", "-c", "/usr/src/librengine/website/build/website \"${envCONFIG}\""] \ No newline at end of file diff --git a/website/main.cpp b/website/main.cpp index 997902f..d81ab31 100644 --- a/website/main.cpp +++ b/website/main.cpp @@ -5,12 +5,12 @@ int main(int argc, char **argv) { using namespace httplib; if (argc <= 1) { - std::cout << "Usage: bin [config_path]\nExample: ./backend config.json" << std::endl; + std::cout << "Usage: bin [config]\nExample: ./website \"$(cat config.json)\"" << std::endl; return 1; } config::all config; - config.load_from_file(argv[1]); + config.load_from_content(argv[1]); auto server = std::make_shared(); auto pages = std::make_shared(config);