Add Docker

This commit is contained in:
liameno 2022-09-04 18:05:28 +03:00
parent 2b0fc6efae
commit 66efdc52af
14 changed files with 117 additions and 44 deletions

View File

@ -21,47 +21,37 @@
- Nodes
- Rating
```shell
cd scripts && sh install_deps.sh
```
## Build
```shell
cd scripts && sh build_all.sh
```
## Usage (Docker)
## Run
Please run the build every time to change the arguments. <br>
The site is launched by default on port 8080 AND with tor proxy (<b>!!!</b>), to edit it you need to change config.json and rebuild website.
The api key for the database must be changed in the config and when the database is started(--api-key)
#### DB
#### DB - please run before using other
```shell
mkdir /tmp/typesense-data &&
./typesense-server --data-dir=/tmp/typesense-data --api-key=xyz --enable-cors &&
sh scripts/init_db.sh
sudo docker pull typesense/typesense:0.24.0.rcn6
mkdir /tmp/typesense-data
sudo docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.24.0.rcn6 --data-dir /data --api-key=xyz
```
#### Crawler
```shell
./crawler ../../sites.txt 5 ../../config.json
#[sites_path] [threads_count] [config path]
sudo docker-compose build crawler --build-arg SITES="$(cat sites.txt)" --build-arg THREADS=1 --build-arg CONFIG="$(cat config.json)"
sudo docker-compose up crawler
```
#### Website
```shell
./website ../../config.json
#[config path]
```
#### CLI
###### Run website before!
```shell
./cli gnu 1 ../../config.json
#[query] [page] [config path]
sudo docker-compose build website --build-arg CONFIG="$(cat config.json)"
sudo docker-compose up crawler
```
## Instances
¯\\_(ツ)_/¯
## TODO
- [x] Docker
- [x] Encryption (assymetric)
- [x] Multithreading crawler
- [ ] Robots Rules (from headers & html) & crawl-delay

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.20)
cmake_minimum_required(VERSION 3.10)
project(cli LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)

View File

@ -5,12 +5,12 @@ int main(int argc, char **argv) {
using namespace librengine;
if (argc <= 2) {
std::cout << "Usage: bin [query] [page] [config_path]\nExample: ./cli \"gnu\" 1 ../../config.json" << std::endl;
std::cout << "Usage: bin [query] [page] [config]\nExample: ./cli \"gnu\" 1 \"$(cat config.json)\"" << std::endl;
return 1;
}
config::all config;
config.load_from_file(argv[3]);
config.load_from_content(argv[3]);
std::string query = argv[1];
size_t page = std::stoi(argv[2]);

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.20)
cmake_minimum_required(VERSION 3.10)
project(crawler LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)

26
crawler/Dockerfile Normal file
View File

@ -0,0 +1,26 @@
FROM gcc:12.2
FROM ubuntu:20.04
ARG SITES
ARG THREADS=1
ARG CONFIG
ENV DEBIAN_FRONTEND noninteractive
ENV envSITES=$SITES
ENV envTHREADS=$THREADS
ENV envCONFIG=$CONFIG
RUN apt-get update && apt-get -y --no-install-recommends install \
build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates
RUN git clone https://github.com/lexbor/lexbor && \
cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install
COPY lib /usr/src/librengine/lib
COPY crawler /usr/src/librengine/crawler
WORKDIR /usr/src/librengine/crawler
RUN rm -rf build && mkdir -p build && cd build && cmake .. && make
WORKDIR /usr/src/librengine/website/build
CMD ["sh", "-c", "/usr/src/librengine/crawler/build/crawler \"${envSITES}\" ${envTHREADS} \"${envCONFIG}\""]

View File

@ -6,18 +6,17 @@ int main(int argc, char **argv) {
using namespace librengine;
if (argc <= 3) {
std::cout << "Usage: bin [sites_path] [threads_count] [config_path]\nExample: ./crawler ../../sites.txt 5 ../../config.json" << std::endl;
std::cout << "Usage: bin [sites] [threads_count] [config]\nExample: ./crawler \"$(cat sites.txt)\" 5 \"$(cat config.json)\"" << std::endl;
return 1;
}
config::all config;
config.load_from_file(argv[3]);
config.load_from_content(argv[3]);
//https://stackoverflow.com/questions/6087886
curl_global_init(CURL_GLOBAL_ALL);
auto content = config::helper::get_file_content(argv[1]);
auto splited = split(content, "\n");
auto splited = split(argv[1], "\n");
auto threads_count = std::stoi(argv[2]);

12
docker-compose.yml Normal file
View File

@ -0,0 +1,12 @@
version: '3'
services:
website:
build:
context: ./
dockerfile: ./website/Dockerfile
network_mode: "host"
crawler:
build:
context: ./
dockerfile: ./crawler/Dockerfile
network_mode: "host"

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.20)
cmake_minimum_required(VERSION 3.10)
project(librengine LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)

View File

@ -28,6 +28,7 @@ namespace librengine::config {
std::vector<node_s> nodes;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
struct crawler {
std::string user_agent;
@ -48,17 +49,20 @@ namespace librengine::config {
bool is_check_robots_txt;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
struct cli {
std::optional<http::proxy> proxy;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
struct website {
size_t port;
std::optional<http::proxy> proxy;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
struct db {
std::string url;
@ -67,6 +71,7 @@ namespace librengine::config {
typesense robots;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
struct all {
@ -77,6 +82,7 @@ namespace librengine::config {
db db_;
void load_from_file(const std::string &path);
void load_from_content(const std::string &content);
};
}

View File

@ -15,13 +15,16 @@ namespace librengine::config {
void global::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
load_from_content(content);
}
void global::load_from_content(const std::string &content) {
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
json = json["global"];
auto nodes = json["nodes"];
for (auto node : nodes) {
this->nodes.push_back(node_s{node["name"], node["url"]});
this->nodes.push_back(node_s { node["name"], node["url"] });
}
rsa_key_length = json["rsa_key_length"].get<size_t>();
@ -31,6 +34,9 @@ namespace librengine::config {
void crawler::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
load_from_content(content);
}
void crawler::load_from_content(const std::string &content) {
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
json = json["crawler"];
@ -38,7 +44,7 @@ namespace librengine::config {
std::string proxy_string = json["proxy"].get<std::string>();
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
load_page_timeout_s = json["load_page_timeout_s"].get<size_t>();
update_time_site_info_s_after = json["update_time_site_info_s_after"].get<size_t>();
@ -54,16 +60,22 @@ namespace librengine::config {
void cli::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
load_from_content(content);
}
void cli::load_from_content(const std::string &content) {
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
json = json["cli"];
std::string proxy_string = json["proxy"].get<std::string>();
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
}
void website::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
load_from_content(content);
}
void website::load_from_content(const std::string &content) {
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
json = json["website"];
@ -71,11 +83,14 @@ namespace librengine::config {
std::string proxy_string = json["proxy"].get<std::string>();
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
}
void db::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
load_from_content(content);
}
void db::load_from_content(const std::string &content) {
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
json = json["db"];
@ -93,4 +108,12 @@ namespace librengine::config {
website_.load_from_file(path);
db_.load_from_file(path);
}
void all::load_from_content(const std::string &content) {
global_.load_from_content(content);
crawler_.load_from_content(content);
cli_.load_from_content(content);
website_.load_from_content(content);
db_.load_from_content(content);
}
}

View File

@ -367,7 +367,6 @@ leboncoin.fr
sourceforge.net
namasha.com
grammarly.com
.com.tw
friv.com
livedoor.com
cambridge.org
@ -420,7 +419,6 @@ weblio.jp
lenta.ru
ptt.cc
google.gr
.com
albawabhnews.com
verystream.com
repubblica.it
@ -593,7 +591,6 @@ cloudflare.com
indiamart.com
beeg.com
drom.ru
.com
mileroticos.com
costco.com
archiveofourown.org
@ -725,7 +722,6 @@ kizlarsoruyor.com
concursolutions.com
fast.com
nature.com
.com
binance.com
rutor.info
itmedia.co.jp
@ -759,7 +755,6 @@ southwest.com
teamviewer.com
chron.com
caixa.gov.br
.com
amazon.com.mx
creditonebank.com
sci-hub.tw

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.20)
cmake_minimum_required(VERSION 3.10)
project(website LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)

22
website/Dockerfile Normal file
View File

@ -0,0 +1,22 @@
FROM gcc:12.2
FROM ubuntu:20.04
ARG CONFIG
ENV DEBIAN_FRONTEND noninteractive
ENV envCONFIG=$CONFIG
RUN apt-get update && apt-get -y --no-install-recommends install \
build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates
RUN git clone https://github.com/lexbor/lexbor && \
cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install
COPY lib /usr/src/librengine/lib
COPY website /usr/src/librengine/website
WORKDIR /usr/src/librengine/website
RUN rm -rf build && mkdir -p build && cd build && cmake .. && make
WORKDIR /usr/src/librengine/website/build
CMD ["sh", "-c", "/usr/src/librengine/website/build/website \"${envCONFIG}\""]

View File

@ -5,12 +5,12 @@ int main(int argc, char **argv) {
using namespace httplib;
if (argc <= 1) {
std::cout << "Usage: bin [config_path]\nExample: ./backend config.json" << std::endl;
std::cout << "Usage: bin [config]\nExample: ./website \"$(cat config.json)\"" << std::endl;
return 1;
}
config::all config;
config.load_from_file(argv[1]);
config.load_from_content(argv[1]);
auto server = std::make_shared<Server>();
auto pages = std::make_shared<website::pages>(config);