mirror of
https://github.com/liameno/librengine.git
synced 2024-11-24 07:53:17 +03:00
Add Docker
This commit is contained in:
parent
2b0fc6efae
commit
66efdc52af
36
README.md
36
README.md
@ -21,47 +21,37 @@
|
||||
- Nodes
|
||||
- Rating
|
||||
|
||||
```shell
|
||||
cd scripts && sh install_deps.sh
|
||||
```
|
||||
|
||||
## Build
|
||||
```shell
|
||||
cd scripts && sh build_all.sh
|
||||
```
|
||||
## Usage (Docker)
|
||||
|
||||
## Run
|
||||
Please run the build every time to change the arguments. <br>
|
||||
The site is launched by default on port 8080 AND with tor proxy (<b>!!!</b>), to edit it you need to change config.json and rebuild website.
|
||||
The api key for the database must be changed in the config and when the database is started(--api-key)
|
||||
|
||||
#### DB
|
||||
#### DB - please run before using other
|
||||
```shell
|
||||
mkdir /tmp/typesense-data &&
|
||||
./typesense-server --data-dir=/tmp/typesense-data --api-key=xyz --enable-cors &&
|
||||
sh scripts/init_db.sh
|
||||
sudo docker pull typesense/typesense:0.24.0.rcn6
|
||||
mkdir /tmp/typesense-data
|
||||
sudo docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.24.0.rcn6 --data-dir /data --api-key=xyz
|
||||
```
|
||||
|
||||
#### Crawler
|
||||
```shell
|
||||
./crawler ../../sites.txt 5 ../../config.json
|
||||
#[sites_path] [threads_count] [config path]
|
||||
sudo docker-compose build crawler --build-arg SITES="$(cat sites.txt)" --build-arg THREADS=1 --build-arg CONFIG="$(cat config.json)"
|
||||
sudo docker-compose up crawler
|
||||
```
|
||||
|
||||
#### Website
|
||||
```shell
|
||||
./website ../../config.json
|
||||
#[config path]
|
||||
```
|
||||
|
||||
#### CLI
|
||||
###### Run website before!
|
||||
```shell
|
||||
./cli gnu 1 ../../config.json
|
||||
#[query] [page] [config path]
|
||||
sudo docker-compose build website --build-arg CONFIG="$(cat config.json)"
|
||||
sudo docker-compose up crawler
|
||||
```
|
||||
|
||||
## Instances
|
||||
¯\\_(ツ)_/¯
|
||||
|
||||
## TODO
|
||||
- [x] Docker
|
||||
- [x] Encryption (assymetric)
|
||||
- [x] Multithreading crawler
|
||||
- [ ] Robots Rules (from headers & html) & crawl-delay
|
||||
|
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(cli LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
@ -5,12 +5,12 @@ int main(int argc, char **argv) {
|
||||
using namespace librengine;
|
||||
|
||||
if (argc <= 2) {
|
||||
std::cout << "Usage: bin [query] [page] [config_path]\nExample: ./cli \"gnu\" 1 ../../config.json" << std::endl;
|
||||
std::cout << "Usage: bin [query] [page] [config]\nExample: ./cli \"gnu\" 1 \"$(cat config.json)\"" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
config::all config;
|
||||
config.load_from_file(argv[3]);
|
||||
config.load_from_content(argv[3]);
|
||||
|
||||
std::string query = argv[1];
|
||||
size_t page = std::stoi(argv[2]);
|
||||
|
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(crawler LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
26
crawler/Dockerfile
Normal file
26
crawler/Dockerfile
Normal file
@ -0,0 +1,26 @@
|
||||
FROM gcc:12.2
|
||||
FROM ubuntu:20.04
|
||||
|
||||
ARG SITES
|
||||
ARG THREADS=1
|
||||
ARG CONFIG
|
||||
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV envSITES=$SITES
|
||||
ENV envTHREADS=$THREADS
|
||||
ENV envCONFIG=$CONFIG
|
||||
|
||||
RUN apt-get update && apt-get -y --no-install-recommends install \
|
||||
build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates
|
||||
|
||||
RUN git clone https://github.com/lexbor/lexbor && \
|
||||
cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install
|
||||
|
||||
COPY lib /usr/src/librengine/lib
|
||||
COPY crawler /usr/src/librengine/crawler
|
||||
WORKDIR /usr/src/librengine/crawler
|
||||
|
||||
RUN rm -rf build && mkdir -p build && cd build && cmake .. && make
|
||||
|
||||
WORKDIR /usr/src/librengine/website/build
|
||||
CMD ["sh", "-c", "/usr/src/librengine/crawler/build/crawler \"${envSITES}\" ${envTHREADS} \"${envCONFIG}\""]
|
@ -6,18 +6,17 @@ int main(int argc, char **argv) {
|
||||
using namespace librengine;
|
||||
|
||||
if (argc <= 3) {
|
||||
std::cout << "Usage: bin [sites_path] [threads_count] [config_path]\nExample: ./crawler ../../sites.txt 5 ../../config.json" << std::endl;
|
||||
std::cout << "Usage: bin [sites] [threads_count] [config]\nExample: ./crawler \"$(cat sites.txt)\" 5 \"$(cat config.json)\"" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
config::all config;
|
||||
config.load_from_file(argv[3]);
|
||||
config.load_from_content(argv[3]);
|
||||
|
||||
//https://stackoverflow.com/questions/6087886
|
||||
curl_global_init(CURL_GLOBAL_ALL);
|
||||
|
||||
auto content = config::helper::get_file_content(argv[1]);
|
||||
auto splited = split(content, "\n");
|
||||
auto splited = split(argv[1], "\n");
|
||||
|
||||
auto threads_count = std::stoi(argv[2]);
|
||||
|
||||
|
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
@ -0,0 +1,12 @@
|
||||
version: '3'
|
||||
services:
|
||||
website:
|
||||
build:
|
||||
context: ./
|
||||
dockerfile: ./website/Dockerfile
|
||||
network_mode: "host"
|
||||
crawler:
|
||||
build:
|
||||
context: ./
|
||||
dockerfile: ./crawler/Dockerfile
|
||||
network_mode: "host"
|
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(librengine LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
@ -28,6 +28,7 @@ namespace librengine::config {
|
||||
std::vector<node_s> nodes;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
struct crawler {
|
||||
std::string user_agent;
|
||||
@ -48,17 +49,20 @@ namespace librengine::config {
|
||||
bool is_check_robots_txt;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
struct cli {
|
||||
std::optional<http::proxy> proxy;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
struct website {
|
||||
size_t port;
|
||||
std::optional<http::proxy> proxy;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
struct db {
|
||||
std::string url;
|
||||
@ -67,6 +71,7 @@ namespace librengine::config {
|
||||
typesense robots;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
|
||||
struct all {
|
||||
@ -77,6 +82,7 @@ namespace librengine::config {
|
||||
db db_;
|
||||
|
||||
void load_from_file(const std::string &path);
|
||||
void load_from_content(const std::string &content);
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,13 +15,16 @@ namespace librengine::config {
|
||||
|
||||
void global::load_from_file(const std::string &path) {
|
||||
const std::string content = helper::get_file_content(path);
|
||||
load_from_content(content);
|
||||
}
|
||||
void global::load_from_content(const std::string &content) {
|
||||
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
|
||||
json = json["global"];
|
||||
|
||||
auto nodes = json["nodes"];
|
||||
|
||||
for (auto node : nodes) {
|
||||
this->nodes.push_back(node_s{node["name"], node["url"]});
|
||||
this->nodes.push_back(node_s { node["name"], node["url"] });
|
||||
}
|
||||
|
||||
rsa_key_length = json["rsa_key_length"].get<size_t>();
|
||||
@ -31,6 +34,9 @@ namespace librengine::config {
|
||||
|
||||
void crawler::load_from_file(const std::string &path) {
|
||||
const std::string content = helper::get_file_content(path);
|
||||
load_from_content(content);
|
||||
}
|
||||
void crawler::load_from_content(const std::string &content) {
|
||||
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
|
||||
json = json["crawler"];
|
||||
|
||||
@ -38,7 +44,7 @@ namespace librengine::config {
|
||||
|
||||
std::string proxy_string = json["proxy"].get<std::string>();
|
||||
|
||||
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
|
||||
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
|
||||
|
||||
load_page_timeout_s = json["load_page_timeout_s"].get<size_t>();
|
||||
update_time_site_info_s_after = json["update_time_site_info_s_after"].get<size_t>();
|
||||
@ -54,16 +60,22 @@ namespace librengine::config {
|
||||
|
||||
void cli::load_from_file(const std::string &path) {
|
||||
const std::string content = helper::get_file_content(path);
|
||||
load_from_content(content);
|
||||
}
|
||||
void cli::load_from_content(const std::string &content) {
|
||||
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
|
||||
json = json["cli"];
|
||||
|
||||
std::string proxy_string = json["proxy"].get<std::string>();
|
||||
|
||||
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
|
||||
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
|
||||
}
|
||||
|
||||
void website::load_from_file(const std::string &path) {
|
||||
const std::string content = helper::get_file_content(path);
|
||||
load_from_content(content);
|
||||
}
|
||||
void website::load_from_content(const std::string &content) {
|
||||
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
|
||||
json = json["website"];
|
||||
|
||||
@ -71,11 +83,14 @@ namespace librengine::config {
|
||||
|
||||
std::string proxy_string = json["proxy"].get<std::string>();
|
||||
|
||||
if (!proxy_string.empty()) proxy = http::proxy{proxy_string};
|
||||
if (!proxy_string.empty()) proxy = http::proxy { proxy_string };
|
||||
}
|
||||
|
||||
void db::load_from_file(const std::string &path) {
|
||||
const std::string content = helper::get_file_content(path);
|
||||
load_from_content(content);
|
||||
}
|
||||
void db::load_from_content(const std::string &content) {
|
||||
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
|
||||
json = json["db"];
|
||||
|
||||
@ -93,4 +108,12 @@ namespace librengine::config {
|
||||
website_.load_from_file(path);
|
||||
db_.load_from_file(path);
|
||||
}
|
||||
|
||||
void all::load_from_content(const std::string &content) {
|
||||
global_.load_from_content(content);
|
||||
crawler_.load_from_content(content);
|
||||
cli_.load_from_content(content);
|
||||
website_.load_from_content(content);
|
||||
db_.load_from_content(content);
|
||||
}
|
||||
}
|
@ -367,7 +367,6 @@ leboncoin.fr
|
||||
sourceforge.net
|
||||
namasha.com
|
||||
grammarly.com
|
||||
.com.tw
|
||||
friv.com
|
||||
livedoor.com
|
||||
cambridge.org
|
||||
@ -420,7 +419,6 @@ weblio.jp
|
||||
lenta.ru
|
||||
ptt.cc
|
||||
google.gr
|
||||
.com
|
||||
albawabhnews.com
|
||||
verystream.com
|
||||
repubblica.it
|
||||
@ -593,7 +591,6 @@ cloudflare.com
|
||||
indiamart.com
|
||||
beeg.com
|
||||
drom.ru
|
||||
.com
|
||||
mileroticos.com
|
||||
costco.com
|
||||
archiveofourown.org
|
||||
@ -725,7 +722,6 @@ kizlarsoruyor.com
|
||||
concursolutions.com
|
||||
fast.com
|
||||
nature.com
|
||||
.com
|
||||
binance.com
|
||||
rutor.info
|
||||
itmedia.co.jp
|
||||
@ -759,7 +755,6 @@ southwest.com
|
||||
teamviewer.com
|
||||
chron.com
|
||||
caixa.gov.br
|
||||
.com
|
||||
amazon.com.mx
|
||||
creditonebank.com
|
||||
sci-hub.tw
|
||||
|
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(website LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
22
website/Dockerfile
Normal file
22
website/Dockerfile
Normal file
@ -0,0 +1,22 @@
|
||||
FROM gcc:12.2
|
||||
FROM ubuntu:20.04
|
||||
|
||||
ARG CONFIG
|
||||
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV envCONFIG=$CONFIG
|
||||
|
||||
RUN apt-get update && apt-get -y --no-install-recommends install \
|
||||
build-essential gcc cmake libcurl4-openssl-dev libssl-dev git ca-certificates
|
||||
|
||||
RUN git clone https://github.com/lexbor/lexbor && \
|
||||
cd lexbor && cmake . -DCMAKE_INSTALL_PREFIX=/usr && make && make install
|
||||
|
||||
COPY lib /usr/src/librengine/lib
|
||||
COPY website /usr/src/librengine/website
|
||||
WORKDIR /usr/src/librengine/website
|
||||
|
||||
RUN rm -rf build && mkdir -p build && cd build && cmake .. && make
|
||||
|
||||
WORKDIR /usr/src/librengine/website/build
|
||||
CMD ["sh", "-c", "/usr/src/librengine/website/build/website \"${envCONFIG}\""]
|
@ -5,12 +5,12 @@ int main(int argc, char **argv) {
|
||||
using namespace httplib;
|
||||
|
||||
if (argc <= 1) {
|
||||
std::cout << "Usage: bin [config_path]\nExample: ./backend config.json" << std::endl;
|
||||
std::cout << "Usage: bin [config]\nExample: ./website \"$(cat config.json)\"" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
config::all config;
|
||||
config.load_from_file(argv[1]);
|
||||
config.load_from_content(argv[1]);
|
||||
|
||||
auto server = std::make_shared<Server>();
|
||||
auto pages = std::make_shared<website::pages>(config);
|
||||
|
Loading…
Reference in New Issue
Block a user