Update May (opensearch->typesense)

This commit is contained in:
liameno 2022-05-02 03:32:11 -04:00
parent e0866f6e41
commit 61bf47e493
32 changed files with 475 additions and 572 deletions

7
CMakeLists.txt Normal file
View File

@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.2)
project(librengine)
add_subdirectory("${PROJECT_SOURCE_DIR}/cli" "${PROJECT_SOURCE_DIR}/cli/build")
add_subdirectory("${PROJECT_SOURCE_DIR}/lib" "${PROJECT_SOURCE_DIR}/lib/build")
add_subdirectory("${PROJECT_SOURCE_DIR}/crawler" "${PROJECT_SOURCE_DIR}/crawler/build")
add_subdirectory("${PROJECT_SOURCE_DIR}/website" "${PROJECT_SOURCE_DIR}/website/build")

View File

@ -1,18 +1,17 @@
# Librengine
![GitHub top language](https://img.shields.io/github/languages/top/liameno/librengine) ![GitHub](https://img.shields.io/github/license/liameno/librengine)
Privacy Opensource Web Search Engine
Privacy Web Search Engine
## Website
[![https://raw.githubusercontent.com/liameno/librengine/master/preview.gif](https://raw.githubusercontent.com/liameno/librengine/master/demo.png)]()
## Donate to web-hosting
| Cryptocurrency | Address |
| Сurrency | Address |
| --- | --- |
| Bitcoin (BTC) | bc1qxpu9vfzah3vw5pzanny0zmfsgd64klcj24pa8x |
| Dogecoin (DOGE) | DM8cqzbrW2rrmGk4K6UCD7rfeoqnKjJTum |
| Ethereum (ETH)| 0x1857A1A7a543ED123151ACCAbBF4EB058741e614 |
| Litecoin (LTC) | LLQMiWpF1cxET7p7UMYoWjJ26JuTp14u8K |
| Monero (XMR) | 4AkPUBr4uoFV1K4fSitpGJjRHo4dfSzZ257YR9HxiQi3DvmgLW1rteRQfRRCFYytKugcygfHAvvJu3Tt96mSoVUE6JKJDZL |
## Features
- Crawler
- Proxy
@ -31,19 +30,20 @@ Privacy Opensource Web Search Engine
## Dependencies
- libcurl (https://github.com/curl/curl)
- lexbor (https://github.com/lexbor/lexbor)
- opensearch (https://www.opensearch.org/)
- openssl (https://www.openssl.org/)
- typesense (https://typesense.org)
- openssl (https://www.openssl.org)
Arch:
```shell
yay -S curl lexbor opensearch openssl
yay -S curl lexbor openssl &&
wget https://dl.typesense.org/releases/0.22.2/typesense-server-0.22.2-linux-amd64.tar.gz &&
tar -zxf typesense-server-0.22.2-linux-amd64.tar.gz
```
Debian:
```shell
sudo apt install libcurl4-openssl-dev &&
wget https://artifacts.opensearch.org/releases/bundle/opensearch/1.2.4/opensearch-1.2.4-linux-x64.tar.gz &&
tar -zxf opensearch-1.2.4-linux-x64.tar.gz && cd opensearch-1.2.4 &&
./opensearch-tar-install.sh &&
wget https://dl.typesense.org/releases/0.22.2/typesense-server-0.22.2-linux-amd64.tar.gz &&
tar -zxf typesense-server-0.22.2-linux-amd64.tar.gz &&
git clone https://github.com/lexbor/lexbor &&
cd lexbor &&
cmake . && make && sudo make install &&
@ -51,23 +51,23 @@ sudo apt install libssl-dev
```
## Build
```shell
git clone https://github.com/liameno/librengine
cd librengine
git clone https://github.com/liameno/librengine &&
cd librengine &&
sh scripts/build_all.sh
```
## Run
```shell
opensearch
sh scripts/set_opensearch.sh
./typesense-server --data-dir=/tmp/typesense-data --api-key=xyz --enable-cors &&
sh scripts/init_db.sh
```
#### Crawler
```shell
./crawler https://www.gnu.org ../../config.json
#[start_site] [config path]
```
#### Backend
#### Website
```shell
./backend ../../config.json
./website ../../config.json
#[config path]
```
## Config
@ -75,10 +75,11 @@ sh scripts/set_opensearch.sh
//proxy: type://ip:port
//socks5://127.0.0.1:9050
//_s - seconds
{
"crawler": {
"user_agent": "librengine",
"opensearch_url": "http://localhost:9200",
"proxy": "socks5://127.0.0.1:9050",
"load_page_timeout_s": 20,
"update_time_site_info_s_after": 86400, //10 days
@ -103,16 +104,14 @@ sh scripts/set_opensearch.sh
"url": "http://127.0.0.1:8080"
}
]
},
//edit also init_db.sh
"db": {
"url": "http://localhost:8108",
"api_key": "xyz"
}
}
```
#### OpenSearch: Permissions Denied
```shell
sudo chmod -R 777 /usr/share/opensearch/config
sudo chmod -R 777 /usr/share/opensearch/logs
```
## License
GNU General Public License v3.0

View File

@ -1,10 +1,11 @@
//proxy: type://ip:port
//socks5://127.0.0.1:9050
//_s - seconds
{
"crawler": {
"user_agent": "librengine",
"opensearch_url": "http://localhost:9200",
"proxy": "socks5://127.0.0.1:9050",
"load_page_timeout_s": 20,
"update_time_site_info_s_after": 86400, //10 days
@ -29,5 +30,10 @@
"url": "http://127.0.0.1:8080"
}
]
},
//edit also init_db.sh
"db": {
"url": "http://localhost:8108",
"api_key": "xyz"
}
}

View File

@ -3,35 +3,19 @@
#include <optional>
#include <librengine/config.h>
#include <librengine/opensearch.h>
#include "librengine/http.h"
#include <librengine/typesense.h>
#include <librengine/http.h>
#ifndef HELPER_H
#define HELPER_H
namespace helper {
inline size_t compute_time();
size_t compute_time();
std::optional<std::string> lxb_string_to_std(const lxb_char_t *s);
lxb_char_t *std_string_to_lxb(const std::string &s);
std::optional<lxb_html_document*> parse_html(const std::string &response);
std::string compute_search_website_json(const std::string &field, const std::string &phrase, const librengine::config::crawler &current_config);
std::string compute_search_robots_txt_json(const std::string &field, const std::string &phrase, const librengine::config::crawler &current_config);
std::optional<std::string> compute_website_json(const std::string &title, const std::string &url, const std::string &host, const std::string &desc, const bool &has_ads, const bool &has_analytics);
std::optional<std::string> compute_robots_txt_json(const std::string &body, const std::string &host);
std::string get_desc(const std::string &attribute_name, const std::string &attribute_value, lxb_html_document *document);
std::string compute_desc(const std::string &tag_name, lxb_html_document *document);
std::optional<std::string> get_added_robots_txt(const std::string &host, const librengine::config::crawler &current_config, librengine::opensearch::client &opensearch_client);
size_t hints_count_added(const std::string &field, const std::string &url, const librengine::config::crawler &current_config, librengine::opensearch::client &opensearch_client);
librengine::http::request::result_s site(const librengine::http::url &url, const librengine::config::crawler &current_config);
bool is_allowed_in_robots(const std::string &body, const std::string &url, const librengine::config::crawler &current_config);
std::optional<std::string> get_robots_txt(const librengine::http::url &url, const librengine::config::crawler &current_config);
}
#endif

View File

@ -3,7 +3,7 @@
#include <optional>
#include <librengine/config.h>
#include <librengine/opensearch.h>
#include <librengine/typesense.h>
#ifndef WORKER_H
#define WORKER_H
@ -21,13 +21,27 @@ public:
error,
};
private:
librengine::config::crawler current_config;
librengine::opensearch::client opensearch_client;
librengine::config::crawler config;
librengine::typesense db_website;
librengine::typesense db_robots;
bool is_work = false;
public:
std::optional<std::string> compute_website_json(const std::string &title, const std::string &url, const std::string &host, const std::string &desc, const bool &has_ads, const bool &has_analytics);
std::optional<std::string> compute_robots_txt_json(const std::string &body, const std::string &host);
std::string get_desc(const std::string &attribute_name, const std::string &attribute_value, lxb_html_document *document);
std::string compute_desc(const std::string &tag_name, lxb_html_document *document);
std::optional<std::string> get_added_robots_txt(const std::string &host);
size_t hints_count_added(const std::string &field, const std::string &url);
librengine::http::request::result_s site(const librengine::http::url &url);
bool is_allowed_in_robots(const std::string &body, const std::string &url);
std::optional<std::string> get_robots_txt(const librengine::http::url &url);
bool normalize_url(librengine::http::url &url, const std::optional<std::string> &owner_host = std::nullopt) const;
public:
worker(librengine::config::crawler config, librengine::opensearch::client opensearch_client);
worker(librengine::config::crawler config, const librengine::config::db &db);
result main_thread(const std::string &site_url, int &deep, const std::optional<librengine::http::url> &owner_url = std::nullopt);
};

View File

@ -6,11 +6,11 @@
using namespace librengine;
void easy_start(const config::crawler &config) {
void easy_start(const config::crawler &config, const config::db &db) {
curl_global_init(CURL_GLOBAL_ALL); //https://stackoverflow.com/questions/6087886
int deep = 0;
auto w = std::make_shared<worker>(config, opensearch::client(config.opensearch_url));
auto w = std::make_shared<worker>(config, db);
w->main_thread(config.start_site_url, deep);
curl_global_cleanup(); //https://curl.se/libcurl/c/curl_global_cleanup.html
@ -23,15 +23,20 @@ int main(int argc, char **argv) {
}
config::crawler config;
config.start_site_url = argv[1];
config::db db;
config.load_from_file(argv[2]);
db.load_from_file(argv[2]);
config.start_site_url = argv[1];
std::string line = std::string(25, '=');
std::cout << logger::white << line << logger::green << "CFG" << logger::white << line << std::endl
<< logger::reset << config.to_str() << std::endl
<< logger::white << line << "===" << logger::white << line << std::endl;
easy_start(config);
easy_start(config, db);
return 0;
}

View File

@ -4,12 +4,6 @@
#include <optional>
#include <thread>
#include <librengine/config.h>
#include <librengine/http.h>
#include <librengine/opensearch.h>
#include <librengine/json.hpp>
#include "../third_party/rep-cpp/robots.h"
namespace helper {
using namespace librengine;
@ -36,168 +30,4 @@ namespace helper {
if (document == nullptr) return std::nullopt;
return document;
}
std::string compute_search_website_json(const std::string &field, const std::string &phrase, const config::crawler &current_config) {
nlohmann::json json;
const auto now = compute_time();
json["query"]["bool"]["must"][0]["match"][field] = phrase;
json["query"]["bool"]["must"][1]["range"]["date"]["gte"] = now - current_config.update_time_site_info_s_after;
json["query"]["bool"]["must"][1]["range"]["date"]["lte"] = now;
json["_source"] = false;
return json.dump();
}
std::string compute_search_robots_txt_json(const std::string &field, const std::string &phrase, const config::crawler &current_config) {
nlohmann::json json;
const auto now = compute_time();
json["query"]["bool"]["must"][0]["match"][field] = phrase;
json["query"]["bool"]["must"][1]["range"]["date"]["gte"] = now - current_config.update_time_site_info_s_after;
json["query"]["bool"]["must"][1]["range"]["date"]["lte"] = now;
return json.dump();
}
std::optional<std::string> compute_website_json(const std::string &title, const std::string &url, const std::string &host, const std::string &desc, const bool &has_ads, const bool &has_analytics) {
nlohmann::json json;
json["title"] = title;
json["url"] = url;
json["host"] = host;
json["desc"] = desc;
json["has_ads"] = has_ads;
json["has_analytics"] = has_analytics;
json["rating"] = 100; //def = 100
json["date"] = compute_time();
try {
return json.dump();
} catch (const nlohmann::detail::type_error &e) { //crawler trap
return std::nullopt;
}
}
std::optional<std::string> compute_robots_txt_json(const std::string &body, const std::string &host) {
nlohmann::json json;
json["body"] = body;
json["host"] = host;
json["date"] = compute_time();
try {
return json.dump();
} catch (const nlohmann::detail::type_error &e) { //crawler trap
return std::nullopt;
}
}
std::string get_desc(const std::string &attribute_name, const std::string &attribute_value, lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_attr(lxb_dom_interface_element(document->head), collection, std_string_to_lxb(attribute_name),
attribute_name.length(), std_string_to_lxb(attribute_value), attribute_value.length(), true);
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < c_length; i++) {
auto element = lxb_dom_collection_element(collection, i);
const auto content = lxb_dom_element_get_attribute(element, std_string_to_lxb("content"), 7, nullptr);
if (content != nullptr) {
if (desc.length() > 500) break;
desc.append(lxb_string_to_std(content).value_or(""));
desc.append("\n");
}
}
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
std::string compute_desc(const std::string &tag_name, lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), collection, std_string_to_lxb(tag_name), tag_name.length());
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < c_length; i++) {
if (desc.length() > 500) break;
auto element = lxb_dom_collection_element(collection, i);
const auto text = lxb_string_to_std(lxb_dom_node_text_content(lxb_dom_interface_node(element), nullptr)).value_or("");
desc.append(text);
desc.append("\n");
}
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
std::optional<std::string> get_added_robots_txt(const std::string &host, const config::crawler &current_config, opensearch::client &opensearch_client) {
const auto path = opensearch::client::path_options("robots_txt/_search");
const auto type = opensearch::client::request_type::POST;
const auto json = compute_search_robots_txt_json("host", host, current_config);
const auto search_response = opensearch_client.custom_request(path, type, json);
if (!search_response) return std::nullopt;
nlohmann::json result_json = nlohmann::json::parse(*search_response);
const auto value = result_json["hits"]["total"]["value"];
if (value.is_null()) return std::nullopt;
if (value > 0) {
const auto body = result_json["hits"]["hits"][0]["_source"]["body"];
if (body.is_null()) return std::nullopt;
return body;
}
return std::nullopt;
}
size_t hints_count_added(const std::string &field, const std::string &url, const config::crawler &current_config, opensearch::client &opensearch_client) {
const auto path = opensearch::client::path_options("website/_search");
const auto type = opensearch::client::request_type::POST;
const auto json = compute_search_website_json(field, url, current_config);
const auto search_response = opensearch_client.custom_request(path, type, json);
if (!search_response) return false;
nlohmann::json result_json = nlohmann::json::parse(*search_response);
const auto value = result_json["hits"]["total"]["value"];
if (value.is_null()) return 0;
if (value > 0) {
return value;
}
return 0;
}
http::request::result_s site(const http::url &url, const config::crawler &current_config) {
http::request request(url.text);
request.options.timeout_s = current_config.load_page_timeout_s;
request.options.user_agent = current_config.user_agent;
request.options.proxy = current_config.proxy;
request.perform();
return request.result;
}
bool is_allowed_in_robots(const std::string &body, const std::string &url, const config::crawler &current_config) {
Rep::Robots robots = Rep::Robots(body);
return robots.allowed(url, current_config.user_agent);
}
std::optional<std::string> get_robots_txt(const http::url &url, const config::crawler &current_config) {
http::url url_cp(url.text);
url_cp.set(CURLUPART_PATH, "/robots.txt");
url_cp.parse();
http::request request(url_cp.text);
request.options.timeout_s = current_config.load_page_timeout_s;
request.options.user_agent = current_config.user_agent;
request.options.proxy = current_config.proxy;
request.perform();
if (request.result.code != 200) return std::nullopt;
return request.result.response;
}
}

View File

@ -20,8 +20,145 @@ void if_debug_print(const logger::type &type, const std::string &text, const std
#endif
}
using namespace helper;
using namespace librengine;
std::optional<std::string> worker::compute_website_json(const std::string &title, const std::string &url, const std::string &host, const std::string &desc, const bool &has_ads, const bool &has_analytics) {
nlohmann::json json;
json["title"] = title;
json["url"] = url;
json["host"] = host;
json["desc"] = desc;
json["has_ads"] = has_ads;
json["has_analytics"] = has_analytics;
json["rating"] = 100; //def = 100
json["date"] = compute_time();
try {
return json.dump();
} catch (const nlohmann::detail::type_error &e) { //crawler trap
return std::nullopt;
}
}
std::optional<std::string> worker::compute_robots_txt_json(const std::string &body, const std::string &host) {
nlohmann::json json;
json["body"] = body;
json["host"] = host;
json["date"] = compute_time();
try {
return json.dump();
} catch (const nlohmann::detail::type_error &e) { //crawler trap
return std::nullopt;
}
}
std::string worker::get_desc(const std::string &attribute_name, const std::string &attribute_value, lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_attr(lxb_dom_interface_element(document->head), collection, std_string_to_lxb(attribute_name),
attribute_name.length(), std_string_to_lxb(attribute_value), attribute_value.length(), true);
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < c_length; i++) {
auto element = lxb_dom_collection_element(collection, i);
const auto content = lxb_dom_element_get_attribute(element, std_string_to_lxb("content"), 7, nullptr);
if (content != nullptr) {
if (desc.length() > 500) break;
desc.append(lxb_string_to_std(content).value_or(""));
desc.append("\n");
}
}
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
std::string worker::compute_desc(const std::string &tag_name, lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), collection, std_string_to_lxb(tag_name), tag_name.length());
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < c_length; i++) {
if (desc.length() > 500) break;
auto element = lxb_dom_collection_element(collection, i);
const auto text = lxb_string_to_std(lxb_dom_node_text_content(lxb_dom_interface_node(element), nullptr)).value_or("");
desc.append(text);
desc.append("\n");
}
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
std::optional<std::string> worker::get_added_robots_txt(const std::string &host) {
const auto now = compute_time();
auto filter_by = "date:>" + std::to_string(now - config.update_time_site_info_s_after) + " && date:<" + std::to_string(now);
const auto search_response = db_robots.search(host, "host", {{"filter_by", filter_by}});
nlohmann::json result_json = nlohmann::json::parse(search_response);
const auto value = result_json["found"];
if (value.is_null()) return std::nullopt;
if (value > 0) {
const auto body = result_json["hits"][0]["document"]["body"];
if (body.is_null()) return std::nullopt;
return body;
}
return std::nullopt;
}
size_t worker::hints_count_added(const std::string &field, const std::string &url) {
const auto now = compute_time();
auto filter_by = "date:>" + std::to_string(now - config.update_time_site_info_s_after) + " && date:<" + std::to_string(now);
const auto search_response = db_website.search(url, "url", {{"filter_by", filter_by}});
nlohmann::json result_json = nlohmann::json::parse(search_response);
const auto value = result_json["found"];
if (value.is_null()) return 0;
if (value > 0) return value;
return 0;
}
http::request::result_s worker::site(const http::url &url) {
http::request request(url.text);
request.options.timeout_s = config.load_page_timeout_s;
request.options.user_agent = config.user_agent;
request.options.proxy = config.proxy;
request.perform();
return request.result;
}
bool worker::is_allowed_in_robots(const std::string &body, const std::string &url) {
Rep::Robots robots = Rep::Robots(body);
return robots.allowed(url, config.user_agent);
}
std::optional<std::string> worker::get_robots_txt(const http::url &url) {
http::url url_cp(url.text);
url_cp.set(CURLUPART_PATH, "/robots.txt");
url_cp.parse();
http::request request(url_cp.text);
request.options.timeout_s = config.load_page_timeout_s;
request.options.user_agent = config.user_agent;
request.options.proxy = config.proxy;
request.perform();
if (request.result.code != 200) return std::nullopt;
return request.result.response;
}
bool worker::normalize_url(http::url &url, const std::optional<std::string> &owner_host) const {
if (url.text.size() < 3 && !owner_host) {
return false;
@ -68,7 +205,7 @@ bool worker::normalize_url(http::url &url, const std::optional<std::string> &own
url.parse();
}
if (this->current_config.is_http_to_https) {
if (this->config.is_http_to_https) {
if (url.scheme && url.scheme == "http") {
url.set(CURLUPART_SCHEME, "https"); //protocol
}
@ -92,8 +229,10 @@ bool worker::normalize_url(http::url &url, const std::optional<std::string> &own
return true;
}
worker::worker(config::crawler config, opensearch::client opensearch_client) : current_config(std::move(config)), opensearch_client(std::move(opensearch_client)) {
worker::worker(config::crawler config, const config::db &db) : config(std::move(config)) {
this->is_work = true;
this->db_website = typesense(db.url, "websites", db.api_key);
this->db_robots = typesense(db.url, "robots", db.api_key);
}
worker::result worker::main_thread(const std::string &site_url, int &deep, const std::optional<http::url> &owner_url) {
@ -116,49 +255,44 @@ worker::result worker::main_thread(const std::string &site_url, int &deep, const
if_debug_print(logger::type::error, "url == owner", url.text);
return result::already_added;
}
if (current_config.is_one_site && owner_url && url.host != owner_url->host) {
if (config.is_one_site && owner_url && url.host != owner_url->host) {
return result::already_added;
}
if (helper::hints_count_added("url", url.text, current_config, opensearch_client) > 0) {
if (hints_count_added("url", url.text) > 0) {
if_debug_print(logger::type::error, "already added", url.text);
return result::already_added;
}
size_t pages_count = helper::hints_count_added("host", *url.host, current_config, opensearch_client);
size_t pages_count = hints_count_added("host", *url.host);
if (pages_count >= this->current_config.max_pages_site) {
if (pages_count >= this->config.max_pages_site) {
if_debug_print(logger::type::error, "pages count >= limit", url.text);
return result::pages_limit;
}
if (this->current_config.is_check_robots_txt) {
auto robots_txt_body = helper::get_added_robots_txt(*url.host, current_config, opensearch_client).value_or("");
if (this->config.is_check_robots_txt) {
auto robots_txt_body = get_added_robots_txt(*url.host).value_or("");
bool is_checked = true;
if (robots_txt_body.empty()) {
robots_txt_body = helper::get_robots_txt(url, current_config).value_or("");
robots_txt_body = get_robots_txt(url).value_or("");
auto robots_txt_body_length = robots_txt_body.length();
if (robots_txt_body_length > 1 && robots_txt_body_length < this->current_config.max_robots_txt_symbols) {
const auto json = helper::compute_robots_txt_json(robots_txt_body, *url.host);
if (robots_txt_body_length > 1 && robots_txt_body_length < this->config.max_robots_txt_symbols) {
const auto json = compute_robots_txt_json(robots_txt_body, *url.host);
if (!json) return result::null_or_limit;
const auto path = opensearch::client::path_options("robots_txt/_doc");
const auto type = opensearch::client::request_type::POST;
//add a robots_txt to the opensearch
opensearch_client.custom_request(path, type, json);
db_robots.add(*json);
} else {
is_checked = false;
}
}
if (is_checked && !helper::is_allowed_in_robots(robots_txt_body, url.text, current_config)) {
if (is_checked && !is_allowed_in_robots(robots_txt_body, url.text)) {
return result::disallowed_robots;
}
}
auto request_result = helper::site(url, current_config);
auto request_result = site(url);
auto response = request_result.response;
auto response_length = response->length();
@ -170,25 +304,25 @@ worker::result worker::main_thread(const std::string &site_url, int &deep, const
if_debug_print(logger::type::error, "code != 200", url.text);
return result::null_or_limit;
}
if (!response || response_length < 1 || response_length >= this->current_config.max_page_symbols) {
if (!response || response_length < 1 || response_length >= this->config.max_page_symbols) {
if_debug_print(logger::type::error, "response = null || length < 1 || >= limit", url.text);
return result::null_or_limit;
}
auto document = helper::parse_html(*response);
auto document = parse_html(*response);
if (!document) return result::null_or_limit;
auto body = lxb_dom_interface_node((*document)->body);
if (body == nullptr) return result::null_or_limit;
const std::string title = helper::lxb_string_to_std(lxb_html_document_title((*document), nullptr)).value_or("");
const std::string title = lxb_string_to_std(lxb_html_document_title((*document), nullptr)).value_or("");
//const std::string content = lxb_string_to_std(lxb_dom_node_text_content(body, nullptr)).value_or("");
std::string desc = helper::get_desc("name", "description", *document); //by meta tag
std::string desc = get_desc("name", "description", *document); //by meta tag
if (desc.empty()) {
desc = helper::get_desc("http-equiv", "description", *document); //by meta tag
desc = get_desc("http-equiv", "description", *document); //by meta tag
}
if (desc.empty()) {
desc.append(helper::compute_desc("h1", *document)); //from h1 tags
desc.append(compute_desc("h1", *document)); //from h1 tags
}
if (title.empty() && desc.empty()) {
if_debug_print(logger::type::error, "title & desc are empty", url.text);
@ -224,28 +358,24 @@ worker::result worker::main_thread(const std::string &site_url, int &deep, const
}
}
const auto json = helper::compute_website_json(title, url.text, *url.host, desc, has_ads, has_analytics);
const auto json = compute_website_json(title, url.text, *url.host, desc, has_ads, has_analytics);
if (!json) return result::null_or_limit;
const auto path = opensearch::client::path_options("website/_doc");
const auto type = opensearch::client::request_type::POST;
//add a website to the opensearch
opensearch_client.custom_request(path, type, json);
db_website.add(*json);
//print added url
std::cout << logger::yellow << "[" << url.text << "]" << std::endl;
if (deep < this->current_config.max_recursive_deep) {
if (deep < this->config.max_recursive_deep) {
auto collection = lxb_dom_collection_make(&(*document)->dom_document, 16);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(body), collection, helper::std_string_to_lxb("a"), 1);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(body), collection, std_string_to_lxb("a"), 1);
const auto a_length = collection->array.length;
std::vector<std::string> pages_limit_hosts;
++deep;
for (size_t i = 0; i < a_length; i++) {
auto element = lxb_dom_collection_element(collection, i);
const auto href_value = helper::lxb_string_to_std(lxb_dom_element_get_attribute(element, helper::std_string_to_lxb("href"), 4, nullptr));
const auto href_value = lxb_string_to_std(lxb_dom_element_get_attribute(element, std_string_to_lxb("href"), 4, nullptr));
if (!href_value && *href_value == url.text && str::starts_with(*href_value, "#")) {
//skip fragment links
@ -265,7 +395,7 @@ worker::result worker::main_thread(const std::string &site_url, int &deep, const
if (!str::starts_with(*href_value, "http")) {
result = main_thread(href_url.text, deep, url);
} else {
if (current_config.is_one_site && href_url.host != url.host) {
if (config.is_one_site && href_url.host != url.host) {
//skip other sites
continue;
}
@ -280,7 +410,7 @@ worker::result worker::main_thread(const std::string &site_url, int &deep, const
pages_limit_hosts.push_back(*href_url.host);
} else if (result == result::added || result == result::disallowed_robots) {
//delay
std::this_thread::sleep_for(std::chrono::seconds(this->current_config.delay_time_s));
std::this_thread::sleep_for(std::chrono::seconds(this->config.delay_time_s));
}
}

View File

@ -3,8 +3,8 @@ project(librengine LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(include include/opensearch.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp)
set(src src/opensearch.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp)
set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp)
set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp)
set(include_all ${include} ${tp_rep_cpp} ${tp_url_cpp})
set(src_all ${src})

View File

@ -17,7 +17,6 @@ namespace librengine::config {
struct crawler {
std::string user_agent;
std::string start_site_url;
std::string opensearch_url;
std::optional<http::proxy> proxy;
@ -51,9 +50,16 @@ namespace librengine::config {
std::string url;
};
size_t port = 8080;
std::optional<http::proxy> proxy = std::nullopt; //socks5://127.0.0.1:9050
std::vector<node_s> nodes = {};
size_t port;
std::optional<http::proxy> proxy;
std::vector<node_s> nodes;
void load_from_file(const std::string &path);
std::string to_str() const;
};
struct db {
std::string url;
std::string api_key;
void load_from_file(const std::string &path);
std::string to_str() const;

View File

@ -1,38 +0,0 @@
#ifndef OPENSEARCH_H
#define OPENSEARCH_H
#include <string>
#include <memory>
#include "http.h"
namespace librengine::opensearch {
class client {
public:
enum class request_type {
GET,
POST,
PUT,
DELETE,
};
struct path_options {
std::string full;
std::string index;
std::string type;
std::string document;
void compute_full();
explicit path_options(const std::string &full);
};
private:
std::string url;
private:
std::string compute_url(const std::string &path);
public:
explicit client(std::string url = "http://localhost:9200");
std::optional<std::string> custom_request(const path_options &path_options, const request_type &request_type,const std::optional<std::string> &data = std::nullopt);
};
}
#endif

28
lib/include/typesense.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef TYPESENSE_H
#define TYPESENSE_H
#include <string>
#include <memory>
#include <map>
#include <librengine/http.h>
namespace librengine {
class typesense {
private:
std::string url;
std::string collection_name;
std::string api_key;
public:
typesense();
typesense(const std::string &url, const std::string &collection_name, const std::string &api_key);
std::string add(const std::string &json);
std::string update(const std::string &json);
std::string get(const int &id);
std::string search(const std::string &q, const std::string &query_by, const std::map<std::string, std::string> &options = {});
};
}
#endif

View File

@ -19,7 +19,6 @@ namespace librengine::config {
auto json_crawler = json["crawler"];
this->user_agent = json_crawler["user_agent"].get<std::string>();
this->opensearch_url = json_crawler["opensearch_url"].get<std::string>();
std::string proxy_string = json_crawler["proxy"].get<std::string>();
@ -40,10 +39,10 @@ namespace librengine::config {
}
std::string crawler::to_str() const {
const std::string format = "UA={0}\nStartSiteUrl={1}\nOpenSearchUrl={2}\nProxy={3}\nMaxRecDeep={4}"
"\nLPageTimeoutS={5}\nUpdateTimeSISAfter={6}\nDelayTimeS={7}\nMaxPagesS={8}\nMaxPageSym={9}"
"\nMaxRobotsTSym={10}\nIsOneSite={11}\nIsHttpToHttps={12}\nIsCheckRobots={13}";
return str::format(format, user_agent, start_site_url, opensearch_url,
const std::string format = "UA={0}\nStartSiteUrl={1}\nProxy={2}\nMaxRecDeep={3}"
"\nLPageTimeoutS={4}\nUpdateTimeSISAfter={5}\nDelayTimeS={6}\nMaxPagesS={7}\nMaxPageSym={8}"
"\nMaxRobotsTSym={9}\nIsOneSite={10}\nIsHttpToHttps={11}\nIsCheckRobots={12}";
return str::format(format, user_agent, start_site_url,
(proxy) ? proxy->compute_curl_format() : "null", max_recursive_deep,
load_page_timeout_s, update_time_site_info_s_after, delay_time_s, max_pages_site,
max_page_symbols, max_robots_txt_symbols,
@ -87,4 +86,18 @@ namespace librengine::config {
const std::string format = "Port={0}\nProxy={1}\nNodes={2}";
return str::format(format, port, (proxy) ? proxy->compute_curl_format() : "null", nodes.size());
}
void db::load_from_file(const std::string &path) {
const std::string content = helper::get_file_content(path);
nlohmann::json json = nlohmann::json::parse(content, nullptr, true, true);
auto json_db = json["db"];
this->url = json_db["url"].get<std::string>();
this->api_key = json_db["api_key"].get<std::string>();
}
std::string db::to_str() const {
const std::string format = "Url={0}\nApiKey={1}";
return str::format(format, url, api_key);
}
}

View File

@ -164,6 +164,8 @@ namespace librengine::http {
this->curl = curl_easy_init();
this->options.headers = std::make_shared<std::vector<header>>();
this->url = str::replace(this->url, " ", "%20");
if (is_set_secure_headers) {
this->options.headers->emplace_back("DNT", "1"); //don't track
this->options.headers->emplace_back("Sec-GPC", "1"); //don't sell or share

View File

@ -1,45 +0,0 @@
#include "opensearch.h"
#include <utility>
namespace librengine::opensearch {
void client::path_options::compute_full() {
this->full.append(this->index);
this->full.push_back('/');
this->full.append(this->type);
this->full.push_back('/');
this->full.append(this->document);
}
client::path_options::path_options(const std::string &full) {
this->full = full;
}
client::client(std::string url) : url(std::move(url)) {
}
std::string client::compute_url(const std::string &path) {
std::string result = this->url;
result.push_back('/');
result.append(path);
return result;
}
std::optional<std::string> client::custom_request(const client::path_options &path_options, const request_type &request_type, const std::optional<std::string> &data) {
std::string type;
if (request_type == request_type::GET) type = "GET";
else if (request_type == request_type::POST) type = "POST";
else if (request_type == request_type::PUT) type = "PUT";
else if (request_type == request_type::DELETE) type = "DELETE";
http::request request(compute_url(path_options.full), data.value_or(""), type, false);
request.options.headers->emplace_back("Content-Type: application/json");
request.perform();
return request.result.response;
}
}

55
lib/src/typesense.cpp Normal file
View File

@ -0,0 +1,55 @@
#include "../include/typesense.h"
namespace librengine {
typesense::typesense() = default;
typesense::typesense(const std::string &url, const std::string &collection_name, const std::string &api_key) {
this->url = url;
this->collection_name = collection_name;
this->api_key = api_key;
}
std::string typesense::add(const std::string &json) {
std::string request_url = this->url + "/collections/" + this->collection_name + "/documents/";
http::request request(request_url, json, "POST", false);
request.options.headers->emplace_back("Content-Type: application/json");
request.options.headers->emplace_back("X-TYPESENSE-API-KEY", api_key);
request.perform();
return request.result.response.value_or("");
}
std::string typesense::update(const std::string &json) {
std::string request_url = this->url + "/collections/" + this->collection_name + "/documents/?action=upsert";
http::request request(request_url, json, "POST", false);
request.options.headers->emplace_back("Content-Type: application/json");
request.options.headers->emplace_back("X-TYPESENSE-API-KEY", api_key);
request.perform();
return request.result.response.value_or("");
}
std::string typesense::get(const int &id) {
std::string request_url = this->url + "/collections/" + this->collection_name + "/documents/" + std::to_string(id);
http::request request(request_url, "", "GET", false);
request.options.headers->emplace_back("X-TYPESENSE-API-KEY", api_key);
request.perform();
return request.result.response.value_or("");
}
std::string typesense::search(const std::string &q, const std::string &query_by, const std::map<std::string, std::string> &options) {
std::string request_url = this->url + "/collections/" + this->collection_name + "/documents/search";
request_url.append("?q=" + q);
request_url.append("&query_by=" + query_by);
for (const auto &option : options) {
request_url.append("&" + option.first + "=" + option.second);
}
http::request request(request_url, "", "GET", false);
request.options.headers->emplace_back("X-TYPESENSE-API-KEY", api_key);
request.perform();
return request.result.response.value_or("");
}
}

View File

@ -4,5 +4,5 @@ mkdir build && cd build && cmake .. && sudo make install
cd ../../crawler
mkdir build && cd build && cmake .. && make
cd ../../website/backend
cd ../../website
mkdir build && cd build && cmake .. && make

33
scripts/init_db.sh Normal file
View File

@ -0,0 +1,33 @@
export URL=http://localhost:8108
export API_KEY=xyz
curl -XDELETE "$URL/collections/websites" -H "X-TYPESENSE-API-KEY: $API_KEY"
curl -XDELETE "$URL/collections/robots" -H "X-TYPESENSE-API-KEY: $API_KEY"
curl -XPOST "$URL/collections/" -d'
{
"name": "websites",
"fields": [
{"name": "title", "type": "string" },
{"name": "desc", "type": "string" },
{"name": "url", "type": "string" },
{"name": "host", "type": "string" },
{"name": "rating", "type": "int32" },
{"name": "has_ads", "type": "bool" },
{"name": "has_analytics", "type": "bool" },
{"name": "date", "type": "int64" }
],
"default_sorting_field": "date"
}
' -H "X-TYPESENSE-API-KEY: $API_KEY" -H 'Content-Type: application/json'
curl -XPOST "$URL/collections/" -d'
{
"name": "robots",
"fields": [
{"name": "body", "type": "string" },
{"name": "host", "type": "string" },
{"name": "date", "type": "int64" }
],
"default_sorting_field": "date"
}
' -H "X-TYPESENSE-API-KEY: $API_KEY" -H 'Content-Type: application/json'

View File

@ -1,84 +0,0 @@
export ES_URL=localhost:9200
curl -XDELETE "$ES_URL/website"
curl -XPUT "$ES_URL/website" -d'{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"word_delimiter"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
},
"not_analyzed": {
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"token_chars": [
"letter"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete"
},
"content": {
"type": "text",
"analyzer": "autocomplete"
},
"desc": {
"type": "text",
"analyzer": "autocomplete"
},
"url": {
"type": "keyword"
},
"host": {
"type": "keyword"
},
"rating": {
"type": "byte"
},
"has_ads": {
"type": "boolean"
},
"has_analytics": {
"type": "boolean"
},
"date": {
"type": "date"
}
}
}
}
' -H 'Content-Type: application/json'
curl -XDELETE "$ES_URL/robots_txt"
curl -XPUT "$ES_URL/robots_txt" -d'{
"mappings": {
"properties": {
"body": {
"type": "keyword"
},
"host": {
"type": "keyword"
},
"date": {
"type": "date"
}
}
}
}
' -H 'Content-Type: application/json'

View File

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.20)
project(backend)
project(website)
set(CMAKE_CXX_STANDARD 17)
@ -11,5 +11,5 @@ find_package(CURL)
find_package(Threads)
find_package(OpenSSL)
add_executable(${PROJECT_NAME} main.cpp third_party/httplib.h src/pages.cpp include/pages.h src/encryption.cpp include/encryption.h)
add_executable(${PROJECT_NAME} main.cpp third_party/httplib.h src/pages.cpp include/pages.h)
target_link_libraries(${PROJECT_NAME} PRIVATE /usr/lib/liblexbor.so curl OpenSSL::Crypto Threads::Threads /usr/local/lib/liblibrengine.so)

View File

@ -19,7 +19,11 @@ function decrypt() {
item = item.replaceAll("\n", "");
item = item.trim();
result += rsa.decrypt(item);
let decrypted = rsa.decrypt(item);
if (decrypted == null) continue;
result += decrypted;
}
splited = splited.slice(last_i);

View File

@ -1,21 +1,29 @@
let rsa = new JSEncrypt({default_key_size: 1024});
let is_generating = true;
let is_started = true;
rsa.getKey(function() {
let public_key = get_with_expiry("public_key");
let private_key = get_with_expiry("private_key");
let public_key = get_with_expiry("public_key");
let private_key = get_with_expiry("private_key");
if (public_key == null || private_key == null) {
rsa.getKey(function() {
let public_key = get_with_expiry("public_key");
let private_key = get_with_expiry("private_key");
if (public_key == null) {
set_with_expiry("public_key", rsa.getPublicKey(), 3600 * 1000); //1 hour
}
if (private_key == null) {
set_with_expiry("private_key", rsa.getPrivateKey(), 3600 * 1000); //1 hour
}
if (public_key == null) {
set_with_expiry("public_key", rsa.getPublicKey(), 3600 * 1000); //1 hour
}
if (private_key == null) {
set_with_expiry("private_key", rsa.getPrivateKey(), 3600 * 1000); //1 hour
}
is_generating = false;
is_started = false;
});
} else {
is_started = false;
});
}
function submit_form() {
if (is_started) return false;
@ -28,15 +36,15 @@ function submit_form() {
let query_v = query.value;
let key_v = key.value;
let rsa2 = new JSEncrypt({default_key_size: 2048});
rsa2.setPublicKey(key_v);
let rsa2 = new JSEncrypt({default_key_size: 1024});
rsa2.setPublicKey(atob(key_v));
let encrypted = rsa2.encrypt(query_v);
let public_key = get_with_expiry("public_key");
query.value = encrypted;
encryption.value = "1";
key.value = public_key;
key.value = btoa(public_key);
form.submit();
return false;

View File

@ -37,7 +37,7 @@
<div class="input_container">
<input class="input" name="q" id="q" type="search" value="{QUERY}" placeholder="" required>
</div>
<input name="s" value="0" type="hidden">
<input name="p" value="1" type="hidden">
<input name="e" id="e" value="0" type="hidden">
<input name="ek" id="ek" value="{RSA_PUBLIC_KEY}" type="hidden">
<button class="button">

View File

@ -26,13 +26,13 @@
</div>
</header>
<div class="center_container">
<div class="websites counter">
<!--<div class="websites counter">
<div class="content">
<i class="fa fa-globe"></i>
<span class="title">{WEBSITES_COUNT}</span>
<h2 class="title">Websites</h2>
</div>
</div>
</div>-->
<div class="pages counter">
<div class="content">
<i class="fa fa-file"></i>

View File

@ -24,7 +24,7 @@
<div class="top_container">
<div class="top_container_content">
<div class="top_left_container">
<form class="search_widget" action="search" method="GET">
<form autocomplete="off" class="search_widget" action="search" onsubmit="return submit_form();" method="GET">
<div class="input_container">
<input class="input" name="q" id="q" type="search" value="{QUERY}" placeholder="query" required>
</div>

File diff suppressed because one or more lines are too long

View File

@ -1,11 +1,12 @@
#include <optional>
#include <librengine/config.h>
#include <librengine/opensearch.h>
#include <librengine/logger.h>
#include <librengine/json.hpp>
#include <librengine/str.h>
#include <librengine/str_impl.h>
#include <librengine/http.h>
#include <librengine/typesense.h>
#include <librengine/encryption.h>
#include <iostream>
#include <cstring>
#include <thread>
@ -13,7 +14,6 @@
#include <map>
#include "../third_party/httplib.h"
#include "encryption.h"
#ifndef PAGES_H
#define PAGES_H
@ -36,11 +36,11 @@ namespace backend {
private:
encryption::rsa rsa;
config::website config;
opensearch::client client;
typesense db;
std::map<std::string, std::string> rsa_public_keys;
public:
pages(const config::website &config, opensearch::client &client);
pages(const config::website &config, const config::db &db);
void init();
void set_variables(std::string &page_src);
@ -49,7 +49,7 @@ namespace backend {
void update(const std::string &id, const std::string &field, const std::string &value);
size_t get_number_field_value(const std::string &id, const std::string &field);
/*size_t get_last_added_website_date(opensearch::client &client);*/
std::optional<std::vector<search_result>> search(const std::string &q, const size_t &s);
std::optional<std::vector<search_result>> search(const std::string &q, const size_t &p);
size_t get_field_count(const std::string &field);
void home(const Request &request, Response &response);

View File

@ -11,20 +11,22 @@ int main(int argc, char **argv) {
}
config::website config;
config::db db;
config.load_from_file(argv[1]);
db.load_from_file(argv[1]);
std::string line = std::string(25, '=');
std::cout << logger::white << line << logger::green << "CFG" << logger::white << line << std::endl
<< logger::reset << config.to_str() << std::endl
<< logger::white << line << "===" << logger::white << line << std::endl;
auto client = librengine::opensearch::client("http://localhost:9200");
auto server = std::make_shared<Server>();
auto pages = std::make_shared<backend::pages>(config, client);
auto pages = std::make_shared<backend::pages>(config, db);
std::thread server_thread([&] {
server->set_mount_point("/", "../../frontend/");
server->set_mount_point("/", "../frontend/");
server->Get("/home", [&](const Request &req, Response &res) { pages->home(req, res); });
server->Get("/search", [&](const Request &req, Response &res) { pages->search(req, res); });
server->Get("/node/info", [&](const Request &req, Response &res) { pages->node_info(req, res); });

View File

@ -20,10 +20,10 @@ void if_debug_print(const logger::type &type, const std::string &text, const std
}
namespace backend {
pages::pages(const config::website &config, opensearch::client &client) {
pages::pages(const config::website &config, const config::db &db) {
this->rsa = encryption::rsa();
this->config = config;
this->client = client;
this->db = typesense(db.url, "websites", db.api_key);
this->rsa.generate_keys(1024);
}
@ -41,83 +41,37 @@ namespace backend {
const std::string noscript_src = R"(<noscript><span class="noscript">Encryption doesn't work without js</span></noscript>)";
const std::string header_src = R"(<li><a href="/home">Home</a></li><li><a href="/node/info">Node Info</a></li><li><a href="https://github.com/liameno/librengine">Github</a></li>)";
str::replace_ref(page_src, "{RSA_PUBLIC_KEY}", rsa.get_public_key_buffer());
auto key = rsa.get_public_key_buffer();
str::replace_ref(page_src, "{RSA_PUBLIC_KEY}", encryption::base64::easy_encode(key));
str::replace_ref(page_src, "{NOSCRIPT_CONTENT}", noscript_src);
str::replace_ref(page_src, "{HEADER_CONTENT}", header_src);
}
void pages::update(const std::string &id, const std::string &field, const size_t &value) {
const auto path = opensearch::client::path_options("website/_doc/" + id + "/_update");
const auto type = opensearch::client::request_type::POST;
const auto response = db.get(std::stoi(id));
nlohmann::json result_json = nlohmann::json::parse(response);
result_json[field] = value;
nlohmann::json json;
json["doc"][field] = value;
const auto response = client.custom_request(path, type, json.dump());
db.update(result_json.dump());
}
void pages::update(const std::string &id, const std::string &field, const std::string &value) {
const auto path = opensearch::client::path_options("website/_doc/" + id + "/_update");
const auto type = opensearch::client::request_type::POST;
const auto response = db.get(std::stoi(id));
nlohmann::json result_json = nlohmann::json::parse(response);
result_json[field] = value;
nlohmann::json json;
json["doc"][field] = value;
const auto response = client.custom_request(path, type, json.dump());
db.update(result_json.dump());
}
size_t pages::get_number_field_value(const std::string &id, const std::string &field) {
const auto path = opensearch::client::path_options("website/_doc/" + id);
const auto type = opensearch::client::request_type::GET;
const auto response = client.custom_request(path, type);
nlohmann::json result_json = nlohmann::json::parse(*response);
const auto rating = result_json["_source"][field];
if (rating.is_null()) return 0;
if (rating > 0) return (size_t)rating;
return 0;
const auto response = db.get(std::stoi(id));
nlohmann::json result_json = nlohmann::json::parse(response);
return result_json[field];
}
/*size_t pages::get_last_added_website_date(opensearch::client &client) {
const auto path = opensearch::client::path_options("website/_search");
const auto type = opensearch::client::request_type::POST;
std::optional<std::vector<pages::search_result>> pages::search(const std::string &q, const size_t &p) {
const auto response = db.search(q, "url,title,desc", {{"page", std::to_string(p)}});
nlohmann::json result_json = nlohmann::json::parse(response);
nlohmann::json json;
json["size"] = 1;
json["sort"][0]["date"]["order"] = "desc";
const auto response = client.custom_request(path, type, json.dump());
nlohmann::json result_json = nlohmann::json::parse(*response);
const auto value = result_json["hits"]["total"]["value"];
if (value.is_null()) return std::nullopt;
if (value < 0) return std::nullopt;
const auto body = result_json["hits"]["hits"];
if (body.is_null()) return std::nullopt;
auto hit = body[0];
size_t hit_date = hit["_source"]["date"];
size_t current_date = time(nullptr);
return current_date - hit_date;
}*/
std::optional<std::vector<pages::search_result>> pages::search(const std::string &q, const size_t &s) {
const auto path = opensearch::client::path_options("website/_search");
const auto type = opensearch::client::request_type::POST;
nlohmann::json json;
json["query"]["query_string"]["fields"] = {"url", "title", "desc"};
json["query"]["query_string"]["query"] = q;
json["size"] = 10;
json["from"] = s;
const auto response = client.custom_request(path, type, json.dump());
nlohmann::json result_json = nlohmann::json::parse(*response);
const auto body = result_json["hits"]["hits"];
const auto body = result_json["hits"];
if (body.is_null() || body.empty()) return std::nullopt;
size_t value = body.size();
@ -128,15 +82,16 @@ namespace backend {
for (int i = 0; i < value; ++i) {
search_result result;
auto hit = body[i];
auto hit_doc = hit["document"];
try {
result.id = hit["_id"];
result.title = hit["_source"]["title"];
result.url = hit["_source"]["url"];
result.desc = hit["_source"]["desc"];
result.rating = hit["_source"]["rating"];
result.has_ads = hit["_source"]["has_ads"];
result.has_analytics = hit["_source"]["has_analytics"];
result.id = hit_doc["id"];
result.title = hit_doc["title"];
result.url = hit_doc["url"];
result.desc = hit_doc["desc"];
result.rating = hit_doc["rating"];
result.has_ads = hit_doc["has_ads"];
result.has_analytics = hit_doc["has_analytics"];
} catch (const nlohmann::json::exception &e) {
continue;
}
@ -147,25 +102,13 @@ namespace backend {
return results;
}
size_t pages::get_field_count(const std::string &field) {
const auto path = opensearch::client::path_options("website/_search");
const auto type = opensearch::client::request_type::POST;
nlohmann::json json;
json["aggs"]["host_uniq"]["terms"]["field"] = field;
json["aggs"]["host_uniq"]["terms"]["size"] = 1;
json["size"] = 0;
const auto response = client.custom_request(path, type, json.dump());
nlohmann::json result_json = nlohmann::json::parse(*response);
const auto value = result_json["aggregations"]["host_uniq"]["sum_other_doc_count"];
if (value.is_null()) return 0;
return (size_t)value + 1;
const auto response = db.search("*", field);
nlohmann::json result_json = nlohmann::json::parse(response);
return result_json["found"];
}
void pages::home(const Request &request, Response &response) {
std::string page_src = config::helper::get_file_content("../../frontend/src/index.html");
std::string page_src = config::helper::get_file_content("../frontend/src/index.html");
const std::string query = request.get_param_value("q");
str::replace_ref(page_src, "{QUERY}", query);
@ -174,12 +117,12 @@ namespace backend {
response.set_content(page_src, "text/html");
}
void pages::search(const Request &request, Response &response) {
std::string page_src = config::helper::get_file_content("../../frontend/src/search.html");
std::string query = request.get_param_value("q");
std::string page_src = config::helper::get_file_content("../frontend/src/search.html");
std::string query = str::replace(request.get_param_value("q"), " ", "+");
std::string e_ = request.get_param_value("e");
std::string ek_ = request.get_param_value("ek");
const std::string s_ = request.get_param_value("s");
const size_t start_index = (!s_.empty()) ? std::stoi(s_) : 0;
const std::string p_ = request.get_param_value("p");
const size_t page = (!p_.empty()) ? std::stoi(p_) : 1;
const std::string center_result_src_format = "<div class=\"center_result\">"
"<div class=\"content\">"
"<a class=\"title\" href=\"{1}\">{0}<span><i class=\"fa fa-ad info_icon info_{6}\"></i><i class=\"fa fa-user-secret info_icon info_{7}\"></i></span></a>"
@ -195,6 +138,9 @@ namespace backend {
"</div>"
"</div>";
std::string params_s = str::format("?q={0}&p={1}&e={2}&ek={3}", query, p_, e_, ek_);
ek_ = encryption::base64::easy_decode(ek_);
if_debug_print(logger::type::info, "query = " + query, request.path);
if (e_ == "1") {
@ -209,10 +155,10 @@ namespace backend {
}
std::string center_results_src;
std::string params_s = str::format("?q={0}&s={1}&e=0", str::replace(query, " ", "+"), s_);
for (const auto &node : config.nodes) {
if_debug_print(logger::type::info, "node = " + node.url, request.path);
std::string params_s2;
if (e_ == "1") {
encryption::rsa rsa_node;
@ -227,10 +173,10 @@ namespace backend {
auto public_key = rsa.get_public_key_buffer();
auto key2 = encryption::base64::easy_encode(public_key); //error of curl (CURLE_URL_MALFORMAT)
params_s = str::format("?q={0}&s={1}&e=1&ek={2}", encrypted_base64, s_, key2);
params_s2 = str::format("?q={0}&p={1}&e=1&ek={2}", encrypted_base64, p_, key2);
}
http::request request_(node.url + "/api/search" + params_s);
http::request request_(node.url + "/api/search" + params_s2);
if (!http::url(node.url).is_localhost()) request_.options.proxy = config.proxy;
request_.perform();
@ -308,15 +254,15 @@ namespace backend {
std::string url = request.path + params_s;
str::replace_ref(page_src, "{CENTER_RESULTS}", center_results_src2);
str::replace_ref(page_src, "{QUERY}", query);
str::replace_ref(page_src, "{PREV_PAGE}", str::replace(url, "&s=" + s_, "&s=" + std::to_string((start_index >= 10) ? start_index - 10 : 0)));
str::replace_ref(page_src, "{NEXT_PAGE}", str::replace(url, "&s=" + s_, "&s=" + std::to_string(start_index + 10)));
str::replace_ref(page_src, "{PREV_PAGE}", str::replace(url, "&p=" + p_, "&p=" + std::to_string((page > 1) ? page - 1 : 1)));
str::replace_ref(page_src, "{NEXT_PAGE}", str::replace(url, "&p=" + p_, "&p=" + std::to_string(page + 1)));
set_variables(page_src);
response.status = 200;
response.set_content(page_src, "text/html");
}
void pages::node_info(const Request &request, Response &response) {
std::string page_src = config::helper::get_file_content("../../frontend/src/node/info.html");
std::string page_src = config::helper::get_file_content("../frontend/src/node/info.html");
str::replace_ref(page_src, "{WEBSITES_COUNT}", std::to_string(get_field_count("host")));
str::replace_ref(page_src, "{PAGES_COUNT}", std::to_string(get_field_count("url")));
@ -326,7 +272,7 @@ namespace backend {
response.set_content(page_src, "text/html");
}
void pages::node_admin_panel(const Request &request, Response &response) {
std::string page_src = config::helper::get_file_content("../../frontend/src/node/admin_panel/index.html");
std::string page_src = config::helper::get_file_content("../frontend/src/node/admin_panel/index.html");
set_variables(page_src);
response.status = 200;
@ -378,8 +324,8 @@ namespace backend {
std::string query = str::replace(request.get_param_value("q"), " ", "+");
std::string e_ = request.get_param_value("e");
std::string ek_ = request.get_param_value("ek");
const std::string s_ = request.get_param_value("s");
const size_t start_index = (!s_.empty()) ? std::stoi(s_) : 0;
const std::string p_ = request.get_param_value("p");
const size_t page = (!p_.empty()) ? std::stoi(p_) : 1;
nlohmann::json page_src;
std::vector<unsigned char> ek_decrypted;
@ -395,7 +341,7 @@ namespace backend {
if_debug_print(logger::type::info, "decrypted query = " + query, request.path);
}
const auto search_results = search(query, start_index);
const auto search_results = search(query, page);
if (search_results) {
auto sr_size = search_results->size();
@ -464,7 +410,7 @@ namespace backend {
void pages::api_node_info(const Request &request, Response &response) {
nlohmann::json page_src;
page_src["websites_count"] = get_field_count("host");
//page_src["websites_count"] = get_field_count("host");
page_src["pages_count"] = get_field_count("url");
response.status = 200;