This commit is contained in:
liameno 2021-12-20 09:52:21 -05:00
parent e00cd19917
commit fa524ec152
8 changed files with 125 additions and 74 deletions

5
.gitignore vendored
View File

@ -42,6 +42,9 @@
# CMake
cmake-build-*/
build/
cmake-build-*/*
build/*
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
@ -71,4 +74,4 @@ fabric.properties
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
.idea/caches/build_file_checksums.ser

View File

@ -9,6 +9,7 @@ Private Web Search engine
- Node Info
## TODO
- Robots in headers && html, crawl-delay
- Encryption (public key)
- Site Rating
- CLI Search

View File

@ -36,14 +36,14 @@ int main(int argc, char **argv) {
config.user_agent = "librengine";
config.opensearch_url = "http://localhost:9200"; //without '/'
config.update_time_site_info_s_after = 864000; //10 days
config.limit_page_symbols = 50000;
config.limit_page_symbols = 50000000; //50 mb
config.limit_robots_txt_symbols = 3000;
config.is_http_to_https = true; //https://en.wikipedia.org/wiki/HTTPS
config.is_check_robots_txt = true; //https://en.wikipedia.org/wiki/Robots_exclusion_standard
std::cout << std::string(25, '=') << "CFG" << std::string(25, '=') << std::endl
<< config.to_str() << std::endl
<< std::string(25, '=') << "===" << std::string(25, '=') << std::endl;
std::cout << std::string(25, '=') << "CFG" << std::string(25, '=') << std::endl
<< config.to_str() << std::endl
<< std::string(25, '=') << "===" << std::string(25, '=') << std::endl;
easy_start(config);

View File

@ -21,7 +21,7 @@ namespace librengine::crawler {
size_t delay_time_s = 3;
size_t limit_pages_site = 399;
size_t limit_page_symbols = 50000;
size_t limit_page_symbols = 50000000; //50 mb
size_t limit_robots_txt_symbols = 3000;
//size_t limit_sitemap_symbols = 10000;

View File

@ -10,6 +10,16 @@
namespace librengine::crawler {
class worker {
public:
enum class result {
added,
disallowed_robots,
work_false,
already_added,
pages_limit,
null_or_limit,
error,
};
private:
config current_config;
opensearch::client opensearch_client;
@ -36,10 +46,11 @@ namespace librengine::crawler {
bool is_allowed_in_robots(const std::string &body, const std::string &url) const;
std::optional<std::string> get_robots_txt(const http::url &url);
static std::string compute_desc(lxb_html_document *document, lxb_dom_node *body);
static std::string get_desc(lxb_html_document *document);
static std::string compute_desc(const std::string &tag_name, lxb_html_document *document);
public:
worker(config config, opensearch::client opensearch_client);
void main_thread(const std::string &site_url, int &deep, const std::optional<std::string> &owner_host = std::nullopt);
result main_thread(const std::string &site_url, int &deep, const std::optional<std::string> &owner_host = std::nullopt);
};
}

View File

@ -12,6 +12,20 @@
#include "../../third_party/json/json.hpp"
#include "../../third_party/rep-cpp/robots.h"
#define DEBUG true //TODO: false
#define DEBUG_NORMALIZE false //TODO: false
void if_debug_print(const std::string &type, const std::string &text, const std::string &ident) {
#if DEBUG
std::cout << "[" << librengine::str::to_upper(type) << "] " << text << " [" << ident << "]" << std::endl;
#endif
}
void if_debug_normalize_print(const std::string &type, const std::string &text, const std::string &ident) {
#if DEBUG_NORMALIZE
std::cout << "[" << librengine::str::to_upper(type) << "] " << text << " [" << ident << "]" << std::endl;
#endif
}
namespace librengine::crawler {
size_t worker::compute_time() {
return time(nullptr);
@ -38,12 +52,13 @@ namespace librengine::crawler {
}
bool worker::normalize_url(http::url &url, const std::optional<std::string> &owner_host) const {
if (url.text.size() < 3 && !owner_host) return false;
if_debug_normalize_print("info", "normalize url", url.text);
if (str::starts_with(url.text, "//")) {
if_debug_normalize_print("info", "url starts with //", url.text);
url.text.insert(0, "http:");
url.parse();
}
if (!url.host && owner_host) {
http::url owner_url(str::to_lower(*owner_host));
owner_url.parse();
@ -55,20 +70,18 @@ namespace librengine::crawler {
if (f_c == '.') {
str::remove_first_char(url.text);
} else if (f_c != '/') {
if (str::get_last_char(owner_url.text) == '/') str::remove_last_char(owner_url.text);
while(true) {
const char c = str::get_last_char(owner_url.text);
if (c == '/' || c == '\0') {
break;
} else {
str::remove_last_char(owner_url.text);
}
if (c == '/' || c == '\0') break;
else str::remove_last_char(owner_url.text);
}
} else {
owner_url.set(CURLUPART_PATH, "");
}
url.parse();
owner_url.parse();
if (str::get_first_char(url.text) == '/' && str::get_last_char(owner_url.text) == '/') {
@ -99,6 +112,7 @@ namespace librengine::crawler {
}
url.parse();
if_debug_normalize_print("info", "normalized url", url.text);
return true;
}
@ -199,10 +213,7 @@ namespace librengine::crawler {
request.options.proxy = this->current_config.proxy;
request.perform();
if (request.result.code != 200) {
return std::nullopt;
}
if (request.result.code != 200) return std::nullopt;
return request.result.response;
}
bool worker::is_allowed_in_robots(const std::string &body, const std::string &url) const {
@ -220,27 +231,48 @@ namespace librengine::crawler {
request.options.proxy = this->current_config.proxy;
request.perform();
if (request.result.code != 200) {
return std::nullopt;
}
if (request.result.code != 200) return std::nullopt;
return request.result.response;
}
std::string worker::compute_desc(lxb_html_document *document, lxb_dom_node *body) {
std::string worker::get_desc(lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(body), collection, std_string_to_lxb("p"), 1);
const auto p_length = collection->array.length;
lxb_dom_elements_by_attr(lxb_dom_interface_element(document->head), collection, std_string_to_lxb("name"), 4, std_string_to_lxb("description"), 11, false);
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < p_length; i++) {
for (size_t i = 0; i < c_length; i++) {
auto element = lxb_dom_collection_element(collection, i);
const auto content = lxb_dom_element_get_attribute(element, std_string_to_lxb("content"), 7, nullptr);
if (content != nullptr) {
if (desc.length() > 500) break;
desc.append(lxb_string_to_std(content).value_or(""));
desc.append("\n");
}
}
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
std::string worker::compute_desc(const std::string &tag_name, lxb_html_document *document) {
auto collection = lxb_dom_collection_make(&(document)->dom_document, 16);
lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), collection, std_string_to_lxb(tag_name), tag_name.length());
const auto c_length = collection->array.length;
std::string desc;
for (size_t i = 0; i < c_length; i++) {
if (desc.length() > 500) break;
auto element = lxb_dom_collection_element(collection, i);
const auto text = lxb_string_to_std(lxb_dom_node_text_content(lxb_dom_interface_node(element), nullptr)).value_or("");
desc.append(text);
desc.append("\n");
}
if (p_length > 0) lxb_dom_collection_destroy(collection, true);
if (c_length > 0) lxb_dom_collection_destroy(collection, true);
return desc;
}
@ -248,18 +280,20 @@ namespace librengine::crawler {
this->is_work = true;
}
void worker::main_thread(const std::string &site_url, int &deep, const std::optional<std::string> &owner_host) {
if (!is_work) return;
worker::result worker::main_thread(const std::string &site_url, int &deep, const std::optional<std::string> &owner_host) {
if (!is_work) return result::work_false;
http::url url(str::to_lower(site_url));
url.parse();
if (!normalize_url(url, owner_host.value_or(""))) return;
if (url.text == owner_host) return;
if (!url.host || hints_count_added("url", url.text) > 0) return;
if (!normalize_url(url, owner_host.value_or(""))) { if_debug_print("error", "normalize url", url.text); return result::error; }
if (url.text == owner_host) { if_debug_print("error", "url == owner", url.text); return result::already_added; }
if (!url.host) { if_debug_print("error", "url host == null", url.text); return result::null_or_limit; }
if (hints_count_added("url", url.text) > 0) { if_debug_print("error", "already added", url.text); return result::already_added; }
size_t pages_count = hints_count_added("host", *url.host);
if (pages_count >= this->current_config.limit_pages_site) return;
if (pages_count >= this->current_config.limit_pages_site) { if_debug_print("error", "pages count >= limit", url.text); return result::pages_limit; }
if (this->current_config.is_check_robots_txt) {
auto robots_txt_body = get_added_robots_txt(*url.host).value_or("");
@ -267,10 +301,11 @@ namespace librengine::crawler {
if (robots_txt_body.empty()) {
robots_txt_body = get_robots_txt(url).value_or("");
auto robots_txt_body_length = robots_txt_body.length();
if (robots_txt_body.length() < this->current_config.limit_robots_txt_symbols) {
if (robots_txt_body_length > 1 && robots_txt_body_length < this->current_config.limit_robots_txt_symbols) {
const auto json = compute_robots_txt_json(robots_txt_body, *url.host);
if (!json) return;
if (!json) return result::null_or_limit;
const auto path = opensearch::client::path_options("robots_txt/_doc");
const auto type = opensearch::client::request_type::POST;
@ -281,23 +316,32 @@ namespace librengine::crawler {
}
}
if (is_check && !is_allowed_in_robots(robots_txt_body, url.text)) return;
if (is_check && !is_allowed_in_robots(robots_txt_body, url.text)) return result::disallowed_robots;
}
auto response = site(url);
if (!response || response->length() >= this->current_config.limit_page_symbols) return;
auto response_length = response->length();
if_debug_print("info", "response length = " + str::to_string(response_length), url.text);
if (!response || response_length < 1 || response_length >= this->current_config.limit_page_symbols)
{ if_debug_print("error", "response = null || length < 1 || >= limit", url.text); return result::null_or_limit; }
auto document = parse_html(*response);
if (!document) return;
if (!document) return result::null_or_limit;
auto body = lxb_dom_interface_node((*document)->body);
if (body == nullptr) return;
if (body == nullptr) return result::null_or_limit;
const std::string title = lxb_string_to_std(lxb_html_document_title((*document), nullptr)).value_or("#ERR#");
const std::string content = lxb_string_to_std(lxb_dom_node_text_content(body, nullptr)).value_or("");
const std::string desc = compute_desc(*document, body);
//const std::string content = lxb_string_to_std(lxb_dom_node_text_content(body, nullptr)).value_or("");
std::string desc = get_desc(*document); //by meta tag
const auto json = compute_website_json(title, url.text, *url.host, content, desc);
if (!json) return;
if (desc.empty()) {
compute_desc("h1", *document);
desc.append(compute_desc("p", *document));
}
const auto json = compute_website_json(title, url.text, *url.host, ""/*content*/, desc);
if (!json) return result::null_or_limit;
const auto path = opensearch::client::path_options("website/_doc");
const auto type = opensearch::client::request_type::POST;
@ -314,17 +358,26 @@ namespace librengine::crawler {
for (size_t i = 0; i < a_length; i++) {
auto element = lxb_dom_collection_element(collection, i);
const auto href_value = lxb_string_to_std(lxb_dom_element_get_attribute(element, std_string_to_lxb("href"), 4, nullptr));
std::vector<std::string> pages_limit_hosts;
if (href_value && *href_value != url.text && !str::starts_with(*href_value, "#")) {
http::url href_url(*href_value);
href_url.parse();
std::this_thread::sleep_for(std::chrono::seconds(this->current_config.delay_time_s));
if (!href_url.host || str::contains(pages_limit_hosts, *href_url.host, true)) continue;
if (!str::starts_with(*href_value, "http")) {
main_thread(*href_value, deep, url.text);
} else {
main_thread(*href_value, deep);
std::this_thread::sleep_for(std::chrono::seconds(this->current_config.delay_time_s));
result result;
if (!str::starts_with(*href_value, "http")) result = main_thread(href_url.text, deep, url.text);
else result = main_thread(*href_value, deep);
if (result == result::work_false) {
break;
} if (result == result::pages_limit) {
pages_limit_hosts.push_back(*href_url.host);
} if (result == result::added || result == result::disallowed_robots) {
std::this_thread::sleep_for(std::chrono::seconds(this->current_config.delay_time_s));
}
}
}
@ -334,5 +387,6 @@ namespace librengine::crawler {
}
lxb_html_document_destroy(*document);
return result::added;
}
}
}

View File

@ -97,8 +97,6 @@ namespace librengine::http {
}
void url::parse() {
compute_text();
//https://curl.se/libcurl/c/parseurl.html
char *path;
curl_url_set(current_curl_url, CURLUPART_URL, text.c_str(), 0/*CURLU_DEFAULT_SCHEME*/);
@ -140,9 +138,7 @@ namespace librengine::http {
if (!c) {
this->text = path;
if (str::get_last_char(this->text) == '#') {
str::remove_last_char(this->text);
}
if (str::get_last_char(this->text) == '#') str::remove_last_char(this->text);
}
curl_free(path);

View File

@ -1,19 +1,12 @@
#include "third_party/httplib.h"
#include <optional>
#include <librengine/crawler/config.h>
#include <librengine/crawler/worker.h>
#include <librengine/opensearch.h>
#include <librengine/third_party/json/json.hpp>
#include <librengine/str.h>
#include <librengine/str_impl.h>
#include <utility>
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <openssl/rsa.h>
#include <openssl/pem.h>
#include <openssl/err.h>
#include <cstring>
using namespace librengine;
@ -46,7 +39,7 @@ namespace pages {
nlohmann::json json;
json["query"]["query_string"]["fields"] = {"url", "title", "content"};
json["query"]["query_string"]["fields"] = {"url", "title", "desc"};
json["query"]["query_string"]["query"] = q;
json["size"] = 10;
json["from"] = s;
@ -170,12 +163,8 @@ namespace pages {
}
int main(int argc, char **argv) {
if (argc <= 1) {
std::cout << "Usage: bin [port]\nExample: ./backend 8080" << std::endl;
return 1;
}
int port = 0;
if (argc <= 1) { std::cout << "Usage: bin [port]\nExample: ./backend 8080" << std::endl; return 1; }
int port;
try {
port = std::stoi(argv[1]);
@ -184,10 +173,7 @@ int main(int argc, char **argv) {
return 2;
}
if (port == 0) {
std::cout << "Port == 0" << std::endl;
return 3;
}
if (port == 0) { std::cout << "Port == 0" << std::endl; return 3; }
using namespace httplib;
auto client = opensearch::client("http://localhost:9200");