mirror of
https://github.com/liameno/librengine.git
synced 2024-11-24 07:53:17 +03:00
Replacing a third party robots.txt parser with own robots.txt parser
This commit is contained in:
parent
077740ac6c
commit
60eadcce58
@ -21,7 +21,7 @@
|
||||
"proxy": "socks5://127.0.0.1:9050",
|
||||
"load_page_timeout_s": 10,
|
||||
"update_time_site_info_s_after": 864000, //10 days
|
||||
"delay_time_s": 3,
|
||||
"delay_time_s": 1,
|
||||
"max_pages_site": 5,
|
||||
"max_page_symbols": 50000000, //50mb
|
||||
"max_robots_txt_symbols": 3000,
|
||||
|
@ -9,12 +9,5 @@ set(CMAKE_CXX_STANDARD 17)
|
||||
find_package(CURL)
|
||||
find_package(Threads)
|
||||
|
||||
set(tp_rep_cpp third_party/rep-cpp/agent.cpp third_party/rep-cpp/agent.h
|
||||
third_party/rep-cpp/directive.cpp third_party/rep-cpp/directive.h third_party/rep-cpp/robots.cpp
|
||||
third_party/rep-cpp/robots.h) #robots.txt / https://github.com/seomoz/rep-cpp
|
||||
set(tp_url_cpp third_party/url-cpp/psl.cpp third_party/url-cpp/psl.h
|
||||
third_party/url-cpp/punycode.cpp third_party/url-cpp/punycode.h third_party/url-cpp/url.cpp
|
||||
third_party/url-cpp/url.h third_party/url-cpp/utf8.cpp third_party/url-cpp/utf8.h) #https://github.com/seomoz/url-cpp
|
||||
|
||||
add_executable(${PROJECT_NAME} main.cpp src/worker.cpp include/worker.h src/json_generator.cpp include/json_generator.h src/html_helper.cpp include/html_helper.h ${tp_rep_cpp} ${tp_url_cpp})
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE /usr/lib/liblexbor.so curl Threads::Threads /usr/local/lib/liblibrengine.so)
|
@ -41,7 +41,7 @@ public:
|
||||
librengine::http::request::result_s site(const librengine::http::url &url);
|
||||
std::optional<std::string> get_robots_txt(const librengine::http::url &url);
|
||||
|
||||
bool is_allowed_in_robots(const std::string &body, const std::string &url);
|
||||
bool is_allowed_in_robots(const std::string &body, const http::url &url);
|
||||
bool normalize_url(librengine::http::url &url, const std::optional<std::string> &owner_host = std::nullopt) const;
|
||||
public:
|
||||
explicit worker(const librengine::config::all &config);
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
#include "include/worker.h"
|
||||
|
||||
using namespace librengine;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
using namespace librengine;
|
||||
|
||||
|
@ -10,10 +10,10 @@
|
||||
#include <librengine/logger.h>
|
||||
#include <librengine/helper.h>
|
||||
#include <librengine/cache.h>
|
||||
#include <librengine/robots_txt.h>
|
||||
|
||||
#include "../include/json_generator.h"
|
||||
#include "../include/html_helper.h"
|
||||
#include "../third_party/rep-cpp/robots.h"
|
||||
|
||||
#define DEBUG true //TODO: FALSE
|
||||
|
||||
@ -82,8 +82,10 @@ std::optional<std::string> worker::get_robots_txt(const http::url &url) {
|
||||
return request.result.response;
|
||||
}
|
||||
|
||||
bool worker::is_allowed_in_robots(const std::string &body, const std::string &url) {
|
||||
Rep::Robots robots = Rep::Robots(body);
|
||||
bool worker::is_allowed_in_robots(const std::string &body, const http::url &url) {
|
||||
robots_txt robots(body);
|
||||
robots.parse();
|
||||
|
||||
return robots.allowed(url, config.crawler_.user_agent);
|
||||
}
|
||||
bool worker::normalize_url(http::url &url, const std::optional<std::string> &owner_host) const {
|
||||
@ -260,7 +262,8 @@ worker::result worker::work(url &url_) {
|
||||
}
|
||||
}
|
||||
|
||||
if (is_checked && !is_allowed_in_robots(robots_txt_body, url.text)) {
|
||||
if (is_checked && !is_allowed_in_robots(robots_txt_body, url)) {
|
||||
if_debug_print(logger::type::error, "disallowed robots.txt", url.text);
|
||||
return result::disallowed_robots;
|
||||
}
|
||||
}
|
||||
|
138
crawler/third_party/rep-cpp/agent.cpp
vendored
138
crawler/third_party/rep-cpp/agent.cpp
vendored
@ -1,138 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "../url-cpp/url.h"
|
||||
|
||||
#include "agent.h"
|
||||
#include "directive.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
std::string escape_url(Url::Url& url)
|
||||
{
|
||||
return url.defrag().escape().fullpath();
|
||||
}
|
||||
|
||||
std::string trim_front(const std::string& str, const char chr)
|
||||
{
|
||||
auto itr = std::find_if(str.begin(), str.end(),
|
||||
[chr](const char c) {return c != chr;});
|
||||
return std::string(itr, str.end());
|
||||
}
|
||||
}
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
Agent& Agent::allow(const std::string& query)
|
||||
{
|
||||
Url::Url url(query);
|
||||
// ignore directives for external URLs
|
||||
if (is_external(url))
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
// leading wildcard?
|
||||
if (query.front() == '*')
|
||||
{
|
||||
Url::Url trimmed(trim_front(query, '*'));
|
||||
directives_.push_back(Directive(escape_url(trimmed), true));
|
||||
}
|
||||
directives_.push_back(Directive(escape_url(url), true));
|
||||
sorted_ = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Agent& Agent::disallow(const std::string& query)
|
||||
{
|
||||
if (query.empty())
|
||||
{
|
||||
// Special case: "Disallow:" means "Allow: /"
|
||||
directives_.push_back(Directive(query, true));
|
||||
}
|
||||
else
|
||||
{
|
||||
Url::Url url(query);
|
||||
// ignore directives for external URLs
|
||||
if (is_external(url))
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
// leading wildcard?
|
||||
if (query.front() == '*')
|
||||
{
|
||||
Url::Url trimmed(trim_front(query, '*'));
|
||||
directives_.push_back(Directive(escape_url(trimmed), false));
|
||||
}
|
||||
directives_.push_back(Directive(escape_url(url), false));
|
||||
}
|
||||
sorted_ = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::vector<Directive>& Agent::directives() const
|
||||
{
|
||||
if (!sorted_)
|
||||
{
|
||||
std::sort(directives_.begin(), directives_.end(),
|
||||
[](const Directive& a, const Directive& b) {
|
||||
return b.priority() < a.priority();
|
||||
});
|
||||
sorted_ = true;
|
||||
}
|
||||
return directives_;
|
||||
}
|
||||
|
||||
bool Agent::allowed(const std::string& query) const
|
||||
{
|
||||
Url::Url url(query);
|
||||
if (is_external(url))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
std::string path(escape_url(url));
|
||||
|
||||
if (path.compare("/robots.txt") == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const auto& directive : directives())
|
||||
{
|
||||
if (directive.match(path))
|
||||
{
|
||||
return directive.allowed();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string Agent::str() const
|
||||
{
|
||||
std::stringstream out;
|
||||
if (delay_ > 0)
|
||||
{
|
||||
out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' ';
|
||||
}
|
||||
out << '[';
|
||||
const auto& d = directives();
|
||||
auto begin = d.begin();
|
||||
auto end = d.end();
|
||||
if (begin != end)
|
||||
{
|
||||
out << "Directive(" << begin->str() << ')';
|
||||
++begin;
|
||||
}
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
out << ", Directive(" << begin->str() << ')';
|
||||
}
|
||||
out << ']';
|
||||
return out.str();
|
||||
}
|
||||
|
||||
bool Agent::is_external(const Url::Url& url) const
|
||||
{
|
||||
return !host_.empty() && !url.host().empty() && url.host() != host_;
|
||||
}
|
||||
}
|
93
crawler/third_party/rep-cpp/agent.h
vendored
93
crawler/third_party/rep-cpp/agent.h
vendored
@ -1,93 +0,0 @@
|
||||
#ifndef AGENT_CPP_H
|
||||
#define AGENT_CPP_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "directive.h"
|
||||
|
||||
// forward declaration
|
||||
namespace Url
|
||||
{
|
||||
struct Url;
|
||||
}
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
class Agent
|
||||
{
|
||||
public:
|
||||
/* The type for the delay. */
|
||||
typedef float delay_t;
|
||||
|
||||
/**
|
||||
* Default constructor
|
||||
*/
|
||||
Agent() : Agent("") {}
|
||||
|
||||
/**
|
||||
* Construct an agent.
|
||||
*/
|
||||
explicit Agent(const std::string& host) :
|
||||
directives_(), delay_(-1.0), sorted_(true), host_(host) {}
|
||||
|
||||
/**
|
||||
* Default copy constructor.
|
||||
*/
|
||||
Agent(const Agent& rhs) = default;
|
||||
|
||||
/**
|
||||
* Default move constructor.
|
||||
*/
|
||||
Agent(Agent&& rhs) = default;
|
||||
|
||||
/**
|
||||
* Add an allowed directive.
|
||||
*/
|
||||
Agent& allow(const std::string& query);
|
||||
|
||||
/**
|
||||
* Add a disallowed directive.
|
||||
*/
|
||||
Agent& disallow(const std::string& query);
|
||||
|
||||
/**
|
||||
* Set the delay for this agent.
|
||||
*/
|
||||
Agent& delay(delay_t value) {
|
||||
delay_ = value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the delay for this agent.
|
||||
*/
|
||||
delay_t delay() const { return delay_; }
|
||||
|
||||
/**
|
||||
* A vector of the directives, in priority-sorted order.
|
||||
*/
|
||||
const std::vector<Directive>& directives() const;
|
||||
|
||||
/**
|
||||
* Return true if the URL (either a full URL or a path) is allowed.
|
||||
*/
|
||||
bool allowed(const std::string& path) const;
|
||||
|
||||
std::string str() const;
|
||||
|
||||
/**
|
||||
* Default copy assignment operator.
|
||||
*/
|
||||
Agent& operator=(const Agent& rhs) = default;
|
||||
|
||||
private:
|
||||
bool is_external(const Url::Url& url) const;
|
||||
|
||||
mutable std::vector<Directive> directives_;
|
||||
delay_t delay_;
|
||||
mutable bool sorted_;
|
||||
std::string host_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
130
crawler/third_party/rep-cpp/directive.cpp
vendored
130
crawler/third_party/rep-cpp/directive.cpp
vendored
@ -1,130 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "../url-cpp/url.h"
|
||||
|
||||
#include "directive.h"
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
Directive::Directive(const std::string& line, bool allowed)
|
||||
: expression_()
|
||||
, priority_(line.size())
|
||||
, allowed_(allowed)
|
||||
{
|
||||
if (line.find('*') == std::string::npos)
|
||||
{
|
||||
expression_.assign(line);
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove consecutive '*'s
|
||||
expression_.reserve(line.size());
|
||||
bool star = false;
|
||||
for (auto character : line)
|
||||
{
|
||||
if (character == '*')
|
||||
{
|
||||
if (!star)
|
||||
{
|
||||
expression_.append(1, character);
|
||||
}
|
||||
star = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
expression_.append(1, character);
|
||||
star = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing '*'s
|
||||
std::string::reverse_iterator last =
|
||||
std::find_if(expression_.rbegin(), expression_.rend(),
|
||||
[](const char c) {
|
||||
return c != '*';
|
||||
});
|
||||
expression_.erase(last.base(), expression_.end());
|
||||
|
||||
// Priority is the length of the expression
|
||||
priority_ = expression_.size();
|
||||
}
|
||||
|
||||
bool Directive::match(const std::string::const_iterator& e_begin,
|
||||
const std::string::const_iterator& e_end,
|
||||
const std::string::const_iterator& p_begin,
|
||||
const std::string::const_iterator& p_end) const
|
||||
{
|
||||
std::string::const_iterator expression_it = e_begin;
|
||||
std::string::const_iterator path_it = p_begin;
|
||||
while (expression_it != e_end && path_it != p_end)
|
||||
{
|
||||
if (*expression_it == '*')
|
||||
{
|
||||
// Advance and recurse
|
||||
++expression_it;
|
||||
for (; path_it != p_end; ++path_it)
|
||||
{
|
||||
if (match(expression_it, e_end, path_it, p_end))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
else if (*expression_it == '$')
|
||||
{
|
||||
// This check expects path to be fully consumed. But since one of the
|
||||
// criteria of being in this while loop is that we've not fully consumed
|
||||
// path, return false.
|
||||
return false;
|
||||
}
|
||||
else if (*expression_it != *path_it)
|
||||
{
|
||||
// These characters must match
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Advance both by one
|
||||
++path_it;
|
||||
++expression_it;
|
||||
}
|
||||
}
|
||||
|
||||
// Return true only if we've consumed all of the expression
|
||||
if (expression_it == e_end)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else if (*expression_it == '$')
|
||||
{
|
||||
return path_it == p_end;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::string Directive::str() const
|
||||
{
|
||||
std::stringstream out;
|
||||
if (allowed_)
|
||||
{
|
||||
out << "Allow: " << expression_;
|
||||
}
|
||||
else {
|
||||
out << "Disallow: " << expression_;
|
||||
}
|
||||
return out.str();
|
||||
}
|
||||
|
||||
bool Directive::match(const std::string& path) const
|
||||
{
|
||||
return match(expression_.begin(), expression_.end(), path.begin(), path.end());
|
||||
}
|
||||
|
||||
}
|
82
crawler/third_party/rep-cpp/directive.h
vendored
82
crawler/third_party/rep-cpp/directive.h
vendored
@ -1,82 +0,0 @@
|
||||
#ifndef DIRECTIVE_CPP_H
|
||||
#define DIRECTIVE_CPP_H
|
||||
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
|
||||
class Directive
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* The type of our priority value.
|
||||
*/
|
||||
typedef size_t priority_t;
|
||||
|
||||
/**
|
||||
* Default constructor disallowed.
|
||||
*/
|
||||
Directive() = delete;
|
||||
|
||||
/**
|
||||
* The input to this constructor must be stripped of comments
|
||||
* and trailing whitespace.
|
||||
*/
|
||||
Directive(const std::string& line, bool allowed);
|
||||
|
||||
/**
|
||||
* Default copy constructor.
|
||||
*/
|
||||
Directive(const Directive& rhs) = default;
|
||||
|
||||
/**
|
||||
* Default move constructor.
|
||||
*/
|
||||
Directive(Directive&& rhs) = default;
|
||||
|
||||
/**
|
||||
* The priority of the rule.
|
||||
*/
|
||||
priority_t priority() const
|
||||
{
|
||||
return priority_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the provided path matches. The path is
|
||||
* expected to be properly escaped.
|
||||
*/
|
||||
bool match(const std::string& path) const;
|
||||
|
||||
/**
|
||||
* Whether this rule is for an allow or a disallow.
|
||||
*/
|
||||
bool allowed() const
|
||||
{
|
||||
return allowed_;
|
||||
}
|
||||
|
||||
std::string str() const;
|
||||
|
||||
/**
|
||||
* Default copy assignment operator.
|
||||
*/
|
||||
Directive& operator=(const Directive& rhs) = default;
|
||||
|
||||
private:
|
||||
std::string expression_;
|
||||
priority_t priority_;
|
||||
bool allowed_;
|
||||
|
||||
/**
|
||||
* Return true if p_begin -> p_end matches the expression e_begin -> e_end.
|
||||
*/
|
||||
bool match(const std::string::const_iterator& e_begin,
|
||||
const std::string::const_iterator& e_end,
|
||||
const std::string::const_iterator& p_begin,
|
||||
const std::string::const_iterator& p_end) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
196
crawler/third_party/rep-cpp/robots.cpp
vendored
196
crawler/third_party/rep-cpp/robots.cpp
vendored
@ -1,196 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <cctype>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "../url-cpp/url.h"
|
||||
|
||||
#include "robots.h"
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
|
||||
void Robots::strip(std::string& string)
|
||||
{
|
||||
string.erase(string.begin(), std::find_if(string.begin(), string.end(),
|
||||
std::not1(std::ptr_fun<int, int>(std::isspace))));
|
||||
string.erase(std::find_if(string.rbegin(), string.rend(),
|
||||
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
|
||||
}
|
||||
|
||||
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
|
||||
{
|
||||
while (getline(stream, key))
|
||||
{
|
||||
size_t index = key.find('#');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
key.resize(index);
|
||||
}
|
||||
|
||||
// Find the colon and divide it into key and value, skipping malformed lines
|
||||
index = key.find(':');
|
||||
if (index == std::string::npos)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
value.assign(key.begin() + index + 1, key.end());
|
||||
key.resize(index);
|
||||
|
||||
// Strip whitespace off of each
|
||||
strip(key);
|
||||
strip(value);
|
||||
|
||||
// Lowercase the key
|
||||
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
|
||||
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Robots::Robots(const std::string& content) :
|
||||
Robots(content, "")
|
||||
{
|
||||
}
|
||||
|
||||
Robots::Robots(const std::string& content, const std::string& base_url) :
|
||||
host_(Url::Url(base_url).host()),
|
||||
agents_(),
|
||||
sitemaps_(),
|
||||
default_(agents_.emplace("*", Agent(host_)).first->second)
|
||||
{
|
||||
std::string agent_name("*");
|
||||
std::istringstream input(content);
|
||||
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
|
||||
{
|
||||
input.ignore(3);
|
||||
}
|
||||
std::string key, value;
|
||||
std::vector<std::string> group;
|
||||
bool last_agent = false;
|
||||
agent_map_t::iterator current = agents_.find("*");
|
||||
while (Robots::getpair(input, key, value))
|
||||
{
|
||||
if (key.compare("user-agent") == 0)
|
||||
{
|
||||
// Store the user agent string as lowercased
|
||||
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
|
||||
|
||||
if (last_agent)
|
||||
{
|
||||
group.push_back(value);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!agent_name.empty())
|
||||
{
|
||||
for (auto other : group)
|
||||
{
|
||||
agents_.emplace(other, current->second);
|
||||
}
|
||||
group.clear();
|
||||
}
|
||||
agent_name = value;
|
||||
current = agents_.emplace(agent_name, Agent(host_)).first;
|
||||
}
|
||||
last_agent = true;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
last_agent = false;
|
||||
}
|
||||
|
||||
if (key.compare("sitemap") == 0)
|
||||
{
|
||||
sitemaps_.push_back(value);
|
||||
}
|
||||
else if (key.compare("disallow") == 0)
|
||||
{
|
||||
current->second.disallow(value);
|
||||
}
|
||||
else if (key.compare("allow") == 0)
|
||||
{
|
||||
current->second.allow(value);
|
||||
}
|
||||
else if (key.compare("crawl-delay") == 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
current->second.delay(std::stof(value));
|
||||
}
|
||||
catch (const std::exception&)
|
||||
{
|
||||
std::cerr << "Could not parse " << value << " as float." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!agent_name.empty())
|
||||
{
|
||||
for (auto other : group)
|
||||
{
|
||||
agents_.emplace(other, current->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const Agent& Robots::agent(const std::string& name) const
|
||||
{
|
||||
// Lowercase the agent
|
||||
std::string lowered(name);
|
||||
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
|
||||
|
||||
auto it = agents_.find(lowered);
|
||||
if (it == agents_.end())
|
||||
{
|
||||
return default_;
|
||||
}
|
||||
else
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
bool Robots::allowed(const std::string& path, const std::string& name) const
|
||||
{
|
||||
return agent(name).allowed(path);
|
||||
}
|
||||
|
||||
std::string Robots::str() const
|
||||
{
|
||||
std::stringstream out;
|
||||
// TODO: include sitepath info
|
||||
out << '{';
|
||||
auto begin = agents_.begin();
|
||||
auto end = agents_.end();
|
||||
if (begin != end)
|
||||
{
|
||||
out << '"' << begin->first << '"' << ": " << begin->second.str();
|
||||
++begin;
|
||||
}
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
out << ", \"" << begin->first << '"' << ": " << begin->second.str();
|
||||
}
|
||||
out << '}';
|
||||
return out.str();
|
||||
}
|
||||
|
||||
std::string Robots::robotsUrl(const std::string& url)
|
||||
{
|
||||
return Url::Url(url)
|
||||
.setUserinfo("")
|
||||
.setPath("robots.txt")
|
||||
.setParams("")
|
||||
.setQuery("")
|
||||
.setFragment("")
|
||||
.remove_default_port()
|
||||
.str();
|
||||
}
|
||||
}
|
66
crawler/third_party/rep-cpp/robots.h
vendored
66
crawler/third_party/rep-cpp/robots.h
vendored
@ -1,66 +0,0 @@
|
||||
#ifndef ROBOTS_CPP_H
|
||||
#define ROBOTS_CPP_H
|
||||
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "agent.h"
|
||||
|
||||
namespace Rep
|
||||
{
|
||||
|
||||
class Robots
|
||||
{
|
||||
public:
|
||||
typedef std::unordered_map<std::string, Agent> agent_map_t;
|
||||
typedef std::vector<std::string> sitemaps_t;
|
||||
|
||||
/**
|
||||
* Create a robots.txt from a utf-8-encoded string.
|
||||
*/
|
||||
explicit Robots(const std::string& content);
|
||||
|
||||
/**
|
||||
* Create a robots.txt from a utf-8-encoded string assuming
|
||||
* the given base_url.
|
||||
*/
|
||||
Robots(const std::string& content, const std::string& base_url);
|
||||
|
||||
/**
|
||||
* Get the sitemaps in this robots.txt
|
||||
*/
|
||||
const sitemaps_t& sitemaps() const { return sitemaps_; }
|
||||
|
||||
/**
|
||||
* Get the agent with the corresponding name.
|
||||
*/
|
||||
const Agent& agent(const std::string& name) const;
|
||||
|
||||
/**
|
||||
* Return true if agent is allowed to fetch the URL (either a
|
||||
* full URL or a path).
|
||||
*/
|
||||
bool allowed(const std::string& path, const std::string& name) const;
|
||||
|
||||
std::string str() const;
|
||||
|
||||
/**
|
||||
* Return the robots.txt URL corresponding to the provided URL.
|
||||
*/
|
||||
static std::string robotsUrl(const std::string& url);
|
||||
|
||||
private:
|
||||
static void strip(std::string& string);
|
||||
|
||||
static bool getpair(
|
||||
std::istringstream& stream, std::string& key, std::string& value);
|
||||
|
||||
std::string host_;
|
||||
agent_map_t agents_;
|
||||
sitemaps_t sitemaps_;
|
||||
Agent& default_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
183
crawler/third_party/url-cpp/psl.cpp
vendored
183
crawler/third_party/url-cpp/psl.cpp
vendored
@ -1,183 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "psl.h"
|
||||
#include "punycode.h"
|
||||
|
||||
namespace Url
|
||||
{
|
||||
const std::string PSL::not_found = "";
|
||||
|
||||
PSL::PSL(std::istream& stream)
|
||||
{
|
||||
std::string line;
|
||||
while (std::getline(stream, line))
|
||||
{
|
||||
// Only take up to the first whitespace.
|
||||
auto it = std::find_if(line.begin(), line.end(), ::isspace);
|
||||
line.resize(it - line.begin());
|
||||
|
||||
// Skip blank lines
|
||||
if (line.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip comments
|
||||
if (line.compare(0, 2, "//") == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// We know the line has at least a single character at this point
|
||||
if (line[0] == '*')
|
||||
{
|
||||
// Line is a wildcard rule
|
||||
if (line.size() <= 2 || line[1] != '.')
|
||||
{
|
||||
throw std::invalid_argument("Wildcard rule must be of form *.<host>");
|
||||
}
|
||||
|
||||
add(line, 1, 2);
|
||||
}
|
||||
else if (line[0] == '!')
|
||||
{
|
||||
// Line is an exception, take all but the !
|
||||
if (line.size() <= 1)
|
||||
{
|
||||
throw std::invalid_argument("Exception rule has no hostname.");
|
||||
}
|
||||
|
||||
add(line, -1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
add(line, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PSL PSL::fromPath(const std::string& path)
|
||||
{
|
||||
std::ifstream stream(path);
|
||||
if (!stream.good())
|
||||
{
|
||||
std::stringstream message;
|
||||
message << "Path '" << path << "' inaccessible.";
|
||||
throw std::invalid_argument(message.str());
|
||||
}
|
||||
return PSL(stream);
|
||||
}
|
||||
|
||||
PSL PSL::fromString(const std::string& str)
|
||||
{
|
||||
std::stringstream stream(str);
|
||||
return PSL(stream);
|
||||
}
|
||||
|
||||
std::string PSL::getTLD(const std::string& hostname) const
|
||||
{
|
||||
return getLastSegments(hostname, getTLDLength(hostname));
|
||||
}
|
||||
|
||||
std::string PSL::getPLD(const std::string& hostname) const
|
||||
{
|
||||
return getLastSegments(hostname, getTLDLength(hostname) + 1);
|
||||
}
|
||||
|
||||
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
|
||||
{
|
||||
size_t length = getTLDLength(hostname);
|
||||
return std::make_pair(
|
||||
getLastSegments(hostname, length),
|
||||
getLastSegments(hostname, length + 1));
|
||||
}
|
||||
|
||||
size_t PSL::getTLDLength(const std::string& hostname) const
|
||||
{
|
||||
// Reversed copy of hostname
|
||||
std::string tld(hostname.rbegin(), hostname.rend());
|
||||
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
|
||||
|
||||
while (tld.size())
|
||||
{
|
||||
auto it = levels.find(tld);
|
||||
if (it != levels.end())
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
|
||||
size_t position = tld.rfind('.');
|
||||
if (position == std::string::npos || position == 0)
|
||||
{
|
||||
tld.resize(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
tld.resize(position);
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
|
||||
{
|
||||
size_t position = hostname.size();
|
||||
size_t remaining = segments;
|
||||
while (remaining != 0 && position && position != std::string::npos)
|
||||
{
|
||||
position = hostname.rfind('.', position - 1);
|
||||
remaining -= 1;
|
||||
}
|
||||
|
||||
if (remaining >= 1)
|
||||
{
|
||||
return not_found;
|
||||
}
|
||||
|
||||
// Return the whole string if position == std:string::npos
|
||||
size_t start = (position == std::string::npos) ? 0 : position + 1;
|
||||
|
||||
std::string result(hostname, start);
|
||||
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
|
||||
|
||||
// Leading .'s indicate that the query had an empty segment
|
||||
if (result.size() && result[0] == '.')
|
||||
{
|
||||
std::stringstream message;
|
||||
message << "Empty segment in " << result;
|
||||
throw std::invalid_argument(message.str());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
size_t PSL::countSegments(const std::string& hostname) const
|
||||
{
|
||||
size_t count = 1;
|
||||
size_t position = hostname.find('.');
|
||||
while (position != std::string::npos)
|
||||
{
|
||||
count += 1;
|
||||
position = hostname.find('.', position + 1);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
void PSL::add(std::string& rule, int level_adjust, size_t trim)
|
||||
{
|
||||
// First unpunycoded
|
||||
std::string copy(rule.rbegin(), rule.rend() - trim);
|
||||
size_t length = countSegments(copy) + level_adjust;
|
||||
levels[copy] = length;
|
||||
|
||||
// And now punycoded
|
||||
rule = Punycode::encodeHostname(rule);
|
||||
copy.assign(rule.rbegin(), rule.rend() - trim);
|
||||
levels[copy] = length;
|
||||
}
|
||||
|
||||
};
|
102
crawler/third_party/url-cpp/psl.h
vendored
102
crawler/third_party/url-cpp/psl.h
vendored
@ -1,102 +0,0 @@
|
||||
#ifndef PSL_CPP_H
|
||||
#define PSL_CPP_H
|
||||
|
||||
#include <istream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
/**
|
||||
* Find TLDs and PLDs of a hostname according to a PSL.
|
||||
*/
|
||||
struct PSL
|
||||
{
|
||||
/**
|
||||
* Indicates the there is no TLD / PLD
|
||||
*/
|
||||
static const std::string not_found;
|
||||
|
||||
/**
|
||||
* Read a PSL from an istream.
|
||||
*/
|
||||
PSL(std::istream& stream);
|
||||
|
||||
PSL(): levels() { };
|
||||
|
||||
PSL(const PSL& other): levels(other.levels) { }
|
||||
|
||||
PSL& operator=(const PSL& other)
|
||||
{
|
||||
levels = other.levels;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the provided path holding a set of PSL rules.
|
||||
*/
|
||||
static PSL fromPath(const std::string& path);
|
||||
|
||||
/**
|
||||
* Create a PSL object from a string.
|
||||
*/
|
||||
static PSL fromString(const std::string& str);
|
||||
|
||||
/**
|
||||
* Get just the TLD of the hostname.
|
||||
*
|
||||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
|
||||
* some segments have been appropriately punycoded and others not, it may return
|
||||
* a wrong answer. If a punycoded host is provided, a punycoded response is
|
||||
* returned. If an unpunycoded host is provided, an unpunycoded response is
|
||||
* returned.
|
||||
*/
|
||||
std::string getTLD(const std::string& hostname) const;
|
||||
|
||||
/**
|
||||
* Get just the PLD of the hostname.
|
||||
*
|
||||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
|
||||
* some segments have been appropriately punycoded and others not, it may return
|
||||
* a wrong answer. If a punycoded host is provided, a punycoded response is
|
||||
* returned. If an unpunycoded host is provided, an unpunycoded response is
|
||||
* returned.
|
||||
*/
|
||||
std::string getPLD(const std::string& hostname) const;
|
||||
|
||||
/**
|
||||
* Get the (TLD, PLD) of the hostname.
|
||||
*
|
||||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
|
||||
* some segments have been appropriately punycoded and others not, it may return
|
||||
* a wrong answer. If a punycoded host is provided, a punycoded response is
|
||||
* returned. If an unpunycoded host is provided, an unpunycoded response is
|
||||
* returned.
|
||||
*/
|
||||
std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
|
||||
private:
|
||||
// Mapping of a string rule to its level
|
||||
std::unordered_map<std::string, size_t> levels;
|
||||
|
||||
// Return the number of segments in a hostname
|
||||
size_t countSegments(const std::string& hostname) const;
|
||||
|
||||
// Return the number of segments in the TLD of the provided hostname
|
||||
size_t getTLDLength(const std::string& hostname) const;
|
||||
|
||||
// Return the last `segments` segments of a hostname
|
||||
std::string getLastSegments(const std::string& hostname, size_t segments) const;
|
||||
|
||||
/**
|
||||
* Add the provided host with the provided priority, trimming characters off
|
||||
* the front, and adjusting the level by the provided number.
|
||||
*/
|
||||
void add(std::string& host, int level_adjust, size_t trim);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
408
crawler/third_party/url-cpp/punycode.cpp
vendored
408
crawler/third_party/url-cpp/punycode.cpp
vendored
@ -1,408 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
#include "punycode.h"
|
||||
#include "utf8.h"
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
std::string& Punycode::encode(std::string& str)
|
||||
{
|
||||
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
|
||||
//
|
||||
// let n = initial_n
|
||||
// let delta = 0
|
||||
// let bias = initial_bias
|
||||
punycode_uint n = INITIAL_N;
|
||||
punycode_uint delta = 0;
|
||||
punycode_uint bias = INITIAL_BIAS;
|
||||
std::string output;
|
||||
|
||||
// Accumulate the non-basic codepoints
|
||||
std::vector<punycode_uint> codepoints;
|
||||
for (auto it = str.cbegin(); it != str.cend(); )
|
||||
{
|
||||
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
|
||||
if (value < 0x80)
|
||||
{
|
||||
// copy them to the output in order
|
||||
output.append(1, static_cast<char>(value));
|
||||
}
|
||||
codepoints.push_back(value);
|
||||
}
|
||||
|
||||
// let h = b = the number of basic code points in the input
|
||||
size_t h = output.size();
|
||||
size_t b = h;
|
||||
|
||||
// copy a delimiter if b > 0
|
||||
if (b > 0)
|
||||
{
|
||||
output.append(1, '-');
|
||||
}
|
||||
|
||||
// while h < length(input) do begin
|
||||
while (h < codepoints.size())
|
||||
{
|
||||
// let m = the minimum {non-basic} code point >= n in the input
|
||||
punycode_uint m = MAX_PUNYCODE_UINT;
|
||||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
||||
{
|
||||
if ((*it >= n) && (*it < m))
|
||||
{
|
||||
m = *it;
|
||||
}
|
||||
}
|
||||
|
||||
// let delta = delta + (m - n) * (h + 1), fail on overflow
|
||||
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
|
||||
{
|
||||
throw std::invalid_argument("Overflow delta update.");
|
||||
}
|
||||
delta += (m - n) * (h + 1);
|
||||
|
||||
// let n = m
|
||||
n = m;
|
||||
|
||||
// for each code point c in the input (in order) do begin
|
||||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
||||
{
|
||||
// if c < n {or c is basic} then increment delta, fail on overflow
|
||||
if (*it < n)
|
||||
{
|
||||
if (delta == MAX_PUNYCODE_UINT)
|
||||
{
|
||||
throw std::invalid_argument("Overflow delta increment.");
|
||||
}
|
||||
++delta;
|
||||
}
|
||||
|
||||
// if c == n then begin
|
||||
if (*it == n)
|
||||
{
|
||||
// let q = delta
|
||||
punycode_uint q = delta;
|
||||
|
||||
// for k = base to infinity in steps of base do begin
|
||||
for (punycode_uint k = BASE; ; k += BASE)
|
||||
{
|
||||
// let t = tmin if k <= bias {+ tmin}, or
|
||||
// tmax if k >= bias + tmax, or k - bias otherwise
|
||||
punycode_uint t = k <= bias ? TMIN :
|
||||
k >= bias + TMAX ? TMAX : k - bias;
|
||||
|
||||
// if q < t then break
|
||||
if (q < t)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// output the code point for digit t + ((q - t) mod (base - t))
|
||||
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
|
||||
|
||||
// let q = (q - t) div (base - t)
|
||||
q = (q - t) / (BASE - t);
|
||||
}
|
||||
|
||||
// output the code point for digit q
|
||||
output.append(1, DIGIT_TO_BASIC[q]);
|
||||
|
||||
// let bias = adapt(delta, h + 1, test h equals b?)
|
||||
bias = adapt(delta, h + 1, h == b);
|
||||
|
||||
// let delta = 0
|
||||
delta = 0;
|
||||
|
||||
// increment h
|
||||
++h;
|
||||
}
|
||||
}
|
||||
|
||||
// increment delta and n
|
||||
++delta;
|
||||
++n;
|
||||
}
|
||||
|
||||
str.assign(output);
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string Punycode::encode(const std::string& str)
|
||||
{
|
||||
std::string result(str);
|
||||
encode(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string Punycode::encodeHostname(const std::string& hostname)
|
||||
{
|
||||
// Avoid any punycoding at all if none is needed
|
||||
if (!needsPunycoding(hostname))
|
||||
{
|
||||
return hostname;
|
||||
}
|
||||
|
||||
std::string encoded;
|
||||
|
||||
size_t start = 0;
|
||||
size_t end = hostname.find('.');
|
||||
while(true)
|
||||
{
|
||||
std::string segment = hostname.substr(start, end - start);
|
||||
if (needsPunycoding(segment))
|
||||
{
|
||||
encoded.append("xn--");
|
||||
encoded.append(Punycode::encode(segment));
|
||||
}
|
||||
else
|
||||
{
|
||||
encoded.append(segment);
|
||||
}
|
||||
|
||||
if (end == std::string::npos)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
encoded.append(1, '.');
|
||||
start = end + 1;
|
||||
end = hostname.find('.', start);
|
||||
}
|
||||
}
|
||||
|
||||
return encoded;
|
||||
}
|
||||
|
||||
std::string& Punycode::decode(std::string& str)
|
||||
{
|
||||
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
|
||||
//
|
||||
// let n = initial_n
|
||||
// let i = 0
|
||||
// let bias = initial_bias
|
||||
// let output = an empty string indexed from 0
|
||||
punycode_uint n = INITIAL_N;
|
||||
punycode_uint i = 0;
|
||||
punycode_uint bias = INITIAL_BIAS;
|
||||
std::vector<punycode_uint> codepoints;
|
||||
|
||||
size_t index = str.rfind('-');
|
||||
if (index == std::string::npos)
|
||||
{
|
||||
index = 0;
|
||||
}
|
||||
|
||||
// consume all code points before the last delimiter (if there is one)
|
||||
// and copy them to output, fail on any non-basic code point
|
||||
for (auto it = str.begin(); it != (str.begin() + index); ++it)
|
||||
{
|
||||
if (static_cast<unsigned char>(*it) > 127U)
|
||||
{
|
||||
throw std::invalid_argument("Argument has non-basic code points.");
|
||||
}
|
||||
codepoints.push_back(*it);
|
||||
}
|
||||
|
||||
// if more than zero code points were consumed then consume one more
|
||||
// (which will be the last delimiter)
|
||||
if (index > 0)
|
||||
{
|
||||
index += 1;
|
||||
}
|
||||
|
||||
// while the input is not exhausted do begin
|
||||
for (auto it = (str.begin() + index); it != str.end(); ++it)
|
||||
{
|
||||
// let oldi = i
|
||||
// let w = 1
|
||||
punycode_uint oldi = i;
|
||||
punycode_uint w = 1;
|
||||
|
||||
// for k = base to infinity in steps of base do begin
|
||||
for (punycode_uint k = BASE; ; k += BASE, ++it)
|
||||
{
|
||||
// consume a code point, or fail if there was none to consume
|
||||
if (it == str.end())
|
||||
{
|
||||
throw std::invalid_argument("Premature termination");
|
||||
}
|
||||
|
||||
// let digit = the code point's digit-value, fail if it has none
|
||||
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
|
||||
if (lookup == -1)
|
||||
{
|
||||
throw std::invalid_argument("Invalid base 36 character.");
|
||||
}
|
||||
unsigned char digit = static_cast<unsigned char>(lookup);
|
||||
|
||||
// let i = i + digit * w, fail on overflow
|
||||
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
|
||||
{
|
||||
throw std::invalid_argument("Overflow on i.");
|
||||
}
|
||||
i += digit * w;
|
||||
|
||||
// let t = tmin if k <= bias {+ tmin}, or
|
||||
// tmax if k >= bias + tmax, or k - bias otherwise
|
||||
punycode_uint t = k <= bias ? TMIN :
|
||||
k >= bias + TMAX ? TMAX : k - bias;
|
||||
|
||||
// if digit < t then break
|
||||
if (digit < t)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// let w = w * (base - t), fail on overflow
|
||||
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
|
||||
{
|
||||
// I believe this line is unreachable without first overflowing i.
|
||||
// Since 'i' is updated above as i += digit * w, and w is updated as
|
||||
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
|
||||
// give 'w' a chance to overflow first. To keep t minimized, we must
|
||||
// have 'bias' maximized. `bias` is driven by the 'adapt' function
|
||||
// below.
|
||||
//
|
||||
// The value returned by 'adapt' increases with the input delta, and
|
||||
// decreases with the input size. The delta is a function of the input
|
||||
// size as well, on the order of (delta_n * input size), and
|
||||
// legitimate delta_n values are limited to 0x10FFFF (the maximum
|
||||
// unicode codepoint). Even setting that aside, the maximum value that
|
||||
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
|
||||
//
|
||||
// Using this bias, we could use the input (HERE) to get iterations:
|
||||
//
|
||||
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
|
||||
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
|
||||
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
|
||||
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
|
||||
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
|
||||
//
|
||||
// At this point, t now becomes TMAX (26) because k exceeds the bias
|
||||
// (since the maximum bias is 204). As such, the minimum continuation
|
||||
// value is 26:
|
||||
//
|
||||
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
|
||||
//
|
||||
// However, the next iteration now overflows i before we can get to
|
||||
// the w update.
|
||||
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
|
||||
}
|
||||
w *= (BASE - t);
|
||||
}
|
||||
|
||||
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
|
||||
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
|
||||
|
||||
// let n = n + i div (length(output) + 1), fail on overflow
|
||||
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
|
||||
{
|
||||
throw std::invalid_argument("Overflow on n.");
|
||||
}
|
||||
n += i / (codepoints.size() + 1);
|
||||
|
||||
// let i = i mod (length(output) + 1)
|
||||
i %= (codepoints.size() + 1);
|
||||
|
||||
// insert n into output at position i
|
||||
codepoints.insert(codepoints.begin() + i, n);
|
||||
|
||||
// increment i
|
||||
++i;
|
||||
}
|
||||
|
||||
std::string output;
|
||||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
||||
{
|
||||
Utf8::writeCodepoint(output, *it);
|
||||
}
|
||||
str.assign(output);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string Punycode::decode(const std::string& str)
|
||||
{
|
||||
std::string result(str);
|
||||
decode(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string Punycode::decodeHostname(const std::string& hostname)
|
||||
{
|
||||
std::string unencoded;
|
||||
|
||||
size_t start = 0;
|
||||
size_t end = hostname.find('.');
|
||||
while(true)
|
||||
{
|
||||
std::string segment = hostname.substr(start, end - start);
|
||||
if (segment.substr(0, 4).compare("xn--") == 0)
|
||||
{
|
||||
segment = segment.substr(4);
|
||||
unencoded.append(Punycode::decode(segment));
|
||||
}
|
||||
else
|
||||
{
|
||||
unencoded.append(segment);
|
||||
}
|
||||
|
||||
if (end == std::string::npos)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
unencoded.append(1, '.');
|
||||
start = end + 1;
|
||||
end = hostname.find('.', start);
|
||||
}
|
||||
}
|
||||
|
||||
return unencoded;
|
||||
}
|
||||
|
||||
bool Punycode::needsPunycoding(const std::string& str)
|
||||
{
|
||||
return std::any_of(
|
||||
str.begin(),
|
||||
str.end(),
|
||||
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
|
||||
}
|
||||
|
||||
Punycode::punycode_uint Punycode::adapt(
|
||||
punycode_uint delta, punycode_uint numpoints, bool firsttime)
|
||||
{
|
||||
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
|
||||
//
|
||||
// It does not matter whether the modifications to delta and k inside
|
||||
// adapt() affect variables of the same name inside the
|
||||
// encoding/decoding procedures, because after calling adapt() the
|
||||
// caller does not read those variables before overwriting them.
|
||||
//
|
||||
// if firsttime then let delta = delta div damp
|
||||
// else let delta = delta div 2
|
||||
delta = firsttime ? delta / DAMP : delta >> 1;
|
||||
|
||||
// let delta = delta + (delta div numpoints)
|
||||
delta += (delta / numpoints);
|
||||
|
||||
// let k = 0
|
||||
punycode_uint k = 0;
|
||||
|
||||
// while delta > ((base - tmin) * tmax) div 2 do begin
|
||||
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
|
||||
{
|
||||
// let delta = delta div (base - tmin)
|
||||
// let k = k + base
|
||||
delta /= (BASE - TMIN);
|
||||
}
|
||||
|
||||
// return k + (((base - tmin + 1) * delta) div (delta + skew))
|
||||
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
|
||||
}
|
||||
|
||||
};
|
106
crawler/third_party/url-cpp/punycode.h
vendored
106
crawler/third_party/url-cpp/punycode.h
vendored
@ -1,106 +0,0 @@
|
||||
#ifndef PUNYCODE_CPP_H
|
||||
#define PUNYCODE_CPP_H
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <limits>
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
namespace Punycode
|
||||
{
|
||||
typedef Utf8::codepoint_t punycode_uint;
|
||||
|
||||
const unsigned int BASE = 36;
|
||||
const unsigned int TMIN = 1;
|
||||
const unsigned int TMAX = 26;
|
||||
const unsigned int SKEW = 38;
|
||||
const unsigned int DAMP = 700;
|
||||
const unsigned int INITIAL_BIAS = 72;
|
||||
const unsigned int INITIAL_N = 128;
|
||||
|
||||
// Codepoints to their base-36 value
|
||||
const std::vector<int8_t> BASIC_TO_DIGIT = {
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
};
|
||||
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
|
||||
|
||||
// The highest codepoint in unicode
|
||||
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
|
||||
//Utf8::MAX_CODEPOINT;
|
||||
//std::numeric_limits<punycode_uint>::max();
|
||||
|
||||
/**
|
||||
* Replace utf-8-encoded str into punycode.
|
||||
*/
|
||||
std::string& encode(std::string& str);
|
||||
|
||||
/**
|
||||
* Create a new punycoded string from utf-8-encoded input.
|
||||
*/
|
||||
std::string encode(const std::string& str);
|
||||
|
||||
/**
|
||||
* Encode a hostname.
|
||||
*/
|
||||
std::string encodeHostname(const std::string& hostname);
|
||||
|
||||
/**
|
||||
* Replace punycoded str into utf-8-encoded.
|
||||
*/
|
||||
std::string& decode(std::string& str);
|
||||
|
||||
/**
|
||||
* Create a new utf-8-encoded string from punycoded input.
|
||||
*/
|
||||
std::string decode(const std::string& str);
|
||||
|
||||
/**
|
||||
* Decode a hostname.
|
||||
*/
|
||||
std::string decodeHostname(const std::string& hostname);
|
||||
|
||||
/**
|
||||
* Determine if a string needs punycoding.
|
||||
*/
|
||||
bool needsPunycoding(const std::string& str);
|
||||
|
||||
/**
|
||||
* Internal function for calculating bias.
|
||||
*/
|
||||
punycode_uint adapt(
|
||||
punycode_uint delta, punycode_uint numpoints, bool firsttime);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
962
crawler/third_party/url-cpp/url.cpp
vendored
962
crawler/third_party/url-cpp/url.cpp
vendored
@ -1,962 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <sstream>
|
||||
|
||||
#include "url.h"
|
||||
#include "punycode.h"
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
/* Character classes */
|
||||
const CharacterClass Url::GEN_DELIMS(":/?#[]@");
|
||||
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
|
||||
const CharacterClass Url::DIGIT("0123456789");
|
||||
const CharacterClass Url::ALPHA(
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
const CharacterClass Url::UNRESERVED(
|
||||
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
|
||||
const CharacterClass Url::RESERVED(
|
||||
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
|
||||
const CharacterClass Url::PCHAR(
|
||||
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
|
||||
const CharacterClass Url::PATH(
|
||||
Url::PCHAR.chars() + "/");
|
||||
const CharacterClass Url::QUERY(
|
||||
Url::PCHAR.chars() + "/?");
|
||||
const CharacterClass Url::FRAGMENT(
|
||||
Url::PCHAR.chars() + "/?");
|
||||
const CharacterClass Url::USERINFO(
|
||||
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
|
||||
const CharacterClass Url::HEX("0123456789ABCDEF");
|
||||
const CharacterClass Url::SCHEME(
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
|
||||
const std::vector<signed char> Url::HEX_TO_DEC = {
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
};
|
||||
const std::unordered_map<std::string, int> Url::PORTS = {
|
||||
{"http", 80},
|
||||
{"https", 443}
|
||||
};
|
||||
const std::unordered_set<std::string> Url::USES_RELATIVE = {
|
||||
"",
|
||||
"file",
|
||||
"ftp",
|
||||
"gopher",
|
||||
"http",
|
||||
"https",
|
||||
"imap",
|
||||
"mms",
|
||||
"nntp",
|
||||
"prospero",
|
||||
"rtsp",
|
||||
"rtspu",
|
||||
"sftp",
|
||||
"shttp",
|
||||
"svn",
|
||||
"svn+ssh",
|
||||
"wais"
|
||||
};
|
||||
const std::unordered_set<std::string> Url::USES_NETLOC = {
|
||||
"",
|
||||
"file",
|
||||
"ftp",
|
||||
"git",
|
||||
"git+ssh",
|
||||
"gopher",
|
||||
"http",
|
||||
"https",
|
||||
"imap",
|
||||
"mms",
|
||||
"nfs",
|
||||
"nntp",
|
||||
"prospero",
|
||||
"rsync",
|
||||
"rtsp",
|
||||
"rtspu",
|
||||
"sftp",
|
||||
"shttp",
|
||||
"snews",
|
||||
"svn",
|
||||
"svn+ssh",
|
||||
"telnet",
|
||||
"wais"
|
||||
};
|
||||
const std::unordered_set<std::string> Url::USES_PARAMS = {
|
||||
"",
|
||||
"ftp",
|
||||
"hdl",
|
||||
"http",
|
||||
"https",
|
||||
"imap",
|
||||
"mms",
|
||||
"prospero",
|
||||
"rtsp",
|
||||
"rtspu",
|
||||
"sftp",
|
||||
"shttp",
|
||||
"sip",
|
||||
"sips",
|
||||
"tel"
|
||||
};
|
||||
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
|
||||
"",
|
||||
"file",
|
||||
"ftp",
|
||||
"git",
|
||||
"git+ssh",
|
||||
"gopher",
|
||||
"hdl",
|
||||
"http",
|
||||
"https",
|
||||
"imap",
|
||||
"mms",
|
||||
"nfs",
|
||||
"nntp",
|
||||
"prospero",
|
||||
"rsync",
|
||||
"rtsp",
|
||||
"rtspu",
|
||||
"sftp",
|
||||
"shttp",
|
||||
"sip",
|
||||
"sips",
|
||||
"sms",
|
||||
"snews",
|
||||
"svn",
|
||||
"svn+ssh",
|
||||
"tel",
|
||||
"telnet",
|
||||
"wais"
|
||||
};
|
||||
|
||||
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
|
||||
{
|
||||
size_t position = 0;
|
||||
size_t index = url.find(':');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
// All the characters in our would-be scheme must be in SCHEME
|
||||
if (std::all_of(
|
||||
url.begin(),
|
||||
url.begin() + index,
|
||||
[](char c) { return SCHEME(c); } ))
|
||||
{
|
||||
// If there is nothing after the : or there are any non-digits, this is
|
||||
// the scheme
|
||||
if ((index + 1) >= url.length()
|
||||
|| std::any_of(
|
||||
url.begin() + index + 1,
|
||||
url.end(),
|
||||
[](char c) { return !DIGIT(c); }))
|
||||
{
|
||||
scheme_.assign(url, 0, index);
|
||||
std::transform(
|
||||
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
|
||||
position = index + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
scheme_.assign(url, 0, index);
|
||||
std::transform(
|
||||
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
|
||||
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
|
||||
{
|
||||
position = index + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
scheme_.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search for the netloc
|
||||
if ((url.length() - position) >= 1
|
||||
&& url[position] == '/'
|
||||
&& url[position + 1] == '/')
|
||||
{
|
||||
// Skip the '//'
|
||||
position += 2;
|
||||
index = url.find_first_of("/?#", position);
|
||||
host_.assign(url, position, index - position);
|
||||
position = index;
|
||||
|
||||
// Extract any userinfo if there is any
|
||||
index = host_.find('@');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
userinfo_.assign(host_, 0, index);
|
||||
host_.assign(host_, index + 1, std::string::npos);
|
||||
}
|
||||
|
||||
// Lowercase the hostname
|
||||
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
|
||||
|
||||
// Try to find a port
|
||||
index = host_.find(':');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
std::string portText(host_, index + 1, std::string::npos);
|
||||
host_.resize(index);
|
||||
|
||||
if (portText.empty())
|
||||
{
|
||||
port_ = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
port_ = std::stoi(portText, &index);
|
||||
|
||||
if (index != portText.length())
|
||||
{
|
||||
// Malformed port
|
||||
throw UrlParseException("Port not a number: " + portText);
|
||||
}
|
||||
|
||||
if (port_ > 65535)
|
||||
{
|
||||
throw UrlParseException("Port too high: " + portText);
|
||||
}
|
||||
else if (port_ < 0)
|
||||
{
|
||||
throw UrlParseException("Port negative: " + portText);
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument&)
|
||||
{
|
||||
// Malformed port
|
||||
throw UrlParseException("Port not a number: " + portText);
|
||||
}
|
||||
catch (const std::out_of_range&)
|
||||
{
|
||||
throw UrlParseException("Port out of integer range: " + portText);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (position != std::string::npos)
|
||||
{
|
||||
path_.assign(url, position, std::string::npos);
|
||||
|
||||
index = path_.find('#');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
fragment_.assign(path_, index + 1, std::string::npos);
|
||||
path_.resize(index);
|
||||
}
|
||||
|
||||
index = path_.find('?');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
query_.assign(path_, index + 1, std::string::npos);
|
||||
has_query_ = true;
|
||||
path_.resize(index);
|
||||
}
|
||||
|
||||
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
|
||||
{
|
||||
index = path_.find(';');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
params_.assign(path_, index + 1, std::string::npos);
|
||||
has_params_ = true;
|
||||
path_.resize(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Url& Url::assign(const Url& other)
|
||||
{
|
||||
return (*this) = other;
|
||||
}
|
||||
|
||||
bool Url::operator==(const Url& other) const
|
||||
{
|
||||
return (
|
||||
(scheme_ == other.scheme_ ) &&
|
||||
(userinfo_ == other.userinfo_ ) &&
|
||||
(host_ == other.host_ ) &&
|
||||
(port_ == other.port_ ) &&
|
||||
(path_ == other.path_ ) &&
|
||||
(params_ == other.params_ ) &&
|
||||
(query_ == other.query_ ) &&
|
||||
(fragment_ == other.fragment_ ) &&
|
||||
(has_params_ == other.has_params_) &&
|
||||
(has_query_ == other.has_query_ )
|
||||
);
|
||||
}
|
||||
|
||||
bool Url::operator!=(const Url& other) const
|
||||
{
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
bool Url::equiv(const Url& other)
|
||||
{
|
||||
Url self_(*this);
|
||||
Url other_(other);
|
||||
|
||||
self_.strip()
|
||||
.sort_query()
|
||||
.defrag()
|
||||
.deuserinfo()
|
||||
.abspath()
|
||||
.escape()
|
||||
.punycode()
|
||||
.remove_default_port();
|
||||
other_.strip()
|
||||
.sort_query()
|
||||
.defrag()
|
||||
.deuserinfo()
|
||||
.abspath()
|
||||
.escape()
|
||||
.punycode()
|
||||
.remove_default_port();
|
||||
return self_ == other_;
|
||||
}
|
||||
|
||||
std::string& Url::remove_repeats(std::string& str, const char chr)
|
||||
{
|
||||
size_t dest = 0;
|
||||
// By initializing this to true, it also strips of leading instances of chr
|
||||
bool seen = true;
|
||||
for (size_t src = 0; src < str.length(); ++src)
|
||||
{
|
||||
if (!seen || (str[src] != chr))
|
||||
{
|
||||
str[dest++] = str[src];
|
||||
}
|
||||
seen = str[src] == chr;
|
||||
}
|
||||
// Remove the last character if it happens to be chr
|
||||
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
|
||||
str.resize(length);
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string Url::fullpath() const
|
||||
{
|
||||
std::string result;
|
||||
if (path_.empty() || path_[0] != '/')
|
||||
{
|
||||
result.append(1, '/');
|
||||
}
|
||||
result.append(path_);
|
||||
|
||||
if (has_params_)
|
||||
{
|
||||
result.append(";");
|
||||
result.append(params_);
|
||||
}
|
||||
|
||||
if (has_query_)
|
||||
{
|
||||
result.append("?");
|
||||
result.append(query_);
|
||||
}
|
||||
|
||||
if (!fragment_.empty())
|
||||
{
|
||||
result.append("#");
|
||||
result.append(fragment_);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string Url::str() const
|
||||
{
|
||||
std::string result;
|
||||
|
||||
if (!scheme_.empty())
|
||||
{
|
||||
result.append(scheme_);
|
||||
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
|
||||
{
|
||||
result.append(":");
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append("://");
|
||||
}
|
||||
}
|
||||
else if (!host_.empty())
|
||||
{
|
||||
result.append("//");
|
||||
}
|
||||
|
||||
if (!userinfo_.empty())
|
||||
{
|
||||
result.append(userinfo_);
|
||||
result.append("@");
|
||||
}
|
||||
|
||||
if (!host_.empty())
|
||||
{
|
||||
result.append(host_);
|
||||
}
|
||||
|
||||
if (port_)
|
||||
{
|
||||
result.append(":");
|
||||
result.append(std::to_string(port_));
|
||||
}
|
||||
|
||||
if (path_.empty())
|
||||
{
|
||||
if (!result.empty())
|
||||
{
|
||||
result.append("/");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!host_.empty() && path_[0] != '/')
|
||||
{
|
||||
result.append(1, '/');
|
||||
}
|
||||
result.append(path_);
|
||||
}
|
||||
|
||||
if (has_params_)
|
||||
{
|
||||
result.append(";");
|
||||
result.append(params_);
|
||||
}
|
||||
|
||||
if (has_query_)
|
||||
{
|
||||
result.append("?");
|
||||
result.append(query_);
|
||||
}
|
||||
|
||||
if (!fragment_.empty())
|
||||
{
|
||||
result.append("#");
|
||||
result.append(fragment_);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Url& Url::strip()
|
||||
{
|
||||
size_t start = query_.find_first_not_of('?');
|
||||
if (start != std::string::npos)
|
||||
{
|
||||
query_.assign(query_, start, std::string::npos);
|
||||
}
|
||||
else
|
||||
{
|
||||
query_.assign("");
|
||||
}
|
||||
setQuery(remove_repeats(query_, '&'));
|
||||
setParams(remove_repeats(params_, ';'));
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::abspath()
|
||||
{
|
||||
std::string copy;
|
||||
std::vector<size_t> segment_starts;
|
||||
|
||||
if (path_.size() >= 1 && path_[0] == '/')
|
||||
{
|
||||
copy.append(1, '/');
|
||||
segment_starts.push_back(0);
|
||||
}
|
||||
|
||||
bool directory = false;
|
||||
size_t previous = 0;
|
||||
size_t index = 0;
|
||||
for (index = path_.find('/')
|
||||
; index != std::string::npos
|
||||
; previous = index + 1, index = path_.find('/', index + 1))
|
||||
{
|
||||
// Skip empty segments
|
||||
if (index - previous == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((index - previous == 2)
|
||||
&& path_[previous] == '.'
|
||||
&& path_[previous + 1] == '.')
|
||||
{
|
||||
if (!segment_starts.empty())
|
||||
{
|
||||
copy.resize(segment_starts.back());
|
||||
segment_starts.pop_back();
|
||||
}
|
||||
directory = true;
|
||||
}
|
||||
else if ((index - previous == 1) && path_[previous] == '.')
|
||||
{
|
||||
directory = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
segment_starts.push_back(copy.length());
|
||||
copy.append(path_, previous, index - previous);
|
||||
copy.append(1, '/');
|
||||
directory = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the last segment
|
||||
index = path_.length();
|
||||
if (previous == path_.length())
|
||||
{
|
||||
directory = true;
|
||||
}
|
||||
else if ((index - previous == 1) && path_[previous] == '.')
|
||||
{
|
||||
directory = true;
|
||||
}
|
||||
else if ((index - previous == 2)
|
||||
&& path_[previous] == '.'
|
||||
&& path_[previous + 1] == '.')
|
||||
{
|
||||
if (!segment_starts.empty())
|
||||
{
|
||||
copy.resize(segment_starts.back());
|
||||
}
|
||||
directory = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
copy.append(path_, previous, index - previous);
|
||||
copy.append(1, '/');
|
||||
directory = false;
|
||||
}
|
||||
|
||||
if (!directory && copy.size() >= 1)
|
||||
{
|
||||
copy.resize(copy.size() - 1);
|
||||
}
|
||||
else if (directory && copy.empty())
|
||||
{
|
||||
copy.append(1, '/');
|
||||
}
|
||||
path_.assign(copy);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::relative_to(const Url& other)
|
||||
{
|
||||
// If this scheme does not use relative, return it unchanged
|
||||
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Support scheme-relative URLs
|
||||
if (scheme_.empty())
|
||||
{
|
||||
scheme_ = other.scheme_;
|
||||
}
|
||||
|
||||
// If this is an absolute URL (or scheme-relative), return early
|
||||
if (!host_.empty()) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
// If it's not an absolute URL, we need to copy the other host and port
|
||||
host_ = other.host_;
|
||||
port_ = other.port_;
|
||||
userinfo_ = other.userinfo_;
|
||||
|
||||
// If the path portion is absolute, then bail out early.
|
||||
if (!path_.empty() && path_.front() == '/')
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Otherwise, this is a path that need to be evaluated relative to the other. If
|
||||
// there is no '/', then we just keep our current path if it's not empty.
|
||||
if (path_.empty())
|
||||
{
|
||||
if (params_.empty())
|
||||
{
|
||||
path_ = other.path_;
|
||||
params_ = other.params_;
|
||||
has_params_ = other.has_params_;
|
||||
if (query_.empty())
|
||||
{
|
||||
query_ = other.query_;
|
||||
has_query_ = other.has_query_;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
|
||||
}
|
||||
|
||||
if (fragment_.empty())
|
||||
{
|
||||
fragment_ = other.fragment_;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t index = other.path_.rfind('/');
|
||||
if (index != std::string::npos)
|
||||
{
|
||||
path_ = other.path_.substr(0, index + 1) + path_;
|
||||
}
|
||||
else if (!host_.empty())
|
||||
{
|
||||
path_ = "/" + path_;
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::escape(bool strict)
|
||||
{
|
||||
escape(path_, PATH, strict);
|
||||
escape(query_, QUERY, strict);
|
||||
escape(params_, QUERY, strict);
|
||||
escape(userinfo_, USERINFO, strict);
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
|
||||
{
|
||||
std::string copy(str);
|
||||
size_t dest = 0;
|
||||
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
|
||||
// the space.
|
||||
str.resize(str.length() * 3);
|
||||
for (size_t src = 0; src < copy.length(); ++src)
|
||||
{
|
||||
if (copy[src] == '%' && (copy.length() - src) >= 2)
|
||||
{
|
||||
// Read ahead to see if there's a valid escape sequence. If not, treat
|
||||
// this like a normal character.
|
||||
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
|
||||
{
|
||||
int value = (
|
||||
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
|
||||
|
||||
// In strict mode, we can only unescape parameters if they are both
|
||||
// safe and not reserved
|
||||
if (!strict || (strict && safe(value) && !RESERVED(value)))
|
||||
{
|
||||
// Replace src + 2 with that byte, advance src to consume it and
|
||||
// continue.
|
||||
src += 2;
|
||||
copy[src] = value;
|
||||
}
|
||||
else
|
||||
{
|
||||
str[dest++] = copy[src++];
|
||||
str[dest++] = ::toupper(copy[src++]);
|
||||
str[dest++] = ::toupper(copy[src]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!safe(copy[src]))
|
||||
{
|
||||
// Not safe -- replace with %XX
|
||||
str[dest++] = '%';
|
||||
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
|
||||
str[dest++] = HEX.chars()[copy[src] & 0xF];
|
||||
}
|
||||
else
|
||||
{
|
||||
str[dest++] = copy[src];
|
||||
}
|
||||
}
|
||||
str.resize(dest);
|
||||
return str;
|
||||
}
|
||||
|
||||
Url& Url::unescape()
|
||||
{
|
||||
unescape(path_);
|
||||
unescape(query_);
|
||||
unescape(params_);
|
||||
unescape(userinfo_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::string& Url::unescape(std::string& str)
|
||||
{
|
||||
std::string copy(str);
|
||||
size_t dest = 0;
|
||||
for (size_t src = 0; src < copy.length(); ++src, ++dest)
|
||||
{
|
||||
if (copy[src] == '%' && (copy.length() - src) >= 2)
|
||||
{
|
||||
// Read ahead to see if there's a valid escape sequence. If not, treat
|
||||
// this like a normal character.
|
||||
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
|
||||
{
|
||||
int value = (
|
||||
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
|
||||
|
||||
// Replace src + 2 with that byte, advance src to consume it and
|
||||
// continue.
|
||||
src += 2;
|
||||
str[dest] = value;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Either not a % or an incomplete entity
|
||||
str[dest] = copy[src];
|
||||
}
|
||||
str.resize(dest);
|
||||
return str;
|
||||
}
|
||||
|
||||
Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
|
||||
{
|
||||
// Predicate is if it's present in the blacklist.
|
||||
auto predicate = [blacklist](std::string& name, const std::string& value)
|
||||
{
|
||||
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
|
||||
return blacklist.find(name) != blacklist.end();
|
||||
};
|
||||
|
||||
setQuery(remove_params(query_, predicate, '&'));
|
||||
setParams(remove_params(params_, predicate, ';'));
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::deparam(const deparam_predicate& predicate)
|
||||
{
|
||||
setQuery(remove_params(query_, predicate, '&'));
|
||||
setParams(remove_params(params_, predicate, ';'));
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::string& Url::remove_params(std::string& str,
|
||||
const deparam_predicate& predicate,
|
||||
char sep)
|
||||
{
|
||||
std::string copy;
|
||||
std::string piece;
|
||||
std::string name;
|
||||
std::string value;
|
||||
size_t previous = 0;
|
||||
for (size_t index = str.find(sep)
|
||||
; index != std::string::npos
|
||||
; previous = index + 1, index = str.find(sep, previous))
|
||||
{
|
||||
piece.assign(str, previous, index - previous);
|
||||
size_t position = piece.find('=');
|
||||
name.assign(piece, 0, position);
|
||||
value.clear();
|
||||
if (position != std::string::npos)
|
||||
{
|
||||
value.assign(piece, position + 1, std::string::npos);
|
||||
}
|
||||
|
||||
if (!predicate(name, value))
|
||||
{
|
||||
copy.append(copy.empty() ? 0 : 1, sep);
|
||||
copy.append(piece);
|
||||
}
|
||||
}
|
||||
|
||||
if (previous < str.length())
|
||||
{
|
||||
piece.assign(str, previous, std::string::npos);
|
||||
size_t position = piece.find('=');
|
||||
name.assign(piece, 0, position);
|
||||
value.clear();
|
||||
if (position != std::string::npos)
|
||||
{
|
||||
value.assign(piece, position + 1, std::string::npos);
|
||||
}
|
||||
|
||||
if (!predicate(name, value))
|
||||
{
|
||||
copy.append(copy.empty() ? 0 : 1, sep);
|
||||
copy.append(piece);
|
||||
}
|
||||
}
|
||||
|
||||
str.assign(copy);
|
||||
return str;
|
||||
}
|
||||
|
||||
Url& Url::sort_query()
|
||||
{
|
||||
split_sort_join(query_, '&');
|
||||
split_sort_join(params_, ';');
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::string& Url::split_sort_join(std::string& str, const char glue)
|
||||
{
|
||||
// Return early if empty
|
||||
if (str.empty())
|
||||
{
|
||||
return str;
|
||||
}
|
||||
|
||||
// Split
|
||||
std::vector<std::string> pieces;
|
||||
std::stringstream stream(str);
|
||||
std::string item;
|
||||
while (getline(stream, item, glue))
|
||||
{
|
||||
pieces.push_back(item);
|
||||
}
|
||||
|
||||
// Return early if it's just a single element
|
||||
if (pieces.size() == 1)
|
||||
{
|
||||
return str;
|
||||
}
|
||||
|
||||
// Sort
|
||||
std::sort(pieces.begin(), pieces.end());
|
||||
|
||||
// Join (at this point we know that there's at least one element)
|
||||
std::stringstream output;
|
||||
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
|
||||
{
|
||||
output << *it << glue;
|
||||
}
|
||||
output << pieces.back();
|
||||
str.assign(output.str());
|
||||
return str;
|
||||
}
|
||||
|
||||
Url& Url::remove_default_port()
|
||||
{
|
||||
if (port_ && !scheme_.empty())
|
||||
{
|
||||
auto it = PORTS.find(scheme_);
|
||||
if (it != PORTS.end() && port_ == it->second)
|
||||
{
|
||||
port_ = 0;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::deuserinfo()
|
||||
{
|
||||
userinfo_.clear();
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::defrag()
|
||||
{
|
||||
fragment_.clear();
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::punycode()
|
||||
{
|
||||
check_hostname(host_);
|
||||
std::string encoded(Punycode::encodeHostname(host_));
|
||||
check_hostname(encoded);
|
||||
host_ = encoded;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::unpunycode()
|
||||
{
|
||||
host_ = Punycode::decodeHostname(host_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
Url& Url::host_reversed()
|
||||
{
|
||||
std::reverse(host_.begin(), host_.end());
|
||||
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
|
||||
{
|
||||
position = host_.find('.', index);
|
||||
if (position == std::string::npos)
|
||||
{
|
||||
std::reverse(host_.begin() + index, host_.end());
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::reverse(host_.begin() + index, host_.begin() + position);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void Url::check_hostname(std::string& host)
|
||||
{
|
||||
// Skip empty hostnames -- they are valid
|
||||
if (host.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
size_t start = 0;
|
||||
size_t end = host.find('.');
|
||||
while (end != std::string::npos)
|
||||
{
|
||||
if ((end - start) > 63)
|
||||
{
|
||||
throw std::invalid_argument("Label too long.");
|
||||
}
|
||||
else if (end == start)
|
||||
{
|
||||
throw std::invalid_argument("Empty label.");
|
||||
}
|
||||
|
||||
start = end + 1;
|
||||
end = host.find('.', start);
|
||||
}
|
||||
|
||||
// For the final segment
|
||||
if ((host.size() - start) > 63)
|
||||
{
|
||||
throw std::invalid_argument("Label too long.");
|
||||
}
|
||||
else if (host.size() == start && start > 1)
|
||||
{
|
||||
// Remove a trailing empty segment
|
||||
host.resize(start - 1);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
323
crawler/third_party/url-cpp/url.h
vendored
323
crawler/third_party/url-cpp/url.h
vendored
@ -1,323 +0,0 @@
|
||||
#ifndef URL_CPP_H
|
||||
#define URL_CPP_H
|
||||
|
||||
#include <stdexcept>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
struct UrlParseException : public std::logic_error
|
||||
{
|
||||
UrlParseException(const std::string& message) : std::logic_error(message) {}
|
||||
};
|
||||
|
||||
struct CharacterClass
|
||||
{
|
||||
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
|
||||
{
|
||||
for (auto it = chars_.begin(); it != chars_.end(); ++it)
|
||||
{
|
||||
map_[static_cast<size_t>(*it)] = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator()(char c) const
|
||||
{
|
||||
return map_[static_cast<unsigned char>(c)];
|
||||
}
|
||||
|
||||
const std::string& chars() const
|
||||
{
|
||||
return chars_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Private, unimplemented to prevent use
|
||||
CharacterClass();
|
||||
CharacterClass(const CharacterClass& other);
|
||||
|
||||
std::string chars_;
|
||||
std::vector<bool> map_;
|
||||
};
|
||||
|
||||
struct Url
|
||||
{
|
||||
/* Character classes */
|
||||
const static CharacterClass GEN_DELIMS;
|
||||
const static CharacterClass SUB_DELIMS;
|
||||
const static CharacterClass ALPHA;
|
||||
const static CharacterClass DIGIT;
|
||||
const static CharacterClass UNRESERVED;
|
||||
const static CharacterClass RESERVED;
|
||||
const static CharacterClass PCHAR;
|
||||
const static CharacterClass PATH;
|
||||
const static CharacterClass QUERY;
|
||||
const static CharacterClass FRAGMENT;
|
||||
const static CharacterClass USERINFO;
|
||||
const static CharacterClass HEX;
|
||||
const static CharacterClass SCHEME;
|
||||
const static std::vector<signed char> HEX_TO_DEC;
|
||||
const static std::unordered_map<std::string, int> PORTS;
|
||||
const static std::unordered_set<std::string> USES_RELATIVE;
|
||||
const static std::unordered_set<std::string> USES_NETLOC;
|
||||
const static std::unordered_set<std::string> USES_PARAMS;
|
||||
const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
|
||||
|
||||
// The type of the predicate used for removing parameters
|
||||
typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
|
||||
|
||||
explicit Url(const std::string& url);
|
||||
|
||||
Url(const Url& other)
|
||||
: scheme_(other.scheme_)
|
||||
, host_(other.host_)
|
||||
, port_(other.port_)
|
||||
, path_(other.path_)
|
||||
, params_(other.params_)
|
||||
, query_(other.query_)
|
||||
, fragment_(other.fragment_)
|
||||
, userinfo_(other.userinfo_)
|
||||
, has_params_(other.has_params_)
|
||||
, has_query_(other.has_query_) { }
|
||||
|
||||
/**
|
||||
* Take on the value of the other URL.
|
||||
*/
|
||||
Url& assign(const Url& other);
|
||||
|
||||
/**
|
||||
* To be considered equal, all fields must be equal.
|
||||
*/
|
||||
bool operator==(const Url& other) const;
|
||||
bool operator!=(const Url& other) const;
|
||||
|
||||
/**
|
||||
* Two URLs are considered equivalent if they have the same meaning.
|
||||
*/
|
||||
bool equiv(const Url& other);
|
||||
|
||||
/**************************************
|
||||
* Component-wise access and setting. *
|
||||
**************************************/
|
||||
const std::string& scheme() const { return scheme_; }
|
||||
Url& setScheme(const std::string& s)
|
||||
{
|
||||
scheme_ = s;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& host() const { return host_; }
|
||||
Url& setHost(const std::string& s)
|
||||
{
|
||||
host_ = s;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const int port() const { return port_; }
|
||||
Url& setPort(int i)
|
||||
{
|
||||
port_ = i;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& path() const { return path_; }
|
||||
Url& setPath(const std::string& s)
|
||||
{
|
||||
path_ = s;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& params() const { return params_; }
|
||||
Url& setParams(const std::string& s)
|
||||
{
|
||||
params_ = s;
|
||||
has_params_ = !s.empty();
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& query() const { return query_; }
|
||||
Url& setQuery(const std::string& s)
|
||||
{
|
||||
query_ = s;
|
||||
has_query_ = !s.empty();
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& fragment() const { return fragment_; }
|
||||
Url& setFragment(const std::string& s)
|
||||
{
|
||||
fragment_ = s;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& userinfo() const { return userinfo_; }
|
||||
Url& setUserinfo(const std::string& s)
|
||||
{
|
||||
userinfo_ = s;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a representation of all components of the path, params, query, fragment.
|
||||
*
|
||||
* Always includes a leading /.
|
||||
*/
|
||||
std::string fullpath() const;
|
||||
|
||||
/**
|
||||
* Get a new string representation of the URL.
|
||||
**/
|
||||
std::string str() const;
|
||||
|
||||
/*********************
|
||||
* Chainable methods *
|
||||
*********************/
|
||||
|
||||
/**
|
||||
* Strip semantically meaningless excess '?', '&', and ';' characters from query
|
||||
* and params.
|
||||
*/
|
||||
Url& strip();
|
||||
|
||||
/**
|
||||
* Make the path absolute.
|
||||
*
|
||||
* Evaluate '.', '..', and excessive slashes.
|
||||
*/
|
||||
Url& abspath();
|
||||
|
||||
/**
|
||||
* Evaluate this URL relative fo `other`, placing the result in this object.
|
||||
*/
|
||||
Url& relative_to(const std::string& other)
|
||||
{
|
||||
return relative_to(Url(other));
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate this URL relative fo `other`, placing the result in this object.
|
||||
*/
|
||||
Url& relative_to(const Url& other);
|
||||
|
||||
/**
|
||||
* Ensure that the path, params, query, and userinfo are properly escaped.
|
||||
*
|
||||
* In 'strict' mode, only entities that are both safe and not reserved characters
|
||||
* are unescaped. In non-strict mode, entities that are safe are unescaped.
|
||||
*/
|
||||
Url& escape(bool strict=false);
|
||||
|
||||
/**
|
||||
* Unescape all entities in the path, params, query, and userinfo.
|
||||
*/
|
||||
Url& unescape();
|
||||
|
||||
/**
|
||||
* Remove any params or queries that appear in the blacklist.
|
||||
*
|
||||
* The blacklist should contain only lowercased strings, and the comparison is
|
||||
* done in a case-insensitive way.
|
||||
*/
|
||||
Url& deparam(const std::unordered_set<std::string>& blacklist);
|
||||
|
||||
/**
|
||||
* Filter params subject to a predicate for whether it should be filtered.
|
||||
*
|
||||
* The predicate must accept two string refs -- the key and value (which may be
|
||||
* empty). Return `true` if the parameter should be removed, and `false`
|
||||
* otherwise.
|
||||
*/
|
||||
Url& deparam(const deparam_predicate& predicate);
|
||||
|
||||
/**
|
||||
* Put queries and params in sorted order.
|
||||
*
|
||||
* To ensure consistent comparisons, escape should be called beforehand.
|
||||
*/
|
||||
Url& sort_query();
|
||||
|
||||
/**
|
||||
* Remove the port if it's the default for the scheme.
|
||||
*/
|
||||
Url& remove_default_port();
|
||||
|
||||
/**
|
||||
* Remove the userinfo portion.
|
||||
*/
|
||||
Url& deuserinfo();
|
||||
|
||||
/**
|
||||
* Remove the fragment.
|
||||
*/
|
||||
Url& defrag();
|
||||
|
||||
/**
|
||||
* Punycode the hostname.
|
||||
*/
|
||||
Url& punycode();
|
||||
|
||||
/**
|
||||
* Unpunycode the hostname.
|
||||
*/
|
||||
Url& unpunycode();
|
||||
|
||||
/**
|
||||
* Reverse the hostname (a.b.c.d => d.c.b.a)
|
||||
*/
|
||||
Url& host_reversed();
|
||||
|
||||
private:
|
||||
// Private, unimplemented to prevent use.
|
||||
Url();
|
||||
|
||||
/**
|
||||
* Remove repeated, leading, and trailing instances of chr from the string.
|
||||
*/
|
||||
std::string& remove_repeats(std::string& str, const char chr);
|
||||
|
||||
/**
|
||||
* Ensure all the provided characters are escaped if necessary
|
||||
*/
|
||||
std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
|
||||
|
||||
/**
|
||||
* Unescape entities in the provided string
|
||||
*/
|
||||
std::string& unescape(std::string& str);
|
||||
|
||||
/**
|
||||
* Remove any params that match entries in the blacklist.
|
||||
*/
|
||||
std::string& remove_params(
|
||||
std::string& str, const deparam_predicate& pred, char sep);
|
||||
|
||||
/**
|
||||
* Split the provided string by char, sort, join by char.
|
||||
*/
|
||||
std::string& split_sort_join(std::string& str, const char glue);
|
||||
|
||||
/**
|
||||
* Check that the hostname is valid, removing an optional trailing '.'.
|
||||
*/
|
||||
void check_hostname(std::string& host);
|
||||
|
||||
std::string scheme_;
|
||||
std::string host_;
|
||||
int port_;
|
||||
std::string path_;
|
||||
std::string params_;
|
||||
std::string query_;
|
||||
std::string fragment_;
|
||||
std::string userinfo_;
|
||||
bool has_params_;
|
||||
bool has_query_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
150
crawler/third_party/url-cpp/utf8.cpp
vendored
150
crawler/third_party/url-cpp/utf8.cpp
vendored
@ -1,150 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
Utf8::codepoint_t Utf8::readCodepoint(
|
||||
std::string::const_iterator& it, const std::string::const_iterator& end)
|
||||
{
|
||||
Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
|
||||
if (current & 0x80)
|
||||
{
|
||||
// Number of additional bytes needed
|
||||
unsigned int bytes = 0;
|
||||
// The accumulated value
|
||||
Utf8::codepoint_t result = 0;
|
||||
if (current < 0xC0)
|
||||
{
|
||||
// Invalid sequence
|
||||
throw std::invalid_argument("Low UTF-8 start byte");
|
||||
}
|
||||
else if (current < 0xE0)
|
||||
{
|
||||
// One additional byte, two bytes total, use 5 bits
|
||||
bytes = 1;
|
||||
result = current & 0x1F;
|
||||
}
|
||||
else if (current < 0xF0)
|
||||
{
|
||||
// Two additional bytes, three bytes total, use 4 bits
|
||||
bytes = 2;
|
||||
result = current & 0x0F;
|
||||
}
|
||||
else if (current < 0xF8)
|
||||
{
|
||||
// Three additional bytes, four bytes total, use 3 bits
|
||||
bytes = 3;
|
||||
result = current & 0x07;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::invalid_argument("High UTF-8 start byte");
|
||||
}
|
||||
|
||||
for (; bytes > 0; --bytes) {
|
||||
if (it == end)
|
||||
{
|
||||
throw std::invalid_argument("UTF-8 sequence terminated early.");
|
||||
}
|
||||
|
||||
current = static_cast<unsigned char>(*it++);
|
||||
// Ensure the first two bits are 10
|
||||
if ((current & 0xC0) != 0x80)
|
||||
{
|
||||
throw std::invalid_argument("Invalid continuation byte");
|
||||
}
|
||||
result = (result << 6) | (current & 0x3F);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
else
|
||||
{
|
||||
return current;
|
||||
}
|
||||
}
|
||||
|
||||
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
|
||||
{
|
||||
if (value > MAX_CODEPOINT)
|
||||
{
|
||||
throw std::invalid_argument("Code point too high.");
|
||||
}
|
||||
else if (value <= 0x007F)
|
||||
{
|
||||
// Just append the character itself
|
||||
str.append(1, static_cast<char>(value));
|
||||
return str;
|
||||
}
|
||||
|
||||
unsigned int bytes = 0;
|
||||
if (value > 0xFFFF)
|
||||
{
|
||||
/**
|
||||
* 11110xxx + 3 bytes for 21 bits total
|
||||
*
|
||||
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least
|
||||
* significant bits of this byte (so we shift them back down by 18). The 5
|
||||
* most significant bits of this byte are 11110, so we OR this result with
|
||||
* 0xF0 to get this first byte.
|
||||
*
|
||||
* The remaining bits will be consumed from the most-significant end and so
|
||||
* they must be shifted up by (32 - 18) = 14.
|
||||
*/
|
||||
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
|
||||
bytes = 3;
|
||||
value <<= 14;
|
||||
}
|
||||
else if (value > 0x07FF)
|
||||
{
|
||||
/**
|
||||
* 1110xxxx + 2 bytes for 16 bits total
|
||||
*
|
||||
* We need to take bits 15-12, which 0xF000 masks out. These form the least
|
||||
* significant bits of this byte (so we shift them back down by 12). The 4
|
||||
* most significant bits of this byte are 1110, so we OR this result with
|
||||
* 0xE0 to get this first byte.
|
||||
*
|
||||
* The remaining bits will be consumed from the most-significant end and so
|
||||
* they must be shifted up by (32 - 12) = 20.
|
||||
*/
|
||||
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
|
||||
bytes = 2;
|
||||
value <<= 20;
|
||||
}
|
||||
else
|
||||
{
|
||||
/**
|
||||
* 110xxxxx + 1 byte for 11 bits total
|
||||
*
|
||||
* We need to take bits 10-6, which 0x7C0 masks out. These form the least
|
||||
* significant bits of this byte (so we shift them back down by 6). The 3
|
||||
* most significant bits of this byte are 110, so we OR this result with
|
||||
* 0xC0 to get this first byte.
|
||||
*
|
||||
* The remaining bits will be consumed from the most-significant end and so
|
||||
* they must be shifted up by (32 - 6) = 26.
|
||||
*/
|
||||
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
|
||||
bytes = 1;
|
||||
value <<= 26;
|
||||
}
|
||||
|
||||
/**
|
||||
* The remaining bits are to be consumed 6 at a time from the most-significant
|
||||
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
|
||||
* by 26, and OR'd with 0x80 to produce the continuation byte.
|
||||
*/
|
||||
for (; bytes > 0; --bytes, value <<= 6)
|
||||
{
|
||||
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
};
|
91
crawler/third_party/url-cpp/utf8.h
vendored
91
crawler/third_party/url-cpp/utf8.h
vendored
@ -1,91 +0,0 @@
|
||||
#ifndef UTF8_CPP_H
|
||||
#define UTF8_CPP_H
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace Url
|
||||
{
|
||||
|
||||
/**
|
||||
* Work between unicode code points and their UTF-8-encoded representation.
|
||||
*/
|
||||
struct Utf8
|
||||
{
|
||||
/**
|
||||
* The type we use to represent Unicode codepoints.
|
||||
*/
|
||||
typedef uint32_t codepoint_t;
|
||||
|
||||
/**
|
||||
* The type we use when talking about the integral value of bytes.
|
||||
*/
|
||||
typedef unsigned char char_t;
|
||||
|
||||
/**
|
||||
* The highest allowed codepoint.
|
||||
*/
|
||||
static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
|
||||
|
||||
/**
|
||||
* Consume up to the last byte of the sequence, returning the codepoint.
|
||||
*/
|
||||
static codepoint_t readCodepoint(
|
||||
std::string::const_iterator& it, const std::string::const_iterator& end);
|
||||
|
||||
/**
|
||||
* Write a codepoint to the provided string.
|
||||
*/
|
||||
static std::string& writeCodepoint(std::string& str, codepoint_t value);
|
||||
|
||||
/**
|
||||
* Return the first codepoint stored in the provided string.
|
||||
*/
|
||||
static codepoint_t toCodepoint(const std::string& str)
|
||||
{
|
||||
auto it = str.begin();
|
||||
return readCodepoint(it, str.end());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string with the provided codepoint.
|
||||
*/
|
||||
static std::string fromCodepoint(codepoint_t value)
|
||||
{
|
||||
std::string str;
|
||||
writeCodepoint(str, value);
|
||||
return str;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all the codepoints in the string.
|
||||
*/
|
||||
static std::vector<codepoint_t> toCodepoints(const std::string& str)
|
||||
{
|
||||
std::vector<codepoint_t> result;
|
||||
for (auto it = str.begin(); it != str.end(); )
|
||||
{
|
||||
result.push_back(readCodepoint(it, str.end()));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a string from a vector of codepoints.
|
||||
*/
|
||||
static std::string fromCodepoints(const std::vector<codepoint_t>& points)
|
||||
{
|
||||
std::string result;
|
||||
for (auto it = points.begin(); it != points.end(); ++it)
|
||||
{
|
||||
writeCodepoint(result, *it);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -3,8 +3,8 @@ project(librengine LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h)
|
||||
set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp)
|
||||
set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h include/robots_txt.h)
|
||||
set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp src/robots_txt.cpp)
|
||||
|
||||
set(include_all ${include})
|
||||
set(src_all ${src})
|
||||
|
36
lib/include/robots_txt.h
Normal file
36
lib/include/robots_txt.h
Normal file
@ -0,0 +1,36 @@
|
||||
#ifndef ROBOTS_TXT_H
|
||||
#define ROBOTS_TXT_H
|
||||
|
||||
#include "http.h"
|
||||
|
||||
namespace librengine {
|
||||
class user_agent {
|
||||
public:
|
||||
std::string agent;
|
||||
std::vector<std::string> allow_list;
|
||||
std::vector<std::string> disallow_list;
|
||||
float crawl_delay;
|
||||
public:
|
||||
static bool match(const std::string &pattern, const std::string &expression);
|
||||
public:
|
||||
explicit user_agent(const std::string &agent);
|
||||
|
||||
bool allowed(const std::string &path);
|
||||
bool allowed(const http::url &url);
|
||||
};
|
||||
|
||||
class robots_txt {
|
||||
private:
|
||||
std::string text;
|
||||
public:
|
||||
std::vector<user_agent> agents;
|
||||
public:
|
||||
explicit robots_txt(const std::string &text);
|
||||
void parse();
|
||||
|
||||
bool allowed(const std::string &path, const std::string &agent);
|
||||
bool allowed(const http::url &url, const std::string &user_agent);
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
138
lib/src/robots_txt.cpp
Normal file
138
lib/src/robots_txt.cpp
Normal file
@ -0,0 +1,138 @@
|
||||
#include "robots_txt.h"
|
||||
|
||||
#include "str.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace librengine {
|
||||
bool user_agent::match(const std::string &pattern, const std::string &expression) {
|
||||
auto pattern_size = pattern.length();
|
||||
auto expression_size = expression.length();
|
||||
|
||||
std::vector<size_t> vector_pos(expression_size + 1);
|
||||
size_t pos = 1;
|
||||
|
||||
for (int i = 0; i < pattern_size; ++i) {
|
||||
char c = pattern[i];
|
||||
|
||||
if (c == '$' && i + 1 == pattern_size) {
|
||||
return vector_pos[pos - 1] == expression_size;
|
||||
}
|
||||
if (c == '*') {
|
||||
pos = expression_size - vector_pos[0] + 1;
|
||||
|
||||
for (int j = 1; j < pos; j++) {
|
||||
vector_pos[j] = vector_pos[j - 1] + 1;
|
||||
}
|
||||
} else {
|
||||
int tmp_pos = 0;
|
||||
|
||||
for (int j = 0; j < pos; j++) {
|
||||
auto c_pos = vector_pos[j];
|
||||
|
||||
if (c_pos < expression_size && expression[c_pos] == c) {
|
||||
vector_pos[tmp_pos] = c_pos + 1;
|
||||
++tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
if (tmp_pos == 0) return false;
|
||||
pos = tmp_pos;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
user_agent::user_agent(const std::string &agent) {
|
||||
this->agent = agent;
|
||||
crawl_delay = 0;
|
||||
}
|
||||
|
||||
bool user_agent::allowed(const std::string &path) {
|
||||
for (const auto &allow : allow_list) {
|
||||
if (match(allow, path)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &disallow : disallow_list) {
|
||||
if (match(disallow, path)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
bool user_agent::allowed(const http::url &url) {
|
||||
if (!url.path) return false;
|
||||
std::string path = *url.path;
|
||||
return allowed(path);
|
||||
}
|
||||
|
||||
robots_txt::robots_txt(const std::string &text) {
|
||||
this->text = text;
|
||||
agents.emplace_back("");
|
||||
}
|
||||
void robots_txt::parse() {
|
||||
auto splited = str::split(text, "\n");
|
||||
|
||||
for (const auto &pair : splited) {
|
||||
auto splited_pair = str::split(pair, ":");
|
||||
auto splited_pair_size = splited_pair.size();
|
||||
|
||||
if (splited_pair_size != 2) continue;
|
||||
|
||||
auto key = str::to_lower(splited_pair[0]);
|
||||
auto value = str::to_lower(splited_pair[1]);
|
||||
|
||||
key = str::trim(key);
|
||||
value = str::trim_start(value);
|
||||
|
||||
if (!value.empty()) value = str::trim_end(value);
|
||||
auto comment_index = value.find('#');
|
||||
|
||||
if (comment_index != -1) {
|
||||
value = value.substr(0, comment_index);
|
||||
value = str::trim_end(value);
|
||||
}
|
||||
|
||||
if (key.empty()) continue;
|
||||
if (key != "disallow" && value.empty()) continue;
|
||||
|
||||
auto ¤t_agent = agents.back();
|
||||
|
||||
if (key == "user-agent") {
|
||||
agents.emplace_back(value);
|
||||
}
|
||||
else if (key == "allow") {
|
||||
current_agent.allow_list.push_back(value);
|
||||
}
|
||||
else if (key == "disallow") {
|
||||
if (value.empty()) current_agent.allow_list.emplace_back("/");
|
||||
else current_agent.disallow_list.push_back(value);
|
||||
}
|
||||
else if (key == "crawl-delay") {
|
||||
try {
|
||||
current_agent.crawl_delay = std::stof(value);
|
||||
} catch (const std::exception &e) {
|
||||
//current_agent.crawl_delay = 0; (def)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool robots_txt::allowed(const std::string &path, const std::string &agent) {
|
||||
auto found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == agent; });
|
||||
if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == "*"; });
|
||||
if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == ""; });
|
||||
|
||||
return found->allowed(path);
|
||||
}
|
||||
bool robots_txt::allowed(const http::url &url, const std::string &user_agent) {
|
||||
if (!url.path) return false;
|
||||
std::string path = *url.path;
|
||||
|
||||
return allowed(path, user_agent);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user