Replacing a third party robots.txt parser with own robots.txt parser

This commit is contained in:
liameno 2022-05-27 19:40:36 +03:00
parent 077740ac6c
commit 60eadcce58
22 changed files with 185 additions and 3047 deletions

View File

@ -21,7 +21,7 @@
"proxy": "socks5://127.0.0.1:9050",
"load_page_timeout_s": 10,
"update_time_site_info_s_after": 864000, //10 days
"delay_time_s": 3,
"delay_time_s": 1,
"max_pages_site": 5,
"max_page_symbols": 50000000, //50mb
"max_robots_txt_symbols": 3000,

View File

@ -9,12 +9,5 @@ set(CMAKE_CXX_STANDARD 17)
find_package(CURL)
find_package(Threads)
set(tp_rep_cpp third_party/rep-cpp/agent.cpp third_party/rep-cpp/agent.h
third_party/rep-cpp/directive.cpp third_party/rep-cpp/directive.h third_party/rep-cpp/robots.cpp
third_party/rep-cpp/robots.h) #robots.txt / https://github.com/seomoz/rep-cpp
set(tp_url_cpp third_party/url-cpp/psl.cpp third_party/url-cpp/psl.h
third_party/url-cpp/punycode.cpp third_party/url-cpp/punycode.h third_party/url-cpp/url.cpp
third_party/url-cpp/url.h third_party/url-cpp/utf8.cpp third_party/url-cpp/utf8.h) #https://github.com/seomoz/url-cpp
add_executable(${PROJECT_NAME} main.cpp src/worker.cpp include/worker.h src/json_generator.cpp include/json_generator.h src/html_helper.cpp include/html_helper.h ${tp_rep_cpp} ${tp_url_cpp})
target_link_libraries(${PROJECT_NAME} PRIVATE /usr/lib/liblexbor.so curl Threads::Threads /usr/local/lib/liblibrengine.so)

View File

@ -41,7 +41,7 @@ public:
librengine::http::request::result_s site(const librengine::http::url &url);
std::optional<std::string> get_robots_txt(const librengine::http::url &url);
bool is_allowed_in_robots(const std::string &body, const std::string &url);
bool is_allowed_in_robots(const std::string &body, const http::url &url);
bool normalize_url(librengine::http::url &url, const std::optional<std::string> &owner_host = std::nullopt) const;
public:
explicit worker(const librengine::config::all &config);

View File

@ -2,8 +2,6 @@
#include "include/worker.h"
using namespace librengine;
int main(int argc, char **argv) {
using namespace librengine;

View File

@ -10,10 +10,10 @@
#include <librengine/logger.h>
#include <librengine/helper.h>
#include <librengine/cache.h>
#include <librengine/robots_txt.h>
#include "../include/json_generator.h"
#include "../include/html_helper.h"
#include "../third_party/rep-cpp/robots.h"
#define DEBUG true //TODO: FALSE
@ -82,8 +82,10 @@ std::optional<std::string> worker::get_robots_txt(const http::url &url) {
return request.result.response;
}
bool worker::is_allowed_in_robots(const std::string &body, const std::string &url) {
Rep::Robots robots = Rep::Robots(body);
bool worker::is_allowed_in_robots(const std::string &body, const http::url &url) {
robots_txt robots(body);
robots.parse();
return robots.allowed(url, config.crawler_.user_agent);
}
bool worker::normalize_url(http::url &url, const std::optional<std::string> &owner_host) const {
@ -260,7 +262,8 @@ worker::result worker::work(url &url_) {
}
}
if (is_checked && !is_allowed_in_robots(robots_txt_body, url.text)) {
if (is_checked && !is_allowed_in_robots(robots_txt_body, url)) {
if_debug_print(logger::type::error, "disallowed robots.txt", url.text);
return result::disallowed_robots;
}
}

View File

@ -1,138 +0,0 @@
#include <algorithm>
#include <iomanip>
#include <sstream>
#include "../url-cpp/url.h"
#include "agent.h"
#include "directive.h"
namespace
{
std::string escape_url(Url::Url& url)
{
return url.defrag().escape().fullpath();
}
std::string trim_front(const std::string& str, const char chr)
{
auto itr = std::find_if(str.begin(), str.end(),
[chr](const char c) {return c != chr;});
return std::string(itr, str.end());
}
}
namespace Rep
{
Agent& Agent::allow(const std::string& query)
{
Url::Url url(query);
// ignore directives for external URLs
if (is_external(url))
{
return *this;
}
// leading wildcard?
if (query.front() == '*')
{
Url::Url trimmed(trim_front(query, '*'));
directives_.push_back(Directive(escape_url(trimmed), true));
}
directives_.push_back(Directive(escape_url(url), true));
sorted_ = false;
return *this;
}
Agent& Agent::disallow(const std::string& query)
{
if (query.empty())
{
// Special case: "Disallow:" means "Allow: /"
directives_.push_back(Directive(query, true));
}
else
{
Url::Url url(query);
// ignore directives for external URLs
if (is_external(url))
{
return *this;
}
// leading wildcard?
if (query.front() == '*')
{
Url::Url trimmed(trim_front(query, '*'));
directives_.push_back(Directive(escape_url(trimmed), false));
}
directives_.push_back(Directive(escape_url(url), false));
}
sorted_ = false;
return *this;
}
const std::vector<Directive>& Agent::directives() const
{
if (!sorted_)
{
std::sort(directives_.begin(), directives_.end(),
[](const Directive& a, const Directive& b) {
return b.priority() < a.priority();
});
sorted_ = true;
}
return directives_;
}
bool Agent::allowed(const std::string& query) const
{
Url::Url url(query);
if (is_external(url))
{
return false;
}
std::string path(escape_url(url));
if (path.compare("/robots.txt") == 0)
{
return true;
}
for (const auto& directive : directives())
{
if (directive.match(path))
{
return directive.allowed();
}
}
return true;
}
std::string Agent::str() const
{
std::stringstream out;
if (delay_ > 0)
{
out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' ';
}
out << '[';
const auto& d = directives();
auto begin = d.begin();
auto end = d.end();
if (begin != end)
{
out << "Directive(" << begin->str() << ')';
++begin;
}
for (; begin != end; ++begin)
{
out << ", Directive(" << begin->str() << ')';
}
out << ']';
return out.str();
}
bool Agent::is_external(const Url::Url& url) const
{
return !host_.empty() && !url.host().empty() && url.host() != host_;
}
}

View File

@ -1,93 +0,0 @@
#ifndef AGENT_CPP_H
#define AGENT_CPP_H
#include <vector>
#include "directive.h"
// forward declaration
namespace Url
{
struct Url;
}
namespace Rep
{
class Agent
{
public:
/* The type for the delay. */
typedef float delay_t;
/**
* Default constructor
*/
Agent() : Agent("") {}
/**
* Construct an agent.
*/
explicit Agent(const std::string& host) :
directives_(), delay_(-1.0), sorted_(true), host_(host) {}
/**
* Default copy constructor.
*/
Agent(const Agent& rhs) = default;
/**
* Default move constructor.
*/
Agent(Agent&& rhs) = default;
/**
* Add an allowed directive.
*/
Agent& allow(const std::string& query);
/**
* Add a disallowed directive.
*/
Agent& disallow(const std::string& query);
/**
* Set the delay for this agent.
*/
Agent& delay(delay_t value) {
delay_ = value;
return *this;
}
/**
* Return the delay for this agent.
*/
delay_t delay() const { return delay_; }
/**
* A vector of the directives, in priority-sorted order.
*/
const std::vector<Directive>& directives() const;
/**
* Return true if the URL (either a full URL or a path) is allowed.
*/
bool allowed(const std::string& path) const;
std::string str() const;
/**
* Default copy assignment operator.
*/
Agent& operator=(const Agent& rhs) = default;
private:
bool is_external(const Url::Url& url) const;
mutable std::vector<Directive> directives_;
delay_t delay_;
mutable bool sorted_;
std::string host_;
};
}
#endif

View File

@ -1,130 +0,0 @@
#include <algorithm>
#include <locale>
#include <sstream>
#include <string>
#include "../url-cpp/url.h"
#include "directive.h"
namespace Rep
{
Directive::Directive(const std::string& line, bool allowed)
: expression_()
, priority_(line.size())
, allowed_(allowed)
{
if (line.find('*') == std::string::npos)
{
expression_.assign(line);
return;
}
// Remove consecutive '*'s
expression_.reserve(line.size());
bool star = false;
for (auto character : line)
{
if (character == '*')
{
if (!star)
{
expression_.append(1, character);
}
star = true;
}
else
{
expression_.append(1, character);
star = false;
}
}
// Remove trailing '*'s
std::string::reverse_iterator last =
std::find_if(expression_.rbegin(), expression_.rend(),
[](const char c) {
return c != '*';
});
expression_.erase(last.base(), expression_.end());
// Priority is the length of the expression
priority_ = expression_.size();
}
bool Directive::match(const std::string::const_iterator& e_begin,
const std::string::const_iterator& e_end,
const std::string::const_iterator& p_begin,
const std::string::const_iterator& p_end) const
{
std::string::const_iterator expression_it = e_begin;
std::string::const_iterator path_it = p_begin;
while (expression_it != e_end && path_it != p_end)
{
if (*expression_it == '*')
{
// Advance and recurse
++expression_it;
for (; path_it != p_end; ++path_it)
{
if (match(expression_it, e_end, path_it, p_end))
{
return true;
}
}
return false;
}
else if (*expression_it == '$')
{
// This check expects path to be fully consumed. But since one of the
// criteria of being in this while loop is that we've not fully consumed
// path, return false.
return false;
}
else if (*expression_it != *path_it)
{
// These characters must match
return false;
}
else
{
// Advance both by one
++path_it;
++expression_it;
}
}
// Return true only if we've consumed all of the expression
if (expression_it == e_end)
{
return true;
}
else if (*expression_it == '$')
{
return path_it == p_end;
}
else
{
return false;
}
}
std::string Directive::str() const
{
std::stringstream out;
if (allowed_)
{
out << "Allow: " << expression_;
}
else {
out << "Disallow: " << expression_;
}
return out.str();
}
bool Directive::match(const std::string& path) const
{
return match(expression_.begin(), expression_.end(), path.begin(), path.end());
}
}

View File

@ -1,82 +0,0 @@
#ifndef DIRECTIVE_CPP_H
#define DIRECTIVE_CPP_H
namespace Rep
{
class Directive
{
public:
/**
* The type of our priority value.
*/
typedef size_t priority_t;
/**
* Default constructor disallowed.
*/
Directive() = delete;
/**
* The input to this constructor must be stripped of comments
* and trailing whitespace.
*/
Directive(const std::string& line, bool allowed);
/**
* Default copy constructor.
*/
Directive(const Directive& rhs) = default;
/**
* Default move constructor.
*/
Directive(Directive&& rhs) = default;
/**
* The priority of the rule.
*/
priority_t priority() const
{
return priority_;
}
/**
* Whether or not the provided path matches. The path is
* expected to be properly escaped.
*/
bool match(const std::string& path) const;
/**
* Whether this rule is for an allow or a disallow.
*/
bool allowed() const
{
return allowed_;
}
std::string str() const;
/**
* Default copy assignment operator.
*/
Directive& operator=(const Directive& rhs) = default;
private:
std::string expression_;
priority_t priority_;
bool allowed_;
/**
* Return true if p_begin -> p_end matches the expression e_begin -> e_end.
*/
bool match(const std::string::const_iterator& e_begin,
const std::string::const_iterator& e_end,
const std::string::const_iterator& p_begin,
const std::string::const_iterator& p_end) const;
};
}
#endif

View File

@ -1,196 +0,0 @@
#include <algorithm>
#include <functional>
#include <cctype>
#include <locale>
#include <sstream>
#include <iostream>
#include <unordered_map>
#include "../url-cpp/url.h"
#include "robots.h"
namespace Rep
{
void Robots::strip(std::string& string)
{
string.erase(string.begin(), std::find_if(string.begin(), string.end(),
std::not1(std::ptr_fun<int, int>(std::isspace))));
string.erase(std::find_if(string.rbegin(), string.rend(),
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
}
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
{
while (getline(stream, key))
{
size_t index = key.find('#');
if (index != std::string::npos)
{
key.resize(index);
}
// Find the colon and divide it into key and value, skipping malformed lines
index = key.find(':');
if (index == std::string::npos)
{
continue;
}
value.assign(key.begin() + index + 1, key.end());
key.resize(index);
// Strip whitespace off of each
strip(key);
strip(value);
// Lowercase the key
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
return true;
}
return false;
}
Robots::Robots(const std::string& content) :
Robots(content, "")
{
}
Robots::Robots(const std::string& content, const std::string& base_url) :
host_(Url::Url(base_url).host()),
agents_(),
sitemaps_(),
default_(agents_.emplace("*", Agent(host_)).first->second)
{
std::string agent_name("*");
std::istringstream input(content);
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
{
input.ignore(3);
}
std::string key, value;
std::vector<std::string> group;
bool last_agent = false;
agent_map_t::iterator current = agents_.find("*");
while (Robots::getpair(input, key, value))
{
if (key.compare("user-agent") == 0)
{
// Store the user agent string as lowercased
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (last_agent)
{
group.push_back(value);
}
else
{
if (!agent_name.empty())
{
for (auto other : group)
{
agents_.emplace(other, current->second);
}
group.clear();
}
agent_name = value;
current = agents_.emplace(agent_name, Agent(host_)).first;
}
last_agent = true;
continue;
}
else
{
last_agent = false;
}
if (key.compare("sitemap") == 0)
{
sitemaps_.push_back(value);
}
else if (key.compare("disallow") == 0)
{
current->second.disallow(value);
}
else if (key.compare("allow") == 0)
{
current->second.allow(value);
}
else if (key.compare("crawl-delay") == 0)
{
try
{
current->second.delay(std::stof(value));
}
catch (const std::exception&)
{
std::cerr << "Could not parse " << value << " as float." << std::endl;
}
}
}
if (!agent_name.empty())
{
for (auto other : group)
{
agents_.emplace(other, current->second);
}
}
}
const Agent& Robots::agent(const std::string& name) const
{
// Lowercase the agent
std::string lowered(name);
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
auto it = agents_.find(lowered);
if (it == agents_.end())
{
return default_;
}
else
{
return it->second;
}
}
bool Robots::allowed(const std::string& path, const std::string& name) const
{
return agent(name).allowed(path);
}
std::string Robots::str() const
{
std::stringstream out;
// TODO: include sitepath info
out << '{';
auto begin = agents_.begin();
auto end = agents_.end();
if (begin != end)
{
out << '"' << begin->first << '"' << ": " << begin->second.str();
++begin;
}
for (; begin != end; ++begin)
{
out << ", \"" << begin->first << '"' << ": " << begin->second.str();
}
out << '}';
return out.str();
}
std::string Robots::robotsUrl(const std::string& url)
{
return Url::Url(url)
.setUserinfo("")
.setPath("robots.txt")
.setParams("")
.setQuery("")
.setFragment("")
.remove_default_port()
.str();
}
}

View File

@ -1,66 +0,0 @@
#ifndef ROBOTS_CPP_H
#define ROBOTS_CPP_H
#include <sstream>
#include <unordered_map>
#include <vector>
#include "agent.h"
namespace Rep
{
class Robots
{
public:
typedef std::unordered_map<std::string, Agent> agent_map_t;
typedef std::vector<std::string> sitemaps_t;
/**
* Create a robots.txt from a utf-8-encoded string.
*/
explicit Robots(const std::string& content);
/**
* Create a robots.txt from a utf-8-encoded string assuming
* the given base_url.
*/
Robots(const std::string& content, const std::string& base_url);
/**
* Get the sitemaps in this robots.txt
*/
const sitemaps_t& sitemaps() const { return sitemaps_; }
/**
* Get the agent with the corresponding name.
*/
const Agent& agent(const std::string& name) const;
/**
* Return true if agent is allowed to fetch the URL (either a
* full URL or a path).
*/
bool allowed(const std::string& path, const std::string& name) const;
std::string str() const;
/**
* Return the robots.txt URL corresponding to the provided URL.
*/
static std::string robotsUrl(const std::string& url);
private:
static void strip(std::string& string);
static bool getpair(
std::istringstream& stream, std::string& key, std::string& value);
std::string host_;
agent_map_t agents_;
sitemaps_t sitemaps_;
Agent& default_;
};
}
#endif

View File

@ -1,183 +0,0 @@
#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include "psl.h"
#include "punycode.h"
namespace Url
{
const std::string PSL::not_found = "";
PSL::PSL(std::istream& stream)
{
std::string line;
while (std::getline(stream, line))
{
// Only take up to the first whitespace.
auto it = std::find_if(line.begin(), line.end(), ::isspace);
line.resize(it - line.begin());
// Skip blank lines
if (line.empty())
{
continue;
}
// Skip comments
if (line.compare(0, 2, "//") == 0)
{
continue;
}
// We know the line has at least a single character at this point
if (line[0] == '*')
{
// Line is a wildcard rule
if (line.size() <= 2 || line[1] != '.')
{
throw std::invalid_argument("Wildcard rule must be of form *.<host>");
}
add(line, 1, 2);
}
else if (line[0] == '!')
{
// Line is an exception, take all but the !
if (line.size() <= 1)
{
throw std::invalid_argument("Exception rule has no hostname.");
}
add(line, -1, 1);
}
else
{
add(line, 0, 0);
}
}
}
PSL PSL::fromPath(const std::string& path)
{
std::ifstream stream(path);
if (!stream.good())
{
std::stringstream message;
message << "Path '" << path << "' inaccessible.";
throw std::invalid_argument(message.str());
}
return PSL(stream);
}
PSL PSL::fromString(const std::string& str)
{
std::stringstream stream(str);
return PSL(stream);
}
std::string PSL::getTLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname));
}
std::string PSL::getPLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname) + 1);
}
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
{
size_t length = getTLDLength(hostname);
return std::make_pair(
getLastSegments(hostname, length),
getLastSegments(hostname, length + 1));
}
size_t PSL::getTLDLength(const std::string& hostname) const
{
// Reversed copy of hostname
std::string tld(hostname.rbegin(), hostname.rend());
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
while (tld.size())
{
auto it = levels.find(tld);
if (it != levels.end())
{
return it->second;
}
size_t position = tld.rfind('.');
if (position == std::string::npos || position == 0)
{
tld.resize(0);
}
else
{
tld.resize(position);
}
}
return 1;
}
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
{
size_t position = hostname.size();
size_t remaining = segments;
while (remaining != 0 && position && position != std::string::npos)
{
position = hostname.rfind('.', position - 1);
remaining -= 1;
}
if (remaining >= 1)
{
return not_found;
}
// Return the whole string if position == std:string::npos
size_t start = (position == std::string::npos) ? 0 : position + 1;
std::string result(hostname, start);
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// Leading .'s indicate that the query had an empty segment
if (result.size() && result[0] == '.')
{
std::stringstream message;
message << "Empty segment in " << result;
throw std::invalid_argument(message.str());
}
return result;
}
size_t PSL::countSegments(const std::string& hostname) const
{
size_t count = 1;
size_t position = hostname.find('.');
while (position != std::string::npos)
{
count += 1;
position = hostname.find('.', position + 1);
}
return count;
}
void PSL::add(std::string& rule, int level_adjust, size_t trim)
{
// First unpunycoded
std::string copy(rule.rbegin(), rule.rend() - trim);
size_t length = countSegments(copy) + level_adjust;
levels[copy] = length;
// And now punycoded
rule = Punycode::encodeHostname(rule);
copy.assign(rule.rbegin(), rule.rend() - trim);
levels[copy] = length;
}
};

View File

@ -1,102 +0,0 @@
#ifndef PSL_CPP_H
#define PSL_CPP_H
#include <istream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
namespace Url
{
/**
* Find TLDs and PLDs of a hostname according to a PSL.
*/
struct PSL
{
/**
* Indicates the there is no TLD / PLD
*/
static const std::string not_found;
/**
* Read a PSL from an istream.
*/
PSL(std::istream& stream);
PSL(): levels() { };
PSL(const PSL& other): levels(other.levels) { }
PSL& operator=(const PSL& other)
{
levels = other.levels;
return *this;
}
/**
* Read the provided path holding a set of PSL rules.
*/
static PSL fromPath(const std::string& path);
/**
* Create a PSL object from a string.
*/
static PSL fromString(const std::string& str);
/**
* Get just the TLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getTLD(const std::string& hostname) const;
/**
* Get just the PLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getPLD(const std::string& hostname) const;
/**
* Get the (TLD, PLD) of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
private:
// Mapping of a string rule to its level
std::unordered_map<std::string, size_t> levels;
// Return the number of segments in a hostname
size_t countSegments(const std::string& hostname) const;
// Return the number of segments in the TLD of the provided hostname
size_t getTLDLength(const std::string& hostname) const;
// Return the last `segments` segments of a hostname
std::string getLastSegments(const std::string& hostname, size_t segments) const;
/**
* Add the provided host with the provided priority, trimming characters off
* the front, and adjusting the level by the provided number.
*/
void add(std::string& host, int level_adjust, size_t trim);
};
}
#endif

View File

@ -1,408 +0,0 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "punycode.h"
#include "utf8.h"
namespace Url
{
std::string& Punycode::encode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
// let n = initial_n
// let delta = 0
// let bias = initial_bias
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;
// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
}
codepoints.push_back(value);
}
// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;
// copy a delimiter if b > 0
if (b > 0)
{
output.append(1, '-');
}
// while h < length(input) do begin
while (h < codepoints.size())
{
// let m = the minimum {non-basic} code point >= n in the input
punycode_uint m = MAX_PUNYCODE_UINT;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
if ((*it >= n) && (*it < m))
{
m = *it;
}
}
// let delta = delta + (m - n) * (h + 1), fail on overflow
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
{
throw std::invalid_argument("Overflow delta update.");
}
delta += (m - n) * (h + 1);
// let n = m
n = m;
// for each code point c in the input (in order) do begin
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
// if c < n {or c is basic} then increment delta, fail on overflow
if (*it < n)
{
if (delta == MAX_PUNYCODE_UINT)
{
throw std::invalid_argument("Overflow delta increment.");
}
++delta;
}
// if c == n then begin
if (*it == n)
{
// let q = delta
punycode_uint q = delta;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE)
{
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if q < t then break
if (q < t)
{
break;
}
// output the code point for digit t + ((q - t) mod (base - t))
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
// let q = (q - t) div (base - t)
q = (q - t) / (BASE - t);
}
// output the code point for digit q
output.append(1, DIGIT_TO_BASIC[q]);
// let bias = adapt(delta, h + 1, test h equals b?)
bias = adapt(delta, h + 1, h == b);
// let delta = 0
delta = 0;
// increment h
++h;
}
}
// increment delta and n
++delta;
++n;
}
str.assign(output);
return str;
}
std::string Punycode::encode(const std::string& str)
{
std::string result(str);
encode(result);
return result;
}
std::string Punycode::encodeHostname(const std::string& hostname)
{
// Avoid any punycoding at all if none is needed
if (!needsPunycoding(hostname))
{
return hostname;
}
std::string encoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (needsPunycoding(segment))
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return encoded;
}
std::string& Punycode::decode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
// let n = initial_n
// let i = 0
// let bias = initial_bias
// let output = an empty string indexed from 0
punycode_uint n = INITIAL_N;
punycode_uint i = 0;
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;
size_t index = str.rfind('-');
if (index == std::string::npos)
{
index = 0;
}
// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
throw std::invalid_argument("Argument has non-basic code points.");
}
codepoints.push_back(*it);
}
// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
{
index += 1;
}
// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
{
// let oldi = i
// let w = 1
punycode_uint oldi = i;
punycode_uint w = 1;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
{
throw std::invalid_argument("Premature termination");
}
// let digit = the code point's digit-value, fail if it has none
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
if (lookup == -1)
{
throw std::invalid_argument("Invalid base 36 character.");
}
unsigned char digit = static_cast<unsigned char>(lookup);
// let i = i + digit * w, fail on overflow
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
{
throw std::invalid_argument("Overflow on i.");
}
i += digit * w;
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if digit < t then break
if (digit < t)
{
break;
}
// let w = w * (base - t), fail on overflow
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
{
// I believe this line is unreachable without first overflowing i.
// Since 'i' is updated above as i += digit * w, and w is updated as
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
// give 'w' a chance to overflow first. To keep t minimized, we must
// have 'bias' maximized. `bias` is driven by the 'adapt' function
// below.
//
// The value returned by 'adapt' increases with the input delta, and
// decreases with the input size. The delta is a function of the input
// size as well, on the order of (delta_n * input size), and
// legitimate delta_n values are limited to 0x10FFFF (the maximum
// unicode codepoint). Even setting that aside, the maximum value that
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
//
// Using this bias, we could use the input (HERE) to get iterations:
//
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
//
// At this point, t now becomes TMAX (26) because k exceeds the bias
// (since the maximum bias is 204). As such, the minimum continuation
// value is 26:
//
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
//
// However, the next iteration now overflows i before we can get to
// the w update.
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
}
w *= (BASE - t);
}
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
// let n = n + i div (length(output) + 1), fail on overflow
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
{
throw std::invalid_argument("Overflow on n.");
}
n += i / (codepoints.size() + 1);
// let i = i mod (length(output) + 1)
i %= (codepoints.size() + 1);
// insert n into output at position i
codepoints.insert(codepoints.begin() + i, n);
// increment i
++i;
}
std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
}
str.assign(output);
return str;
}
std::string Punycode::decode(const std::string& str)
{
std::string result(str);
decode(result);
return result;
}
std::string Punycode::decodeHostname(const std::string& hostname)
{
std::string unencoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
unencoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return unencoded;
}
bool Punycode::needsPunycoding(const std::string& str)
{
return std::any_of(
str.begin(),
str.end(),
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}
Punycode::punycode_uint Punycode::adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime)
{
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
//
// It does not matter whether the modifications to delta and k inside
// adapt() affect variables of the same name inside the
// encoding/decoding procedures, because after calling adapt() the
// caller does not read those variables before overwriting them.
//
// if firsttime then let delta = delta div damp
// else let delta = delta div 2
delta = firsttime ? delta / DAMP : delta >> 1;
// let delta = delta + (delta div numpoints)
delta += (delta / numpoints);
// let k = 0
punycode_uint k = 0;
// while delta > ((base - tmin) * tmax) div 2 do begin
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
{
// let delta = delta div (base - tmin)
// let k = k + base
delta /= (BASE - TMIN);
}
// return k + (((base - tmin + 1) * delta) div (delta + skew))
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
};

View File

@ -1,106 +0,0 @@
#ifndef PUNYCODE_CPP_H
#define PUNYCODE_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include <limits>
#include "utf8.h"
namespace Url
{
namespace Punycode
{
typedef Utf8::codepoint_t punycode_uint;
const unsigned int BASE = 36;
const unsigned int TMIN = 1;
const unsigned int TMAX = 26;
const unsigned int SKEW = 38;
const unsigned int DAMP = 700;
const unsigned int INITIAL_BIAS = 72;
const unsigned int INITIAL_N = 128;
// Codepoints to their base-36 value
const std::vector<int8_t> BASIC_TO_DIGIT = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
// The highest codepoint in unicode
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
//Utf8::MAX_CODEPOINT;
//std::numeric_limits<punycode_uint>::max();
/**
* Replace utf-8-encoded str into punycode.
*/
std::string& encode(std::string& str);
/**
* Create a new punycoded string from utf-8-encoded input.
*/
std::string encode(const std::string& str);
/**
* Encode a hostname.
*/
std::string encodeHostname(const std::string& hostname);
/**
* Replace punycoded str into utf-8-encoded.
*/
std::string& decode(std::string& str);
/**
* Create a new utf-8-encoded string from punycoded input.
*/
std::string decode(const std::string& str);
/**
* Decode a hostname.
*/
std::string decodeHostname(const std::string& hostname);
/**
* Determine if a string needs punycoding.
*/
bool needsPunycoding(const std::string& str);
/**
* Internal function for calculating bias.
*/
punycode_uint adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime);
};
}
#endif

View File

@ -1,962 +0,0 @@
#include <algorithm>
#include <string>
#include <iterator>
#include <unordered_map>
#include <unordered_set>
#include <iostream>
#include <iterator>
#include <sstream>
#include "url.h"
#include "punycode.h"
namespace Url
{
/* Character classes */
const CharacterClass Url::GEN_DELIMS(":/?#[]@");
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
const CharacterClass Url::DIGIT("0123456789");
const CharacterClass Url::ALPHA(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
const CharacterClass Url::UNRESERVED(
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
const CharacterClass Url::RESERVED(
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
const CharacterClass Url::PCHAR(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
const CharacterClass Url::PATH(
Url::PCHAR.chars() + "/");
const CharacterClass Url::QUERY(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::FRAGMENT(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::USERINFO(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
const CharacterClass Url::HEX("0123456789ABCDEF");
const CharacterClass Url::SCHEME(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
const std::vector<signed char> Url::HEX_TO_DEC = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::unordered_map<std::string, int> Url::PORTS = {
{"http", 80},
{"https", 443}
};
const std::unordered_set<std::string> Url::USES_RELATIVE = {
"",
"file",
"ftp",
"gopher",
"http",
"https",
"imap",
"mms",
"nntp",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"svn",
"svn+ssh",
"wais"
};
const std::unordered_set<std::string> Url::USES_NETLOC = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"snews",
"svn",
"svn+ssh",
"telnet",
"wais"
};
const std::unordered_set<std::string> Url::USES_PARAMS = {
"",
"ftp",
"hdl",
"http",
"https",
"imap",
"mms",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"tel"
};
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"hdl",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"sms",
"snews",
"svn",
"svn+ssh",
"tel",
"telnet",
"wais"
};
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
{
size_t position = 0;
size_t index = url.find(':');
if (index != std::string::npos)
{
// All the characters in our would-be scheme must be in SCHEME
if (std::all_of(
url.begin(),
url.begin() + index,
[](char c) { return SCHEME(c); } ))
{
// If there is nothing after the : or there are any non-digits, this is
// the scheme
if ((index + 1) >= url.length()
|| std::any_of(
url.begin() + index + 1,
url.end(),
[](char c) { return !DIGIT(c); }))
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
position = index + 1;
}
else
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
{
position = index + 1;
}
else
{
scheme_.clear();
}
}
}
}
// Search for the netloc
if ((url.length() - position) >= 1
&& url[position] == '/'
&& url[position + 1] == '/')
{
// Skip the '//'
position += 2;
index = url.find_first_of("/?#", position);
host_.assign(url, position, index - position);
position = index;
// Extract any userinfo if there is any
index = host_.find('@');
if (index != std::string::npos)
{
userinfo_.assign(host_, 0, index);
host_.assign(host_, index + 1, std::string::npos);
}
// Lowercase the hostname
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
// Try to find a port
index = host_.find(':');
if (index != std::string::npos)
{
std::string portText(host_, index + 1, std::string::npos);
host_.resize(index);
if (portText.empty())
{
port_ = 0;
}
else
{
try
{
port_ = std::stoi(portText, &index);
if (index != portText.length())
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
if (port_ > 65535)
{
throw UrlParseException("Port too high: " + portText);
}
else if (port_ < 0)
{
throw UrlParseException("Port negative: " + portText);
}
}
catch (const std::invalid_argument&)
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
catch (const std::out_of_range&)
{
throw UrlParseException("Port out of integer range: " + portText);
}
}
}
}
if (position != std::string::npos)
{
path_.assign(url, position, std::string::npos);
index = path_.find('#');
if (index != std::string::npos)
{
fragment_.assign(path_, index + 1, std::string::npos);
path_.resize(index);
}
index = path_.find('?');
if (index != std::string::npos)
{
query_.assign(path_, index + 1, std::string::npos);
has_query_ = true;
path_.resize(index);
}
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
{
index = path_.find(';');
if (index != std::string::npos)
{
params_.assign(path_, index + 1, std::string::npos);
has_params_ = true;
path_.resize(index);
}
}
}
}
Url& Url::assign(const Url& other)
{
return (*this) = other;
}
bool Url::operator==(const Url& other) const
{
return (
(scheme_ == other.scheme_ ) &&
(userinfo_ == other.userinfo_ ) &&
(host_ == other.host_ ) &&
(port_ == other.port_ ) &&
(path_ == other.path_ ) &&
(params_ == other.params_ ) &&
(query_ == other.query_ ) &&
(fragment_ == other.fragment_ ) &&
(has_params_ == other.has_params_) &&
(has_query_ == other.has_query_ )
);
}
bool Url::operator!=(const Url& other) const
{
return !operator==(other);
}
bool Url::equiv(const Url& other)
{
Url self_(*this);
Url other_(other);
self_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
other_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
return self_ == other_;
}
std::string& Url::remove_repeats(std::string& str, const char chr)
{
size_t dest = 0;
// By initializing this to true, it also strips of leading instances of chr
bool seen = true;
for (size_t src = 0; src < str.length(); ++src)
{
if (!seen || (str[src] != chr))
{
str[dest++] = str[src];
}
seen = str[src] == chr;
}
// Remove the last character if it happens to be chr
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
str.resize(length);
return str;
}
std::string Url::fullpath() const
{
std::string result;
if (path_.empty() || path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
std::string Url::str() const
{
std::string result;
if (!scheme_.empty())
{
result.append(scheme_);
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
{
result.append(":");
}
else
{
result.append("://");
}
}
else if (!host_.empty())
{
result.append("//");
}
if (!userinfo_.empty())
{
result.append(userinfo_);
result.append("@");
}
if (!host_.empty())
{
result.append(host_);
}
if (port_)
{
result.append(":");
result.append(std::to_string(port_));
}
if (path_.empty())
{
if (!result.empty())
{
result.append("/");
}
}
else
{
if (!host_.empty() && path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
}
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
Url& Url::strip()
{
size_t start = query_.find_first_not_of('?');
if (start != std::string::npos)
{
query_.assign(query_, start, std::string::npos);
}
else
{
query_.assign("");
}
setQuery(remove_repeats(query_, '&'));
setParams(remove_repeats(params_, ';'));
return *this;
}
Url& Url::abspath()
{
std::string copy;
std::vector<size_t> segment_starts;
if (path_.size() >= 1 && path_[0] == '/')
{
copy.append(1, '/');
segment_starts.push_back(0);
}
bool directory = false;
size_t previous = 0;
size_t index = 0;
for (index = path_.find('/')
; index != std::string::npos
; previous = index + 1, index = path_.find('/', index + 1))
{
// Skip empty segments
if (index - previous == 0)
{
continue;
}
if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
segment_starts.pop_back();
}
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else
{
segment_starts.push_back(copy.length());
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
}
// Handle the last segment
index = path_.length();
if (previous == path_.length())
{
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
}
directory = true;
}
else
{
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
if (!directory && copy.size() >= 1)
{
copy.resize(copy.size() - 1);
}
else if (directory && copy.empty())
{
copy.append(1, '/');
}
path_.assign(copy);
return *this;
}
Url& Url::relative_to(const Url& other)
{
// If this scheme does not use relative, return it unchanged
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
{
return *this;
}
// Support scheme-relative URLs
if (scheme_.empty())
{
scheme_ = other.scheme_;
}
// If this is an absolute URL (or scheme-relative), return early
if (!host_.empty()) {
return *this;
}
// If it's not an absolute URL, we need to copy the other host and port
host_ = other.host_;
port_ = other.port_;
userinfo_ = other.userinfo_;
// If the path portion is absolute, then bail out early.
if (!path_.empty() && path_.front() == '/')
{
return *this;
}
// Otherwise, this is a path that need to be evaluated relative to the other. If
// there is no '/', then we just keep our current path if it's not empty.
if (path_.empty())
{
if (params_.empty())
{
path_ = other.path_;
params_ = other.params_;
has_params_ = other.has_params_;
if (query_.empty())
{
query_ = other.query_;
has_query_ = other.has_query_;
}
}
else
{
path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
}
if (fragment_.empty())
{
fragment_ = other.fragment_;
}
}
else
{
size_t index = other.path_.rfind('/');
if (index != std::string::npos)
{
path_ = other.path_.substr(0, index + 1) + path_;
}
else if (!host_.empty())
{
path_ = "/" + path_;
}
}
return *this;
}
Url& Url::escape(bool strict)
{
escape(path_, PATH, strict);
escape(query_, QUERY, strict);
escape(params_, QUERY, strict);
escape(userinfo_, USERINFO, strict);
return *this;
}
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
{
std::string copy(str);
size_t dest = 0;
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
// the space.
str.resize(str.length() * 3);
for (size_t src = 0; src < copy.length(); ++src)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// In strict mode, we can only unescape parameters if they are both
// safe and not reserved
if (!strict || (strict && safe(value) && !RESERVED(value)))
{
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
copy[src] = value;
}
else
{
str[dest++] = copy[src++];
str[dest++] = ::toupper(copy[src++]);
str[dest++] = ::toupper(copy[src]);
continue;
}
}
}
if (!safe(copy[src]))
{
// Not safe -- replace with %XX
str[dest++] = '%';
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
str[dest++] = HEX.chars()[copy[src] & 0xF];
}
else
{
str[dest++] = copy[src];
}
}
str.resize(dest);
return str;
}
Url& Url::unescape()
{
unescape(path_);
unescape(query_);
unescape(params_);
unescape(userinfo_);
return *this;
}
std::string& Url::unescape(std::string& str)
{
std::string copy(str);
size_t dest = 0;
for (size_t src = 0; src < copy.length(); ++src, ++dest)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
str[dest] = value;
continue;
}
}
// Either not a % or an incomplete entity
str[dest] = copy[src];
}
str.resize(dest);
return str;
}
Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
{
// Predicate is if it's present in the blacklist.
auto predicate = [blacklist](std::string& name, const std::string& value)
{
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
return blacklist.find(name) != blacklist.end();
};
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
Url& Url::deparam(const deparam_predicate& predicate)
{
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
std::string& Url::remove_params(std::string& str,
const deparam_predicate& predicate,
char sep)
{
std::string copy;
std::string piece;
std::string name;
std::string value;
size_t previous = 0;
for (size_t index = str.find(sep)
; index != std::string::npos
; previous = index + 1, index = str.find(sep, previous))
{
piece.assign(str, previous, index - previous);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
if (previous < str.length())
{
piece.assign(str, previous, std::string::npos);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
str.assign(copy);
return str;
}
Url& Url::sort_query()
{
split_sort_join(query_, '&');
split_sort_join(params_, ';');
return *this;
}
std::string& Url::split_sort_join(std::string& str, const char glue)
{
// Return early if empty
if (str.empty())
{
return str;
}
// Split
std::vector<std::string> pieces;
std::stringstream stream(str);
std::string item;
while (getline(stream, item, glue))
{
pieces.push_back(item);
}
// Return early if it's just a single element
if (pieces.size() == 1)
{
return str;
}
// Sort
std::sort(pieces.begin(), pieces.end());
// Join (at this point we know that there's at least one element)
std::stringstream output;
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
{
output << *it << glue;
}
output << pieces.back();
str.assign(output.str());
return str;
}
Url& Url::remove_default_port()
{
if (port_ && !scheme_.empty())
{
auto it = PORTS.find(scheme_);
if (it != PORTS.end() && port_ == it->second)
{
port_ = 0;
}
}
return *this;
}
Url& Url::deuserinfo()
{
userinfo_.clear();
return *this;
}
Url& Url::defrag()
{
fragment_.clear();
return *this;
}
Url& Url::punycode()
{
check_hostname(host_);
std::string encoded(Punycode::encodeHostname(host_));
check_hostname(encoded);
host_ = encoded;
return *this;
}
Url& Url::unpunycode()
{
host_ = Punycode::decodeHostname(host_);
return *this;
}
Url& Url::host_reversed()
{
std::reverse(host_.begin(), host_.end());
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
{
position = host_.find('.', index);
if (position == std::string::npos)
{
std::reverse(host_.begin() + index, host_.end());
break;
}
else
{
std::reverse(host_.begin() + index, host_.begin() + position);
}
}
return *this;
}
void Url::check_hostname(std::string& host)
{
// Skip empty hostnames -- they are valid
if (host.empty())
{
return;
}
size_t start = 0;
size_t end = host.find('.');
while (end != std::string::npos)
{
if ((end - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (end == start)
{
throw std::invalid_argument("Empty label.");
}
start = end + 1;
end = host.find('.', start);
}
// For the final segment
if ((host.size() - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (host.size() == start && start > 1)
{
// Remove a trailing empty segment
host.resize(start - 1);
}
}
};

View File

@ -1,323 +0,0 @@
#ifndef URL_CPP_H
#define URL_CPP_H
#include <stdexcept>
#include <functional>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
namespace Url
{
struct UrlParseException : public std::logic_error
{
UrlParseException(const std::string& message) : std::logic_error(message) {}
};
struct CharacterClass
{
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
{
for (auto it = chars_.begin(); it != chars_.end(); ++it)
{
map_[static_cast<size_t>(*it)] = true;
}
}
bool operator()(char c) const
{
return map_[static_cast<unsigned char>(c)];
}
const std::string& chars() const
{
return chars_;
}
private:
// Private, unimplemented to prevent use
CharacterClass();
CharacterClass(const CharacterClass& other);
std::string chars_;
std::vector<bool> map_;
};
struct Url
{
/* Character classes */
const static CharacterClass GEN_DELIMS;
const static CharacterClass SUB_DELIMS;
const static CharacterClass ALPHA;
const static CharacterClass DIGIT;
const static CharacterClass UNRESERVED;
const static CharacterClass RESERVED;
const static CharacterClass PCHAR;
const static CharacterClass PATH;
const static CharacterClass QUERY;
const static CharacterClass FRAGMENT;
const static CharacterClass USERINFO;
const static CharacterClass HEX;
const static CharacterClass SCHEME;
const static std::vector<signed char> HEX_TO_DEC;
const static std::unordered_map<std::string, int> PORTS;
const static std::unordered_set<std::string> USES_RELATIVE;
const static std::unordered_set<std::string> USES_NETLOC;
const static std::unordered_set<std::string> USES_PARAMS;
const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
// The type of the predicate used for removing parameters
typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
explicit Url(const std::string& url);
Url(const Url& other)
: scheme_(other.scheme_)
, host_(other.host_)
, port_(other.port_)
, path_(other.path_)
, params_(other.params_)
, query_(other.query_)
, fragment_(other.fragment_)
, userinfo_(other.userinfo_)
, has_params_(other.has_params_)
, has_query_(other.has_query_) { }
/**
* Take on the value of the other URL.
*/
Url& assign(const Url& other);
/**
* To be considered equal, all fields must be equal.
*/
bool operator==(const Url& other) const;
bool operator!=(const Url& other) const;
/**
* Two URLs are considered equivalent if they have the same meaning.
*/
bool equiv(const Url& other);
/**************************************
* Component-wise access and setting. *
**************************************/
const std::string& scheme() const { return scheme_; }
Url& setScheme(const std::string& s)
{
scheme_ = s;
return *this;
}
const std::string& host() const { return host_; }
Url& setHost(const std::string& s)
{
host_ = s;
return *this;
}
const int port() const { return port_; }
Url& setPort(int i)
{
port_ = i;
return *this;
}
const std::string& path() const { return path_; }
Url& setPath(const std::string& s)
{
path_ = s;
return *this;
}
const std::string& params() const { return params_; }
Url& setParams(const std::string& s)
{
params_ = s;
has_params_ = !s.empty();
return *this;
}
const std::string& query() const { return query_; }
Url& setQuery(const std::string& s)
{
query_ = s;
has_query_ = !s.empty();
return *this;
}
const std::string& fragment() const { return fragment_; }
Url& setFragment(const std::string& s)
{
fragment_ = s;
return *this;
}
const std::string& userinfo() const { return userinfo_; }
Url& setUserinfo(const std::string& s)
{
userinfo_ = s;
return *this;
}
/**
* Get a representation of all components of the path, params, query, fragment.
*
* Always includes a leading /.
*/
std::string fullpath() const;
/**
* Get a new string representation of the URL.
**/
std::string str() const;
/*********************
* Chainable methods *
*********************/
/**
* Strip semantically meaningless excess '?', '&', and ';' characters from query
* and params.
*/
Url& strip();
/**
* Make the path absolute.
*
* Evaluate '.', '..', and excessive slashes.
*/
Url& abspath();
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const std::string& other)
{
return relative_to(Url(other));
}
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const Url& other);
/**
* Ensure that the path, params, query, and userinfo are properly escaped.
*
* In 'strict' mode, only entities that are both safe and not reserved characters
* are unescaped. In non-strict mode, entities that are safe are unescaped.
*/
Url& escape(bool strict=false);
/**
* Unescape all entities in the path, params, query, and userinfo.
*/
Url& unescape();
/**
* Remove any params or queries that appear in the blacklist.
*
* The blacklist should contain only lowercased strings, and the comparison is
* done in a case-insensitive way.
*/
Url& deparam(const std::unordered_set<std::string>& blacklist);
/**
* Filter params subject to a predicate for whether it should be filtered.
*
* The predicate must accept two string refs -- the key and value (which may be
* empty). Return `true` if the parameter should be removed, and `false`
* otherwise.
*/
Url& deparam(const deparam_predicate& predicate);
/**
* Put queries and params in sorted order.
*
* To ensure consistent comparisons, escape should be called beforehand.
*/
Url& sort_query();
/**
* Remove the port if it's the default for the scheme.
*/
Url& remove_default_port();
/**
* Remove the userinfo portion.
*/
Url& deuserinfo();
/**
* Remove the fragment.
*/
Url& defrag();
/**
* Punycode the hostname.
*/
Url& punycode();
/**
* Unpunycode the hostname.
*/
Url& unpunycode();
/**
* Reverse the hostname (a.b.c.d => d.c.b.a)
*/
Url& host_reversed();
private:
// Private, unimplemented to prevent use.
Url();
/**
* Remove repeated, leading, and trailing instances of chr from the string.
*/
std::string& remove_repeats(std::string& str, const char chr);
/**
* Ensure all the provided characters are escaped if necessary
*/
std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
/**
* Unescape entities in the provided string
*/
std::string& unescape(std::string& str);
/**
* Remove any params that match entries in the blacklist.
*/
std::string& remove_params(
std::string& str, const deparam_predicate& pred, char sep);
/**
* Split the provided string by char, sort, join by char.
*/
std::string& split_sort_join(std::string& str, const char glue);
/**
* Check that the hostname is valid, removing an optional trailing '.'.
*/
void check_hostname(std::string& host);
std::string scheme_;
std::string host_;
int port_;
std::string path_;
std::string params_;
std::string query_;
std::string fragment_;
std::string userinfo_;
bool has_params_;
bool has_query_;
};
}
#endif

View File

@ -1,150 +0,0 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "utf8.h"
namespace Url
{
Utf8::codepoint_t Utf8::readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end)
{
Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
if (current & 0x80)
{
// Number of additional bytes needed
unsigned int bytes = 0;
// The accumulated value
Utf8::codepoint_t result = 0;
if (current < 0xC0)
{
// Invalid sequence
throw std::invalid_argument("Low UTF-8 start byte");
}
else if (current < 0xE0)
{
// One additional byte, two bytes total, use 5 bits
bytes = 1;
result = current & 0x1F;
}
else if (current < 0xF0)
{
// Two additional bytes, three bytes total, use 4 bits
bytes = 2;
result = current & 0x0F;
}
else if (current < 0xF8)
{
// Three additional bytes, four bytes total, use 3 bits
bytes = 3;
result = current & 0x07;
}
else
{
throw std::invalid_argument("High UTF-8 start byte");
}
for (; bytes > 0; --bytes) {
if (it == end)
{
throw std::invalid_argument("UTF-8 sequence terminated early.");
}
current = static_cast<unsigned char>(*it++);
// Ensure the first two bits are 10
if ((current & 0xC0) != 0x80)
{
throw std::invalid_argument("Invalid continuation byte");
}
result = (result << 6) | (current & 0x3F);
}
return result;
}
else
{
return current;
}
}
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
{
if (value > MAX_CODEPOINT)
{
throw std::invalid_argument("Code point too high.");
}
else if (value <= 0x007F)
{
// Just append the character itself
str.append(1, static_cast<char>(value));
return str;
}
unsigned int bytes = 0;
if (value > 0xFFFF)
{
/**
* 11110xxx + 3 bytes for 21 bits total
*
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 18). The 5
* most significant bits of this byte are 11110, so we OR this result with
* 0xF0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 18) = 14.
*/
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
bytes = 3;
value <<= 14;
}
else if (value > 0x07FF)
{
/**
* 1110xxxx + 2 bytes for 16 bits total
*
* We need to take bits 15-12, which 0xF000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 12). The 4
* most significant bits of this byte are 1110, so we OR this result with
* 0xE0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 12) = 20.
*/
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
bytes = 2;
value <<= 20;
}
else
{
/**
* 110xxxxx + 1 byte for 11 bits total
*
* We need to take bits 10-6, which 0x7C0 masks out. These form the least
* significant bits of this byte (so we shift them back down by 6). The 3
* most significant bits of this byte are 110, so we OR this result with
* 0xC0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 6) = 26.
*/
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
bytes = 1;
value <<= 26;
}
/**
* The remaining bits are to be consumed 6 at a time from the most-significant
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
* by 26, and OR'd with 0x80 to produce the continuation byte.
*/
for (; bytes > 0; --bytes, value <<= 6)
{
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
}
return str;
}
};

View File

@ -1,91 +0,0 @@
#ifndef UTF8_CPP_H
#define UTF8_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
namespace Url
{
/**
* Work between unicode code points and their UTF-8-encoded representation.
*/
struct Utf8
{
/**
* The type we use to represent Unicode codepoints.
*/
typedef uint32_t codepoint_t;
/**
* The type we use when talking about the integral value of bytes.
*/
typedef unsigned char char_t;
/**
* The highest allowed codepoint.
*/
static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
/**
* Consume up to the last byte of the sequence, returning the codepoint.
*/
static codepoint_t readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end);
/**
* Write a codepoint to the provided string.
*/
static std::string& writeCodepoint(std::string& str, codepoint_t value);
/**
* Return the first codepoint stored in the provided string.
*/
static codepoint_t toCodepoint(const std::string& str)
{
auto it = str.begin();
return readCodepoint(it, str.end());
}
/**
* Get a string with the provided codepoint.
*/
static std::string fromCodepoint(codepoint_t value)
{
std::string str;
writeCodepoint(str, value);
return str;
}
/**
* Return all the codepoints in the string.
*/
static std::vector<codepoint_t> toCodepoints(const std::string& str)
{
std::vector<codepoint_t> result;
for (auto it = str.begin(); it != str.end(); )
{
result.push_back(readCodepoint(it, str.end()));
}
return result;
}
/**
* Create a string from a vector of codepoints.
*/
static std::string fromCodepoints(const std::vector<codepoint_t>& points)
{
std::string result;
for (auto it = points.begin(); it != points.end(); ++it)
{
writeCodepoint(result, *it);
}
return result;
}
};
}
#endif

View File

@ -3,8 +3,8 @@ project(librengine LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h)
set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp)
set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h include/robots_txt.h)
set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp src/robots_txt.cpp)
set(include_all ${include})
set(src_all ${src})

36
lib/include/robots_txt.h Normal file
View File

@ -0,0 +1,36 @@
#ifndef ROBOTS_TXT_H
#define ROBOTS_TXT_H
#include "http.h"
namespace librengine {
class user_agent {
public:
std::string agent;
std::vector<std::string> allow_list;
std::vector<std::string> disallow_list;
float crawl_delay;
public:
static bool match(const std::string &pattern, const std::string &expression);
public:
explicit user_agent(const std::string &agent);
bool allowed(const std::string &path);
bool allowed(const http::url &url);
};
class robots_txt {
private:
std::string text;
public:
std::vector<user_agent> agents;
public:
explicit robots_txt(const std::string &text);
void parse();
bool allowed(const std::string &path, const std::string &agent);
bool allowed(const http::url &url, const std::string &user_agent);
};
}
#endif

138
lib/src/robots_txt.cpp Normal file
View File

@ -0,0 +1,138 @@
#include "robots_txt.h"
#include "str.h"
#include <algorithm>
namespace librengine {
bool user_agent::match(const std::string &pattern, const std::string &expression) {
auto pattern_size = pattern.length();
auto expression_size = expression.length();
std::vector<size_t> vector_pos(expression_size + 1);
size_t pos = 1;
for (int i = 0; i < pattern_size; ++i) {
char c = pattern[i];
if (c == '$' && i + 1 == pattern_size) {
return vector_pos[pos - 1] == expression_size;
}
if (c == '*') {
pos = expression_size - vector_pos[0] + 1;
for (int j = 1; j < pos; j++) {
vector_pos[j] = vector_pos[j - 1] + 1;
}
} else {
int tmp_pos = 0;
for (int j = 0; j < pos; j++) {
auto c_pos = vector_pos[j];
if (c_pos < expression_size && expression[c_pos] == c) {
vector_pos[tmp_pos] = c_pos + 1;
++tmp_pos;
}
}
if (tmp_pos == 0) return false;
pos = tmp_pos;
}
}
return true;
}
user_agent::user_agent(const std::string &agent) {
this->agent = agent;
crawl_delay = 0;
}
bool user_agent::allowed(const std::string &path) {
for (const auto &allow : allow_list) {
if (match(allow, path)) {
return true;
}
}
for (const auto &disallow : disallow_list) {
if (match(disallow, path)) {
return false;
}
}
return true;
}
bool user_agent::allowed(const http::url &url) {
if (!url.path) return false;
std::string path = *url.path;
return allowed(path);
}
robots_txt::robots_txt(const std::string &text) {
this->text = text;
agents.emplace_back("");
}
void robots_txt::parse() {
auto splited = str::split(text, "\n");
for (const auto &pair : splited) {
auto splited_pair = str::split(pair, ":");
auto splited_pair_size = splited_pair.size();
if (splited_pair_size != 2) continue;
auto key = str::to_lower(splited_pair[0]);
auto value = str::to_lower(splited_pair[1]);
key = str::trim(key);
value = str::trim_start(value);
if (!value.empty()) value = str::trim_end(value);
auto comment_index = value.find('#');
if (comment_index != -1) {
value = value.substr(0, comment_index);
value = str::trim_end(value);
}
if (key.empty()) continue;
if (key != "disallow" && value.empty()) continue;
auto &current_agent = agents.back();
if (key == "user-agent") {
agents.emplace_back(value);
}
else if (key == "allow") {
current_agent.allow_list.push_back(value);
}
else if (key == "disallow") {
if (value.empty()) current_agent.allow_list.emplace_back("/");
else current_agent.disallow_list.push_back(value);
}
else if (key == "crawl-delay") {
try {
current_agent.crawl_delay = std::stof(value);
} catch (const std::exception &e) {
//current_agent.crawl_delay = 0; (def)
}
}
}
}
bool robots_txt::allowed(const std::string &path, const std::string &agent) {
auto found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == agent; });
if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == "*"; });
if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == ""; });
return found->allowed(path);
}
bool robots_txt::allowed(const http::url &url, const std::string &user_agent) {
if (!url.path) return false;
std::string path = *url.path;
return allowed(path, user_agent);
}
}