Replacing a third party robots.txt parser with own robots.txt parser

2024-11-24 07:53:17 +03:00 · 2022-05-27 19:40:36 +03:00 · 2022-05-27 19:40:36 +03:00 · 60eadcce58
commit 60eadcce58
parent 077740ac6c
22 changed files with 185 additions and 3047 deletions
--- a/config.json
+++ b/config.json
@ -21,7 +21,7 @@
    "proxy": "socks5://127.0.0.1:9050",
    "load_page_timeout_s": 10,
    "update_time_site_info_s_after": 864000, //10 days
-    "delay_time_s": 3, 
+    "delay_time_s": 1, 
    "max_pages_site": 5,
    "max_page_symbols": 50000000, //50mb
    "max_robots_txt_symbols": 3000,
--- a/crawler/CMakeLists.txt
+++ b/crawler/CMakeLists.txt
@ -9,12 +9,5 @@ set(CMAKE_CXX_STANDARD 17)
 find_package(CURL)
 find_package(Threads)

-set(tp_rep_cpp third_party/rep-cpp/agent.cpp third_party/rep-cpp/agent.h
-        third_party/rep-cpp/directive.cpp third_party/rep-cpp/directive.h third_party/rep-cpp/robots.cpp
-        third_party/rep-cpp/robots.h) #robots.txt / https://github.com/seomoz/rep-cpp
-set(tp_url_cpp third_party/url-cpp/psl.cpp third_party/url-cpp/psl.h
-        third_party/url-cpp/punycode.cpp third_party/url-cpp/punycode.h third_party/url-cpp/url.cpp
-        third_party/url-cpp/url.h third_party/url-cpp/utf8.cpp third_party/url-cpp/utf8.h) #https://github.com/seomoz/url-cpp
-
 add_executable(${PROJECT_NAME} main.cpp src/worker.cpp include/worker.h src/json_generator.cpp include/json_generator.h src/html_helper.cpp include/html_helper.h ${tp_rep_cpp} ${tp_url_cpp})
 target_link_libraries(${PROJECT_NAME} PRIVATE /usr/lib/liblexbor.so curl Threads::Threads /usr/local/lib/liblibrengine.so)
--- a/crawler/include/worker.h
+++ b/crawler/include/worker.h
@ -41,7 +41,7 @@ public:
    librengine::http::request::result_s site(const librengine::http::url &url);
    std::optional<std::string> get_robots_txt(const librengine::http::url &url);

-    bool is_allowed_in_robots(const std::string &body, const std::string &url);
+    bool is_allowed_in_robots(const std::string &body, const http::url &url);
    bool normalize_url(librengine::http::url &url, const std::optional<std::string> &owner_host = std::nullopt) const;
 public:
    explicit worker(const librengine::config::all &config);
--- a/crawler/main.cpp
+++ b/crawler/main.cpp
@ -2,8 +2,6 @@

 #include "include/worker.h"

-using namespace librengine;
-
 int main(int argc, char **argv) {
    using namespace librengine;

--- a/crawler/src/worker.cpp
+++ b/crawler/src/worker.cpp
@ -10,10 +10,10 @@
 #include <librengine/logger.h>
 #include <librengine/helper.h>
 #include <librengine/cache.h>
+#include <librengine/robots_txt.h>

 #include "../include/json_generator.h"
 #include "../include/html_helper.h"
-#include "../third_party/rep-cpp/robots.h"

 #define DEBUG true //TODO: FALSE

@ -82,8 +82,10 @@ std::optional<std::string> worker::get_robots_txt(const http::url &url) {
    return request.result.response;
 }

-bool worker::is_allowed_in_robots(const std::string &body, const std::string &url) {
-    Rep::Robots robots = Rep::Robots(body);
+bool worker::is_allowed_in_robots(const std::string &body, const http::url &url) {
+    robots_txt robots(body);
+    robots.parse();
+
    return robots.allowed(url, config.crawler_.user_agent);
 }
 bool worker::normalize_url(http::url &url, const std::optional<std::string> &owner_host) const {
@ -260,7 +262,8 @@ worker::result worker::work(url &url_) {
            }
        }

-        if (is_checked && !is_allowed_in_robots(robots_txt_body, url.text)) {
+        if (is_checked && !is_allowed_in_robots(robots_txt_body, url)) {
+            if_debug_print(logger::type::error, "disallowed robots.txt", url.text);
            return result::disallowed_robots;
        }
    }
--- a/crawler/third_party/rep-cpp/agent.cpp
+++ b/crawler/third_party/rep-cpp/agent.cpp
@ -1,138 +0,0 @@
-#include <algorithm>
-#include <iomanip>
-#include <sstream>
-
-#include "../url-cpp/url.h"
-
-#include "agent.h"
-#include "directive.h"
-
-namespace
-{
-    std::string escape_url(Url::Url& url)
-    {
-        return url.defrag().escape().fullpath();
-    }
-
-    std::string trim_front(const std::string& str, const char chr)
-    {
-        auto itr = std::find_if(str.begin(), str.end(),
-                       [chr](const char c) {return c != chr;});
-        return std::string(itr, str.end());
-    }
-}
-
-namespace Rep
-{
-    Agent& Agent::allow(const std::string& query)
-    {
-        Url::Url url(query);
-        // ignore directives for external URLs
-        if (is_external(url))
-        {
-            return *this;
-        }
-        // leading wildcard?
-        if (query.front() == '*')
-        {
-            Url::Url trimmed(trim_front(query, '*'));
-            directives_.push_back(Directive(escape_url(trimmed), true));
-        }
-        directives_.push_back(Directive(escape_url(url), true));
-        sorted_ = false;
-        return *this;
-    }
-
-    Agent& Agent::disallow(const std::string& query)
-    {
-        if (query.empty())
-        {
-            // Special case: "Disallow:" means "Allow: /"
-            directives_.push_back(Directive(query, true));
-        }
-        else
-        {
-            Url::Url url(query);
-            // ignore directives for external URLs
-            if (is_external(url))
-            {
-                return *this;
-            }
-            // leading wildcard?
-            if (query.front() == '*')
-            {
-                Url::Url trimmed(trim_front(query, '*'));
-                directives_.push_back(Directive(escape_url(trimmed), false));
-            }
-            directives_.push_back(Directive(escape_url(url), false));
-        }
-        sorted_ = false;
-        return *this;
-    }
-
-    const std::vector<Directive>& Agent::directives() const
-    {
-        if (!sorted_)
-        {
-            std::sort(directives_.begin(), directives_.end(),
-                [](const Directive& a, const Directive& b) {
-                    return b.priority() < a.priority();
-                });
-            sorted_ = true;
-        }
-        return directives_;
-    }
-
-    bool Agent::allowed(const std::string& query) const
-    {
-        Url::Url url(query);
-        if (is_external(url))
-        {
-            return false;
-        }
-        std::string path(escape_url(url));
-
-        if (path.compare("/robots.txt") == 0)
-        {
-            return true;
-        }
-
-        for (const auto& directive : directives())
-        {
-            if (directive.match(path))
-            {
-                return directive.allowed();
-            }
-        }
-        return true;
-    }
-
-    std::string Agent::str() const
-    {
-        std::stringstream out;
-        if (delay_ > 0)
-        {
-            out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' ';
-        }
-        out << '[';
-        const auto& d = directives();
-        auto begin = d.begin();
-        auto end = d.end();
-        if (begin != end)
-        {
-            out << "Directive(" << begin->str() << ')';
-            ++begin;
-        }
-        for (; begin != end; ++begin)
-        {
-            out << ", Directive(" << begin->str() << ')';
-        }
-        out << ']';
-        return out.str();
-    }
-
-    bool Agent::is_external(const Url::Url& url) const
-    {
-        return !host_.empty() && !url.host().empty() && url.host() != host_;
-    }
-}
--- a/crawler/third_party/rep-cpp/agent.h
+++ b/crawler/third_party/rep-cpp/agent.h
@ -1,93 +0,0 @@
-#ifndef AGENT_CPP_H
-#define AGENT_CPP_H
-
-#include <vector>
-
-#include "directive.h"
-
-// forward declaration
-namespace Url
-{
-    struct Url;
-}
-
-namespace Rep
-{
-    class Agent
-    {
-    public:
-        /* The type for the delay. */
-        typedef float delay_t;
-
-        /**
-         * Default constructor
-         */
-        Agent() : Agent("") {}
-
-        /**
-         * Construct an agent.
-         */
-        explicit Agent(const std::string& host) :
-            directives_(), delay_(-1.0), sorted_(true), host_(host) {}
-
-        /**
-         * Default copy constructor.
-         */
-        Agent(const Agent& rhs) = default;
-
-        /**
-         * Default move constructor.
-         */
-        Agent(Agent&& rhs) = default;
-
-        /**
-         * Add an allowed directive.
-         */
-        Agent& allow(const std::string& query);
-
-        /**
-         * Add a disallowed directive.
-         */
-        Agent& disallow(const std::string& query);
-
-        /**
-         * Set the delay for this agent.
-         */
-        Agent& delay(delay_t value) {
-            delay_ = value;
-            return *this;
-        }
-
-        /**
-         * Return the delay for this agent.
-         */
-        delay_t delay() const { return delay_; }
-
-        /**
-         * A vector of the directives, in priority-sorted order.
-         */
-        const std::vector<Directive>& directives() const;
-
-        /**
-         * Return true if the URL (either a full URL or a path) is allowed.
-         */
-        bool allowed(const std::string& path) const;
-
-        std::string str() const;
-
-        /**
-         * Default copy assignment operator.
-         */
-        Agent& operator=(const Agent& rhs) = default;
-
-    private:
-        bool is_external(const Url::Url& url) const;
-
-        mutable std::vector<Directive> directives_;
-        delay_t delay_;
-        mutable bool sorted_;
-        std::string host_;
-    };
-}
-
-#endif
--- a/crawler/third_party/rep-cpp/directive.cpp
+++ b/crawler/third_party/rep-cpp/directive.cpp
@ -1,130 +0,0 @@
-#include <algorithm>
-#include <locale>
-#include <sstream>
-#include <string>
-
-#include "../url-cpp/url.h"
-
-#include "directive.h"
-
-namespace Rep
-{
-    Directive::Directive(const std::string& line, bool allowed)
-        : expression_()
-        , priority_(line.size())
-        , allowed_(allowed)
-    {
-        if (line.find('*') == std::string::npos)
-        {
-            expression_.assign(line);
-            return;
-        }
-
-        // Remove consecutive '*'s
-        expression_.reserve(line.size());
-        bool star = false;
-        for (auto character : line)
-        {
-            if (character == '*')
-            {
-                if (!star)
-                {
-                    expression_.append(1, character);
-                }
-                star = true;
-            }
-            else
-            {
-                expression_.append(1, character);
-                star = false;
-            }
-        }
-
-        // Remove trailing '*'s
-        std::string::reverse_iterator last =
-            std::find_if(expression_.rbegin(), expression_.rend(),
-                [](const char c) {
-                    return c != '*';
-                });
-        expression_.erase(last.base(), expression_.end());
-
-        // Priority is the length of the expression
-        priority_ = expression_.size();
-    }
-
-    bool Directive::match(const std::string::const_iterator& e_begin,
-                          const std::string::const_iterator& e_end,
-                          const std::string::const_iterator& p_begin,
-                          const std::string::const_iterator& p_end) const
-    {
-        std::string::const_iterator expression_it = e_begin;
-        std::string::const_iterator path_it = p_begin;
-        while (expression_it != e_end && path_it != p_end)
-        {
-            if (*expression_it == '*')
-            {
-                // Advance and recurse
-                ++expression_it;
-                for (; path_it != p_end; ++path_it)
-                {
-                    if (match(expression_it, e_end, path_it, p_end))
-                    {
-                        return true;
-                    }
-                }
-                return false;
-            }
-            else if (*expression_it == '$')
-            {
-                // This check expects path to be fully consumed. But since one of the
-                // criteria of being in this while loop is that we've not fully consumed
-                // path, return false.
-                return false;
-            }
-            else if (*expression_it != *path_it)
-            {
-                // These characters must match
-                return false;
-            }
-            else
-            {
-                // Advance both by one
-                ++path_it;
-                ++expression_it;
-            }
-        }
-
-        // Return true only if we've consumed all of the expression
-        if (expression_it == e_end)
-        {
-            return true;
-        }
-        else if (*expression_it == '$')
-        {
-            return path_it == p_end;
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    std::string Directive::str() const
-    {
-        std::stringstream out;
-        if (allowed_)
-        {
-            out << "Allow: " << expression_;
-        }
-        else {
-            out << "Disallow: " << expression_;
-        }
-        return out.str();
-    }
-
-    bool Directive::match(const std::string& path) const
-    {
-        return match(expression_.begin(), expression_.end(), path.begin(), path.end());
-    }
-
-}
--- a/crawler/third_party/rep-cpp/directive.h
+++ b/crawler/third_party/rep-cpp/directive.h
@ -1,82 +0,0 @@
-#ifndef DIRECTIVE_CPP_H
-#define DIRECTIVE_CPP_H
-
-
-namespace Rep
-{
-
-    class Directive
-    {
-    public:
-        /**
-         * The type of our priority value.
-         */
-        typedef size_t priority_t;
-
-        /**
-         * Default constructor disallowed.
-         */
-        Directive() = delete;
-
-        /**
-         * The input to this constructor must be stripped of comments
-         * and trailing whitespace.
-         */
-        Directive(const std::string& line, bool allowed);
-
-        /**
-         * Default copy constructor.
-         */
-        Directive(const Directive& rhs) = default;
-
-        /**
-         * Default move constructor.
-         */
-        Directive(Directive&& rhs) = default;
-
-        /**
-         * The priority of the rule.
-         */
-        priority_t priority() const
-        {
-            return priority_;
-        }
-
-        /**
-         * Whether or not the provided path matches. The path is
-         * expected to be properly escaped.
-         */
-        bool match(const std::string& path) const;
-
-        /**
-         * Whether this rule is for an allow or a disallow.
-         */
-        bool allowed() const
-        {
-            return allowed_;
-        }
-
-        std::string str() const;
-
-        /**
-         * Default copy assignment operator.
-         */
-        Directive& operator=(const Directive& rhs) = default;
-
-    private:
-        std::string expression_;
-        priority_t priority_;
-        bool allowed_;
-
-        /**
-         * Return true if p_begin -> p_end matches the expression e_begin -> e_end.
-         */
-        bool match(const std::string::const_iterator& e_begin,
-                   const std::string::const_iterator& e_end,
-                   const std::string::const_iterator& p_begin,
-                   const std::string::const_iterator& p_end) const;
-    };
-
-}
-
-#endif
--- a/crawler/third_party/rep-cpp/robots.cpp
+++ b/crawler/third_party/rep-cpp/robots.cpp
@ -1,196 +0,0 @@
-#include <algorithm>
-#include <functional>
-#include <cctype>
-#include <locale>
-#include <sstream>
-#include <iostream>
-#include <unordered_map>
-
-#include "../url-cpp/url.h"
-
-#include "robots.h"
-
-namespace Rep
-{
-
-    void Robots::strip(std::string& string)
-    {
-        string.erase(string.begin(), std::find_if(string.begin(), string.end(),
-            std::not1(std::ptr_fun<int, int>(std::isspace))));
-        string.erase(std::find_if(string.rbegin(), string.rend(),
-            std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
-    }
-
-    bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
-    {
-        while (getline(stream, key))
-        {
-            size_t index = key.find('#');
-            if (index != std::string::npos)
-            {
-                key.resize(index);
-            }
-
-            // Find the colon and divide it into key and value, skipping malformed lines
-            index = key.find(':');
-            if (index == std::string::npos)
-            {
-                continue;
-            }
-
-            value.assign(key.begin() + index + 1, key.end());
-            key.resize(index);
-
-            // Strip whitespace off of each
-            strip(key);
-            strip(value);
-
-            // Lowercase the key
-            std::transform(key.begin(), key.end(), key.begin(), ::tolower);
-
-            return true;
-        }
-        return false;
-    }
-
-    Robots::Robots(const std::string& content) :
-        Robots(content, "")
-    {
-    }
-
-    Robots::Robots(const std::string& content, const std::string& base_url) :
-        host_(Url::Url(base_url).host()),
-        agents_(),
-        sitemaps_(),
-        default_(agents_.emplace("*", Agent(host_)).first->second)
-    {
-        std::string agent_name("*");
-        std::istringstream input(content);
-        if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
-        {
-            input.ignore(3);
-        }
-        std::string key, value;
-        std::vector<std::string> group;
-        bool last_agent = false;
-        agent_map_t::iterator current = agents_.find("*");
-        while (Robots::getpair(input, key, value))
-        {
-            if (key.compare("user-agent") == 0)
-            {
-                // Store the user agent string as lowercased
-                std::transform(value.begin(), value.end(), value.begin(), ::tolower);
-
-                if (last_agent)
-                {
-                    group.push_back(value);
-                }
-                else
-                {
-                    if (!agent_name.empty())
-                    {
-                        for (auto other : group)
-                        {
-                            agents_.emplace(other, current->second);
-                        }
-                        group.clear();
-                    }
-                    agent_name = value;
-                    current = agents_.emplace(agent_name, Agent(host_)).first;
-                }
-                last_agent = true;
-                continue;
-            }
-            else
-            {
-                last_agent = false;
-            }
-
-            if (key.compare("sitemap") == 0)
-            {
-                sitemaps_.push_back(value);
-            }
-            else if (key.compare("disallow") == 0)
-            {
-                current->second.disallow(value);
-            }
-            else if (key.compare("allow") == 0)
-            {
-                current->second.allow(value);
-            }
-            else if (key.compare("crawl-delay") == 0)
-            {
-                try
-                {
-                    current->second.delay(std::stof(value));
-                }
-                catch (const std::exception&)
-                {
-                    std::cerr << "Could not parse " << value << " as float." << std::endl;
-                }
-            }
-        }
-
-        if (!agent_name.empty())
-        {
-            for (auto other : group)
-            {
-                agents_.emplace(other, current->second);
-            }
-        }
-    }
-
-    const Agent& Robots::agent(const std::string& name) const
-    {
-        // Lowercase the agent
-        std::string lowered(name);
-        std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
-
-        auto it = agents_.find(lowered);
-        if (it == agents_.end())
-        {
-            return default_;
-        }
-        else
-        {
-            return it->second;
-        }
-    }
-
-    bool Robots::allowed(const std::string& path, const std::string& name) const
-    {
-        return agent(name).allowed(path);
-    }
-
-    std::string Robots::str() const
-    {
-        std::stringstream out;
-        // TODO: include sitepath info
-        out << '{';
-        auto begin = agents_.begin();
-        auto end = agents_.end();
-        if (begin != end)
-        {
-            out << '"' << begin->first << '"' << ": " << begin->second.str();
-            ++begin;
-        }
-        for (; begin != end; ++begin)
-        {
-            out << ", \"" << begin->first << '"' << ": " << begin->second.str();
-        }
-        out << '}';
-        return out.str();
-    }
-
-    std::string Robots::robotsUrl(const std::string& url)
-    {
-        return Url::Url(url)
-            .setUserinfo("")
-            .setPath("robots.txt")
-            .setParams("")
-            .setQuery("")
-            .setFragment("")
-            .remove_default_port()
-            .str();
-    }
-}
--- a/crawler/third_party/rep-cpp/robots.h
+++ b/crawler/third_party/rep-cpp/robots.h
@ -1,66 +0,0 @@
-#ifndef ROBOTS_CPP_H
-#define ROBOTS_CPP_H
-
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-#include "agent.h"
-
-namespace Rep
-{
-
-    class Robots
-    {
-    public:
-        typedef std::unordered_map<std::string, Agent> agent_map_t;
-        typedef std::vector<std::string> sitemaps_t;
-
-        /**
-         * Create a robots.txt from a utf-8-encoded string.
-         */
-        explicit Robots(const std::string& content);
-
-        /**
-         * Create a robots.txt from a utf-8-encoded string assuming
-         * the given base_url.
-         */
-        Robots(const std::string& content, const std::string& base_url);
-
-        /**
-         * Get the sitemaps in this robots.txt
-         */
-        const sitemaps_t& sitemaps() const { return sitemaps_; }
-
-        /**
-         * Get the agent with the corresponding name.
-         */
-        const Agent& agent(const std::string& name) const;
-
-        /**
-         * Return true if agent is allowed to fetch the URL (either a
-         * full URL or a path).
-         */
-        bool allowed(const std::string& path, const std::string& name) const;
-
-        std::string str() const;
-
-        /**
-         * Return the robots.txt URL corresponding to the provided URL.
-         */
-        static std::string robotsUrl(const std::string& url);
-
-    private:
-        static void strip(std::string& string);
-
-        static bool getpair(
-            std::istringstream& stream, std::string& key, std::string& value);
-
-        std::string host_;
-        agent_map_t agents_;
-        sitemaps_t sitemaps_;
-        Agent& default_;
-    };
-}
-
-#endif
--- a/crawler/third_party/url-cpp/psl.cpp
+++ b/crawler/third_party/url-cpp/psl.cpp
@ -1,183 +0,0 @@
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include "psl.h"
-#include "punycode.h"
-
-namespace Url
-{
-    const std::string PSL::not_found = "";
-
-    PSL::PSL(std::istream& stream)
-    {
-        std::string line;
-        while (std::getline(stream, line))
-        {
-            // Only take up to the first whitespace.
-            auto it = std::find_if(line.begin(), line.end(), ::isspace);
-            line.resize(it - line.begin());
-
-            // Skip blank lines
-            if (line.empty())
-            {
-                continue;
-            }
-
-            // Skip comments
-            if (line.compare(0, 2, "//") == 0)
-            {
-                continue;
-            }
-
-            // We know the line has at least a single character at this point
-            if (line[0] == '*')
-            {
-                // Line is a wildcard rule
-                if (line.size() <= 2 || line[1] != '.')
-                {
-                    throw std::invalid_argument("Wildcard rule must be of form *.<host>");
-                }
-
-                add(line, 1, 2);
-            }
-            else if (line[0] == '!')
-            {
-                // Line is an exception, take all but the !
-                if (line.size() <= 1)
-                {
-                    throw std::invalid_argument("Exception rule has no hostname.");
-                }
-
-                add(line, -1, 1);
-            }
-            else
-            {
-                add(line, 0, 0);
-            }
-        }
-    }
-
-    PSL PSL::fromPath(const std::string& path)
-    {
-        std::ifstream stream(path);
-        if (!stream.good())
-        {
-            std::stringstream message;
-            message << "Path '" << path << "' inaccessible.";
-            throw std::invalid_argument(message.str());
-        }
-        return PSL(stream);
-    }
-
-    PSL PSL::fromString(const std::string& str)
-    {
-        std::stringstream stream(str);
-        return PSL(stream);
-    }
-
-    std::string PSL::getTLD(const std::string& hostname) const
-    {
-        return getLastSegments(hostname, getTLDLength(hostname));
-    }
-
-    std::string PSL::getPLD(const std::string& hostname) const
-    {
-        return getLastSegments(hostname, getTLDLength(hostname) + 1);
-    }
-
-    std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
-    {
-        size_t length = getTLDLength(hostname);
-        return std::make_pair(
-            getLastSegments(hostname, length),
-            getLastSegments(hostname, length + 1));
-    }
-
-    size_t PSL::getTLDLength(const std::string& hostname) const
-    {
-        // Reversed copy of hostname
-        std::string tld(hostname.rbegin(), hostname.rend());
-        std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
-
-        while (tld.size())
-        {
-            auto it = levels.find(tld);
-            if (it != levels.end())
-            {
-                return it->second;
-            }
-
-            size_t position = tld.rfind('.');
-            if (position == std::string::npos || position == 0)
-            {
-                tld.resize(0);
-            }
-            else
-            {
-                tld.resize(position);
-            }
-        }
-
-        return 1;
-    }
-
-    std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
-    {
-        size_t position = hostname.size();
-        size_t remaining = segments;
-        while (remaining != 0 && position && position != std::string::npos)
-        {
-            position = hostname.rfind('.', position - 1);
-            remaining -= 1;
-        }
-
-        if (remaining >= 1)
-        {
-            return not_found;
-        }
-
-        // Return the whole string if position == std:string::npos
-        size_t start = (position == std::string::npos) ? 0 : position + 1;
-
-        std::string result(hostname, start);
-        std::transform(result.begin(), result.end(), result.begin(), ::tolower);
-
-        // Leading .'s indicate that the query had an empty segment
-        if (result.size() && result[0] == '.')
-        {
-            std::stringstream message;
-            message << "Empty segment in " << result;
-            throw std::invalid_argument(message.str());
-        }
-
-        return result;
-    }
-
-    size_t PSL::countSegments(const std::string& hostname) const
-    {
-        size_t count = 1;
-        size_t position = hostname.find('.');
-        while (position != std::string::npos)
-        {
-            count += 1;
-            position = hostname.find('.', position + 1);
-        }
-        return count;
-    }
-
-    void PSL::add(std::string& rule, int level_adjust, size_t trim)
-    {
-        // First unpunycoded
-        std::string copy(rule.rbegin(), rule.rend() - trim);
-        size_t length = countSegments(copy) + level_adjust;
-        levels[copy] = length;
-
-        // And now punycoded
-        rule = Punycode::encodeHostname(rule);
-        copy.assign(rule.rbegin(), rule.rend() - trim);
-        levels[copy] = length;
-    }
-
-};
--- a/crawler/third_party/url-cpp/psl.h
+++ b/crawler/third_party/url-cpp/psl.h
@ -1,102 +0,0 @@
-#ifndef PSL_CPP_H
-#define PSL_CPP_H
-
-#include <istream>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-
-namespace Url
-{
-
-    /**
-     * Find TLDs and PLDs of a hostname according to a PSL.
-     */
-    struct PSL
-    {
-        /**
-         * Indicates the there is no TLD / PLD
-         */
-        static const std::string not_found;
-
-        /**
-         * Read a PSL from an istream.
-         */
-        PSL(std::istream& stream);
-
-        PSL(): levels() { };
-
-        PSL(const PSL& other): levels(other.levels) { }
-
-        PSL& operator=(const PSL& other)
-        {
-            levels = other.levels;
-            return *this;
-        }
-
-        /**
-         * Read the provided path holding a set of PSL rules.
-         */
-        static PSL fromPath(const std::string& path);
-
-        /**
-         * Create a PSL object from a string.
-         */
-        static PSL fromString(const std::string& str);
-
-        /**
-         * Get just the TLD of the hostname.
-         *
-         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
-         * some segments have been appropriately punycoded and others not, it may return
-         * a wrong answer. If a punycoded host is provided, a punycoded response is
-         * returned. If an unpunycoded host is provided, an unpunycoded response is
-         * returned.
-         */
-        std::string getTLD(const std::string& hostname) const;
-
-        /**
-         * Get just the PLD of the hostname.
-         *
-         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
-         * some segments have been appropriately punycoded and others not, it may return
-         * a wrong answer. If a punycoded host is provided, a punycoded response is
-         * returned. If an unpunycoded host is provided, an unpunycoded response is
-         * returned.
-         */
-        std::string getPLD(const std::string& hostname) const;
-
-        /**
-         * Get the (TLD, PLD) of the hostname.
-         *
-         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
-         * some segments have been appropriately punycoded and others not, it may return
-         * a wrong answer. If a punycoded host is provided, a punycoded response is
-         * returned. If an unpunycoded host is provided, an unpunycoded response is
-         * returned.
-         */
-        std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
-    private:
-        // Mapping of a string rule to its level
-        std::unordered_map<std::string, size_t> levels;
-
-        // Return the number of segments in a hostname
-        size_t countSegments(const std::string& hostname) const;
-
-        // Return the number of segments in the TLD of the provided hostname
-        size_t getTLDLength(const std::string& hostname) const;
-
-        // Return the last `segments` segments of a hostname
-        std::string getLastSegments(const std::string& hostname, size_t segments) const;
-
-        /**
-         * Add the provided host with the provided priority, trimming characters off
-         * the front, and adjusting the level by the provided number.
-         */
-        void add(std::string& host, int level_adjust, size_t trim);
-    };
-
-}
-
-#endif
--- a/crawler/third_party/url-cpp/punycode.cpp
+++ b/crawler/third_party/url-cpp/punycode.cpp
@ -1,408 +0,0 @@
-#include <algorithm>
-#include <string>
-#include <iostream>
-
-#include "punycode.h"
-#include "utf8.h"
-
-namespace Url
-{
-
-    std::string& Punycode::encode(std::string& str)
-    {
-        // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
-        //
-        // let n = initial_n
-        // let delta = 0
-        // let bias = initial_bias
-        punycode_uint n = INITIAL_N;
-        punycode_uint delta = 0;
-        punycode_uint bias = INITIAL_BIAS;
-        std::string output;
-
-        // Accumulate the non-basic codepoints
-        std::vector<punycode_uint> codepoints;
-        for (auto it = str.cbegin(); it != str.cend(); )
-        {
-            Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
-            if (value < 0x80)
-            {
-                // copy them to the output in order
-                output.append(1, static_cast<char>(value));
-            }
-            codepoints.push_back(value);
-        }
-
-        // let h = b = the number of basic code points in the input
-        size_t h = output.size();
-        size_t b = h;
-
-        // copy a delimiter if b > 0
-        if (b > 0)
-        {
-            output.append(1, '-');
-        }
-
-        // while h < length(input) do begin
-        while (h < codepoints.size())
-        {
-            // let m = the minimum {non-basic} code point >= n in the input
-            punycode_uint m = MAX_PUNYCODE_UINT;
-            for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
-            {
-                if ((*it >= n) && (*it < m))
-                {
-                    m = *it;
-                }
-            }
-
-            // let delta = delta + (m - n) * (h + 1), fail on overflow
-            if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
-            {
-                throw std::invalid_argument("Overflow delta update.");
-            }
-            delta += (m - n) * (h + 1);
-
-            // let n = m
-            n = m;
-
-            // for each code point c in the input (in order) do begin
-            for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
-            {
-                // if c < n {or c is basic} then increment delta, fail on overflow
-                if (*it < n)
-                {
-                    if (delta == MAX_PUNYCODE_UINT)
-                    {
-                        throw std::invalid_argument("Overflow delta increment.");
-                    }
-                    ++delta;
-                }
-
-                // if c == n then begin
-                if (*it == n)
-                {
-                    // let q = delta
-                    punycode_uint q = delta;
-
-                    // for k = base to infinity in steps of base do begin
-                    for (punycode_uint k = BASE; ; k += BASE)
-                    {
-                        // let t = tmin if k <= bias {+ tmin}, or
-                        //         tmax if k >= bias + tmax, or k - bias otherwise
-                        punycode_uint t = k <= bias ? TMIN :
-                                          k >= bias + TMAX ? TMAX : k - bias;
-
-                        // if q < t then break
-                        if (q < t)
-                        {
-                            break;
-                        }
-
-                        // output the code point for digit t + ((q - t) mod (base - t))
-                        output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
-
-                        // let q = (q - t) div (base - t)
-                        q = (q - t) / (BASE - t);
-                    }
-
-                    // output the code point for digit q
-                    output.append(1, DIGIT_TO_BASIC[q]);
-
-                    // let bias = adapt(delta, h + 1, test h equals b?)
-                    bias = adapt(delta, h + 1, h == b);
-
-                    // let delta = 0
-                    delta = 0;
-
-                    // increment h
-                    ++h;
-                }
-            }
-
-            // increment delta and n
-            ++delta;
-            ++n;
-        }
-
-        str.assign(output);
-        return str;
-    }
-
-    std::string Punycode::encode(const std::string& str)
-    {
-        std::string result(str);
-        encode(result);
-        return result;
-    }
-
-    std::string Punycode::encodeHostname(const std::string& hostname)
-    {
-        // Avoid any punycoding at all if none is needed
-        if (!needsPunycoding(hostname))
-        {
-            return hostname;
-        }
-
-        std::string encoded;
-
-        size_t start = 0;
-        size_t end = hostname.find('.');
-        while(true)
-        {
-            std::string segment = hostname.substr(start, end - start);
-            if (needsPunycoding(segment))
-            {
-                encoded.append("xn--");
-                encoded.append(Punycode::encode(segment));
-            }
-            else
-            {
-                encoded.append(segment);
-            }
-
-            if (end == std::string::npos)
-            {
-                break;
-            }
-            else
-            {
-                encoded.append(1, '.');
-                start = end + 1;
-                end = hostname.find('.', start);
-            }
-        }
-
-        return encoded;
-    }
-
-    std::string& Punycode::decode(std::string& str)
-    {
-        // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
-        //
-        // let n = initial_n
-        // let i = 0
-        // let bias = initial_bias
-        // let output = an empty string indexed from 0
-        punycode_uint n = INITIAL_N;
-        punycode_uint i = 0;
-        punycode_uint bias = INITIAL_BIAS;
-        std::vector<punycode_uint> codepoints;
-
-        size_t index = str.rfind('-');
-        if (index == std::string::npos)
-        {
-            index = 0;
-        }
-
-        // consume all code points before the last delimiter (if there is one)
-        // and copy them to output, fail on any non-basic code point
-        for (auto it = str.begin(); it != (str.begin() + index); ++it)
-        {
-            if (static_cast<unsigned char>(*it) > 127U)
-            {
-                throw std::invalid_argument("Argument has non-basic code points.");
-            }
-            codepoints.push_back(*it);
-        }
-
-        // if more than zero code points were consumed then consume one more
-        //   (which will be the last delimiter)
-        if (index > 0)
-        {
-            index += 1;
-        }
-
-        // while the input is not exhausted do begin
-        for (auto it = (str.begin() + index); it != str.end(); ++it)
-        {
-            // let oldi = i
-            // let w = 1
-            punycode_uint oldi = i;
-            punycode_uint w = 1;
-
-            // for k = base to infinity in steps of base do begin
-            for (punycode_uint k = BASE; ; k += BASE, ++it)
-            {
-                // consume a code point, or fail if there was none to consume
-                if (it == str.end())
-                {
-                    throw std::invalid_argument("Premature termination");
-                }
-
-                // let digit = the code point's digit-value, fail if it has none
-                int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
-                if (lookup == -1)
-                {
-                    throw std::invalid_argument("Invalid base 36 character.");
-                }
-                unsigned char digit = static_cast<unsigned char>(lookup);
-
-                // let i = i + digit * w, fail on overflow
-                if (digit > ((MAX_PUNYCODE_UINT - i) / w))
-                {
-                    throw std::invalid_argument("Overflow on i.");
-                }
-                i += digit * w;
-
-                // let t = tmin if k <= bias {+ tmin}, or
-                //         tmax if k >= bias + tmax, or k - bias otherwise
-                punycode_uint t = k <= bias ? TMIN :
-                                  k >= bias + TMAX ? TMAX : k - bias;
-
-                // if digit < t then break
-                if (digit < t)
-                {
-                    break;
-                }
-
-                // let w = w * (base - t), fail on overflow
-                if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
-                {
-                    // I believe this line is unreachable without first overflowing i.
-                    // Since 'i' is updated above as i += digit * w, and w is updated as
-                    // w = w * (BASE - t), we should like to keep (BASE - t) > digit to
-                    // give 'w' a chance to overflow first. To keep t minimized, we must
-                    // have 'bias' maximized. `bias` is driven by the 'adapt' function
-                    // below.
-                    //
-                    // The value returned by 'adapt' increases with the input delta, and
-                    // decreases with the input size. The delta is a function of the input
-                    // size as well, on the order of (delta_n * input size), and
-                    // legitimate delta_n values are limited to 0x10FFFF (the maximum
-                    // unicode codepoint). Even setting that aside, the maximum value that
-                    // adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
-                    //
-                    // Using this bias, we could use the input (HERE) to get iterations:
-                    //
-                    //     digit = b = 1, i = 2, k = 36, t = 1, w = 35
-                    //     digit = b = 1, i = 37, k = 72, t = 1, w = 1225
-                    //     digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
-                    //     digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
-                    //     digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
-                    //
-                    // At this point, t now becomes TMAX (26) because k exceeds the bias
-                    // (since the maximum bias is 204). As such, the minimum continuation
-                    // value is 26:
-                    //
-                    //     digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
-                    //
-                    // However, the next iteration now overflows i before we can get to
-                    // the w update.
-                    throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
-                }
-                w *= (BASE - t);
-            }
-
-            // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
-            bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
-
-            // let n = n + i div (length(output) + 1), fail on overflow
-            if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
-            {
-                throw std::invalid_argument("Overflow on n.");
-            }
-            n += i / (codepoints.size() + 1);
-
-            // let i = i mod (length(output) + 1)
-            i %= (codepoints.size() + 1);
-
-            // insert n into output at position i
-            codepoints.insert(codepoints.begin() + i, n);
-
-            // increment i
-            ++i;
-        }
-
-        std::string output;
-        for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
-        {
-            Utf8::writeCodepoint(output, *it);
-        }
-        str.assign(output);
-
-        return str;
-    }
-
-    std::string Punycode::decode(const std::string& str)
-    {
-        std::string result(str);
-        decode(result);
-        return result;
-    }
-
-    std::string Punycode::decodeHostname(const std::string& hostname)
-    {
-        std::string unencoded;
-
-        size_t start = 0;
-        size_t end = hostname.find('.');
-        while(true)
-        {
-            std::string segment = hostname.substr(start, end - start);
-            if (segment.substr(0, 4).compare("xn--") == 0)
-            {
-                segment = segment.substr(4);
-                unencoded.append(Punycode::decode(segment));
-            }
-            else
-            {
-                unencoded.append(segment);
-            }
-
-            if (end == std::string::npos)
-            {
-                break;
-            }
-            else
-            {
-                unencoded.append(1, '.');
-                start = end + 1;
-                end = hostname.find('.', start);
-            }
-        }
-
-        return unencoded;
-    }
-
-    bool Punycode::needsPunycoding(const std::string& str)
-    {
-        return std::any_of(
-            str.begin(),
-            str.end(),
-            [](char i){ return static_cast<unsigned char>(i) & 0x80; });
-    }
-
-    Punycode::punycode_uint Punycode::adapt(
-        punycode_uint delta, punycode_uint numpoints, bool firsttime)
-    {
-        // Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
-        //
-        // It does not matter whether the modifications to delta and k inside
-        // adapt() affect variables of the same name inside the
-        // encoding/decoding procedures, because after calling adapt() the
-        // caller does not read those variables before overwriting them.
-        //
-        // if firsttime then let delta = delta div damp
-        // else let delta = delta div 2
-        delta = firsttime ? delta / DAMP : delta >> 1;
-
-        // let delta = delta + (delta div numpoints)
-        delta += (delta / numpoints);
-
-        // let k = 0
-        punycode_uint k = 0;
-
-        // while delta > ((base - tmin) * tmax) div 2 do begin
-        for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
-        {
-            // let delta = delta div (base - tmin)
-            // let k = k + base
-            delta /= (BASE - TMIN);
-        }
-
-        // return k + (((base - tmin + 1) * delta) div (delta + skew))
-        return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
-    }
-
-};
--- a/crawler/third_party/url-cpp/punycode.h
+++ b/crawler/third_party/url-cpp/punycode.h
@ -1,106 +0,0 @@
-#ifndef PUNYCODE_CPP_H
-#define PUNYCODE_CPP_H
-
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-#include <limits>
-
-#include "utf8.h"
-
-namespace Url
-{
-
-    namespace Punycode
-    {
-        typedef Utf8::codepoint_t punycode_uint;
-
-        const unsigned int BASE          = 36;
-        const unsigned int TMIN          = 1;
-        const unsigned int TMAX          = 26;
-        const unsigned int SKEW          = 38;
-        const unsigned int DAMP          = 700;
-        const unsigned int INITIAL_BIAS  = 72;
-        const unsigned int INITIAL_N     = 128;
-
-        // Codepoints to their base-36 value
-        const std::vector<int8_t> BASIC_TO_DIGIT = {
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-
-            -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-            15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-            -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-            15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-        };
-        const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
-
-        // The highest codepoint in unicode
-        const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
-        //Utf8::MAX_CODEPOINT;
-        //std::numeric_limits<punycode_uint>::max();
-
-        /**
-         * Replace utf-8-encoded str into punycode.
-         */
-        std::string& encode(std::string& str);
-
-        /**
-         * Create a new punycoded string from utf-8-encoded input.
-         */
-        std::string encode(const std::string& str);
-
-        /**
-         * Encode a hostname.
-         */
-        std::string encodeHostname(const std::string& hostname);
-
-        /**
-         * Replace punycoded str into utf-8-encoded.
-         */
-        std::string& decode(std::string& str);
-
-        /**
-         * Create a new utf-8-encoded string from punycoded input.
-         */
-        std::string decode(const std::string& str);
-
-        /**
-         * Decode a hostname.
-         */
-        std::string decodeHostname(const std::string& hostname);
-
-        /**
-         * Determine if a string needs punycoding.
-         */
-        bool needsPunycoding(const std::string& str);
-
-        /**
-         * Internal function for calculating bias.
-         */
-        punycode_uint adapt(
-            punycode_uint delta, punycode_uint numpoints, bool firsttime);
-
-    };
-
-}
-
-#endif
--- a/crawler/third_party/url-cpp/url.cpp
+++ b/crawler/third_party/url-cpp/url.cpp
@ -1,962 +0,0 @@
-#include <algorithm>
-#include <string>
-#include <iterator>
-#include <unordered_map>
-#include <unordered_set>
-#include <iostream>
-#include <iterator>
-#include <sstream>
-
-#include "url.h"
-#include "punycode.h"
-
-namespace Url
-{
-
-    /* Character classes */
-    const CharacterClass Url::GEN_DELIMS(":/?#[]@");
-    const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
-    const CharacterClass Url::DIGIT("0123456789");
-    const CharacterClass Url::ALPHA(
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-    const CharacterClass Url::UNRESERVED(
-        Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
-    const CharacterClass Url::RESERVED(
-        Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
-    const CharacterClass Url::PCHAR(
-        Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
-    const CharacterClass Url::PATH(
-        Url::PCHAR.chars() + "/");
-    const CharacterClass Url::QUERY(
-        Url::PCHAR.chars() + "/?");
-    const CharacterClass Url::FRAGMENT(
-        Url::PCHAR.chars() + "/?");
-    const CharacterClass Url::USERINFO(
-        Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
-    const CharacterClass Url::HEX("0123456789ABCDEF");
-    const CharacterClass Url::SCHEME(
-        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
-    const std::vector<signed char> Url::HEX_TO_DEC = {
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
-
-        -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
-    const std::unordered_map<std::string, int> Url::PORTS = {
-        {"http", 80},
-        {"https", 443}
-    };
-    const std::unordered_set<std::string> Url::USES_RELATIVE = {
-        "",
-        "file",
-        "ftp",
-        "gopher",
-        "http",
-        "https",
-        "imap",
-        "mms",
-        "nntp",
-        "prospero",
-        "rtsp",
-        "rtspu",
-        "sftp",
-        "shttp",
-        "svn",
-        "svn+ssh",
-        "wais"
-    };
-    const std::unordered_set<std::string> Url::USES_NETLOC = {
-        "",
-        "file",
-        "ftp",
-        "git",
-        "git+ssh",
-        "gopher",
-        "http",
-        "https",
-        "imap",
-        "mms",
-        "nfs",
-        "nntp",
-        "prospero",
-        "rsync",
-        "rtsp",
-        "rtspu",
-        "sftp",
-        "shttp",
-        "snews",
-        "svn",
-        "svn+ssh",
-        "telnet",
-        "wais"
-    };
-    const std::unordered_set<std::string> Url::USES_PARAMS = {
-        "",
-        "ftp",
-        "hdl",
-        "http",
-        "https",
-        "imap",
-        "mms",
-        "prospero",
-        "rtsp",
-        "rtspu",
-        "sftp",
-        "shttp",
-        "sip",
-        "sips",
-        "tel"
-    };
-    const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
-        "",
-        "file",
-        "ftp",
-        "git",
-        "git+ssh",
-        "gopher",
-        "hdl",
-        "http",
-        "https",
-        "imap",
-        "mms",
-        "nfs",
-        "nntp",
-        "prospero",
-        "rsync",
-        "rtsp",
-        "rtspu",
-        "sftp",
-        "shttp",
-        "sip",
-        "sips",
-        "sms",
-        "snews",
-        "svn",
-        "svn+ssh",
-        "tel",
-        "telnet",
-        "wais"
-    };
-
-    Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
-    {
-        size_t position = 0;
-        size_t index = url.find(':');
-        if (index != std::string::npos)
-        {
-            // All the characters in our would-be scheme must be in SCHEME
-            if (std::all_of(
-                    url.begin(),
-                    url.begin() + index,
-                    [](char c) { return SCHEME(c); } ))
-            {
-                // If there is nothing after the : or there are any non-digits, this is
-                // the scheme
-                if ((index + 1) >= url.length()
-                    || std::any_of(
-                        url.begin() + index + 1,
-                        url.end(),
-                        [](char c) { return !DIGIT(c); }))
-                {
-                    scheme_.assign(url, 0, index);
-                    std::transform(
-                        scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
-                    position = index + 1;
-                }
-                else
-                {
-                    scheme_.assign(url, 0, index);
-                    std::transform(
-                        scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
-                    if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
-                    {
-                        position = index + 1;
-                    }
-                    else
-                    {
-                        scheme_.clear();
-                    }
-                }
-            }
-        }
-
-        // Search for the netloc
-        if ((url.length() - position) >= 1
-            && url[position] == '/'
-            && url[position + 1] == '/')
-        {
-            // Skip the '//'
-            position += 2;
-            index = url.find_first_of("/?#", position);
-            host_.assign(url, position, index - position);
-            position = index;
-
-            // Extract any userinfo if there is any
-            index = host_.find('@');
-            if (index != std::string::npos)
-            {
-                userinfo_.assign(host_, 0, index);
-                host_.assign(host_, index + 1, std::string::npos);
-            }
-
-            // Lowercase the hostname
-            std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
-
-            // Try to find a port
-            index = host_.find(':');
-            if (index != std::string::npos)
-            {
-                std::string portText(host_, index + 1, std::string::npos);
-                host_.resize(index);
-
-                if (portText.empty())
-                {
-                    port_ = 0;
-                }
-                else
-                {
-                    try
-                    {
-                        port_ = std::stoi(portText, &index);
-
-                        if (index != portText.length())
-                        {
-                            // Malformed port
-                            throw UrlParseException("Port not a number: " + portText);
-                        }
-
-                        if (port_ > 65535)
-                        {
-                            throw UrlParseException("Port too high: " + portText);
-                        }
-                        else if (port_ < 0)
-                        {
-                            throw UrlParseException("Port negative: " + portText);
-                        }
-                    }
-                    catch (const std::invalid_argument&)
-                    {
-                        // Malformed port
-                        throw UrlParseException("Port not a number: " + portText);
-                    }
-                    catch (const std::out_of_range&)
-                    {
-                        throw UrlParseException("Port out of integer range: " + portText);
-                    }
-                }
-            }
-        }
-
-        if (position != std::string::npos)
-        {
-            path_.assign(url, position, std::string::npos);
-
-            index = path_.find('#');
-            if (index != std::string::npos)
-            {
-                fragment_.assign(path_, index + 1, std::string::npos);
-                path_.resize(index);
-            }
-
-            index = path_.find('?');
-            if (index != std::string::npos)
-            {
-                query_.assign(path_, index + 1, std::string::npos);
-                has_query_ = true;
-                path_.resize(index);
-            }
-
-            if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
-            {
-                index = path_.find(';');
-                if (index != std::string::npos)
-                {
-                    params_.assign(path_, index + 1, std::string::npos);
-                    has_params_ = true;
-                    path_.resize(index);
-                }
-            }
-        }
-    }
-
-    Url& Url::assign(const Url& other)
-    {
-        return (*this) = other;
-    }
-
-    bool Url::operator==(const Url& other) const
-    {
-        return (
-            (scheme_     == other.scheme_    ) &&
-            (userinfo_   == other.userinfo_  ) &&
-            (host_       == other.host_      ) &&
-            (port_       == other.port_      ) &&
-            (path_       == other.path_      ) &&
-            (params_     == other.params_    ) &&
-            (query_      == other.query_     ) &&
-            (fragment_   == other.fragment_  ) &&
-            (has_params_ == other.has_params_) &&
-            (has_query_  == other.has_query_ )
-        );
-    }
-
-    bool Url::operator!=(const Url& other) const
-    {
-        return !operator==(other);
-    }
-
-    bool Url::equiv(const Url& other)
-    {
-        Url self_(*this);
-        Url other_(other);
-
-        self_.strip()
-             .sort_query()
-             .defrag()
-             .deuserinfo()
-             .abspath()
-             .escape()
-             .punycode()
-             .remove_default_port();
-        other_.strip()
-              .sort_query()
-              .defrag()
-              .deuserinfo()
-              .abspath()
-              .escape()
-              .punycode()
-              .remove_default_port();
-        return self_ == other_;
-    }
-
-    std::string& Url::remove_repeats(std::string& str, const char chr)
-    {
-        size_t dest = 0;
-        // By initializing this to true, it also strips of leading instances of chr
-        bool seen = true;
-        for (size_t src = 0; src < str.length(); ++src)
-        {
-            if (!seen || (str[src] != chr))
-            {
-                str[dest++] = str[src];
-            }
-            seen = str[src] == chr;
-        }
-        // Remove the last character if it happens to be chr
-        size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
-        str.resize(length);
-        return str;
-    }
-
-    std::string Url::fullpath() const
-    {
-        std::string result;
-        if (path_.empty() || path_[0] != '/')
-        {
-            result.append(1, '/');
-        }
-        result.append(path_);
-
-        if (has_params_)
-        {
-            result.append(";");
-            result.append(params_);
-        }
-
-        if (has_query_)
-        {
-            result.append("?");
-            result.append(query_);
-        }
-
-        if (!fragment_.empty())
-        {
-            result.append("#");
-            result.append(fragment_);
-        }
-        return result;
-    }
-
-    std::string Url::str() const
-    {
-        std::string result;
-
-        if (!scheme_.empty())
-        {
-            result.append(scheme_);
-            if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
-            {
-                result.append(":");
-            }
-            else
-            {
-                result.append("://");
-            }
-        }
-        else if (!host_.empty())
-        {
-            result.append("//");
-        }
-
-        if (!userinfo_.empty())
-        {
-            result.append(userinfo_);
-            result.append("@");
-        }
-
-        if (!host_.empty())
-        {
-            result.append(host_);
-        }
-
-        if (port_)
-        {
-            result.append(":");
-            result.append(std::to_string(port_));
-        }
-
-        if (path_.empty())
-        {
-            if (!result.empty())
-            {
-                result.append("/");
-            }
-        }
-        else
-        {
-            if (!host_.empty() && path_[0] != '/')
-            {
-                result.append(1, '/');
-            }
-            result.append(path_);
-        }
-
-        if (has_params_)
-        {
-            result.append(";");
-            result.append(params_);
-        }
-
-        if (has_query_)
-        {
-            result.append("?");
-            result.append(query_);
-        }
-
-        if (!fragment_.empty())
-        {
-            result.append("#");
-            result.append(fragment_);
-        }
-
-        return result;
-    }
-
-    Url& Url::strip()
-    {
-        size_t start = query_.find_first_not_of('?');
-        if (start != std::string::npos)
-        {
-            query_.assign(query_, start, std::string::npos);
-        }
-        else
-        {
-            query_.assign("");
-        }
-        setQuery(remove_repeats(query_, '&'));
-        setParams(remove_repeats(params_, ';'));
-        return *this;
-    }
-
-    Url& Url::abspath()
-    {
-        std::string copy;
-        std::vector<size_t> segment_starts;
-
-        if (path_.size() >= 1 && path_[0] == '/')
-        {
-            copy.append(1, '/');
-            segment_starts.push_back(0);
-        }
-
-        bool directory = false;
-        size_t previous = 0;
-        size_t index = 0;
-        for (index = path_.find('/')
-            ; index != std::string::npos
-            ; previous = index + 1, index = path_.find('/', index + 1))
-        {
-            // Skip empty segments
-            if (index - previous == 0)
-            {
-                continue;
-            }
-
-            if ((index - previous == 2)
-                && path_[previous] == '.'
-                && path_[previous + 1] == '.')
-            {
-                if (!segment_starts.empty())
-                {
-                    copy.resize(segment_starts.back());
-                    segment_starts.pop_back();
-                }
-                directory = true;
-            }
-            else if ((index - previous == 1) && path_[previous] == '.')
-            {
-                directory = true;
-            }
-            else
-            {
-                segment_starts.push_back(copy.length());
-                copy.append(path_, previous, index - previous);
-                copy.append(1, '/');
-                directory = false;
-            }
-        }
-
-        // Handle the last segment
-        index = path_.length();
-        if (previous == path_.length())
-        {
-            directory = true;
-        }
-        else if ((index - previous == 1) && path_[previous] == '.')
-        {
-            directory = true;
-        }
-        else if ((index - previous == 2)
-                && path_[previous] == '.'
-                && path_[previous + 1] == '.')
-        {
-            if (!segment_starts.empty())
-            {
-                copy.resize(segment_starts.back());
-            }
-            directory = true;
-        }
-        else
-        {
-            copy.append(path_, previous, index - previous);
-            copy.append(1, '/');
-            directory = false;
-        }
-
-        if (!directory && copy.size() >= 1)
-        {
-            copy.resize(copy.size() - 1);
-        }
-        else if (directory && copy.empty())
-        {
-            copy.append(1, '/');
-        }
-        path_.assign(copy);
-
-        return *this;
-    }
-
-    Url& Url::relative_to(const Url& other)
-    {
-        // If this scheme does not use relative, return it unchanged
-        if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
-        {
-            return *this;
-        }
-
-        // Support scheme-relative URLs
-        if (scheme_.empty())
-        {
-            scheme_ = other.scheme_;
-        }
-
-        // If this is an absolute URL (or scheme-relative), return early
-        if (!host_.empty()) {
-            return *this;
-        }
-
-        // If it's not an absolute URL, we need to copy the other host and port
-        host_ = other.host_;
-        port_ = other.port_;
-        userinfo_ = other.userinfo_;
-
-        // If the path portion is absolute, then bail out early.
-        if (!path_.empty() && path_.front() == '/')
-        {
-            return *this;
-        }
-
-        // Otherwise, this is a path that need to be evaluated relative to the other. If
-        // there is no '/', then we just keep our current path if it's not empty.
-        if (path_.empty())
-        {
-            if (params_.empty())
-            {
-                path_ = other.path_;
-                params_ = other.params_;
-                has_params_ = other.has_params_;
-                if (query_.empty())
-                {
-                    query_ = other.query_;
-                    has_query_ = other.has_query_;
-                }
-            }
-            else
-            {
-                path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
-            }
-
-            if (fragment_.empty())
-            {
-                fragment_ = other.fragment_;
-            }
-        }
-        else
-        {
-            size_t index = other.path_.rfind('/');
-            if (index != std::string::npos)
-            {
-                path_ = other.path_.substr(0, index + 1) + path_;
-            }
-            else if (!host_.empty())
-            {
-                path_ = "/" + path_;
-            }
-        }
-
-        return *this;
-    }
-
-    Url& Url::escape(bool strict)
-    {
-        escape(path_, PATH, strict);
-        escape(query_, QUERY, strict);
-        escape(params_, QUERY, strict);
-        escape(userinfo_, USERINFO, strict);
-        return *this;
-    }
-
-    std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
-    {
-        std::string copy(str);
-        size_t dest = 0;
-        // Allocate space pessimistically -- if every entity is expanded, it will take 3x
-        // the space.
-        str.resize(str.length() * 3);
-        for (size_t src = 0; src < copy.length(); ++src)
-        {
-            if (copy[src] == '%' && (copy.length() - src) >= 2)
-            {
-                // Read ahead to see if there's a valid escape sequence. If not, treat
-                // this like a normal character.
-                if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
-                {
-                    int value = (
-                        HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
-
-                    // In strict mode, we can only unescape parameters if they are both
-                    // safe and not reserved
-                    if (!strict || (strict && safe(value) && !RESERVED(value)))
-                    {
-                        // Replace src + 2 with that byte, advance src to consume it and
-                        // continue.
-                        src += 2;
-                        copy[src] = value;
-                    }
-                    else
-                    {
-                        str[dest++] = copy[src++];
-                        str[dest++] = ::toupper(copy[src++]);
-                        str[dest++] = ::toupper(copy[src]);
-                        continue;
-                    }
-                }
-            }
-
-            if (!safe(copy[src]))
-            {
-                // Not safe -- replace with %XX
-                str[dest++] = '%';
-                str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
-                str[dest++] = HEX.chars()[copy[src] & 0xF];
-            }
-            else
-            {
-                str[dest++] = copy[src];
-            }
-        }
-        str.resize(dest);
-        return str;
-    }
-
-    Url& Url::unescape()
-    {
-        unescape(path_);
-        unescape(query_);
-        unescape(params_);
-        unescape(userinfo_);
-        return *this;
-    }
-
-    std::string& Url::unescape(std::string& str)
-    {
-        std::string copy(str);
-        size_t dest = 0;
-        for (size_t src = 0; src < copy.length(); ++src, ++dest)
-        {
-            if (copy[src] == '%' && (copy.length() - src) >= 2)
-            {
-                // Read ahead to see if there's a valid escape sequence. If not, treat
-                // this like a normal character.
-                if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
-                {
-                    int value = (
-                        HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
-
-                    // Replace src + 2 with that byte, advance src to consume it and
-                    // continue.
-                    src += 2;
-                    str[dest] = value;
-                    continue;
-                }
-            }
-
-            // Either not a % or an incomplete entity
-            str[dest] = copy[src];
-        }
-        str.resize(dest);
-        return str;
-    }
-
-    Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
-    {
-        // Predicate is if it's present in the blacklist.
-        auto predicate = [blacklist](std::string& name, const std::string& value)
-        {
-            std::transform(name.begin(), name.end(), name.begin(), ::tolower);
-            return blacklist.find(name) != blacklist.end();
-        };
-
-        setQuery(remove_params(query_, predicate, '&'));
-        setParams(remove_params(params_, predicate, ';'));
-        return *this;
-    }
-
-    Url& Url::deparam(const deparam_predicate& predicate)
-    {
-        setQuery(remove_params(query_, predicate, '&'));
-        setParams(remove_params(params_, predicate, ';'));
-        return *this;
-    }
-
-    std::string& Url::remove_params(std::string& str,
-                            const deparam_predicate& predicate,
-                            char sep)
-    {
-        std::string copy;
-        std::string piece;
-        std::string name;
-        std::string value;
-        size_t previous = 0;
-        for (size_t index = str.find(sep)
-            ; index != std::string::npos
-            ; previous = index + 1, index = str.find(sep, previous))
-        {
-            piece.assign(str, previous, index - previous);
-            size_t position = piece.find('=');
-            name.assign(piece, 0, position);
-            value.clear();
-            if (position != std::string::npos)
-            {
-                value.assign(piece, position + 1, std::string::npos);
-            }
-
-            if (!predicate(name, value))
-            {
-                copy.append(copy.empty() ? 0 : 1, sep);
-                copy.append(piece);
-            }
-        }
-
-        if (previous < str.length())
-        {
-            piece.assign(str, previous, std::string::npos);
-            size_t position = piece.find('=');
-            name.assign(piece, 0, position);
-            value.clear();
-            if (position != std::string::npos)
-            {
-                value.assign(piece, position + 1, std::string::npos);
-            }
-
-            if (!predicate(name, value))
-            {
-                copy.append(copy.empty() ? 0 : 1, sep);
-                copy.append(piece);
-            }
-        }
-
-        str.assign(copy);
-        return str;
-    }
-
-    Url& Url::sort_query()
-    {
-        split_sort_join(query_, '&');
-        split_sort_join(params_, ';');
-        return *this;
-    }
-
-    std::string& Url::split_sort_join(std::string& str, const char glue)
-    {
-        // Return early if empty
-        if (str.empty())
-        {
-            return str;
-        }
-
-        // Split
-        std::vector<std::string> pieces;
-        std::stringstream stream(str);
-        std::string item;
-        while (getline(stream, item, glue))
-        {
-            pieces.push_back(item);
-        }
-
-        // Return early if it's just a single element
-        if (pieces.size() == 1)
-        {
-            return str;
-        }
-
-        // Sort
-        std::sort(pieces.begin(), pieces.end());
-
-        // Join (at this point we know that there's at least one element)
-        std::stringstream output;
-        for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
-        {
-            output << *it << glue;
-        }
-        output << pieces.back();
-        str.assign(output.str());
-        return str;
-    }
-
-    Url& Url::remove_default_port()
-    {
-        if (port_ && !scheme_.empty())
-        {
-            auto it = PORTS.find(scheme_);
-            if (it != PORTS.end() && port_ == it->second)
-            {
-                port_ = 0;
-            }
-        }
-        return *this;
-    }
-
-    Url& Url::deuserinfo()
-    {
-        userinfo_.clear();
-        return *this;
-    }
-
-    Url& Url::defrag()
-    {
-        fragment_.clear();
-        return *this;
-    }
-
-    Url& Url::punycode()
-    {
-        check_hostname(host_);
-        std::string encoded(Punycode::encodeHostname(host_));
-        check_hostname(encoded);
-        host_ = encoded;
-        return *this;
-    }
-
-    Url& Url::unpunycode()
-    {
-        host_ = Punycode::decodeHostname(host_);
-        return *this;
-    }
-
-    Url& Url::host_reversed()
-    {
-        std::reverse(host_.begin(), host_.end());
-        for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
-        {
-            position = host_.find('.', index);
-            if (position == std::string::npos)
-            {
-                std::reverse(host_.begin() + index, host_.end());
-                break;
-            }
-            else
-            {
-                std::reverse(host_.begin() + index, host_.begin() + position);
-            }
-        }
-        return *this;
-    }
-
-    void Url::check_hostname(std::string& host)
-    {
-        // Skip empty hostnames -- they are valid
-        if (host.empty())
-        {
-            return;
-        }
-
-        size_t start = 0;
-        size_t end = host.find('.');
-        while (end != std::string::npos)
-        {
-            if ((end - start) > 63)
-            {
-                throw std::invalid_argument("Label too long.");
-            }
-            else if (end == start)
-            {
-                throw std::invalid_argument("Empty label.");
-            }
-
-            start = end + 1;
-            end = host.find('.', start);
-        }
-
-        // For the final segment
-        if ((host.size() - start) > 63)
-        {
-            throw std::invalid_argument("Label too long.");
-        }
-        else if (host.size() == start && start > 1)
-        {
-            // Remove a trailing empty segment
-            host.resize(start - 1);
-        }
-    }
-
-};
--- a/crawler/third_party/url-cpp/url.h
+++ b/crawler/third_party/url-cpp/url.h
@ -1,323 +0,0 @@
-#ifndef URL_CPP_H
-#define URL_CPP_H
-
-#include <stdexcept>
-#include <functional>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace Url
-{
-
-    struct UrlParseException : public std::logic_error
-    {
-        UrlParseException(const std::string& message) : std::logic_error(message) {}
-    };
-
-    struct CharacterClass
-    {
-        CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
-        {
-            for (auto it = chars_.begin(); it != chars_.end(); ++it)
-            {
-                map_[static_cast<size_t>(*it)] = true;
-            }
-        }
-
-        bool operator()(char c) const
-        {
-            return map_[static_cast<unsigned char>(c)];
-        }
-
-        const std::string& chars() const
-        {
-            return chars_;
-        }
-
-    private:
-        // Private, unimplemented to prevent use
-        CharacterClass();
-        CharacterClass(const CharacterClass& other);
-
-        std::string chars_;
-        std::vector<bool> map_;
-    };
-
-    struct Url
-    {
-        /* Character classes */
-        const static CharacterClass GEN_DELIMS;
-        const static CharacterClass SUB_DELIMS;
-        const static CharacterClass ALPHA;
-        const static CharacterClass DIGIT;
-        const static CharacterClass UNRESERVED;
-        const static CharacterClass RESERVED;
-        const static CharacterClass PCHAR;
-        const static CharacterClass PATH;
-        const static CharacterClass QUERY;
-        const static CharacterClass FRAGMENT;
-        const static CharacterClass USERINFO;
-        const static CharacterClass HEX;
-        const static CharacterClass SCHEME;
-        const static std::vector<signed char> HEX_TO_DEC;
-        const static std::unordered_map<std::string, int> PORTS;
-        const static std::unordered_set<std::string> USES_RELATIVE;
-        const static std::unordered_set<std::string> USES_NETLOC;
-        const static std::unordered_set<std::string> USES_PARAMS;
-        const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
-
-        // The type of the predicate used for removing parameters
-        typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
-
-        explicit Url(const std::string& url);
-
-        Url(const Url& other)
-            : scheme_(other.scheme_)
-            , host_(other.host_)
-            , port_(other.port_)
-            , path_(other.path_)
-            , params_(other.params_)
-            , query_(other.query_)
-            , fragment_(other.fragment_)
-            , userinfo_(other.userinfo_)
-            , has_params_(other.has_params_)
-            , has_query_(other.has_query_) { }
-
-        /**
-         * Take on the value of the other URL.
-         */
-        Url& assign(const Url& other);
-
-        /**
-         * To be considered equal, all fields must be equal.
-         */
-        bool operator==(const Url& other) const;
-        bool operator!=(const Url& other) const;
-
-        /**
-         * Two URLs are considered equivalent if they have the same meaning.
-         */
-        bool equiv(const Url& other);
-
-        /**************************************
-         * Component-wise access and setting. *
-         **************************************/
-        const std::string& scheme() const { return scheme_; }
-        Url& setScheme(const std::string& s)
-        {
-            scheme_ = s;
-            return *this;
-        }
-
-        const std::string& host() const { return host_; }
-        Url& setHost(const std::string& s)
-        {
-            host_ = s;
-            return *this;
-        }
-
-        const int port() const { return port_; }
-        Url& setPort(int i)
-        {
-            port_ = i;
-            return *this;
-        }
-
-        const std::string& path() const { return path_; }
-        Url& setPath(const std::string& s)
-        {
-            path_ = s;
-            return *this;
-        }
-
-        const std::string& params() const { return params_; }
-        Url& setParams(const std::string& s)
-        {
-            params_ = s;
-            has_params_ = !s.empty();
-            return *this;
-        }
-
-        const std::string& query() const { return query_; }
-        Url& setQuery(const std::string& s)
-        {
-            query_ = s;
-            has_query_ = !s.empty();
-            return *this;
-        }
-
-        const std::string& fragment() const { return fragment_; }
-        Url& setFragment(const std::string& s)
-        {
-            fragment_ = s;
-            return *this;
-        }
-
-        const std::string& userinfo() const { return userinfo_; }
-        Url& setUserinfo(const std::string& s)
-        {
-            userinfo_ = s;
-            return *this;
-        }
-
-        /**
-         * Get a representation of all components of the path, params, query, fragment.
-         *
-         * Always includes a leading /.
-         */
-        std::string fullpath() const;
-
-        /**
-         * Get a new string representation of the URL.
-         **/
-        std::string str() const;
-
-        /*********************
-         * Chainable methods *
-         *********************/
-
-        /**
-         * Strip semantically meaningless excess '?', '&', and ';' characters from query
-         * and params.
-         */
-        Url& strip();
-
-        /**
-         * Make the path absolute.
-         *
-         * Evaluate '.', '..', and excessive slashes.
-         */
-        Url& abspath();
-
-        /**
-         * Evaluate this URL relative fo `other`, placing the result in this object.
-         */
-        Url& relative_to(const std::string& other)
-        {
-            return relative_to(Url(other));
-        }
-
-        /**
-         * Evaluate this URL relative fo `other`, placing the result in this object.
-         */
-        Url& relative_to(const Url& other);
-
-        /**
-         * Ensure that the path, params, query, and userinfo are properly escaped.
-         *
-         * In 'strict' mode, only entities that are both safe and not reserved characters
-         * are unescaped. In non-strict mode, entities that are safe are unescaped.
-         */
-        Url& escape(bool strict=false);
-
-        /**
-         * Unescape all entities in the path, params, query, and userinfo.
-         */
-        Url& unescape();
-
-        /**
-         * Remove any params or queries that appear in the blacklist.
-         *
-         * The blacklist should contain only lowercased strings, and the comparison is
-         * done in a case-insensitive way.
-         */
-        Url& deparam(const std::unordered_set<std::string>& blacklist);
-
-        /**
-         * Filter params subject to a predicate for whether it should be filtered.
-         *
-         * The predicate must accept two string refs -- the key and value (which may be
-         * empty). Return `true` if the parameter should be removed, and `false`
-         * otherwise.
-         */
-        Url& deparam(const deparam_predicate& predicate);
-
-        /**
-         * Put queries and params in sorted order.
-         *
-         * To ensure consistent comparisons, escape should be called beforehand.
-         */
-        Url& sort_query();
-
-        /**
-         * Remove the port if it's the default for the scheme.
-         */
-        Url& remove_default_port();
-
-        /**
-         * Remove the userinfo portion.
-         */
-        Url& deuserinfo();
-
-        /**
-         * Remove the fragment.
-         */
-        Url& defrag();
-
-        /**
-         * Punycode the hostname.
-         */
-        Url& punycode();
-
-        /**
-         * Unpunycode the hostname.
-         */
-        Url& unpunycode();
-
-        /**
-         * Reverse the hostname (a.b.c.d => d.c.b.a)
-         */
-        Url& host_reversed();
-
-    private:
-        // Private, unimplemented to prevent use.
-        Url();
-
-        /**
-         * Remove repeated, leading, and trailing instances of chr from the string.
-         */
-        std::string& remove_repeats(std::string& str, const char chr);
-
-        /**
-         * Ensure all the provided characters are escaped if necessary
-         */
-        std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
-
-        /**
-         * Unescape entities in the provided string
-         */
-        std::string& unescape(std::string& str);
-
-        /**
-         * Remove any params that match entries in the blacklist.
-         */
-        std::string& remove_params(
-            std::string& str, const deparam_predicate& pred, char sep);
-
-        /**
-         * Split the provided string by char, sort, join by char.
-         */
-        std::string& split_sort_join(std::string& str, const char glue);
-
-        /**
-         * Check that the hostname is valid, removing an optional trailing '.'.
-         */
-        void check_hostname(std::string& host);
-
-        std::string scheme_;
-        std::string host_;
-        int port_;
-        std::string path_;
-        std::string params_;
-        std::string query_;
-        std::string fragment_;
-        std::string userinfo_;
-        bool has_params_;
-        bool has_query_;
-    };
-
-}
-
-#endif
--- a/crawler/third_party/url-cpp/utf8.cpp
+++ b/crawler/third_party/url-cpp/utf8.cpp
@ -1,150 +0,0 @@
-#include <algorithm>
-#include <string>
-#include <iostream>
-
-#include "utf8.h"
-
-namespace Url
-{
-
-    Utf8::codepoint_t Utf8::readCodepoint(
-        std::string::const_iterator& it, const std::string::const_iterator& end)
-    {
-        Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
-        if (current & 0x80)
-        {
-            // Number of additional bytes needed
-            unsigned int bytes = 0;
-            // The accumulated value
-            Utf8::codepoint_t result = 0;
-            if (current < 0xC0)
-            {
-                // Invalid sequence
-                throw std::invalid_argument("Low UTF-8 start byte");
-            }
-            else if (current < 0xE0)
-            {
-                // One additional byte, two bytes total, use 5 bits
-                bytes = 1;
-                result = current & 0x1F;
-            }
-            else if (current < 0xF0)
-            {
-                // Two additional bytes, three bytes total, use 4 bits
-                bytes = 2;
-                result = current & 0x0F;
-            }
-            else if (current < 0xF8)
-            {
-                // Three additional bytes, four bytes total, use 3 bits
-                bytes = 3;
-                result = current & 0x07;
-            }
-            else
-            {
-                throw std::invalid_argument("High UTF-8 start byte");
-            }
-
-            for (; bytes > 0; --bytes) {
-                if (it == end)
-                {
-                    throw std::invalid_argument("UTF-8 sequence terminated early.");
-                }
-
-                current = static_cast<unsigned char>(*it++);
-                // Ensure the first two bits are 10
-                if ((current & 0xC0) != 0x80)
-                {
-                    throw std::invalid_argument("Invalid continuation byte");
-                }
-                result = (result << 6) | (current & 0x3F);
-            }
-
-            return result;
-        }
-        else
-        {
-            return current;
-        }
-    }
-
-    std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
-    {
-        if (value > MAX_CODEPOINT)
-        {
-            throw std::invalid_argument("Code point too high.");
-        }
-        else if (value <= 0x007F)
-        {
-            // Just append the character itself
-            str.append(1, static_cast<char>(value));
-            return str;
-        }
-
-        unsigned int bytes = 0;
-        if (value > 0xFFFF)
-        {
-            /**
-             * 11110xxx + 3 bytes for 21 bits total
-             *
-             * We need to take bits 20-18, which 0x1C0000 masks out. These form the least
-             * significant bits of this byte (so we shift them back down by 18). The 5
-             * most significant bits of this byte are 11110, so we OR this result with
-             * 0xF0 to get this first byte.
-             *
-             * The remaining bits will be consumed from the most-significant end and so
-             * they must be shifted up by (32 - 18) = 14.
-             */
-            str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
-            bytes = 3;
-            value <<= 14;
-        }
-        else if (value > 0x07FF)
-        {
-            /**
-             * 1110xxxx + 2 bytes for 16 bits total
-             *
-             * We need to take bits 15-12, which 0xF000 masks out. These form the least
-             * significant bits of this byte (so we shift them back down by 12). The 4
-             * most significant bits of this byte are 1110, so we OR this result with
-             * 0xE0 to get this first byte.
-             *
-             * The remaining bits will be consumed from the most-significant end and so
-             * they must be shifted up by (32 - 12) = 20.
-             */
-            str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
-            bytes = 2;
-            value <<= 20;
-        }
-        else
-        {
-            /**
-             * 110xxxxx + 1 byte for 11 bits total
-             *
-             * We need to take bits 10-6, which 0x7C0 masks out. These form the least
-             * significant bits of this byte (so we shift them back down by 6). The 3
-             * most significant bits of this byte are 110, so we OR this result with
-             * 0xC0 to get this first byte.
-             *
-             * The remaining bits will be consumed from the most-significant end and so
-             * they must be shifted up by (32 - 6) = 26.
-             */
-            str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
-            bytes = 1;
-            value <<= 26;
-        }
-
-        /**
-         * The remaining bits are to be consumed 6 at a time from the most-significant
-         * end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
-         * by 26, and OR'd with 0x80 to produce the continuation byte.
-         */
-        for (; bytes > 0; --bytes, value <<= 6)
-        {
-            str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
-        }
-
-        return str;
-    }
-
-};
--- a/crawler/third_party/url-cpp/utf8.h
+++ b/crawler/third_party/url-cpp/utf8.h
@ -1,91 +0,0 @@
-#ifndef UTF8_CPP_H
-#define UTF8_CPP_H
-
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-namespace Url
-{
-
-    /**
-     * Work between unicode code points and their UTF-8-encoded representation.
-     */
-    struct Utf8
-    {
-        /**
-         * The type we use to represent Unicode codepoints.
-         */
-        typedef uint32_t codepoint_t;
-
-        /**
-         * The type we use when talking about the integral value of bytes.
-         */
-        typedef unsigned char char_t;
-
-        /**
-         * The highest allowed codepoint.
-         */
-        static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
-
-        /**
-         * Consume up to the last byte of the sequence, returning the codepoint.
-         */
-        static codepoint_t readCodepoint(
-            std::string::const_iterator& it, const std::string::const_iterator& end);
-
-        /**
-         * Write a codepoint to the provided string.
-         */
-        static std::string& writeCodepoint(std::string& str, codepoint_t value);
-
-        /**
-         * Return the first codepoint stored in the provided string.
-         */
-        static codepoint_t toCodepoint(const std::string& str)
-        {
-            auto it = str.begin();
-            return readCodepoint(it, str.end());
-        }
-
-        /**
-         * Get a string with the provided codepoint.
-         */
-        static std::string fromCodepoint(codepoint_t value)
-        {
-            std::string str;
-            writeCodepoint(str, value);
-            return str;
-        }
-
-        /**
-         * Return all the codepoints in the string.
-         */
-        static std::vector<codepoint_t> toCodepoints(const std::string& str)
-        {
-            std::vector<codepoint_t> result;
-            for (auto it = str.begin(); it != str.end(); )
-            {
-                result.push_back(readCodepoint(it, str.end()));
-            }
-            return result;
-        }
-
-        /**
-         * Create a string from a vector of codepoints.
-         */
-        static std::string fromCodepoints(const std::vector<codepoint_t>& points)
-        {
-            std::string result;
-            for (auto it = points.begin(); it != points.end(); ++it)
-            {
-                writeCodepoint(result, *it);
-            }
-            return result;
-        }
-
-    };
-
-}
-
-#endif
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -3,8 +3,8 @@ project(librengine LANGUAGES CXX)

 set(CMAKE_CXX_STANDARD 17)

-set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h)
-set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp)
+set(include include/encryption.h include/typesense.h include/http.h include/str.h include/str_impl.h include/config.h include/logger.h include/json.hpp include/structs.h include/search.h include/helper.h include/cache.h include/robots_txt.h)
+set(src src/encryption.cpp src/typesense.cpp src/http.cpp src/str.cpp src/logger.cpp src/config.cpp src/search.cpp src/helper.cpp src/robots_txt.cpp)

 set(include_all ${include})
 set(src_all ${src})
--- a/lib/include/robots_txt.h
+++ b/lib/include/robots_txt.h
@ -0,0 +1,36 @@
+#ifndef ROBOTS_TXT_H
+#define ROBOTS_TXT_H
+
+#include "http.h"
+
+namespace librengine {
+    class user_agent {
+    public:
+        std::string agent;
+        std::vector<std::string> allow_list;
+        std::vector<std::string> disallow_list;
+        float crawl_delay;
+    public:
+        static bool match(const std::string &pattern, const std::string &expression);
+    public:
+        explicit user_agent(const std::string &agent);
+
+        bool allowed(const std::string &path);
+        bool allowed(const http::url &url);
+    };
+
+    class robots_txt {
+    private:
+        std::string text;
+    public:
+        std::vector<user_agent> agents;
+    public:
+        explicit robots_txt(const std::string &text);
+        void parse();
+
+        bool allowed(const std::string &path, const std::string &agent);
+        bool allowed(const http::url &url, const std::string &user_agent);
+    };
+}
+
+#endif
--- a/lib/src/robots_txt.cpp
+++ b/lib/src/robots_txt.cpp
@ -0,0 +1,138 @@
+#include "robots_txt.h"
+
+#include "str.h"
+
+#include <algorithm>
+
+namespace librengine {
+    bool user_agent::match(const std::string &pattern, const std::string &expression) {
+        auto pattern_size = pattern.length();
+        auto expression_size = expression.length();
+
+        std::vector<size_t> vector_pos(expression_size + 1);
+        size_t pos = 1;
+
+        for (int i = 0; i < pattern_size; ++i) {
+            char c = pattern[i];
+
+            if (c == '$' && i + 1 == pattern_size) {
+                return vector_pos[pos - 1] == expression_size;
+            }
+            if (c == '*') {
+                pos = expression_size - vector_pos[0] + 1;
+
+                for (int j = 1; j < pos; j++) {
+                    vector_pos[j] = vector_pos[j - 1] + 1;
+                }
+            } else {
+                int tmp_pos = 0;
+
+                for (int j = 0; j < pos; j++) {
+                    auto c_pos = vector_pos[j];
+
+                    if (c_pos < expression_size && expression[c_pos] == c) {
+                        vector_pos[tmp_pos] = c_pos + 1;
+                        ++tmp_pos;
+                    }
+                }
+
+                if (tmp_pos == 0) return false;
+                pos = tmp_pos;
+            }
+        }
+
+        return true;
+    }
+
+    user_agent::user_agent(const std::string &agent) {
+        this->agent = agent;
+        crawl_delay = 0;
+    }
+
+    bool user_agent::allowed(const std::string &path) {
+        for (const auto &allow : allow_list) {
+            if (match(allow, path)) {
+                return true;
+            }
+        }
+
+        for (const auto &disallow : disallow_list) {
+            if (match(disallow, path)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+    bool user_agent::allowed(const http::url &url) {
+        if (!url.path) return false;
+        std::string path = *url.path;
+        return allowed(path);
+    }
+
+    robots_txt::robots_txt(const std::string &text) {
+        this->text = text;
+        agents.emplace_back("");
+    }
+    void robots_txt::parse() {
+        auto splited = str::split(text, "\n");
+
+        for (const auto &pair : splited) {
+            auto splited_pair = str::split(pair, ":");
+            auto splited_pair_size = splited_pair.size();
+
+            if (splited_pair_size != 2) continue;
+
+            auto key = str::to_lower(splited_pair[0]);
+            auto value = str::to_lower(splited_pair[1]);
+
+            key = str::trim(key);
+            value = str::trim_start(value);
+
+            if (!value.empty()) value = str::trim_end(value);
+            auto comment_index = value.find('#');
+
+            if (comment_index != -1) {
+                value = value.substr(0, comment_index);
+                value = str::trim_end(value);
+            }
+
+            if (key.empty()) continue;
+            if (key != "disallow" && value.empty()) continue;
+
+            auto &current_agent = agents.back();
+
+            if (key == "user-agent") {
+                agents.emplace_back(value);
+            }
+            else if (key == "allow") {
+                current_agent.allow_list.push_back(value);
+            }
+            else if (key == "disallow") {
+                if (value.empty()) current_agent.allow_list.emplace_back("/");
+                else current_agent.disallow_list.push_back(value);
+            }
+            else if (key == "crawl-delay") {
+                try {
+                    current_agent.crawl_delay = std::stof(value);
+                } catch (const std::exception &e) {
+                    //current_agent.crawl_delay = 0; (def)
+                }
+            }
+        }
+    }
+
+    bool robots_txt::allowed(const std::string &path, const std::string &agent) {
+        auto found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == agent; });
+        if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == "*"; });
+        if (found == agents.end()) found = std::find_if(agents.begin(), agents.end(), [&](user_agent &u){ return u.agent == ""; });
+
+        return found->allowed(path);
+    }
+    bool robots_txt::allowed(const http::url &url, const std::string &user_agent) {
+        if (!url.path) return false;
+        std::string path = *url.path;
+
+        return allowed(path, user_agent);
+    }
+}