├── scripts ├── travis │ ├── script.sh │ └── before_install.sh ├── vagrant │ └── provision.sh └── check-coverage.sh ├── test ├── test-all.cpp ├── test-agent.cpp ├── test-directive.cpp └── test-robots.cpp ├── .gitignore ├── .gitmodules ├── .travis.yml ├── Vagrantfile ├── LICENSE ├── include ├── robots.h ├── directive.h └── agent.h ├── Makefile ├── bench.cpp ├── src ├── agent.cpp ├── directive.cpp └── robots.cpp └── README.md /scripts/travis/script.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | make test 6 | -------------------------------------------------------------------------------- /scripts/travis/before_install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | pip install --user gcovr==3.2 6 | -------------------------------------------------------------------------------- /test/test-all.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char **argv) { 4 | testing::InitGoogleTest(&argc, argv); 5 | return RUN_ALL_TESTS(); 6 | } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries and build artifacts 2 | *.o 3 | *.a 4 | bench 5 | test-all 6 | 7 | # Coverage 8 | *.gcda 9 | *.gcno 10 | *.gcov 11 | coverage.out 12 | 13 | # Vagrant 14 | .vagrant/ 15 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/url-cpp"] 2 | path = deps/url-cpp 3 | url = https://github.com/seomoz/url-cpp 4 | [submodule "deps/googletest"] 5 | path = deps/googletest 6 | url = https://github.com/google/googletest/ 7 | ignore = untracked 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | dist: bionic 3 | git: 4 | submodules: true 5 | compiler: 6 | - gcc 7 | - clang 8 | language: cpp 9 | python: 10 | - '3.5' 11 | before_install: scripts/travis/before_install.sh 12 | script: scripts/travis/script.sh 13 | addons: 14 | apt: 15 | packages: 16 | - cmake 17 | -------------------------------------------------------------------------------- /scripts/vagrant/provision.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -e 4 | 5 | sudo apt-get update 6 | sudo apt-get install -y g++ libgtest-dev cmake python-pip 7 | 8 | sudo pip install gcovr==3.2 9 | 10 | pushd /tmp 11 | mkdir -p gtest-build 12 | pushd gtest-build 13 | cmake -DCMAKE_BUILD_TYPE=RELEASE /usr/src/gtest/ 14 | make 15 | find . -name 'libg*.a' | xargs sudo cp -f --target-directory=/usr/lib/ 16 | popd 17 | popd 18 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # Encoding: utf-8 2 | # -*- mode: ruby -*- 3 | # vi: set ft=ruby : 4 | 5 | ENV['VAGRANT_DEFAULT_PROVIDER'] = 'virtualbox' 6 | 7 | # http://docs.vagrantup.com/v2/ 8 | Vagrant.configure('2') do |config| 9 | config.vm.box = 'ubuntu/trusty64' 10 | config.vm.hostname = 'rep-cpp' 11 | config.ssh.forward_agent = true 12 | 13 | config.vm.provider :virtualbox do |vb| 14 | vb.customize ["modifyvm", :id, "--memory", "1024"] 15 | vb.customize ["modifyvm", :id, "--cpus", "2"] 16 | end 17 | 18 | config.vm.provision :shell, path: 'scripts/vagrant/provision.sh', privileged: false 19 | end 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 SEOmoz, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /scripts/check-coverage.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ "$#" -ne 1 ]; then 6 | echo "Usage:" 7 | echo " check-coverage.sh " 8 | exit 1 9 | fi 10 | 11 | root="${1}" 12 | 13 | results=`gcovr \ 14 | --root=${root} \ 15 | --exclude-unreachable-branches \ 16 | --output=coverage.out \ 17 | --print-summary \ 18 | --object-directory=${root} \ 19 | --exclude test \ 20 | --exclude deps/ \ 21 | --exclude src/psl.cpp \ 22 | --exclude src/punycode.cpp \ 23 | --exclude src/url.cpp \ 24 | --exclude src/utf8.cpp \ 25 | --exclude include/psl.h \ 26 | --exclude include/punycode.h \ 27 | --exclude include/url.h \ 28 | --exclude include/utf8.h` 29 | 30 | lines=`echo ${results} | sed -E 's#^.*lines: ([0-9]+)(\.[0-9]+)?%.+$#\1#'` 31 | branches=`echo ${results} | sed -E 's#^.*branches: ([0-9]+)(\.[0-9]+)?%.+$#\1#'` 32 | 33 | if [ "${lines}" -eq "0" ]; then 34 | echo "Coverage disabled." 35 | echo "#{results}" 36 | elif [ "${lines}" -ne "100" ]; then 37 | echo "Incomplete line coverage (${lines})" 38 | echo "${results}" 39 | exit 2 40 | else 41 | echo "Coverage looks good!" 42 | echo "${results}" 43 | fi 44 | -------------------------------------------------------------------------------- /include/robots.h: -------------------------------------------------------------------------------- 1 | #ifndef ROBOTS_CPP_H 2 | #define ROBOTS_CPP_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "agent.h" 9 | 10 | namespace Rep 11 | { 12 | 13 | class Robots 14 | { 15 | public: 16 | typedef std::unordered_map agent_map_t; 17 | typedef std::vector sitemaps_t; 18 | 19 | /** 20 | * Create a robots.txt from a utf-8-encoded string. 21 | */ 22 | explicit Robots(const std::string& content); 23 | 24 | /** 25 | * Create a robots.txt from a utf-8-encoded string assuming 26 | * the given base_url. 27 | */ 28 | Robots(const std::string& content, const std::string& base_url); 29 | 30 | /** 31 | * Get the sitemaps in this robots.txt 32 | */ 33 | const sitemaps_t& sitemaps() const { return sitemaps_; } 34 | 35 | /** 36 | * Get the agent with the corresponding name. 37 | */ 38 | const Agent& agent(const std::string& name) const; 39 | 40 | /** 41 | * Return true if agent is allowed to fetch the URL (either a 42 | * full URL or a path). 43 | */ 44 | bool allowed(const std::string& path, const std::string& name) const; 45 | 46 | std::string str() const; 47 | 48 | /** 49 | * Return the robots.txt URL corresponding to the provided URL. 50 | */ 51 | static std::string robotsUrl(const std::string& url); 52 | 53 | private: 54 | static void strip(std::string& string); 55 | 56 | static bool getpair( 57 | std::istringstream& stream, std::string& key, std::string& value); 58 | 59 | std::string host_; 60 | agent_map_t agents_; 61 | sitemaps_t sitemaps_; 62 | Agent& default_; 63 | }; 64 | } 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GTEST_DIR ?= deps/googletest/googletest 2 | 3 | CXX ?= g++ 4 | CXXOPTS ?= -Wall -Werror -std=c++11 -Iinclude/ -Ideps/url-cpp/include -I$(GTEST_DIR)/include 5 | DEBUG_OPTS ?= -g -fprofile-arcs -ftest-coverage -O0 -fPIC 6 | RELEASE_OPTS ?= -O3 7 | BINARIES = 8 | 9 | all: test release/librep.o $(BINARIES) 10 | 11 | $(GTEST_DIR)/libgtest.a: 12 | g++ -std=c++11 -isystem $(GTEST_DIR)/include -I$(GTEST_DIR) -pthread -c $(GTEST_DIR)/src/gtest-all.cc -o $(GTEST_DIR)/libgtest.a 13 | 14 | # Release libraries 15 | release: 16 | mkdir -p release 17 | 18 | release/bin: release 19 | mkdir -p release/bin 20 | 21 | deps/url-cpp/release/liburl.o: deps/url-cpp/* deps/url-cpp/include/* deps/url-cpp/src/* 22 | make -C deps/url-cpp release/liburl.o 23 | 24 | release/librep.o: release/directive.o release/agent.o release/robots.o deps/url-cpp/release/liburl.o 25 | ld -r -o $@ $^ 26 | 27 | release/%.o: src/%.cpp include/%.h release 28 | $(CXX) $(CXXOPTS) $(RELEASE_OPTS) -o $@ -c $< 29 | 30 | # Debug libraries 31 | debug: 32 | mkdir -p debug 33 | 34 | debug/bin: debug 35 | mkdir -p debug/bin 36 | 37 | deps/url-cpp/debug/liburl.o: deps/url-cpp/* deps/url-cpp/include/* deps/url-cpp/src/* 38 | make -C deps/url-cpp debug/liburl.o 39 | 40 | debug/librep.o: debug/directive.o debug/agent.o debug/robots.o deps/url-cpp/debug/liburl.o 41 | ld -r -o $@ $^ 42 | 43 | debug/%.o: src/%.cpp include/%.h debug 44 | $(CXX) $(CXXOPTS) $(DEBUG_OPTS) -o $@ -c $< 45 | 46 | test/%.o: test/%.cpp 47 | $(CXX) $(CXXOPTS) $(DEBUG_OPTS) -o $@ -c $< 48 | 49 | # Tests 50 | test-all: test/test-all.o test/test-agent.o test/test-directive.o test/test-robots.o debug/librep.o $(GTEST_DIR)/libgtest.a 51 | $(CXX) $(CXXOPTS) -L$(GTEST_DIR) $(DEBUG_OPTS) -o $@ $^ -lpthread 52 | 53 | # Bench 54 | bench: bench.cpp release/librep.o 55 | $(CXX) $(CXXOPTS) $(RELEASE_OPTS) -o $@ $< release/librep.o 56 | 57 | .PHONY: test 58 | test: test-all 59 | ./test-all 60 | ./scripts/check-coverage.sh $(PWD) 61 | 62 | clean: 63 | rm -rf debug release test-all bench test/*.o test/*.gcda test/*.gcno deps/url-cpp/debug deps/url-cpp/release 64 | -------------------------------------------------------------------------------- /include/directive.h: -------------------------------------------------------------------------------- 1 | #ifndef DIRECTIVE_CPP_H 2 | #define DIRECTIVE_CPP_H 3 | 4 | 5 | namespace Rep 6 | { 7 | 8 | class Directive 9 | { 10 | public: 11 | /** 12 | * The type of our priority value. 13 | */ 14 | typedef size_t priority_t; 15 | 16 | /** 17 | * Default constructor disallowed. 18 | */ 19 | Directive() = delete; 20 | 21 | /** 22 | * The input to this constructor must be stripped of comments 23 | * and trailing whitespace. 24 | */ 25 | Directive(const std::string& line, bool allowed); 26 | 27 | /** 28 | * Default copy constructor. 29 | */ 30 | Directive(const Directive& rhs) = default; 31 | 32 | /** 33 | * Default move constructor. 34 | */ 35 | Directive(Directive&& rhs) = default; 36 | 37 | /** 38 | * The priority of the rule. 39 | */ 40 | priority_t priority() const 41 | { 42 | return priority_; 43 | } 44 | 45 | /** 46 | * Whether or not the provided path matches. The path is 47 | * expected to be properly escaped. 48 | */ 49 | bool match(const std::string& path) const; 50 | 51 | /** 52 | * Whether this rule is for an allow or a disallow. 53 | */ 54 | bool allowed() const 55 | { 56 | return allowed_; 57 | } 58 | 59 | std::string str() const; 60 | 61 | /** 62 | * Default copy assignment operator. 63 | */ 64 | Directive& operator=(const Directive& rhs) = default; 65 | 66 | private: 67 | std::string expression_; 68 | priority_t priority_; 69 | bool allowed_; 70 | 71 | /** 72 | * Return true if p_begin -> p_end matches the expression e_begin -> e_end. 73 | */ 74 | bool match(const std::string::const_iterator& e_begin, 75 | const std::string::const_iterator& e_end, 76 | const std::string::const_iterator& p_begin, 77 | const std::string::const_iterator& p_end) const; 78 | }; 79 | 80 | } 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /include/agent.h: -------------------------------------------------------------------------------- 1 | #ifndef AGENT_CPP_H 2 | #define AGENT_CPP_H 3 | 4 | #include 5 | 6 | #include "directive.h" 7 | 8 | // forward declaration 9 | namespace Url 10 | { 11 | struct Url; 12 | } 13 | 14 | namespace Rep 15 | { 16 | class Agent 17 | { 18 | public: 19 | /* The type for the delay. */ 20 | typedef float delay_t; 21 | 22 | /** 23 | * Default constructor 24 | */ 25 | Agent() : Agent("") {} 26 | 27 | /** 28 | * Construct an agent. 29 | */ 30 | explicit Agent(const std::string& host) : 31 | directives_(), delay_(-1.0), sorted_(true), host_(host) {} 32 | 33 | /** 34 | * Default copy constructor. 35 | */ 36 | Agent(const Agent& rhs) = default; 37 | 38 | /** 39 | * Default move constructor. 40 | */ 41 | Agent(Agent&& rhs) = default; 42 | 43 | /** 44 | * Add an allowed directive. 45 | */ 46 | Agent& allow(const std::string& query); 47 | 48 | /** 49 | * Add a disallowed directive. 50 | */ 51 | Agent& disallow(const std::string& query); 52 | 53 | /** 54 | * Set the delay for this agent. 55 | */ 56 | Agent& delay(delay_t value) { 57 | delay_ = value; 58 | return *this; 59 | } 60 | 61 | /** 62 | * Return the delay for this agent. 63 | */ 64 | delay_t delay() const { return delay_; } 65 | 66 | /** 67 | * A vector of the directives, in priority-sorted order. 68 | */ 69 | const std::vector& directives() const; 70 | 71 | /** 72 | * Return true if the URL (either a full URL or a path) is allowed. 73 | */ 74 | bool allowed(const std::string& path) const; 75 | 76 | std::string str() const; 77 | 78 | /** 79 | * Default copy assignment operator. 80 | */ 81 | Agent& operator=(const Agent& rhs) = default; 82 | 83 | private: 84 | bool is_external(const Url::Url& url) const; 85 | 86 | mutable std::vector directives_; 87 | delay_t delay_; 88 | mutable bool sorted_; 89 | std::string host_; 90 | }; 91 | } 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /bench.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "directive.h" 6 | #include "robots.h" 7 | 8 | /** 9 | * Run func() `count` times in each of `runs` experiments, where `name` provides a 10 | * meaningful description of the task being benchmarked. Prints out the time for each 11 | * run, the average time, and rate. 12 | */ 13 | template 14 | void bench(const std::string& name, size_t count, size_t runs, Functor func) 15 | { 16 | std::cout << "Benchmarking " << name << " with " << count << " per run:" << std::endl; 17 | double total(0); 18 | for (size_t run = 0; run < runs; ++run) 19 | { 20 | auto start = std::chrono::high_resolution_clock::now(); 21 | for (size_t it = 0; it < count; ++it) 22 | { 23 | func(); 24 | } 25 | auto end = std::chrono::high_resolution_clock::now(); 26 | double duration = std::chrono::duration(end - start).count(); 27 | total += duration; 28 | std::cout << " Run " << run << ": " << duration << " ms" << std::endl; 29 | } 30 | std::cout << " Average: " << (total / runs) << " ms" << std::endl; 31 | std::cout << " Rate: " << ((count * runs) / total) << " k-iter / s" << std::endl; 32 | } 33 | 34 | int main(int argc, char* argv[]) { 35 | 36 | size_t count = 1000000; 37 | size_t runs = 5; 38 | 39 | bench("directive basic parse", count, runs, []() { 40 | Rep::Directive("/basic/path", true); 41 | }); 42 | 43 | bench("directive wildcard parse", count, runs, []() { 44 | Rep::Directive("/path/*/with/**/wildcards/*", true); 45 | }); 46 | 47 | Rep::Directive directive("/basic/path", true); 48 | bench("directive basic check", count, runs, [directive]() { 49 | directive.match("/basic/path/other"); 50 | }); 51 | 52 | directive = Rep::Directive("/path/*/with/**/wildcards/*", true); 53 | bench("directive wildcard check", count, runs, [directive]() { 54 | directive.match("/path/is/with/a/few/wildcards/"); 55 | }); 56 | 57 | std::string content = 58 | "# /robots.txt for http://www.fict.org/\n" 59 | "# comments to webmaster@fict.org\n" 60 | "\n" 61 | "User-agent: unhipbot\n" 62 | "Disallow: /\n" 63 | "\n" 64 | "User-agent: webcrawler\n" 65 | "User-agent: excite\n" 66 | "Disallow:\n" 67 | "\n" 68 | "User-agent: *\n" 69 | "Disallow: /org/plans.html\n" 70 | "Allow: /org/\n" 71 | "Allow: /serv\n" 72 | "Allow: /~mak\n" 73 | "Disallow: /\n"; 74 | bench("parse RFC", count / 10, runs, [content]() { 75 | Rep::Robots robot(content); 76 | }); 77 | } 78 | -------------------------------------------------------------------------------- /src/agent.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "url.h" 6 | 7 | #include "agent.h" 8 | #include "directive.h" 9 | 10 | namespace 11 | { 12 | std::string escape_url(Url::Url& url) 13 | { 14 | return url.defrag().escape().fullpath(); 15 | } 16 | 17 | std::string trim_front(const std::string& str, const char chr) 18 | { 19 | auto itr = std::find_if(str.begin(), str.end(), 20 | [chr](const char c) {return c != chr;}); 21 | return std::string(itr, str.end()); 22 | } 23 | } 24 | 25 | namespace Rep 26 | { 27 | Agent& Agent::allow(const std::string& query) 28 | { 29 | Url::Url url(query); 30 | // ignore directives for external URLs 31 | if (is_external(url)) 32 | { 33 | return *this; 34 | } 35 | // leading wildcard? 36 | if (query.front() == '*') 37 | { 38 | Url::Url trimmed(trim_front(query, '*')); 39 | directives_.push_back(Directive(escape_url(trimmed), true)); 40 | } 41 | directives_.push_back(Directive(escape_url(url), true)); 42 | sorted_ = false; 43 | return *this; 44 | } 45 | 46 | Agent& Agent::disallow(const std::string& query) 47 | { 48 | if (query.empty()) 49 | { 50 | // Special case: "Disallow:" means "Allow: /" 51 | directives_.push_back(Directive(query, true)); 52 | } 53 | else 54 | { 55 | Url::Url url(query); 56 | // ignore directives for external URLs 57 | if (is_external(url)) 58 | { 59 | return *this; 60 | } 61 | // leading wildcard? 62 | if (query.front() == '*') 63 | { 64 | Url::Url trimmed(trim_front(query, '*')); 65 | directives_.push_back(Directive(escape_url(trimmed), false)); 66 | } 67 | directives_.push_back(Directive(escape_url(url), false)); 68 | } 69 | sorted_ = false; 70 | return *this; 71 | } 72 | 73 | const std::vector& Agent::directives() const 74 | { 75 | if (!sorted_) 76 | { 77 | std::sort(directives_.begin(), directives_.end(), 78 | [](const Directive& a, const Directive& b) { 79 | return b.priority() < a.priority(); 80 | }); 81 | sorted_ = true; 82 | } 83 | return directives_; 84 | } 85 | 86 | bool Agent::allowed(const std::string& query) const 87 | { 88 | Url::Url url(query); 89 | if (is_external(url)) 90 | { 91 | return false; 92 | } 93 | std::string path(escape_url(url)); 94 | 95 | if (path.compare("/robots.txt") == 0) 96 | { 97 | return true; 98 | } 99 | 100 | for (const auto& directive : directives()) 101 | { 102 | if (directive.match(path)) 103 | { 104 | return directive.allowed(); 105 | } 106 | } 107 | return true; 108 | } 109 | 110 | std::string Agent::str() const 111 | { 112 | std::stringstream out; 113 | if (delay_ > 0) 114 | { 115 | out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' '; 116 | } 117 | out << '['; 118 | const auto& d = directives(); 119 | auto begin = d.begin(); 120 | auto end = d.end(); 121 | if (begin != end) 122 | { 123 | out << "Directive(" << begin->str() << ')'; 124 | ++begin; 125 | } 126 | for (; begin != end; ++begin) 127 | { 128 | out << ", Directive(" << begin->str() << ')'; 129 | } 130 | out << ']'; 131 | return out.str(); 132 | } 133 | 134 | bool Agent::is_external(const Url::Url& url) const 135 | { 136 | return !host_.empty() && !url.host().empty() && url.host() != host_; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/directive.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "url.h" 7 | 8 | #include "directive.h" 9 | 10 | namespace Rep 11 | { 12 | Directive::Directive(const std::string& line, bool allowed) 13 | : expression_() 14 | , priority_(line.size()) 15 | , allowed_(allowed) 16 | { 17 | if (line.find('*') == std::string::npos) 18 | { 19 | expression_.assign(line); 20 | return; 21 | } 22 | 23 | // Remove consecutive '*'s 24 | expression_.reserve(line.size()); 25 | bool star = false; 26 | for (auto character : line) 27 | { 28 | if (character == '*') 29 | { 30 | if (!star) 31 | { 32 | expression_.append(1, character); 33 | } 34 | star = true; 35 | } 36 | else 37 | { 38 | expression_.append(1, character); 39 | star = false; 40 | } 41 | } 42 | 43 | // Remove trailing '*'s 44 | std::string::reverse_iterator last = 45 | std::find_if(expression_.rbegin(), expression_.rend(), 46 | [](const char c) { 47 | return c != '*'; 48 | }); 49 | expression_.erase(last.base(), expression_.end()); 50 | 51 | // Priority is the length of the expression 52 | priority_ = expression_.size(); 53 | } 54 | 55 | bool Directive::match(const std::string::const_iterator& e_begin, 56 | const std::string::const_iterator& e_end, 57 | const std::string::const_iterator& p_begin, 58 | const std::string::const_iterator& p_end) const 59 | { 60 | std::string::const_iterator expression_it = e_begin; 61 | std::string::const_iterator path_it = p_begin; 62 | while (expression_it != e_end && path_it != p_end) 63 | { 64 | if (*expression_it == '*') 65 | { 66 | // Advance and recurse 67 | ++expression_it; 68 | for (; path_it != p_end; ++path_it) 69 | { 70 | if (match(expression_it, e_end, path_it, p_end)) 71 | { 72 | return true; 73 | } 74 | } 75 | return false; 76 | } 77 | else if (*expression_it == '$') 78 | { 79 | // This check expects path to be fully consumed. But since one of the 80 | // criteria of being in this while loop is that we've not fully consumed 81 | // path, return false. 82 | return false; 83 | } 84 | else if (*expression_it != *path_it) 85 | { 86 | // These characters must match 87 | return false; 88 | } 89 | else 90 | { 91 | // Advance both by one 92 | ++path_it; 93 | ++expression_it; 94 | } 95 | } 96 | 97 | // Return true only if we've consumed all of the expression 98 | if (expression_it == e_end) 99 | { 100 | return true; 101 | } 102 | else if (*expression_it == '$') 103 | { 104 | return path_it == p_end; 105 | } 106 | else 107 | { 108 | return false; 109 | } 110 | } 111 | 112 | std::string Directive::str() const 113 | { 114 | std::stringstream out; 115 | if (allowed_) 116 | { 117 | out << "Allow: " << expression_; 118 | } 119 | else { 120 | out << "Disallow: " << expression_; 121 | } 122 | return out.str(); 123 | } 124 | 125 | bool Directive::match(const std::string& path) const 126 | { 127 | return match(expression_.begin(), expression_.end(), path.begin(), path.end()); 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /test/test-agent.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "agent.h" 4 | 5 | TEST(AgentTest, Basic) 6 | { 7 | Rep::Agent agent = Rep::Agent("a.com").allow("/").disallow("/foo"); 8 | EXPECT_EQ(agent.directives().size(), 2ul); 9 | } 10 | 11 | TEST(AgentTest, ChecksAllowed) 12 | { 13 | Rep::Agent agent = Rep::Agent("a.com").allow("/path"); 14 | EXPECT_TRUE(agent.allowed("/path")); 15 | EXPECT_TRUE(agent.allowed("/elsewhere")); 16 | } 17 | 18 | TEST(AgentTest, IgnoresExternalDisallow) 19 | { 20 | Rep::Agent agent = Rep::Agent("a.com") 21 | .allow("/path") 22 | .disallow("http://b.com/external"); 23 | EXPECT_TRUE(agent.allowed("/path")); 24 | EXPECT_TRUE(agent.allowed("/external")); 25 | } 26 | 27 | TEST(AgentTest, IgnoresExternalAllow) 28 | { 29 | Rep::Agent agent = Rep::Agent("a.com") 30 | .disallow("/path") 31 | .allow("http://b.com/path/exception"); 32 | EXPECT_FALSE(agent.allowed("/path")); 33 | EXPECT_FALSE(agent.allowed("/path/exception")); 34 | } 35 | 36 | TEST(AgentTest, NeverExternalAllowed) 37 | { 38 | Rep::Agent agent = Rep::Agent("a.com"); 39 | EXPECT_FALSE(agent.allowed("http://b.com/")); 40 | } 41 | 42 | TEST(AgentTest, HonorsLongestFirstPriority) 43 | { 44 | Rep::Agent agent = Rep::Agent("a.com") 45 | .disallow("/path") 46 | .allow("/path/exception"); 47 | EXPECT_TRUE(agent.allowed("/path/exception")); 48 | EXPECT_FALSE(agent.allowed("/path")); 49 | } 50 | 51 | TEST(AgentTest, RobotsTextAllowed) 52 | { 53 | Rep::Agent agent = Rep::Agent("a.com").disallow("/robots.txt"); 54 | EXPECT_TRUE(agent.allowed("/robots.txt")); 55 | } 56 | 57 | TEST(AgentTest, DisallowNone) 58 | { 59 | Rep::Agent agent = Rep::Agent("a.com").disallow(""); 60 | EXPECT_TRUE(agent.allowed("/anything")); 61 | } 62 | 63 | TEST(AgentTest, MiddleWildcard) 64 | { 65 | Rep::Agent agent = Rep::Agent("a.com").disallow("/test*foo"); 66 | EXPECT_FALSE(agent.allowed("/testfoo")); 67 | EXPECT_FALSE(agent.allowed("/testafoo")); 68 | EXPECT_FALSE(agent.allowed("/testaasdffoo")); 69 | EXPECT_FALSE(agent.allowed("/test/foo")); 70 | EXPECT_TRUE(agent.allowed("/testfo")); 71 | EXPECT_TRUE(agent.allowed("/estfoo")); 72 | } 73 | 74 | TEST(AgentTest, EscapedRule) 75 | { 76 | Rep::Agent agent = Rep::Agent("a.com").disallow("/a%3cd.html"); 77 | EXPECT_FALSE(agent.allowed("/a 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "url.h" 10 | 11 | #include "robots.h" 12 | 13 | namespace Rep 14 | { 15 | 16 | void Robots::strip(std::string& string) 17 | { 18 | string.erase(string.begin(), std::find_if(string.begin(), string.end(), 19 | std::not1(std::ptr_fun(std::isspace)))); 20 | string.erase(std::find_if(string.rbegin(), string.rend(), 21 | std::not1(std::ptr_fun(std::isspace))).base(), string.end()); 22 | } 23 | 24 | bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) 25 | { 26 | while (getline(stream, key)) 27 | { 28 | size_t index = key.find('#'); 29 | if (index != std::string::npos) 30 | { 31 | key.resize(index); 32 | } 33 | 34 | // Find the colon and divide it into key and value, skipping malformed lines 35 | index = key.find(':'); 36 | if (index == std::string::npos) 37 | { 38 | continue; 39 | } 40 | 41 | value.assign(key.begin() + index + 1, key.end()); 42 | key.resize(index); 43 | 44 | // Strip whitespace off of each 45 | strip(key); 46 | strip(value); 47 | 48 | // Lowercase the key 49 | std::transform(key.begin(), key.end(), key.begin(), ::tolower); 50 | 51 | return true; 52 | } 53 | return false; 54 | } 55 | 56 | Robots::Robots(const std::string& content) : 57 | Robots(content, "") 58 | { 59 | } 60 | 61 | Robots::Robots(const std::string& content, const std::string& base_url) : 62 | host_(Url::Url(base_url).host()), 63 | agents_(), 64 | sitemaps_(), 65 | default_(agents_.emplace("*", Agent(host_)).first->second) 66 | { 67 | std::string agent_name("*"); 68 | std::istringstream input(content); 69 | if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) 70 | { 71 | input.ignore(3); 72 | } 73 | std::string key, value; 74 | std::vector group; 75 | bool last_agent = false; 76 | agent_map_t::iterator current = agents_.find("*"); 77 | while (Robots::getpair(input, key, value)) 78 | { 79 | if (key.compare("user-agent") == 0) 80 | { 81 | // Store the user agent string as lowercased 82 | std::transform(value.begin(), value.end(), value.begin(), ::tolower); 83 | 84 | if (last_agent) 85 | { 86 | group.push_back(value); 87 | } 88 | else 89 | { 90 | if (!agent_name.empty()) 91 | { 92 | for (auto other : group) 93 | { 94 | agents_.emplace(other, current->second); 95 | } 96 | group.clear(); 97 | } 98 | agent_name = value; 99 | current = agents_.emplace(agent_name, Agent(host_)).first; 100 | } 101 | last_agent = true; 102 | continue; 103 | } 104 | else 105 | { 106 | last_agent = false; 107 | } 108 | 109 | if (key.compare("sitemap") == 0) 110 | { 111 | sitemaps_.push_back(value); 112 | } 113 | else if (key.compare("disallow") == 0) 114 | { 115 | current->second.disallow(value); 116 | } 117 | else if (key.compare("allow") == 0) 118 | { 119 | current->second.allow(value); 120 | } 121 | else if (key.compare("crawl-delay") == 0) 122 | { 123 | try 124 | { 125 | current->second.delay(std::stof(value)); 126 | } 127 | catch (const std::exception&) 128 | { 129 | std::cerr << "Could not parse " << value << " as float." << std::endl; 130 | } 131 | } 132 | } 133 | 134 | if (!agent_name.empty()) 135 | { 136 | for (auto other : group) 137 | { 138 | agents_.emplace(other, current->second); 139 | } 140 | } 141 | } 142 | 143 | const Agent& Robots::agent(const std::string& name) const 144 | { 145 | // Lowercase the agent 146 | std::string lowered(name); 147 | std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower); 148 | 149 | auto it = agents_.find(lowered); 150 | if (it == agents_.end()) 151 | { 152 | return default_; 153 | } 154 | else 155 | { 156 | return it->second; 157 | } 158 | } 159 | 160 | bool Robots::allowed(const std::string& path, const std::string& name) const 161 | { 162 | return agent(name).allowed(path); 163 | } 164 | 165 | std::string Robots::str() const 166 | { 167 | std::stringstream out; 168 | // TODO: include sitepath info 169 | out << '{'; 170 | auto begin = agents_.begin(); 171 | auto end = agents_.end(); 172 | if (begin != end) 173 | { 174 | out << '"' << begin->first << '"' << ": " << begin->second.str(); 175 | ++begin; 176 | } 177 | for (; begin != end; ++begin) 178 | { 179 | out << ", \"" << begin->first << '"' << ": " << begin->second.str(); 180 | } 181 | out << '}'; 182 | return out.str(); 183 | } 184 | 185 | std::string Robots::robotsUrl(const std::string& url) 186 | { 187 | return Url::Url(url) 188 | .setUserinfo("") 189 | .setPath("robots.txt") 190 | .setParams("") 191 | .setQuery("") 192 | .setFragment("") 193 | .remove_default_port() 194 | .str(); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Robots Exclusion Protocol Parser for C++ 2 | ======================================== 3 | 4 | [![Build Status](https://travis-ci.org/seomoz/rep-cpp.svg?branch=master)](https://travis-ci.org/seomoz/rep-cpp) 5 | 6 | Supports the [1996 RFC](http://www.robotstxt.org/norobots-rfc.txt), as well as some 7 | modern conventions, including: 8 | 9 | - wildcard matching (`*` and `$`) 10 | - sitemap listing 11 | - crawl-delay 12 | 13 | __This library deals in UTF-8-encoded strings.__ 14 | 15 | Matching 16 | -------- 17 | A path may match multiple directives. For example, `/some/path/page.html` matches all 18 | of these rules: 19 | 20 | ``` 21 | Allow: /some/ 22 | Disallow: /some/path/ 23 | Allow: /*/page.html 24 | ``` 25 | 26 | Each directive is given a priority, and the highest-priority matching directive is used. 27 | We choose the length of the expression to be that priority. In the above example, the 28 | priorities are: 29 | 30 | ``` 31 | Allow: /some/ (priority = 6) 32 | Disallow: /some/path/ (priority = 11) 33 | Allow: /*/page.html (priority = 12) 34 | ``` 35 | 36 | Classes 37 | ------- 38 | A `Robots` object is the result of parsing a single `robots.txt` file. It has a mapping of 39 | agent names to `Agent` objects, as well as a vector of the `sitemaps` listed in the file. 40 | An `Agent` object holds the crawl-delay and `Directive`s associated with a particular 41 | user-agent. 42 | 43 | Parsing and Querying 44 | -------------------- 45 | Here's an example of parsing a robots.txt file: 46 | 47 | ```c++ 48 | #include "robots.h" 49 | 50 | std::string content = "..."; 51 | Rep::Robots robots = Rep::Robots(content); 52 | 53 | // Is this path allowed to the provided agent? 54 | robots.allowed("/some/path", "my-agent"); 55 | 56 | // Is this URL allowed to the provided agent? 57 | robots.url_allowed("http://example.com/some/path", "my-agent"); 58 | ``` 59 | 60 | If a client is interested only in the exclusion rules of a single agent, then: 61 | 62 | ```c++ 63 | Rep::Agent agent = Rep::Robots(content).agent("my-agent"); 64 | 65 | // Is this path allowed to this agent? 66 | agent.allowed("/some/path"); 67 | 68 | // Is this URL allowed to this agent? 69 | agent.url_allowed("http://example.com/some/path"); 70 | ``` 71 | 72 | Building 73 | ======== 74 | This library depends on `url-cpp`, which is included as a submodule. We provide two 75 | main targets, `{debug,release}/librep.o`: 76 | 77 | ``` 78 | git submodule update --init --recursive 79 | make release/librep.o 80 | ``` 81 | 82 | Development 83 | =========== 84 | 85 | Environment 86 | ----------- 87 | To launch the `vagrant` image, we only need to 88 | `vagrant up` (though you may have to provide a `--provider` flag): 89 | 90 | ```bash 91 | vagrant up 92 | ``` 93 | 94 | With a running `vagrant` instance, you can log in and run tests: 95 | 96 | ```bash 97 | vagrant ssh 98 | cd /vagrant 99 | 100 | make test 101 | ``` 102 | 103 | Running Tests 104 | ------------- 105 | Tests are run with the top-level `Makefile`: 106 | 107 | ```bash 108 | make test 109 | ``` 110 | 111 | PRs 112 | === 113 | These are not all hard-and-fast rules, but in general PRs have the following expectations: 114 | 115 | - __pass Travis__ -- or more generally, whatever CI is used for the particular project 116 | - __be a complete unit__ -- whether a bug fix or feature, it should appear as a complete 117 | unit before consideration. 118 | - __maintain code coverage__ -- some projects may include code coverage requirements as 119 | part of the build as well 120 | - __maintain the established style__ -- this means the existing style of established 121 | projects, the established conventions of the team for a given language on new 122 | projects, and the guidelines of the community of the relevant languages and 123 | frameworks. 124 | - __include failing tests__ -- in the case of bugs, failing tests demonstrating the bug 125 | should be included as one commit, followed by a commit making the test succeed. This 126 | allows us to jump to a world with a bug included, and prove that our test in fact 127 | exercises the bug. 128 | - __be reviewed by one or more developers__ -- not all feedback has to be accepted, but 129 | it should all be considered. 130 | - __avoid 'addressed PR feedback' commits__ -- in general, PR feedback should be rebased 131 | back into the appropriate commits that introduced the change. In cases, where this 132 | is burdensome, PR feedback commits may be used but should still describe the changed 133 | contained therein. 134 | 135 | PR reviews consider the design, organization, and functionality of the submitted code. 136 | 137 | Commits 138 | ======= 139 | Certain types of changes should be made in their own commits to improve readability. When 140 | too many different types of changes happen simultaneous to a single commit, the purpose of 141 | each change is muddled. By giving each commit a single logical purpose, it is implicitly 142 | clear why changes in that commit took place. 143 | 144 | - __updating / upgrading dependencies__ -- this is especially true for invocations like 145 | `bundle update` or `berks update`. 146 | - __introducing a new dependency__ -- often preceeded by a commit updating existing 147 | dependencies, this should only include the changes for the new dependency. 148 | - __refactoring__ -- these commits should preserve all the existing functionality and 149 | merely update how it's done. 150 | - __utility components to be used by a new feature__ -- if introducing an auxiliary class 151 | in support of a subsequent commit, add this new class (and its tests) in its own 152 | commit. 153 | - __config changes__ -- when adjusting configuration in isolation 154 | - __formatting / whitespace commits__ -- when adjusting code only for stylistic purposes. 155 | 156 | New Features 157 | ------------ 158 | Small new features (where small refers to the size and complexity of the change, not the 159 | impact) are often introduced in a single commit. Larger features or components might be 160 | built up piecewise, with each commit containing a single part of it (and its corresponding 161 | tests). 162 | 163 | Bug Fixes 164 | --------- 165 | In general, bug fixes should come in two-commit pairs: a commit adding a failing test 166 | demonstrating the bug, and a commit making that failing test pass. 167 | 168 | Tagging and Versioning 169 | ====================== 170 | Whenever the version included in `setup.py` is changed (and it should be changed when 171 | appropriate using [http://semver.org/](http://semver.org/)), a corresponding tag should 172 | be created with the same version number (formatted `v`). 173 | 174 | ```bash 175 | git tag -a v0.1.0 -m 'Version 0.1.0 176 | 177 | This release contains an initial working version Rep::Robots.' 178 | 179 | git push origin 180 | ``` 181 | -------------------------------------------------------------------------------- /test/test-directive.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "directive.h" 4 | 5 | TEST(DirectiveTest, BasicExact) 6 | { 7 | std::string directive("/tmp"); 8 | std::string example("/tmp"); 9 | EXPECT_TRUE(Rep::Directive(directive, true).match(example)); 10 | } 11 | 12 | TEST(DirectiveTest, BasicFileExtension) 13 | { 14 | std::string directive("/tmp"); 15 | std::string example("/tmp.html"); 16 | EXPECT_TRUE(Rep::Directive(directive, true).match(example)); 17 | } 18 | 19 | TEST(DirectiveTest, BasicDirectory) 20 | { 21 | std::string directive("/tmp"); 22 | std::string example("/tmp/a.html"); 23 | EXPECT_TRUE(Rep::Directive(directive, true).match(example)); 24 | } 25 | 26 | TEST(DirectiveTest, BasicDirectoryAndFile) 27 | { 28 | std::string directive("/tmp/"); 29 | std::string example("/tmp"); 30 | EXPECT_FALSE(Rep::Directive(directive, true).match(example)); 31 | } 32 | 33 | TEST(DirectiveTest, BasicDirectoryAndDirectory) 34 | { 35 | std::string directive("/tmp/"); 36 | std::string example("/tmp/"); 37 | EXPECT_TRUE(Rep::Directive(directive, true).match(example)); 38 | } 39 | 40 | TEST(DirectiveTest, BasicDirectoryAndDirectoryAndFile) 41 | { 42 | std::string directive("/tmp/"); 43 | std::string example("/tmp/a.html"); 44 | EXPECT_TRUE(Rep::Directive(directive, true).match(example)); 45 | } 46 | 47 | TEST(DirectiveTest, WildcardTest) 48 | { 49 | std::vector examples = { 50 | "/hello/how/are/you", 51 | "/hello/how/are/you/today", 52 | "/hello/how/are/yo/are/you", 53 | }; 54 | std::vector antiexamples = { 55 | "/hello/", 56 | "/hi/how/are/you" 57 | }; 58 | std::string directive("/hello/*/are/you"); 59 | Rep::Directive parsed(directive, true); 60 | for (auto example : examples) 61 | { 62 | EXPECT_TRUE(parsed.match(example)) << 63 | example << " didn't match " << directive; 64 | } 65 | 66 | for (auto example : antiexamples) 67 | { 68 | EXPECT_FALSE(parsed.match(example)) << 69 | example << " matched " << directive; 70 | } 71 | } 72 | 73 | TEST(DirectiveTest, LeadingWildcard) 74 | { 75 | std::vector examples = { 76 | "/test", 77 | "/a/test", 78 | "/ab/test", 79 | "/abc/test", 80 | }; 81 | std::string directive("*/test"); 82 | Rep::Directive parsed(directive, true); 83 | for (auto example : examples) 84 | { 85 | EXPECT_TRUE(parsed.match(example)) << 86 | example << " didn't match " << directive; 87 | } 88 | 89 | std::vector antiexamples = { 90 | "/tes", 91 | "/est", 92 | }; 93 | for (auto example : antiexamples) 94 | { 95 | EXPECT_FALSE(parsed.match(example)) << 96 | example << " matched " << directive; 97 | } 98 | } 99 | 100 | TEST(DirectiveTest, MultipleWildcardTest) 101 | { 102 | std::vector examples = { 103 | "/this-test-is-a-simple-test", 104 | "/this-test-is-another-test-is-a-tricky-test" 105 | }; 106 | std::vector antiexamples = { 107 | "/this-test-is-a-mislead" 108 | }; 109 | std::string directive("/this-*-is-a-*-test"); 110 | Rep::Directive parsed(directive, true); 111 | for (auto example : examples) 112 | { 113 | EXPECT_TRUE(parsed.match(example)) << 114 | example << " didn't match " << directive; 115 | } 116 | 117 | for (auto example : antiexamples) 118 | { 119 | EXPECT_FALSE(parsed.match(example)) << 120 | example << " matched " << directive; 121 | } 122 | } 123 | 124 | TEST(DirectiveTest, Str) 125 | { 126 | EXPECT_EQ("Allow: /foo", Rep::Directive("/foo", true).str()); 127 | EXPECT_EQ("Disallow: /bar", Rep::Directive("/bar", false).str()); 128 | } 129 | 130 | TEST(GoogleTest, EmptyAndWildcard) 131 | { 132 | std::vector examples = { 133 | "/", 134 | "/fish", 135 | "/fish.html", 136 | "/fish/salmon.html", 137 | "/fishheads", 138 | "/fishheads/yummy.html", 139 | "/fish.php?id=anything" 140 | }; 141 | std::vector directives = { 142 | "/", 143 | "/*" 144 | }; 145 | for (auto directive : directives) 146 | { 147 | Rep::Directive parsed(directive, true); 148 | for (auto example : examples) 149 | { 150 | EXPECT_TRUE(parsed.match(example)) << 151 | example << " didn't match " << directive; 152 | } 153 | } 154 | } 155 | 156 | TEST(GoogleTest, Prefix) 157 | { 158 | std::vector examples = { 159 | "/fish", 160 | "/fish.html", 161 | "/fish/salmon.html", 162 | "/fishheads", 163 | "/fishheads/yummy.html", 164 | "/fish.php?id=anything" 165 | }; 166 | std::vector antiexamples = { 167 | "/Fish.asp", 168 | "/catfish", 169 | "/?id=fish" 170 | }; 171 | std::string directive("/fish"); 172 | Rep::Directive parsed(directive, true); 173 | for (auto example : examples) 174 | { 175 | EXPECT_TRUE(parsed.match(example)) << 176 | example << " didn't match " << directive; 177 | } 178 | 179 | for (auto example : antiexamples) 180 | { 181 | EXPECT_FALSE(parsed.match(example)) << 182 | example << " matched " << directive; 183 | } 184 | } 185 | 186 | TEST(GoogleTest, TrailingWildcard) 187 | { 188 | std::vector examples = { 189 | "/fish", 190 | "/fish.html", 191 | "/fish/salmon.html", 192 | "/fishheads", 193 | "/fishheads/yummy.html", 194 | "/fish.php?id=anything" 195 | }; 196 | std::vector antiexamples = { 197 | "/Fish.asp", 198 | "/catfish", 199 | "/?id=fish" 200 | }; 201 | std::string directive("/fish*"); 202 | Rep::Directive parsed(directive, true); 203 | for (auto example : examples) 204 | { 205 | EXPECT_TRUE(parsed.match(example)) << 206 | example << " didn't match " << directive; 207 | } 208 | 209 | for (auto example : antiexamples) 210 | { 211 | EXPECT_FALSE(parsed.match(example)) << 212 | example << " matched " << directive; 213 | } 214 | } 215 | 216 | TEST(GoogleTest, Directory) 217 | { 218 | std::vector examples = { 219 | "/fish/", 220 | "/fish/?id=anything", 221 | "/fish/salmon.htm" 222 | }; 223 | std::vector antiexamples = { 224 | "/fish", 225 | "/fish.html", 226 | "/Fish/Salmon.asp" 227 | }; 228 | std::string directive("/fish/"); 229 | Rep::Directive parsed(directive, true); 230 | for (auto example : examples) 231 | { 232 | EXPECT_TRUE(parsed.match(example)) << 233 | example << " didn't match " << directive; 234 | } 235 | 236 | for (auto example : antiexamples) 237 | { 238 | EXPECT_FALSE(parsed.match(example)) << 239 | example << " matched " << directive; 240 | } 241 | } 242 | 243 | TEST(GoogleTest, WildcardExtension) 244 | { 245 | std::vector examples = { 246 | "/filename.php", 247 | "/folder/filename.php", 248 | "/folder/filename.php?parameters", 249 | "/folder/any.php.file.html", 250 | "/filename.php/" 251 | }; 252 | std::vector antiexamples = { 253 | "/", 254 | "/windows.PHP" 255 | }; 256 | std::string directive("/*.php"); 257 | Rep::Directive parsed(directive, true); 258 | for (auto example : examples) 259 | { 260 | EXPECT_TRUE(parsed.match(example)) << 261 | example << " didn't match " << directive; 262 | } 263 | 264 | for (auto example : antiexamples) 265 | { 266 | EXPECT_FALSE(parsed.match(example)) << 267 | example << " matched " << directive; 268 | } 269 | } 270 | 271 | TEST(GoogleTest, WildcardExtensionEnd) 272 | { 273 | std::vector examples = { 274 | "/filename.php", 275 | "/folder/filename.php" 276 | }; 277 | std::vector antiexamples = { 278 | "/filename.php?parameters", 279 | "/filename.php/", 280 | "/filename.php5", 281 | "/windows.PHP" 282 | }; 283 | std::string directive("/*.php$"); 284 | Rep::Directive parsed(directive, true); 285 | for (auto example : examples) 286 | { 287 | EXPECT_TRUE(parsed.match(example)) << 288 | example << " didn't match " << directive; 289 | } 290 | 291 | for (auto example : antiexamples) 292 | { 293 | EXPECT_FALSE(parsed.match(example)) << 294 | example << " matched " << directive; 295 | } 296 | } 297 | 298 | TEST(GoogleTest, FishStarExtension) 299 | { 300 | std::vector examples = { 301 | "/fish.php", 302 | "/fishheads/catfish.php?parameters" 303 | }; 304 | std::vector antiexamples = { 305 | "/Fish.PHP" 306 | }; 307 | std::string directive("/fish*.php"); 308 | Rep::Directive parsed(directive, true); 309 | for (auto example : examples) 310 | { 311 | EXPECT_TRUE(parsed.match(example)) << 312 | example << " didn't match " << directive; 313 | } 314 | 315 | for (auto example : antiexamples) 316 | { 317 | EXPECT_FALSE(parsed.match(example)) << 318 | example << " matched " << directive; 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /test/test-robots.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "url.h" 4 | 5 | #include "robots.h" 6 | 7 | TEST(RobotsTest, NoLeadingUserAgent) 8 | { 9 | // Assumed to be the default user agent 10 | std::string content = 11 | "Disallow: /path\n" 12 | "Allow: /path/exception\n" 13 | "Crawl-delay: 5.2\n"; 14 | Rep::Robots robot(content); 15 | EXPECT_TRUE(robot.allowed("/path/exception", "agent")); 16 | EXPECT_FALSE(robot.allowed("/path", "agent")); 17 | EXPECT_NEAR(robot.agent("agent").delay(), 5.2, 0.000001); 18 | } 19 | 20 | TEST(RobotsTest, WellFormedCrawlDelay) 21 | { 22 | std::string content = 23 | "User-agent: *\n" 24 | "Crawl-delay: 5.2\n"; 25 | Rep::Robots robot(content); 26 | EXPECT_NEAR(robot.agent("any").delay(), 5.2, 0.000001); 27 | } 28 | 29 | TEST(RobotsTest, MalformedCrawlDelay) 30 | { 31 | std::string content = 32 | "User-agent: *\n" 33 | "Crawl-delay: word\n"; 34 | Rep::Robots robot(content); 35 | EXPECT_EQ(robot.agent("any").delay(), -1.0); 36 | } 37 | 38 | TEST(RobotsTest, HonorsDefaultAgent) 39 | { 40 | std::string content = 41 | "User-agent: *\n" 42 | "Disallow: /tmp\n" 43 | "\n" 44 | "User-agent: other-agent\n" 45 | "Allow: /tmp\n"; 46 | Rep::Robots robot(content); 47 | EXPECT_FALSE(robot.allowed("/tmp", "agent")); 48 | EXPECT_TRUE(robot.allowed("/path", "agent")); 49 | } 50 | 51 | TEST(RobotsTest, HonorsSpecificAgent) 52 | { 53 | std::string content = 54 | "User-agent: *\n" 55 | "Disallow: /tmp\n" 56 | "\n" 57 | "User-agent: agent\n" 58 | "Allow: /tmp\n"; 59 | Rep::Robots robot(content); 60 | EXPECT_TRUE(robot.allowed("/tmp", "agent")); 61 | EXPECT_TRUE(robot.allowed("/path", "agent")); 62 | } 63 | 64 | TEST(RobotsTest, Grouping) 65 | { 66 | std::string content = 67 | "User-agent: one\n" 68 | "User-agent: two\n" 69 | "Disallow: /tmp\n"; 70 | Rep::Robots robot(content); 71 | EXPECT_FALSE(robot.allowed("/tmp", "one")); 72 | EXPECT_FALSE(robot.allowed("/tmp", "two")); 73 | } 74 | 75 | TEST(RobotsTest, GroupingUnknownKeys) 76 | { 77 | // When we encounter unknown keys, we should disregard any grouping that may have 78 | // happened between user agent rules. 79 | // 80 | // This is an example from the wild. Despite `Noindex` not being a valid directive, 81 | // we'll not consider the "*" and "ia_archiver" rules together. 82 | std::string content = 83 | "User-agent: *\n" 84 | "Disallow: /content/2/\n" 85 | "User-agent: *\n" 86 | "Noindex: /gb.html\n" 87 | "Noindex: /content/2/\n" 88 | "User-agent: ia_archiver\n" 89 | "Disallow: /\n"; 90 | Rep::Robots robot(content); 91 | EXPECT_TRUE(robot.allowed("/foo", "agent")); 92 | EXPECT_FALSE(robot.allowed("/bar", "ia_archiver")); 93 | } 94 | 95 | TEST(RobotsTest, SeparatesAgents) 96 | { 97 | std::string content = 98 | "User-agent: one\n" 99 | "Crawl-delay: 1\n" 100 | "\n" 101 | "User-agent: two\n" 102 | "Crawl-delay: 2\n"; 103 | Rep::Robots robot(content); 104 | EXPECT_NE(robot.agent("one").delay(), robot.agent("two").delay()); 105 | } 106 | 107 | TEST(RobotsTest, ExposesSitemaps) 108 | { 109 | std::string content = 110 | "Sitemap: http://a.com/sitemap.xml\n" 111 | "Sitemap: http://b.com/sitemap.xml\n"; 112 | Rep::Robots robot(content); 113 | std::vector expected = { 114 | "http://a.com/sitemap.xml", "http://b.com/sitemap.xml" 115 | }; 116 | EXPECT_EQ(robot.sitemaps(), expected); 117 | } 118 | 119 | TEST(RobotsTest, CaseInsensitivity) 120 | { 121 | std::string content = 122 | "User-agent: Agent\n" 123 | "Disallow: /path\n"; 124 | Rep::Robots robot(content); 125 | EXPECT_FALSE(robot.allowed("/path", "agent")); 126 | EXPECT_FALSE(robot.allowed("/path", "aGeNt")); 127 | } 128 | 129 | TEST(RobotsTest, Empty) 130 | { 131 | std::string content; 132 | Rep::Robots robot(content); 133 | EXPECT_TRUE(robot.sitemaps().empty()); 134 | EXPECT_TRUE(robot.allowed("/", "agent")); 135 | } 136 | 137 | TEST(RobotsTest, Comments) 138 | { 139 | std::string content = 140 | "User-Agent: * # comment saying it's the default agent\n" 141 | "Disallow: /\n"; 142 | Rep::Robots robot(content); 143 | EXPECT_FALSE(robot.allowed("/path", "agent")); 144 | } 145 | 146 | TEST(RobotsTest, AcceptsFullUrl) 147 | { 148 | std::string content = 149 | "User-Agent: agent\n" 150 | "Disallow: /path;params?query\n"; 151 | Rep::Robots robot(content); 152 | EXPECT_FALSE(robot.allowed( 153 | "http://userinfo@exmaple.com:10/path;params?query#fragment", "agent")); 154 | } 155 | 156 | TEST(RobotsTest, SkipMalformedLine) 157 | { 158 | std::string content = 159 | "User-Agent: agent\n" 160 | "Disallow /no/colon/in/this/line\n"; 161 | Rep::Robots robot(content); 162 | EXPECT_TRUE(robot.allowed("/no/colon/in/this/line", "agent")); 163 | } 164 | 165 | TEST(RobotsTest, RobotsUrlHttp) 166 | { 167 | std::string url("http://user@example.com:80/path;params?query#fragment"); 168 | std::string expected("http://example.com/robots.txt"); 169 | EXPECT_EQ(expected, Rep::Robots::robotsUrl(url)); 170 | } 171 | 172 | TEST(RobotsTest, RobotsUrlHttps) 173 | { 174 | std::string url("https://user@example.com:443/path;params?query#fragment"); 175 | std::string expected("https://example.com/robots.txt"); 176 | EXPECT_EQ(expected, Rep::Robots::robotsUrl(url)); 177 | } 178 | 179 | TEST(RobotsTest, RobotsUrlNonDefaultPort) 180 | { 181 | std::string url("http://user@example.com:8080/path;params?query#fragment"); 182 | std::string expected("http://example.com:8080/robots.txt"); 183 | EXPECT_EQ(expected, Rep::Robots::robotsUrl(url)); 184 | } 185 | 186 | TEST(RobotsTest, RobotsUrlInvalidPort) 187 | { 188 | std::string url("http://:::cnn.com/"); 189 | ASSERT_THROW(Rep::Robots::robotsUrl(url), Url::UrlParseException); 190 | } 191 | 192 | TEST(RobotsTest, RfcExample) 193 | { 194 | std::string content = 195 | "# /robots.txt for http://www.fict.org/\n" 196 | "# comments to webmaster@fict.org\n" 197 | "\n" 198 | "User-agent: unhipbot\n" 199 | "Disallow: /\n" 200 | "\n" 201 | "User-agent: webcrawler\n" 202 | "User-agent: excite\n" 203 | "Disallow:\n" 204 | "\n" 205 | "User-agent: *\n" 206 | "Disallow: /org/plans.html\n" 207 | "Allow: /org/\n" 208 | "Allow: /serv\n" 209 | "Allow: /~mak\n" 210 | "Disallow: /\n"; 211 | Rep::Robots robot(content); 212 | 213 | // The unhip bot 214 | EXPECT_FALSE(robot.allowed("/", "unhipbot")); 215 | EXPECT_FALSE(robot.allowed("/index.html", "unhipbot")); 216 | EXPECT_TRUE(robot.allowed("/robots.txt", "unhipbot")); 217 | EXPECT_FALSE(robot.allowed("/server.html", "unhipbot")); 218 | EXPECT_FALSE(robot.allowed("/services/fast.html", "unhipbot")); 219 | EXPECT_FALSE(robot.allowed("/services/slow.html", "unhipbot")); 220 | EXPECT_FALSE(robot.allowed("/orgo.gif", "unhipbot")); 221 | EXPECT_FALSE(robot.allowed("/org/about.html", "unhipbot")); 222 | EXPECT_FALSE(robot.allowed("/org/plans.html", "unhipbot")); 223 | EXPECT_FALSE(robot.allowed("/%7Ejim/jim.html", "unhipbot")); 224 | EXPECT_FALSE(robot.allowed("/%7Emak/mak.html", "unhipbot")); 225 | 226 | // The webcrawler agent 227 | EXPECT_TRUE(robot.allowed("/", "webcrawler")); 228 | EXPECT_TRUE(robot.allowed("/index.html", "webcrawler")); 229 | EXPECT_TRUE(robot.allowed("/robots.txt", "webcrawler")); 230 | EXPECT_TRUE(robot.allowed("/server.html", "webcrawler")); 231 | EXPECT_TRUE(robot.allowed("/services/fast.html", "webcrawler")); 232 | EXPECT_TRUE(robot.allowed("/services/slow.html", "webcrawler")); 233 | EXPECT_TRUE(robot.allowed("/orgo.gif", "webcrawler")); 234 | EXPECT_TRUE(robot.allowed("/org/about.html", "webcrawler")); 235 | EXPECT_TRUE(robot.allowed("/org/plans.html", "webcrawler")); 236 | EXPECT_TRUE(robot.allowed("/%7Ejim/jim.html", "webcrawler")); 237 | EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "webcrawler")); 238 | 239 | // The excite agent 240 | EXPECT_TRUE(robot.allowed("/", "excite")); 241 | EXPECT_TRUE(robot.allowed("/index.html", "excite")); 242 | EXPECT_TRUE(robot.allowed("/robots.txt", "excite")); 243 | EXPECT_TRUE(robot.allowed("/server.html", "excite")); 244 | EXPECT_TRUE(robot.allowed("/services/fast.html", "excite")); 245 | EXPECT_TRUE(robot.allowed("/services/slow.html", "excite")); 246 | EXPECT_TRUE(robot.allowed("/orgo.gif", "excite")); 247 | EXPECT_TRUE(robot.allowed("/org/about.html", "excite")); 248 | EXPECT_TRUE(robot.allowed("/org/plans.html", "excite")); 249 | EXPECT_TRUE(robot.allowed("/%7Ejim/jim.html", "excite")); 250 | EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "excite")); 251 | 252 | // All others 253 | EXPECT_FALSE(robot.allowed("/", "anything")); 254 | EXPECT_FALSE(robot.allowed("/index.html", "anything")); 255 | EXPECT_TRUE(robot.allowed("/robots.txt", "anything")); 256 | EXPECT_TRUE(robot.allowed("/server.html", "anything")); 257 | EXPECT_TRUE(robot.allowed("/services/fast.html", "anything")); 258 | EXPECT_TRUE(robot.allowed("/services/slow.html", "anything")); 259 | EXPECT_FALSE(robot.allowed("/orgo.gif", "anything")); 260 | EXPECT_TRUE(robot.allowed("/org/about.html", "anything")); 261 | EXPECT_FALSE(robot.allowed("/org/plans.html", "anything")); 262 | EXPECT_FALSE(robot.allowed("/%7Ejim/jim.html", "anything")); 263 | EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "anything")); 264 | } 265 | 266 | TEST(RobotsTest, IgnoreBOM) 267 | { 268 | std::string content = 269 | "\xEF\xBB\xBFuser-agent: *\n" 270 | "disallow: /disallowed\n"; 271 | Rep::Robots robot(content); 272 | EXPECT_TRUE(robot.allowed("/", "bot")); 273 | EXPECT_FALSE(robot.allowed("/disallowed", "bot")); 274 | } 275 | 276 | TEST(RobotsTest, Str) 277 | { 278 | std::string content = 279 | "User-agent: one\n" 280 | "Disallow: /foo\n" 281 | "Allow: /bar\n" 282 | "User-agent: *\n" 283 | "Allow: /foo\n" 284 | "Disallow: /bar\n"; 285 | 286 | Rep::Robots robot(content); 287 | EXPECT_EQ( 288 | "{\"one\": [Directive(Disallow: /foo), Directive(Allow: /bar)]," 289 | " \"*\": [Directive(Allow: /foo), Directive(Disallow: /bar)]}", 290 | robot.str()); 291 | } 292 | 293 | TEST(RobotsTest, IgnoresExternalDisallow) 294 | { 295 | std::string content = 296 | "User-agent: one\n" 297 | "Allow: /path\n" 298 | "Disallow: http://b.com/external\n"; 299 | 300 | Rep::Robots robot(content, "http://a.com/robots.txt"); 301 | EXPECT_TRUE(robot.allowed("/path", "one")); 302 | EXPECT_TRUE(robot.allowed("/external", "one")); 303 | } 304 | 305 | TEST(RobotsTest, IgnoresExternalAllow) 306 | { 307 | std::string content = 308 | "User-agent: one\n" 309 | "Disallow: /path\n" 310 | "Allow: http://b.com/path/external\n"; 311 | 312 | Rep::Robots robot(content, "http://a.com/robots.txt"); 313 | EXPECT_FALSE(robot.allowed("/path", "one")); 314 | EXPECT_FALSE(robot.allowed("/path/external", "one")); 315 | } 316 | 317 | TEST(RobotsTest, NeverExternalAllowed) 318 | { 319 | Rep::Robots robot("", "http://a.com/robots.txt"); 320 | EXPECT_FALSE(robot.allowed("http://b.com/", "one")); 321 | } 322 | 323 | TEST(RobotsTest, LeadingWildcardAllow) 324 | { 325 | std::string content = 326 | "User-agent: meow\n" 327 | "Disallow: /\n" 328 | "Allow: ****/cats\n" 329 | "Allow: */kangaroos\n"; 330 | Rep::Robots robot(content); 331 | 332 | EXPECT_FALSE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow")); 333 | EXPECT_TRUE(robot.allowed("/cats.html", "meow")); 334 | EXPECT_TRUE(robot.allowed("/cats/page.html", "meow")); 335 | EXPECT_TRUE(robot.allowed("/get/more/cats/page.html", "meow")); 336 | EXPECT_TRUE(robot.allowed("/kangaroos/page.html", "meow")); 337 | EXPECT_TRUE(robot.allowed("/heaps/of/kangaroos/page.html", "meow")); 338 | EXPECT_TRUE(robot.allowed("/kangaroosandkoalas/page.html", "meow")); 339 | } 340 | 341 | TEST(RobotsTest, LeadingWildcardDisallow) 342 | { 343 | std::string content = 344 | "User-agent: meow\n" 345 | "Allow: /\n" 346 | "Disallow: ****/cats\n" 347 | "Disallow: */kangaroos\n"; 348 | Rep::Robots robot(content); 349 | 350 | EXPECT_TRUE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow")); 351 | EXPECT_FALSE(robot.allowed("/cats.html", "meow")); 352 | EXPECT_FALSE(robot.allowed("/cats/page.html", "meow")); 353 | EXPECT_FALSE(robot.allowed("/get/more/cats/page.html", "meow")); 354 | EXPECT_FALSE(robot.allowed("/kangaroos/page.html", "meow")); 355 | EXPECT_FALSE(robot.allowed("/heaps/of/kangaroos/page.html", "meow")); 356 | EXPECT_FALSE(robot.allowed("/kangaroosandkoalas/page.html", "meow")); 357 | } 358 | --------------------------------------------------------------------------------