├── scripts
    ├── travis
    │   ├── script.sh
    │   └── before_install.sh
    ├── vagrant
    │   └── provision.sh
    └── check-coverage.sh
├── test
    ├── test-all.cpp
    ├── test-agent.cpp
    ├── test-directive.cpp
    └── test-robots.cpp
├── .gitignore
├── .gitmodules
├── .travis.yml
├── Vagrantfile
├── LICENSE
├── include
    ├── robots.h
    ├── directive.h
    └── agent.h
├── Makefile
├── bench.cpp
├── src
    ├── agent.cpp
    ├── directive.cpp
    └── robots.cpp
└── README.md


/scripts/travis/script.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | make test
6 | 


--------------------------------------------------------------------------------
/scripts/travis/before_install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | pip install --user gcovr==3.2
6 | 


--------------------------------------------------------------------------------
/test/test-all.cpp:
--------------------------------------------------------------------------------
1 | #include <gtest/gtest.h>
2 | 
3 | int main(int argc, char **argv) {
4 |     testing::InitGoogleTest(&argc, argv);
5 |     return RUN_ALL_TESTS();
6 | }
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries and build artifacts
 2 | *.o
 3 | *.a
 4 | bench
 5 | test-all
 6 | 
 7 | # Coverage
 8 | *.gcda
 9 | *.gcno
10 | *.gcov
11 | coverage.out
12 | 
13 | # Vagrant
14 | .vagrant/
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "deps/url-cpp"]
2 | 	path = deps/url-cpp
3 | 	url = https://github.com/seomoz/url-cpp
4 | [submodule "deps/googletest"]
5 | 	path = deps/googletest
6 | 	url = https://github.com/google/googletest/
7 | 	ignore = untracked
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | dist: bionic
 3 | git:
 4 |   submodules: true
 5 | compiler:
 6 |   - gcc
 7 |   - clang
 8 | language: cpp
 9 | python:
10 |   - '3.5'
11 | before_install: scripts/travis/before_install.sh
12 | script: scripts/travis/script.sh
13 | addons:
14 |   apt:
15 |     packages:
16 |     - cmake
17 | 


--------------------------------------------------------------------------------
/scripts/vagrant/provision.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | sudo apt-get update
 6 | sudo apt-get install -y g++ libgtest-dev cmake python-pip
 7 | 
 8 | sudo pip install gcovr==3.2
 9 | 
10 | pushd /tmp
11 |     mkdir -p gtest-build
12 |     pushd gtest-build
13 |         cmake -DCMAKE_BUILD_TYPE=RELEASE /usr/src/gtest/
14 |         make
15 |         find . -name 'libg*.a' | xargs sudo cp -f --target-directory=/usr/lib/
16 |     popd
17 | popd
18 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # Encoding: utf-8
 2 | # -*- mode: ruby -*-
 3 | # vi: set ft=ruby :
 4 | 
 5 | ENV['VAGRANT_DEFAULT_PROVIDER'] = 'virtualbox'
 6 | 
 7 | # http://docs.vagrantup.com/v2/
 8 | Vagrant.configure('2') do |config|
 9 |   config.vm.box = 'ubuntu/trusty64'
10 |   config.vm.hostname = 'rep-cpp'
11 |   config.ssh.forward_agent = true
12 | 
13 |   config.vm.provider :virtualbox do |vb|
14 |     vb.customize ["modifyvm", :id, "--memory", "1024"]
15 |     vb.customize ["modifyvm", :id, "--cpus", "2"]
16 |   end
17 | 
18 |   config.vm.provision :shell, path: 'scripts/vagrant/provision.sh', privileged: false
19 | end
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 SEOmoz, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/scripts/check-coverage.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ "$#" -ne 1 ]; then
 6 |     echo "Usage:"
 7 |     echo "    check-coverage.sh <path-to-project-root>"
 8 |     exit 1
 9 | fi
10 | 
11 | root="${1}"
12 | 
13 | results=`gcovr \
14 |     --root=${root} \
15 |     --exclude-unreachable-branches \
16 |     --output=coverage.out \
17 |     --print-summary \
18 |     --object-directory=${root} \
19 |     --exclude test \
20 |     --exclude deps/ \
21 |     --exclude src/psl.cpp \
22 |     --exclude src/punycode.cpp \
23 |     --exclude src/url.cpp \
24 |     --exclude src/utf8.cpp \
25 |     --exclude include/psl.h \
26 |     --exclude include/punycode.h \
27 |     --exclude include/url.h \
28 |     --exclude include/utf8.h`
29 | 
30 | lines=`echo ${results} | sed -E 's#^.*lines: ([0-9]+)(\.[0-9]+)?%.+$#\1#'`
31 | branches=`echo ${results} | sed -E 's#^.*branches: ([0-9]+)(\.[0-9]+)?%.+$#\1#'`
32 | 
33 | if [ "${lines}" -eq "0" ]; then
34 |     echo "Coverage disabled."
35 |     echo "#{results}"
36 | elif [ "${lines}" -ne "100" ]; then
37 |     echo "Incomplete line coverage (${lines})"
38 |     echo "${results}"
39 |     exit 2
40 | else
41 |     echo "Coverage looks good!"
42 |     echo "${results}"
43 | fi
44 | 


--------------------------------------------------------------------------------
/include/robots.h:
--------------------------------------------------------------------------------
 1 | #ifndef ROBOTS_CPP_H
 2 | #define ROBOTS_CPP_H
 3 | 
 4 | #include <sstream>
 5 | #include <unordered_map>
 6 | #include <vector>
 7 | 
 8 | #include "agent.h"
 9 | 
10 | namespace Rep
11 | {
12 | 
13 |     class Robots
14 |     {
15 |     public:
16 |         typedef std::unordered_map<std::string, Agent> agent_map_t;
17 |         typedef std::vector<std::string> sitemaps_t;
18 | 
19 |         /**
20 |          * Create a robots.txt from a utf-8-encoded string.
21 |          */
22 |         explicit Robots(const std::string& content);
23 | 
24 |         /**
25 |          * Create a robots.txt from a utf-8-encoded string assuming
26 |          * the given base_url.
27 |          */
28 |         Robots(const std::string& content, const std::string& base_url);
29 | 
30 |         /**
31 |          * Get the sitemaps in this robots.txt
32 |          */
33 |         const sitemaps_t& sitemaps() const { return sitemaps_; }
34 | 
35 |         /**
36 |          * Get the agent with the corresponding name.
37 |          */
38 |         const Agent& agent(const std::string& name) const;
39 | 
40 |         /**
41 |          * Return true if agent is allowed to fetch the URL (either a
42 |          * full URL or a path).
43 |          */
44 |         bool allowed(const std::string& path, const std::string& name) const;
45 | 
46 |         std::string str() const;
47 | 
48 |         /**
49 |          * Return the robots.txt URL corresponding to the provided URL.
50 |          */
51 |         static std::string robotsUrl(const std::string& url);
52 | 
53 |     private:
54 |         static void strip(std::string& string);
55 | 
56 |         static bool getpair(
57 |             std::istringstream& stream, std::string& key, std::string& value);
58 | 
59 |         std::string host_;
60 |         agent_map_t agents_;
61 |         sitemaps_t sitemaps_;
62 |         Agent& default_;
63 |     };
64 | }
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GTEST_DIR    ?= deps/googletest/googletest
 2 | 
 3 | CXX          ?= g++
 4 | CXXOPTS      ?= -Wall -Werror -std=c++11 -Iinclude/ -Ideps/url-cpp/include -I$(GTEST_DIR)/include
 5 | DEBUG_OPTS   ?= -g -fprofile-arcs -ftest-coverage -O0 -fPIC
 6 | RELEASE_OPTS ?= -O3
 7 | BINARIES      =
 8 | 
 9 | all: test release/librep.o $(BINARIES)
10 | 
11 | $(GTEST_DIR)/libgtest.a:
12 | 	g++ -std=c++11 -isystem $(GTEST_DIR)/include -I$(GTEST_DIR) -pthread -c $(GTEST_DIR)/src/gtest-all.cc -o $(GTEST_DIR)/libgtest.a
13 | 
14 | # Release libraries
15 | release:
16 | 	mkdir -p release
17 | 
18 | release/bin: release
19 | 	mkdir -p release/bin
20 | 
21 | deps/url-cpp/release/liburl.o: deps/url-cpp/* deps/url-cpp/include/* deps/url-cpp/src/*
22 | 	make -C deps/url-cpp release/liburl.o
23 | 
24 | release/librep.o: release/directive.o release/agent.o release/robots.o deps/url-cpp/release/liburl.o
25 | 	ld -r -o $@ $^
26 | 
27 | release/%.o: src/%.cpp include/%.h release
28 | 	$(CXX) $(CXXOPTS) $(RELEASE_OPTS) -o $@ -c $<
29 | 
30 | # Debug libraries
31 | debug:
32 | 	mkdir -p debug
33 | 
34 | debug/bin: debug
35 | 	mkdir -p debug/bin
36 | 
37 | deps/url-cpp/debug/liburl.o: deps/url-cpp/* deps/url-cpp/include/* deps/url-cpp/src/*
38 | 	make -C deps/url-cpp debug/liburl.o
39 | 
40 | debug/librep.o: debug/directive.o debug/agent.o debug/robots.o deps/url-cpp/debug/liburl.o
41 | 	ld -r -o $@ $^
42 | 
43 | debug/%.o: src/%.cpp include/%.h debug
44 | 	$(CXX) $(CXXOPTS) $(DEBUG_OPTS) -o $@ -c $<
45 | 
46 | test/%.o: test/%.cpp
47 | 	$(CXX) $(CXXOPTS) $(DEBUG_OPTS) -o $@ -c $<
48 | 
49 | # Tests
50 | test-all: test/test-all.o test/test-agent.o test/test-directive.o test/test-robots.o debug/librep.o $(GTEST_DIR)/libgtest.a
51 | 	$(CXX) $(CXXOPTS) -L$(GTEST_DIR) $(DEBUG_OPTS) -o $@ $^ -lpthread
52 | 
53 | # Bench
54 | bench: bench.cpp release/librep.o
55 | 	$(CXX) $(CXXOPTS) $(RELEASE_OPTS) -o $@ $< release/librep.o
56 | 
57 | .PHONY: test
58 | test: test-all
59 | 	./test-all
60 | 	./scripts/check-coverage.sh $(PWD)
61 | 
62 | clean:
63 | 	rm -rf debug release test-all bench test/*.o test/*.gcda test/*.gcno deps/url-cpp/debug deps/url-cpp/release
64 | 


--------------------------------------------------------------------------------
/include/directive.h:
--------------------------------------------------------------------------------
 1 | #ifndef DIRECTIVE_CPP_H
 2 | #define DIRECTIVE_CPP_H
 3 | 
 4 | 
 5 | namespace Rep
 6 | {
 7 | 
 8 |     class Directive
 9 |     {
10 |     public:
11 |         /**
12 |          * The type of our priority value.
13 |          */
14 |         typedef size_t priority_t;
15 | 
16 |         /**
17 |          * Default constructor disallowed.
18 |          */
19 |         Directive() = delete;
20 | 
21 |         /**
22 |          * The input to this constructor must be stripped of comments
23 |          * and trailing whitespace.
24 |          */
25 |         Directive(const std::string& line, bool allowed);
26 | 
27 |         /**
28 |          * Default copy constructor.
29 |          */
30 |         Directive(const Directive& rhs) = default;
31 | 
32 |         /**
33 |          * Default move constructor.
34 |          */
35 |         Directive(Directive&& rhs) = default;
36 | 
37 |         /**
38 |          * The priority of the rule.
39 |          */
40 |         priority_t priority() const
41 |         {
42 |             return priority_;
43 |         }
44 | 
45 |         /**
46 |          * Whether or not the provided path matches. The path is
47 |          * expected to be properly escaped.
48 |          */
49 |         bool match(const std::string& path) const;
50 | 
51 |         /**
52 |          * Whether this rule is for an allow or a disallow.
53 |          */
54 |         bool allowed() const
55 |         {
56 |             return allowed_;
57 |         }
58 | 
59 |         std::string str() const;
60 | 
61 |         /**
62 |          * Default copy assignment operator.
63 |          */
64 |         Directive& operator=(const Directive& rhs) = default;
65 | 
66 |     private:
67 |         std::string expression_;
68 |         priority_t priority_;
69 |         bool allowed_;
70 | 
71 |         /**
72 |          * Return true if p_begin -> p_end matches the expression e_begin -> e_end.
73 |          */
74 |         bool match(const std::string::const_iterator& e_begin,
75 |                    const std::string::const_iterator& e_end,
76 |                    const std::string::const_iterator& p_begin,
77 |                    const std::string::const_iterator& p_end) const;
78 |     };
79 | 
80 | }
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/include/agent.h:
--------------------------------------------------------------------------------
 1 | #ifndef AGENT_CPP_H
 2 | #define AGENT_CPP_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include "directive.h"
 7 | 
 8 | // forward declaration
 9 | namespace Url
10 | {
11 |     struct Url;
12 | }
13 | 
14 | namespace Rep
15 | {
16 |     class Agent
17 |     {
18 |     public:
19 |         /* The type for the delay. */
20 |         typedef float delay_t;
21 | 
22 |         /**
23 |          * Default constructor
24 |          */
25 |         Agent() : Agent("") {}
26 | 
27 |         /**
28 |          * Construct an agent.
29 |          */
30 |         explicit Agent(const std::string& host) :
31 |             directives_(), delay_(-1.0), sorted_(true), host_(host) {}
32 | 
33 |         /**
34 |          * Default copy constructor.
35 |          */
36 |         Agent(const Agent& rhs) = default;
37 | 
38 |         /**
39 |          * Default move constructor.
40 |          */
41 |         Agent(Agent&& rhs) = default;
42 | 
43 |         /**
44 |          * Add an allowed directive.
45 |          */
46 |         Agent& allow(const std::string& query);
47 | 
48 |         /**
49 |          * Add a disallowed directive.
50 |          */
51 |         Agent& disallow(const std::string& query);
52 | 
53 |         /**
54 |          * Set the delay for this agent.
55 |          */
56 |         Agent& delay(delay_t value) {
57 |             delay_ = value;
58 |             return *this;
59 |         }
60 | 
61 |         /**
62 |          * Return the delay for this agent.
63 |          */
64 |         delay_t delay() const { return delay_; }
65 | 
66 |         /**
67 |          * A vector of the directives, in priority-sorted order.
68 |          */
69 |         const std::vector<Directive>& directives() const;
70 | 
71 |         /**
72 |          * Return true if the URL (either a full URL or a path) is allowed.
73 |          */
74 |         bool allowed(const std::string& path) const;
75 | 
76 |         std::string str() const;
77 | 
78 |         /**
79 |          * Default copy assignment operator.
80 |          */
81 |         Agent& operator=(const Agent& rhs) = default;
82 | 
83 |     private:
84 |         bool is_external(const Url::Url& url) const;
85 | 
86 |         mutable std::vector<Directive> directives_;
87 |         delay_t delay_;
88 |         mutable bool sorted_;
89 |         std::string host_;
90 |     };
91 | }
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/bench.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <ctime>
 4 | 
 5 | #include "directive.h"
 6 | #include "robots.h"
 7 | 
 8 | /**
 9 |  * Run func() `count` times in each of `runs` experiments, where `name` provides a
10 |  * meaningful description of the task being benchmarked. Prints out the time for each
11 |  * run, the average time, and rate.
12 |  */
13 | template<typename Functor>
14 | void bench(const std::string& name, size_t count, size_t runs, Functor func)
15 | {
16 |     std::cout << "Benchmarking " << name << " with " << count << " per run:" << std::endl;
17 |     double total(0);
18 |     for (size_t run = 0; run < runs; ++run)
19 |     {
20 |         auto start = std::chrono::high_resolution_clock::now();
21 |         for (size_t it = 0; it < count; ++it)
22 |         {
23 |             func();
24 |         }
25 |         auto end = std::chrono::high_resolution_clock::now();
26 |         double duration = std::chrono::duration<double, std::milli>(end - start).count();
27 |         total += duration;
28 |         std::cout << "    Run " << run << ": " << duration << " ms" << std::endl;
29 |     }
30 |     std::cout << "  Average: " << (total / runs) << " ms" << std::endl;
31 |     std::cout << "     Rate: " << ((count * runs) / total) << " k-iter / s" << std::endl;
32 | }
33 | 
34 | int main(int argc, char* argv[]) {
35 | 
36 |     size_t count = 1000000;
37 |     size_t runs = 5;
38 | 
39 |     bench("directive basic parse", count, runs, []() {
40 |         Rep::Directive("/basic/path", true);
41 |     });
42 | 
43 |     bench("directive wildcard parse", count, runs, []() {
44 |         Rep::Directive("/path/*/with/**/wildcards/*", true);
45 |     });
46 | 
47 |     Rep::Directive directive("/basic/path", true);
48 |     bench("directive basic check", count, runs, [directive]() {
49 |         directive.match("/basic/path/other");
50 |     });
51 | 
52 |     directive = Rep::Directive("/path/*/with/**/wildcards/*", true);
53 |     bench("directive wildcard check", count, runs, [directive]() {
54 |         directive.match("/path/is/with/a/few/wildcards/");
55 |     });
56 | 
57 |     std::string content =
58 |         "# /robots.txt for http://www.fict.org/\n"
59 |         "# comments to webmaster@fict.org\n"
60 |         "\n"
61 |         "User-agent: unhipbot\n"
62 |         "Disallow: /\n"
63 |         "\n"
64 |         "User-agent: webcrawler\n"
65 |         "User-agent: excite\n"
66 |         "Disallow:\n"
67 |         "\n"
68 |         "User-agent: *\n"
69 |         "Disallow: /org/plans.html\n"
70 |         "Allow: /org/\n"
71 |         "Allow: /serv\n"
72 |         "Allow: /~mak\n"
73 |         "Disallow: /\n";
74 |     bench("parse RFC", count / 10, runs, [content]() {
75 |         Rep::Robots robot(content);
76 |     });
77 | }
78 | 


--------------------------------------------------------------------------------
/src/agent.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <iomanip>
  3 | #include <sstream>
  4 | 
  5 | #include "url.h"
  6 | 
  7 | #include "agent.h"
  8 | #include "directive.h"
  9 | 
 10 | namespace
 11 | {
 12 |     std::string escape_url(Url::Url& url)
 13 |     {
 14 |         return url.defrag().escape().fullpath();
 15 |     }
 16 | 
 17 |     std::string trim_front(const std::string& str, const char chr)
 18 |     {
 19 |         auto itr = std::find_if(str.begin(), str.end(),
 20 |                        [chr](const char c) {return c != chr;});
 21 |         return std::string(itr, str.end());
 22 |     }
 23 | }
 24 | 
 25 | namespace Rep
 26 | {
 27 |     Agent& Agent::allow(const std::string& query)
 28 |     {
 29 |         Url::Url url(query);
 30 |         // ignore directives for external URLs
 31 |         if (is_external(url))
 32 |         {
 33 |             return *this;
 34 |         }
 35 |         // leading wildcard?
 36 |         if (query.front() == '*')
 37 |         {
 38 |             Url::Url trimmed(trim_front(query, '*'));
 39 |             directives_.push_back(Directive(escape_url(trimmed), true));
 40 |         }
 41 |         directives_.push_back(Directive(escape_url(url), true));
 42 |         sorted_ = false;
 43 |         return *this;
 44 |     }
 45 | 
 46 |     Agent& Agent::disallow(const std::string& query)
 47 |     {
 48 |         if (query.empty())
 49 |         {
 50 |             // Special case: "Disallow:" means "Allow: /"
 51 |             directives_.push_back(Directive(query, true));
 52 |         }
 53 |         else
 54 |         {
 55 |             Url::Url url(query);
 56 |             // ignore directives for external URLs
 57 |             if (is_external(url))
 58 |             {
 59 |                 return *this;
 60 |             }
 61 |             // leading wildcard?
 62 |             if (query.front() == '*')
 63 |             {
 64 |                 Url::Url trimmed(trim_front(query, '*'));
 65 |                 directives_.push_back(Directive(escape_url(trimmed), false));
 66 |             }
 67 |             directives_.push_back(Directive(escape_url(url), false));
 68 |         }
 69 |         sorted_ = false;
 70 |         return *this;
 71 |     }
 72 | 
 73 |     const std::vector<Directive>& Agent::directives() const
 74 |     {
 75 |         if (!sorted_)
 76 |         {
 77 |             std::sort(directives_.begin(), directives_.end(),
 78 |                 [](const Directive& a, const Directive& b) {
 79 |                     return b.priority() < a.priority();
 80 |                 });
 81 |             sorted_ = true;
 82 |         }
 83 |         return directives_;
 84 |     }
 85 | 
 86 |     bool Agent::allowed(const std::string& query) const
 87 |     {
 88 |         Url::Url url(query);
 89 |         if (is_external(url))
 90 |         {
 91 |             return false;
 92 |         }
 93 |         std::string path(escape_url(url));
 94 | 
 95 |         if (path.compare("/robots.txt") == 0)
 96 |         {
 97 |             return true;
 98 |         }
 99 | 
100 |         for (const auto& directive : directives())
101 |         {
102 |             if (directive.match(path))
103 |             {
104 |                 return directive.allowed();
105 |             }
106 |         }
107 |         return true;
108 |     }
109 | 
110 |     std::string Agent::str() const
111 |     {
112 |         std::stringstream out;
113 |         if (delay_ > 0)
114 |         {
115 |             out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' ';
116 |         }
117 |         out << '[';
118 |         const auto& d = directives();
119 |         auto begin = d.begin();
120 |         auto end = d.end();
121 |         if (begin != end)
122 |         {
123 |             out << "Directive(" << begin->str() << ')';
124 |             ++begin;
125 |         }
126 |         for (; begin != end; ++begin)
127 |         {
128 |             out << ", Directive(" << begin->str() << ')';
129 |         }
130 |         out << ']';
131 |         return out.str();
132 |     }
133 | 
134 |     bool Agent::is_external(const Url::Url& url) const
135 |     {
136 |         return !host_.empty() && !url.host().empty() && url.host() != host_;
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/directive.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <locale>
  3 | #include <sstream>
  4 | #include <string>
  5 | 
  6 | #include "url.h"
  7 | 
  8 | #include "directive.h"
  9 | 
 10 | namespace Rep
 11 | {
 12 |     Directive::Directive(const std::string& line, bool allowed)
 13 |         : expression_()
 14 |         , priority_(line.size())
 15 |         , allowed_(allowed)
 16 |     {
 17 |         if (line.find('*') == std::string::npos)
 18 |         {
 19 |             expression_.assign(line);
 20 |             return;
 21 |         }
 22 | 
 23 |         // Remove consecutive '*'s
 24 |         expression_.reserve(line.size());
 25 |         bool star = false;
 26 |         for (auto character : line)
 27 |         {
 28 |             if (character == '*')
 29 |             {
 30 |                 if (!star)
 31 |                 {
 32 |                     expression_.append(1, character);
 33 |                 }
 34 |                 star = true;
 35 |             }
 36 |             else
 37 |             {
 38 |                 expression_.append(1, character);
 39 |                 star = false;
 40 |             }
 41 |         }
 42 | 
 43 |         // Remove trailing '*'s
 44 |         std::string::reverse_iterator last =
 45 |             std::find_if(expression_.rbegin(), expression_.rend(),
 46 |                 [](const char c) {
 47 |                     return c != '*';
 48 |                 });
 49 |         expression_.erase(last.base(), expression_.end());
 50 | 
 51 |         // Priority is the length of the expression
 52 |         priority_ = expression_.size();
 53 |     }
 54 | 
 55 |     bool Directive::match(const std::string::const_iterator& e_begin,
 56 |                           const std::string::const_iterator& e_end,
 57 |                           const std::string::const_iterator& p_begin,
 58 |                           const std::string::const_iterator& p_end) const
 59 |     {
 60 |         std::string::const_iterator expression_it = e_begin;
 61 |         std::string::const_iterator path_it = p_begin;
 62 |         while (expression_it != e_end && path_it != p_end)
 63 |         {
 64 |             if (*expression_it == '*')
 65 |             {
 66 |                 // Advance and recurse
 67 |                 ++expression_it;
 68 |                 for (; path_it != p_end; ++path_it)
 69 |                 {
 70 |                     if (match(expression_it, e_end, path_it, p_end))
 71 |                     {
 72 |                         return true;
 73 |                     }
 74 |                 }
 75 |                 return false;
 76 |             }
 77 |             else if (*expression_it == '$')
 78 |             {
 79 |                 // This check expects path to be fully consumed. But since one of the
 80 |                 // criteria of being in this while loop is that we've not fully consumed
 81 |                 // path, return false.
 82 |                 return false;
 83 |             }
 84 |             else if (*expression_it != *path_it)
 85 |             {
 86 |                 // These characters must match
 87 |                 return false;
 88 |             }
 89 |             else
 90 |             {
 91 |                 // Advance both by one
 92 |                 ++path_it;
 93 |                 ++expression_it;
 94 |             }
 95 |         }
 96 | 
 97 |         // Return true only if we've consumed all of the expression
 98 |         if (expression_it == e_end)
 99 |         {
100 |             return true;
101 |         }
102 |         else if (*expression_it == '$')
103 |         {
104 |             return path_it == p_end;
105 |         }
106 |         else
107 |         {
108 |             return false;
109 |         }
110 |     }
111 | 
112 |     std::string Directive::str() const
113 |     {
114 |         std::stringstream out;
115 |         if (allowed_)
116 |         {
117 |             out << "Allow: " << expression_;
118 |         }
119 |         else {
120 |             out << "Disallow: " << expression_;
121 |         }
122 |         return out.str();
123 |     }
124 | 
125 |     bool Directive::match(const std::string& path) const
126 |     {
127 |         return match(expression_.begin(), expression_.end(), path.begin(), path.end());
128 |     }
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/test/test-agent.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include "agent.h"
  4 | 
  5 | TEST(AgentTest, Basic)
  6 | {
  7 |     Rep::Agent agent = Rep::Agent("a.com").allow("/").disallow("/foo");
  8 |     EXPECT_EQ(agent.directives().size(), 2ul);
  9 | }
 10 | 
 11 | TEST(AgentTest, ChecksAllowed)
 12 | {
 13 |     Rep::Agent agent = Rep::Agent("a.com").allow("/path");
 14 |     EXPECT_TRUE(agent.allowed("/path"));
 15 |     EXPECT_TRUE(agent.allowed("/elsewhere"));
 16 | }
 17 | 
 18 | TEST(AgentTest, IgnoresExternalDisallow)
 19 | {
 20 |     Rep::Agent agent = Rep::Agent("a.com")
 21 |         .allow("/path")
 22 |         .disallow("http://b.com/external");
 23 |     EXPECT_TRUE(agent.allowed("/path"));
 24 |     EXPECT_TRUE(agent.allowed("/external"));
 25 | }
 26 | 
 27 | TEST(AgentTest, IgnoresExternalAllow)
 28 | {
 29 |     Rep::Agent agent = Rep::Agent("a.com")
 30 |         .disallow("/path")
 31 |         .allow("http://b.com/path/exception");
 32 |     EXPECT_FALSE(agent.allowed("/path"));
 33 |     EXPECT_FALSE(agent.allowed("/path/exception"));
 34 | }
 35 | 
 36 | TEST(AgentTest, NeverExternalAllowed)
 37 | {
 38 |     Rep::Agent agent = Rep::Agent("a.com");
 39 |     EXPECT_FALSE(agent.allowed("http://b.com/"));
 40 | }
 41 | 
 42 | TEST(AgentTest, HonorsLongestFirstPriority)
 43 | {
 44 |     Rep::Agent agent = Rep::Agent("a.com")
 45 |         .disallow("/path")
 46 |         .allow("/path/exception");
 47 |     EXPECT_TRUE(agent.allowed("/path/exception"));
 48 |     EXPECT_FALSE(agent.allowed("/path"));
 49 | }
 50 | 
 51 | TEST(AgentTest, RobotsTextAllowed)
 52 | {
 53 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/robots.txt");
 54 |     EXPECT_TRUE(agent.allowed("/robots.txt"));
 55 | }
 56 | 
 57 | TEST(AgentTest, DisallowNone)
 58 | {
 59 |     Rep::Agent agent = Rep::Agent("a.com").disallow("");
 60 |     EXPECT_TRUE(agent.allowed("/anything"));
 61 | }
 62 | 
 63 | TEST(AgentTest, MiddleWildcard)
 64 | {
 65 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/test*foo");
 66 |     EXPECT_FALSE(agent.allowed("/testfoo"));
 67 |     EXPECT_FALSE(agent.allowed("/testafoo"));
 68 |     EXPECT_FALSE(agent.allowed("/testaasdffoo"));
 69 |     EXPECT_FALSE(agent.allowed("/test/foo"));
 70 |     EXPECT_TRUE(agent.allowed("/testfo"));
 71 |     EXPECT_TRUE(agent.allowed("/estfoo"));
 72 | }
 73 | 
 74 | TEST(AgentTest, EscapedRule)
 75 | {
 76 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/a%3cd.html");
 77 |     EXPECT_FALSE(agent.allowed("/a<d.html"));
 78 |     EXPECT_FALSE(agent.allowed("/a%3cd.html"));
 79 | }
 80 | 
 81 | TEST(AgentTest, UnescapedRule)
 82 | {
 83 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/a<d.html");
 84 |     EXPECT_FALSE(agent.allowed("/a<d.html"));
 85 |     EXPECT_FALSE(agent.allowed("/a%3cd.html"));
 86 | }
 87 | 
 88 | TEST(AgentTest, EscapedRuleWildcard)
 89 | {
 90 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/a%3c*");
 91 |     EXPECT_FALSE(agent.allowed("/a<d.html"));
 92 |     EXPECT_FALSE(agent.allowed("/a%3cd.html"));
 93 | }
 94 | 
 95 | TEST(AgentTest, UnescapedRuleWildcard)
 96 | {
 97 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/a<*");
 98 |     EXPECT_FALSE(agent.allowed("/a<d.html"));
 99 |     EXPECT_FALSE(agent.allowed("/a%3cd.html"));
100 | }
101 | 
102 | TEST(AgentTest, AcceptsFullUrl)
103 | {
104 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/path;params?query");
105 |     EXPECT_FALSE(agent.allowed(
106 |         "http://userinfo@exmaple.com:10/path;params?query#fragment"));
107 | }
108 | 
109 | TEST(AgentTest, QueryOnly)
110 | {
111 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/?");
112 |     EXPECT_TRUE(agent.allowed("/"));
113 |     EXPECT_FALSE(agent.allowed("/?query"));
114 | }
115 | 
116 | TEST(AgentTest, ParamsOnly)
117 | {
118 |     Rep::Agent agent = Rep::Agent("a.com").disallow("/;");
119 |     EXPECT_TRUE(agent.allowed("/"));
120 |     EXPECT_FALSE(agent.allowed("/;params"));
121 | }
122 | 
123 | TEST(AgentTest, Str)
124 | {
125 |     Rep::Agent agent("a.com");
126 |     agent.disallow("/foo");
127 |     agent.allow("/bar");
128 |     EXPECT_EQ("[Directive(Disallow: /foo), Directive(Allow: /bar)]", agent.str());
129 | }
130 | 
131 | TEST(AgentTest, StrWithDelay)
132 | {
133 |     Rep::Agent agent("a.com");
134 |     agent.delay(1.0);
135 |     agent.disallow("/foo");
136 |     agent.allow("/bar");
137 |     EXPECT_EQ("Crawl-Delay: 1 [Directive(Disallow: /foo), Directive(Allow: /bar)]", agent.str());
138 | }
139 | 


--------------------------------------------------------------------------------
/src/robots.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <functional>
  3 | #include <cctype>
  4 | #include <locale>
  5 | #include <sstream>
  6 | #include <iostream>
  7 | #include <unordered_map>
  8 | 
  9 | #include "url.h"
 10 | 
 11 | #include "robots.h"
 12 | 
 13 | namespace Rep
 14 | {
 15 | 
 16 |     void Robots::strip(std::string& string)
 17 |     {
 18 |         string.erase(string.begin(), std::find_if(string.begin(), string.end(),
 19 |             std::not1(std::ptr_fun<int, int>(std::isspace))));
 20 |         string.erase(std::find_if(string.rbegin(), string.rend(),
 21 |             std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
 22 |     }
 23 | 
 24 |     bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
 25 |     {
 26 |         while (getline(stream, key))
 27 |         {
 28 |             size_t index = key.find('#');
 29 |             if (index != std::string::npos)
 30 |             {
 31 |                 key.resize(index);
 32 |             }
 33 | 
 34 |             // Find the colon and divide it into key and value, skipping malformed lines
 35 |             index = key.find(':');
 36 |             if (index == std::string::npos)
 37 |             {
 38 |                 continue;
 39 |             }
 40 | 
 41 |             value.assign(key.begin() + index + 1, key.end());
 42 |             key.resize(index);
 43 | 
 44 |             // Strip whitespace off of each
 45 |             strip(key);
 46 |             strip(value);
 47 | 
 48 |             // Lowercase the key
 49 |             std::transform(key.begin(), key.end(), key.begin(), ::tolower);
 50 | 
 51 |             return true;
 52 |         }
 53 |         return false;
 54 |     }
 55 | 
 56 |     Robots::Robots(const std::string& content) :
 57 |         Robots(content, "")
 58 |     {
 59 |     }
 60 | 
 61 |     Robots::Robots(const std::string& content, const std::string& base_url) :
 62 |         host_(Url::Url(base_url).host()),
 63 |         agents_(),
 64 |         sitemaps_(),
 65 |         default_(agents_.emplace("*", Agent(host_)).first->second)
 66 |     {
 67 |         std::string agent_name("*");
 68 |         std::istringstream input(content);
 69 |         if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
 70 |         {
 71 |             input.ignore(3);
 72 |         }
 73 |         std::string key, value;
 74 |         std::vector<std::string> group;
 75 |         bool last_agent = false;
 76 |         agent_map_t::iterator current = agents_.find("*");
 77 |         while (Robots::getpair(input, key, value))
 78 |         {
 79 |             if (key.compare("user-agent") == 0)
 80 |             {
 81 |                 // Store the user agent string as lowercased
 82 |                 std::transform(value.begin(), value.end(), value.begin(), ::tolower);
 83 | 
 84 |                 if (last_agent)
 85 |                 {
 86 |                     group.push_back(value);
 87 |                 }
 88 |                 else
 89 |                 {
 90 |                     if (!agent_name.empty())
 91 |                     {
 92 |                         for (auto other : group)
 93 |                         {
 94 |                             agents_.emplace(other, current->second);
 95 |                         }
 96 |                         group.clear();
 97 |                     }
 98 |                     agent_name = value;
 99 |                     current = agents_.emplace(agent_name, Agent(host_)).first;
100 |                 }
101 |                 last_agent = true;
102 |                 continue;
103 |             }
104 |             else
105 |             {
106 |                 last_agent = false;
107 |             }
108 | 
109 |             if (key.compare("sitemap") == 0)
110 |             {
111 |                 sitemaps_.push_back(value);
112 |             }
113 |             else if (key.compare("disallow") == 0)
114 |             {
115 |                 current->second.disallow(value);
116 |             }
117 |             else if (key.compare("allow") == 0)
118 |             {
119 |                 current->second.allow(value);
120 |             }
121 |             else if (key.compare("crawl-delay") == 0)
122 |             {
123 |                 try
124 |                 {
125 |                     current->second.delay(std::stof(value));
126 |                 }
127 |                 catch (const std::exception&)
128 |                 {
129 |                     std::cerr << "Could not parse " << value << " as float." << std::endl;
130 |                 }
131 |             }
132 |         }
133 | 
134 |         if (!agent_name.empty())
135 |         {
136 |             for (auto other : group)
137 |             {
138 |                 agents_.emplace(other, current->second);
139 |             }
140 |         }
141 |     }
142 | 
143 |     const Agent& Robots::agent(const std::string& name) const
144 |     {
145 |         // Lowercase the agent
146 |         std::string lowered(name);
147 |         std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
148 | 
149 |         auto it = agents_.find(lowered);
150 |         if (it == agents_.end())
151 |         {
152 |             return default_;
153 |         }
154 |         else
155 |         {
156 |             return it->second;
157 |         }
158 |     }
159 | 
160 |     bool Robots::allowed(const std::string& path, const std::string& name) const
161 |     {
162 |         return agent(name).allowed(path);
163 |     }
164 | 
165 |     std::string Robots::str() const
166 |     {
167 |         std::stringstream out;
168 |         // TODO: include sitepath info
169 |         out << '{';
170 |         auto begin = agents_.begin();
171 |         auto end = agents_.end();
172 |         if (begin != end)
173 |         {
174 |             out << '"' << begin->first << '"' << ": " << begin->second.str();
175 |             ++begin;
176 |         }
177 |         for (; begin != end; ++begin)
178 |         {
179 |             out << ", \"" << begin->first << '"' << ": " << begin->second.str();
180 |         }
181 |         out << '}';
182 |         return out.str();
183 |     }
184 | 
185 |     std::string Robots::robotsUrl(const std::string& url)
186 |     {
187 |         return Url::Url(url)
188 |             .setUserinfo("")
189 |             .setPath("robots.txt")
190 |             .setParams("")
191 |             .setQuery("")
192 |             .setFragment("")
193 |             .remove_default_port()
194 |             .str();
195 |     }
196 | }
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Robots Exclusion Protocol Parser for C++
  2 | ========================================
  3 | 
  4 | [![Build Status](https://travis-ci.org/seomoz/rep-cpp.svg?branch=master)](https://travis-ci.org/seomoz/rep-cpp)
  5 | 
  6 | Supports the [1996 RFC](http://www.robotstxt.org/norobots-rfc.txt), as well as some
  7 | modern conventions, including:
  8 | 
  9 | - wildcard matching (`*` and `$`)
 10 | - sitemap listing
 11 | - crawl-delay
 12 | 
 13 | __This library deals in UTF-8-encoded strings.__
 14 | 
 15 | Matching
 16 | --------
 17 | A path may match multiple directives. For example, `/some/path/page.html` matches all
 18 | of these rules:
 19 | 
 20 | ```
 21 | Allow: /some/
 22 | Disallow: /some/path/
 23 | Allow: /*/page.html
 24 | ```
 25 | 
 26 | Each directive is given a priority, and the highest-priority matching directive is used.
 27 | We choose the length of the expression to be that priority. In the above example, the
 28 | priorities are:
 29 | 
 30 | ```
 31 | Allow: /some/            (priority = 6)
 32 | Disallow: /some/path/    (priority = 11)
 33 | Allow: /*/page.html      (priority = 12)
 34 | ```
 35 | 
 36 | Classes
 37 | -------
 38 | A `Robots` object is the result of parsing a single `robots.txt` file. It has a mapping of
 39 | agent names to `Agent` objects, as well as a vector of the `sitemaps` listed in the file.
 40 | An `Agent` object holds the crawl-delay and `Directive`s associated with a particular
 41 | user-agent.
 42 | 
 43 | Parsing and Querying
 44 | --------------------
 45 | Here's an example of parsing a robots.txt file:
 46 | 
 47 | ```c++
 48 | #include "robots.h"
 49 | 
 50 | std::string content = "...";
 51 | Rep::Robots robots = Rep::Robots(content);
 52 | 
 53 | // Is this path allowed to the provided agent?
 54 | robots.allowed("/some/path", "my-agent");
 55 | 
 56 | // Is this URL allowed to the provided agent?
 57 | robots.url_allowed("http://example.com/some/path", "my-agent");
 58 | ```
 59 | 
 60 | If a client is interested only in the exclusion rules of a single agent, then:
 61 | 
 62 | ```c++
 63 | Rep::Agent agent = Rep::Robots(content).agent("my-agent");
 64 | 
 65 | // Is this path allowed to this agent?
 66 | agent.allowed("/some/path");
 67 | 
 68 | // Is this URL allowed to this agent?
 69 | agent.url_allowed("http://example.com/some/path");
 70 | ```
 71 | 
 72 | Building
 73 | ========
 74 | This library depends on `url-cpp`, which is included as a submodule. We provide two
 75 | main targets, `{debug,release}/librep.o`:
 76 | 
 77 | ```
 78 | git submodule update --init --recursive
 79 | make release/librep.o
 80 | ```
 81 | 
 82 | Development
 83 | ===========
 84 | 
 85 | Environment
 86 | -----------
 87 | To launch the `vagrant` image, we only need to
 88 | `vagrant up` (though you may have to provide a `--provider` flag):
 89 | 
 90 | ```bash
 91 | vagrant up
 92 | ```
 93 | 
 94 | With a running `vagrant` instance, you can log in and run tests:
 95 | 
 96 | ```bash
 97 | vagrant ssh
 98 | cd /vagrant
 99 | 
100 | make test
101 | ```
102 | 
103 | Running Tests
104 | -------------
105 | Tests are run with the top-level `Makefile`:
106 | 
107 | ```bash
108 | make test
109 | ```
110 | 
111 | PRs
112 | ===
113 | These are not all hard-and-fast rules, but in general PRs have the following expectations:
114 | 
115 | - __pass Travis__ -- or more generally, whatever CI is used for the particular project
116 | - __be a complete unit__ -- whether a bug fix or feature, it should appear as a complete
117 |     unit before consideration.
118 | - __maintain code coverage__ -- some projects may include code coverage requirements as
119 |     part of the build as well
120 | - __maintain the established style__ -- this means the existing style of established
121 |     projects, the established conventions of the team for a given language on new
122 |     projects, and the guidelines of the community of the relevant languages and
123 |     frameworks.
124 | - __include failing tests__ -- in the case of bugs, failing tests demonstrating the bug
125 |     should be included as one commit, followed by a commit making the test succeed. This
126 |     allows us to jump to a world with a bug included, and prove that our test in fact
127 |     exercises the bug.
128 | - __be reviewed by one or more developers__ -- not all feedback has to be accepted, but
129 |     it should all be considered.
130 | - __avoid 'addressed PR feedback' commits__ -- in general, PR feedback should be rebased
131 |     back into the appropriate commits that introduced the change. In cases, where this
132 |     is burdensome, PR feedback commits may be used but should still describe the changed
133 |     contained therein.
134 | 
135 | PR reviews consider the design, organization, and functionality of the submitted code.
136 | 
137 | Commits
138 | =======
139 | Certain types of changes should be made in their own commits to improve readability. When
140 | too many different types of changes happen simultaneous to a single commit, the purpose of
141 | each change is muddled. By giving each commit a single logical purpose, it is implicitly
142 | clear why changes in that commit took place.
143 | 
144 | - __updating / upgrading dependencies__ -- this is especially true for invocations like
145 |     `bundle update` or `berks update`.
146 | - __introducing a new dependency__ -- often preceeded by a commit updating existing
147 |     dependencies, this should only include the changes for the new dependency.
148 | - __refactoring__ -- these commits should preserve all the existing functionality and
149 |     merely update how it's done.
150 | - __utility components to be used by a new feature__ -- if introducing an auxiliary class
151 |     in support of a subsequent commit, add this new class (and its tests) in its own
152 |     commit.
153 | - __config changes__ -- when adjusting configuration in isolation
154 | - __formatting / whitespace commits__ -- when adjusting code only for stylistic purposes.
155 | 
156 | New Features
157 | ------------
158 | Small new features (where small refers to the size and complexity of the change, not the
159 | impact) are often introduced in a single commit. Larger features or components might be
160 | built up piecewise, with each commit containing a single part of it (and its corresponding
161 | tests).
162 | 
163 | Bug Fixes
164 | ---------
165 | In general, bug fixes should come in two-commit pairs: a commit adding a failing test
166 | demonstrating the bug, and a commit making that failing test pass.
167 | 
168 | Tagging and Versioning
169 | ======================
170 | Whenever the version included in `setup.py` is changed (and it should be changed when
171 | appropriate using [http://semver.org/](http://semver.org/)), a corresponding tag should
172 | be created with the same version number (formatted `v<version>`).
173 | 
174 | ```bash
175 | git tag -a v0.1.0 -m 'Version 0.1.0
176 | 
177 | This release contains an initial working version Rep::Robots.'
178 | 
179 | git push origin
180 | ```
181 | 


--------------------------------------------------------------------------------
/test/test-directive.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include "directive.h"
  4 | 
  5 | TEST(DirectiveTest, BasicExact)
  6 | {
  7 |     std::string directive("/tmp");
  8 |     std::string example("/tmp");
  9 |     EXPECT_TRUE(Rep::Directive(directive, true).match(example));
 10 | }
 11 | 
 12 | TEST(DirectiveTest, BasicFileExtension)
 13 | {
 14 |     std::string directive("/tmp");
 15 |     std::string example("/tmp.html");
 16 |     EXPECT_TRUE(Rep::Directive(directive, true).match(example));
 17 | }
 18 | 
 19 | TEST(DirectiveTest, BasicDirectory)
 20 | {
 21 |     std::string directive("/tmp");
 22 |     std::string example("/tmp/a.html");
 23 |     EXPECT_TRUE(Rep::Directive(directive, true).match(example));
 24 | }
 25 | 
 26 | TEST(DirectiveTest, BasicDirectoryAndFile)
 27 | {
 28 |     std::string directive("/tmp/");
 29 |     std::string example("/tmp");
 30 |     EXPECT_FALSE(Rep::Directive(directive, true).match(example));
 31 | }
 32 | 
 33 | TEST(DirectiveTest, BasicDirectoryAndDirectory)
 34 | {
 35 |     std::string directive("/tmp/");
 36 |     std::string example("/tmp/");
 37 |     EXPECT_TRUE(Rep::Directive(directive, true).match(example));
 38 | }
 39 | 
 40 | TEST(DirectiveTest, BasicDirectoryAndDirectoryAndFile)
 41 | {
 42 |     std::string directive("/tmp/");
 43 |     std::string example("/tmp/a.html");
 44 |     EXPECT_TRUE(Rep::Directive(directive, true).match(example));
 45 | }
 46 | 
 47 | TEST(DirectiveTest, WildcardTest)
 48 | {
 49 |     std::vector<std::string> examples = {
 50 |         "/hello/how/are/you",
 51 |         "/hello/how/are/you/today",
 52 |         "/hello/how/are/yo/are/you",
 53 |     };
 54 |     std::vector<std::string> antiexamples = {
 55 |         "/hello/",
 56 |         "/hi/how/are/you"
 57 |     };
 58 |     std::string directive("/hello/*/are/you");
 59 |     Rep::Directive parsed(directive, true);
 60 |     for (auto example : examples)
 61 |     {
 62 |         EXPECT_TRUE(parsed.match(example)) <<
 63 |             example << " didn't match " << directive;
 64 |     }
 65 | 
 66 |     for (auto example : antiexamples)
 67 |     {
 68 |         EXPECT_FALSE(parsed.match(example)) <<
 69 |             example << " matched " << directive;
 70 |     }
 71 | }
 72 | 
 73 | TEST(DirectiveTest, LeadingWildcard)
 74 | {
 75 |     std::vector<std::string> examples = {
 76 |         "/test",
 77 |         "/a/test",
 78 |         "/ab/test",
 79 |         "/abc/test",
 80 |     };
 81 |     std::string directive("*/test");
 82 |     Rep::Directive parsed(directive, true);
 83 |     for (auto example : examples)
 84 |     {
 85 |         EXPECT_TRUE(parsed.match(example)) <<
 86 |             example << " didn't match " << directive;
 87 |     }
 88 | 
 89 |     std::vector<std::string> antiexamples = {
 90 |         "/tes",
 91 |         "/est",
 92 |     };
 93 |     for (auto example : antiexamples)
 94 |     {
 95 |         EXPECT_FALSE(parsed.match(example)) <<
 96 |             example << " matched " << directive;
 97 |     }
 98 | }
 99 | 
100 | TEST(DirectiveTest, MultipleWildcardTest)
101 | {
102 |     std::vector<std::string> examples = {
103 |         "/this-test-is-a-simple-test",
104 |         "/this-test-is-another-test-is-a-tricky-test"
105 |     };
106 |     std::vector<std::string> antiexamples = {
107 |         "/this-test-is-a-mislead"
108 |     };
109 |     std::string directive("/this-*-is-a-*-test");
110 |     Rep::Directive parsed(directive, true);
111 |     for (auto example : examples)
112 |     {
113 |         EXPECT_TRUE(parsed.match(example)) <<
114 |             example << " didn't match " << directive;
115 |     }
116 | 
117 |     for (auto example : antiexamples)
118 |     {
119 |         EXPECT_FALSE(parsed.match(example)) <<
120 |             example << " matched " << directive;
121 |     }
122 | }
123 | 
124 | TEST(DirectiveTest, Str)
125 | {
126 |     EXPECT_EQ("Allow: /foo", Rep::Directive("/foo", true).str());
127 |     EXPECT_EQ("Disallow: /bar", Rep::Directive("/bar", false).str());
128 | }
129 | 
130 | TEST(GoogleTest, EmptyAndWildcard)
131 | {
132 |     std::vector<std::string> examples = {
133 |         "/",
134 |         "/fish",
135 |         "/fish.html",
136 |         "/fish/salmon.html",
137 |         "/fishheads",
138 |         "/fishheads/yummy.html",
139 |         "/fish.php?id=anything"
140 |     };
141 |     std::vector<std::string> directives = {
142 |         "/",
143 |         "/*"
144 |     };
145 |     for (auto directive : directives)
146 |     {
147 |         Rep::Directive parsed(directive, true);
148 |         for (auto example : examples)
149 |         {
150 |             EXPECT_TRUE(parsed.match(example)) <<
151 |                 example << " didn't match " << directive;
152 |         }
153 |     }
154 | }
155 | 
156 | TEST(GoogleTest, Prefix)
157 | {
158 |     std::vector<std::string> examples = {
159 |         "/fish",
160 |         "/fish.html",
161 |         "/fish/salmon.html",
162 |         "/fishheads",
163 |         "/fishheads/yummy.html",
164 |         "/fish.php?id=anything"
165 |     };
166 |     std::vector<std::string> antiexamples = {
167 |         "/Fish.asp",
168 |         "/catfish",
169 |         "/?id=fish"
170 |     };
171 |     std::string directive("/fish");
172 |     Rep::Directive parsed(directive, true);
173 |     for (auto example : examples)
174 |     {
175 |         EXPECT_TRUE(parsed.match(example)) <<
176 |             example << " didn't match " << directive;
177 |     }
178 | 
179 |     for (auto example : antiexamples)
180 |     {
181 |         EXPECT_FALSE(parsed.match(example)) <<
182 |             example << " matched " << directive;
183 |     }
184 | }
185 | 
186 | TEST(GoogleTest, TrailingWildcard)
187 | {
188 |     std::vector<std::string> examples = {
189 |         "/fish",
190 |         "/fish.html",
191 |         "/fish/salmon.html",
192 |         "/fishheads",
193 |         "/fishheads/yummy.html",
194 |         "/fish.php?id=anything"
195 |     };
196 |     std::vector<std::string> antiexamples = {
197 |         "/Fish.asp",
198 |         "/catfish",
199 |         "/?id=fish"
200 |     };
201 |     std::string directive("/fish*");
202 |     Rep::Directive parsed(directive, true);
203 |     for (auto example : examples)
204 |     {
205 |         EXPECT_TRUE(parsed.match(example)) <<
206 |             example << " didn't match " << directive;
207 |     }
208 | 
209 |     for (auto example : antiexamples)
210 |     {
211 |         EXPECT_FALSE(parsed.match(example)) <<
212 |             example << " matched " << directive;
213 |     }
214 | }
215 | 
216 | TEST(GoogleTest, Directory)
217 | {
218 |     std::vector<std::string> examples = {
219 |         "/fish/",
220 |         "/fish/?id=anything",
221 |         "/fish/salmon.htm"
222 |     };
223 |     std::vector<std::string> antiexamples = {
224 |         "/fish",
225 |         "/fish.html",
226 |         "/Fish/Salmon.asp"
227 |     };
228 |     std::string directive("/fish/");
229 |     Rep::Directive parsed(directive, true);
230 |     for (auto example : examples)
231 |     {
232 |         EXPECT_TRUE(parsed.match(example)) <<
233 |             example << " didn't match " << directive;
234 |     }
235 | 
236 |     for (auto example : antiexamples)
237 |     {
238 |         EXPECT_FALSE(parsed.match(example)) <<
239 |             example << " matched " << directive;
240 |     }
241 | }
242 | 
243 | TEST(GoogleTest, WildcardExtension)
244 | {
245 |     std::vector<std::string> examples = {
246 |         "/filename.php",
247 |         "/folder/filename.php",
248 |         "/folder/filename.php?parameters",
249 |         "/folder/any.php.file.html",
250 |         "/filename.php/"
251 |     };
252 |     std::vector<std::string> antiexamples = {
253 |         "/",
254 |         "/windows.PHP"
255 |     };
256 |     std::string directive("/*.php");
257 |     Rep::Directive parsed(directive, true);
258 |     for (auto example : examples)
259 |     {
260 |         EXPECT_TRUE(parsed.match(example)) <<
261 |             example << " didn't match " << directive;
262 |     }
263 | 
264 |     for (auto example : antiexamples)
265 |     {
266 |         EXPECT_FALSE(parsed.match(example)) <<
267 |             example << " matched " << directive;
268 |     }
269 | }
270 | 
271 | TEST(GoogleTest, WildcardExtensionEnd)
272 | {
273 |     std::vector<std::string> examples = {
274 |         "/filename.php",
275 |         "/folder/filename.php"
276 |     };
277 |     std::vector<std::string> antiexamples = {
278 |         "/filename.php?parameters",
279 |         "/filename.php/",
280 |         "/filename.php5",
281 |         "/windows.PHP"
282 |     };
283 |     std::string directive("/*.php$");
284 |     Rep::Directive parsed(directive, true);
285 |     for (auto example : examples)
286 |     {
287 |         EXPECT_TRUE(parsed.match(example)) <<
288 |             example << " didn't match " << directive;
289 |     }
290 | 
291 |     for (auto example : antiexamples)
292 |     {
293 |         EXPECT_FALSE(parsed.match(example)) <<
294 |             example << " matched " << directive;
295 |     }
296 | }
297 | 
298 | TEST(GoogleTest, FishStarExtension)
299 | {
300 |     std::vector<std::string> examples = {
301 |         "/fish.php",
302 |         "/fishheads/catfish.php?parameters"
303 |     };
304 |     std::vector<std::string> antiexamples = {
305 |         "/Fish.PHP"
306 |     };
307 |     std::string directive("/fish*.php");
308 |     Rep::Directive parsed(directive, true);
309 |     for (auto example : examples)
310 |     {
311 |         EXPECT_TRUE(parsed.match(example)) <<
312 |             example << " didn't match " << directive;
313 |     }
314 | 
315 |     for (auto example : antiexamples)
316 |     {
317 |         EXPECT_FALSE(parsed.match(example)) <<
318 |             example << " matched " << directive;
319 |     }
320 | }
321 | 


--------------------------------------------------------------------------------
/test/test-robots.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include "url.h"
  4 | 
  5 | #include "robots.h"
  6 | 
  7 | TEST(RobotsTest, NoLeadingUserAgent)
  8 | {
  9 |     // Assumed to be the default user agent
 10 |     std::string content =
 11 |         "Disallow: /path\n"
 12 |         "Allow: /path/exception\n"
 13 |         "Crawl-delay: 5.2\n";
 14 |     Rep::Robots robot(content);
 15 |     EXPECT_TRUE(robot.allowed("/path/exception", "agent"));
 16 |     EXPECT_FALSE(robot.allowed("/path", "agent"));
 17 |     EXPECT_NEAR(robot.agent("agent").delay(), 5.2, 0.000001);
 18 | }
 19 | 
 20 | TEST(RobotsTest, WellFormedCrawlDelay)
 21 | {
 22 |     std::string content =
 23 |         "User-agent: *\n"
 24 |         "Crawl-delay: 5.2\n";
 25 |     Rep::Robots robot(content);
 26 |     EXPECT_NEAR(robot.agent("any").delay(), 5.2, 0.000001);
 27 | }
 28 | 
 29 | TEST(RobotsTest, MalformedCrawlDelay)
 30 | {
 31 |     std::string content =
 32 |         "User-agent: *\n"
 33 |         "Crawl-delay: word\n";
 34 |     Rep::Robots robot(content);
 35 |     EXPECT_EQ(robot.agent("any").delay(), -1.0);
 36 | }
 37 | 
 38 | TEST(RobotsTest, HonorsDefaultAgent)
 39 | {
 40 |     std::string content =
 41 |         "User-agent: *\n"
 42 |         "Disallow: /tmp\n"
 43 |         "\n"
 44 |         "User-agent: other-agent\n"
 45 |         "Allow: /tmp\n";
 46 |     Rep::Robots robot(content);
 47 |     EXPECT_FALSE(robot.allowed("/tmp", "agent"));
 48 |     EXPECT_TRUE(robot.allowed("/path", "agent"));
 49 | }
 50 | 
 51 | TEST(RobotsTest, HonorsSpecificAgent)
 52 | {
 53 |     std::string content =
 54 |         "User-agent: *\n"
 55 |         "Disallow: /tmp\n"
 56 |         "\n"
 57 |         "User-agent: agent\n"
 58 |         "Allow: /tmp\n";
 59 |     Rep::Robots robot(content);
 60 |     EXPECT_TRUE(robot.allowed("/tmp", "agent"));
 61 |     EXPECT_TRUE(robot.allowed("/path", "agent"));
 62 | }
 63 | 
 64 | TEST(RobotsTest, Grouping)
 65 | {
 66 |     std::string content =
 67 |         "User-agent: one\n"
 68 |         "User-agent: two\n"
 69 |         "Disallow: /tmp\n";
 70 |     Rep::Robots robot(content);
 71 |     EXPECT_FALSE(robot.allowed("/tmp", "one"));
 72 |     EXPECT_FALSE(robot.allowed("/tmp", "two"));
 73 | }
 74 | 
 75 | TEST(RobotsTest, GroupingUnknownKeys)
 76 | {
 77 |     // When we encounter unknown keys, we should disregard any grouping that may have
 78 |     // happened between user agent rules.
 79 |     //
 80 |     // This is an example from the wild. Despite `Noindex` not being a valid directive,
 81 |     // we'll not consider the "*" and "ia_archiver" rules together.
 82 |     std::string content =
 83 |         "User-agent: *\n"
 84 |         "Disallow: /content/2/\n"
 85 |         "User-agent: *\n"
 86 |         "Noindex: /gb.html\n"
 87 |         "Noindex: /content/2/\n"
 88 |         "User-agent: ia_archiver\n"
 89 |         "Disallow: /\n";
 90 |     Rep::Robots robot(content);
 91 |     EXPECT_TRUE(robot.allowed("/foo", "agent"));
 92 |     EXPECT_FALSE(robot.allowed("/bar", "ia_archiver"));
 93 | }
 94 | 
 95 | TEST(RobotsTest, SeparatesAgents)
 96 | {
 97 |     std::string content =
 98 |         "User-agent: one\n"
 99 |         "Crawl-delay: 1\n"
100 |         "\n"
101 |         "User-agent: two\n"
102 |         "Crawl-delay: 2\n";
103 |     Rep::Robots robot(content);
104 |     EXPECT_NE(robot.agent("one").delay(), robot.agent("two").delay());
105 | }
106 | 
107 | TEST(RobotsTest, ExposesSitemaps)
108 | {
109 |     std::string content =
110 |         "Sitemap: http://a.com/sitemap.xml\n"
111 |         "Sitemap: http://b.com/sitemap.xml\n";
112 |     Rep::Robots robot(content);
113 |     std::vector<std::string> expected = {
114 |         "http://a.com/sitemap.xml", "http://b.com/sitemap.xml"
115 |     };
116 |     EXPECT_EQ(robot.sitemaps(), expected);
117 | }
118 | 
119 | TEST(RobotsTest, CaseInsensitivity)
120 | {
121 |     std::string content =
122 |         "User-agent: Agent\n"
123 |         "Disallow: /path\n";
124 |     Rep::Robots robot(content);
125 |     EXPECT_FALSE(robot.allowed("/path", "agent"));
126 |     EXPECT_FALSE(robot.allowed("/path", "aGeNt"));
127 | }
128 | 
129 | TEST(RobotsTest, Empty)
130 | {
131 |     std::string content;
132 |     Rep::Robots robot(content);
133 |     EXPECT_TRUE(robot.sitemaps().empty());
134 |     EXPECT_TRUE(robot.allowed("/", "agent"));
135 | }
136 | 
137 | TEST(RobotsTest, Comments)
138 | {
139 |     std::string content =
140 |         "User-Agent: *  # comment saying it's the default agent\n"
141 |         "Disallow: /\n";
142 |     Rep::Robots robot(content);
143 |     EXPECT_FALSE(robot.allowed("/path", "agent"));
144 | }
145 | 
146 | TEST(RobotsTest, AcceptsFullUrl)
147 | {
148 |     std::string content =
149 |         "User-Agent: agent\n"
150 |         "Disallow: /path;params?query\n";
151 |     Rep::Robots robot(content);
152 |     EXPECT_FALSE(robot.allowed(
153 |         "http://userinfo@exmaple.com:10/path;params?query#fragment", "agent"));
154 | }
155 | 
156 | TEST(RobotsTest, SkipMalformedLine)
157 | {
158 |     std::string content =
159 |         "User-Agent: agent\n"
160 |         "Disallow /no/colon/in/this/line\n";
161 |     Rep::Robots robot(content);
162 |     EXPECT_TRUE(robot.allowed("/no/colon/in/this/line", "agent"));
163 | }
164 | 
165 | TEST(RobotsTest, RobotsUrlHttp)
166 | {
167 |     std::string url("http://user@example.com:80/path;params?query#fragment");
168 |     std::string expected("http://example.com/robots.txt");
169 |     EXPECT_EQ(expected, Rep::Robots::robotsUrl(url));
170 | }
171 | 
172 | TEST(RobotsTest, RobotsUrlHttps)
173 | {
174 |     std::string url("https://user@example.com:443/path;params?query#fragment");
175 |     std::string expected("https://example.com/robots.txt");
176 |     EXPECT_EQ(expected, Rep::Robots::robotsUrl(url));
177 | }
178 | 
179 | TEST(RobotsTest, RobotsUrlNonDefaultPort)
180 | {
181 |     std::string url("http://user@example.com:8080/path;params?query#fragment");
182 |     std::string expected("http://example.com:8080/robots.txt");
183 |     EXPECT_EQ(expected, Rep::Robots::robotsUrl(url));
184 | }
185 | 
186 | TEST(RobotsTest, RobotsUrlInvalidPort)
187 | {
188 |     std::string url("http://:::cnn.com/");
189 |     ASSERT_THROW(Rep::Robots::robotsUrl(url), Url::UrlParseException);
190 | }
191 | 
192 | TEST(RobotsTest, RfcExample)
193 | {
194 |     std::string content =
195 |         "# /robots.txt for http://www.fict.org/\n"
196 |         "# comments to webmaster@fict.org\n"
197 |         "\n"
198 |         "User-agent: unhipbot\n"
199 |         "Disallow: /\n"
200 |         "\n"
201 |         "User-agent: webcrawler\n"
202 |         "User-agent: excite\n"
203 |         "Disallow:\n"
204 |         "\n"
205 |         "User-agent: *\n"
206 |         "Disallow: /org/plans.html\n"
207 |         "Allow: /org/\n"
208 |         "Allow: /serv\n"
209 |         "Allow: /~mak\n"
210 |         "Disallow: /\n";
211 |     Rep::Robots robot(content);
212 | 
213 |     // The unhip bot
214 |     EXPECT_FALSE(robot.allowed("/", "unhipbot"));
215 |     EXPECT_FALSE(robot.allowed("/index.html", "unhipbot"));
216 |     EXPECT_TRUE(robot.allowed("/robots.txt", "unhipbot"));
217 |     EXPECT_FALSE(robot.allowed("/server.html", "unhipbot"));
218 |     EXPECT_FALSE(robot.allowed("/services/fast.html", "unhipbot"));
219 |     EXPECT_FALSE(robot.allowed("/services/slow.html", "unhipbot"));
220 |     EXPECT_FALSE(robot.allowed("/orgo.gif", "unhipbot"));
221 |     EXPECT_FALSE(robot.allowed("/org/about.html", "unhipbot"));
222 |     EXPECT_FALSE(robot.allowed("/org/plans.html", "unhipbot"));
223 |     EXPECT_FALSE(robot.allowed("/%7Ejim/jim.html", "unhipbot"));
224 |     EXPECT_FALSE(robot.allowed("/%7Emak/mak.html", "unhipbot"));
225 | 
226 |     // The webcrawler agent
227 |     EXPECT_TRUE(robot.allowed("/", "webcrawler"));
228 |     EXPECT_TRUE(robot.allowed("/index.html", "webcrawler"));
229 |     EXPECT_TRUE(robot.allowed("/robots.txt", "webcrawler"));
230 |     EXPECT_TRUE(robot.allowed("/server.html", "webcrawler"));
231 |     EXPECT_TRUE(robot.allowed("/services/fast.html", "webcrawler"));
232 |     EXPECT_TRUE(robot.allowed("/services/slow.html", "webcrawler"));
233 |     EXPECT_TRUE(robot.allowed("/orgo.gif", "webcrawler"));
234 |     EXPECT_TRUE(robot.allowed("/org/about.html", "webcrawler"));
235 |     EXPECT_TRUE(robot.allowed("/org/plans.html", "webcrawler"));
236 |     EXPECT_TRUE(robot.allowed("/%7Ejim/jim.html", "webcrawler"));
237 |     EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "webcrawler"));
238 | 
239 |     // The excite agent
240 |     EXPECT_TRUE(robot.allowed("/", "excite"));
241 |     EXPECT_TRUE(robot.allowed("/index.html", "excite"));
242 |     EXPECT_TRUE(robot.allowed("/robots.txt", "excite"));
243 |     EXPECT_TRUE(robot.allowed("/server.html", "excite"));
244 |     EXPECT_TRUE(robot.allowed("/services/fast.html", "excite"));
245 |     EXPECT_TRUE(robot.allowed("/services/slow.html", "excite"));
246 |     EXPECT_TRUE(robot.allowed("/orgo.gif", "excite"));
247 |     EXPECT_TRUE(robot.allowed("/org/about.html", "excite"));
248 |     EXPECT_TRUE(robot.allowed("/org/plans.html", "excite"));
249 |     EXPECT_TRUE(robot.allowed("/%7Ejim/jim.html", "excite"));
250 |     EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "excite"));
251 | 
252 |     // All others
253 |     EXPECT_FALSE(robot.allowed("/", "anything"));
254 |     EXPECT_FALSE(robot.allowed("/index.html", "anything"));
255 |     EXPECT_TRUE(robot.allowed("/robots.txt", "anything"));
256 |     EXPECT_TRUE(robot.allowed("/server.html", "anything"));
257 |     EXPECT_TRUE(robot.allowed("/services/fast.html", "anything"));
258 |     EXPECT_TRUE(robot.allowed("/services/slow.html", "anything"));
259 |     EXPECT_FALSE(robot.allowed("/orgo.gif", "anything"));
260 |     EXPECT_TRUE(robot.allowed("/org/about.html", "anything"));
261 |     EXPECT_FALSE(robot.allowed("/org/plans.html", "anything"));
262 |     EXPECT_FALSE(robot.allowed("/%7Ejim/jim.html", "anything"));
263 |     EXPECT_TRUE(robot.allowed("/%7Emak/mak.html", "anything"));
264 | }
265 | 
266 | TEST(RobotsTest, IgnoreBOM)
267 | {
268 |     std::string content =
269 |         "\xEF\xBB\xBFuser-agent: *\n"
270 |         "disallow: /disallowed\n";
271 |     Rep::Robots robot(content);
272 |     EXPECT_TRUE(robot.allowed("/", "bot"));
273 |     EXPECT_FALSE(robot.allowed("/disallowed", "bot"));
274 | }
275 | 
276 | TEST(RobotsTest, Str)
277 | {
278 |     std::string content =
279 |         "User-agent: one\n"
280 |         "Disallow: /foo\n"
281 |         "Allow: /bar\n"
282 |         "User-agent: *\n"
283 |         "Allow: /foo\n"
284 |         "Disallow: /bar\n";
285 | 
286 |     Rep::Robots robot(content);
287 |     EXPECT_EQ(
288 |         "{\"one\": [Directive(Disallow: /foo), Directive(Allow: /bar)],"
289 |         " \"*\": [Directive(Allow: /foo), Directive(Disallow: /bar)]}",
290 |         robot.str());
291 | }
292 | 
293 | TEST(RobotsTest, IgnoresExternalDisallow)
294 | {
295 |     std::string content =
296 |         "User-agent: one\n"
297 |         "Allow: /path\n"
298 |         "Disallow: http://b.com/external\n";
299 | 
300 |     Rep::Robots robot(content, "http://a.com/robots.txt");
301 |     EXPECT_TRUE(robot.allowed("/path", "one"));
302 |     EXPECT_TRUE(robot.allowed("/external", "one"));
303 | }
304 | 
305 | TEST(RobotsTest, IgnoresExternalAllow)
306 | {
307 |     std::string content =
308 |         "User-agent: one\n"
309 |         "Disallow: /path\n"
310 |         "Allow: http://b.com/path/external\n";
311 | 
312 |     Rep::Robots robot(content, "http://a.com/robots.txt");
313 |     EXPECT_FALSE(robot.allowed("/path", "one"));
314 |     EXPECT_FALSE(robot.allowed("/path/external", "one"));
315 | }
316 | 
317 | TEST(RobotsTest, NeverExternalAllowed)
318 | {
319 |     Rep::Robots robot("", "http://a.com/robots.txt");
320 |     EXPECT_FALSE(robot.allowed("http://b.com/", "one"));
321 | }
322 | 
323 | TEST(RobotsTest, LeadingWildcardAllow)
324 | {
325 |     std::string content =
326 |         "User-agent: meow\n"
327 |         "Disallow: /\n"
328 |         "Allow: ****/cats\n"
329 |         "Allow: */kangaroos\n";
330 |     Rep::Robots robot(content);
331 | 
332 |     EXPECT_FALSE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow"));
333 |     EXPECT_TRUE(robot.allowed("/cats.html", "meow"));
334 |     EXPECT_TRUE(robot.allowed("/cats/page.html", "meow"));
335 |     EXPECT_TRUE(robot.allowed("/get/more/cats/page.html", "meow"));
336 |     EXPECT_TRUE(robot.allowed("/kangaroos/page.html", "meow"));
337 |     EXPECT_TRUE(robot.allowed("/heaps/of/kangaroos/page.html", "meow"));
338 |     EXPECT_TRUE(robot.allowed("/kangaroosandkoalas/page.html", "meow"));
339 | }
340 | 
341 | TEST(RobotsTest, LeadingWildcardDisallow)
342 | {
343 |     std::string content =
344 |         "User-agent: meow\n"
345 |         "Allow: /\n"
346 |         "Disallow: ****/cats\n"
347 |         "Disallow: */kangaroos\n";
348 |     Rep::Robots robot(content);
349 | 
350 |     EXPECT_TRUE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow"));
351 |     EXPECT_FALSE(robot.allowed("/cats.html", "meow"));
352 |     EXPECT_FALSE(robot.allowed("/cats/page.html", "meow"));
353 |     EXPECT_FALSE(robot.allowed("/get/more/cats/page.html", "meow"));
354 |     EXPECT_FALSE(robot.allowed("/kangaroos/page.html", "meow"));
355 |     EXPECT_FALSE(robot.allowed("/heaps/of/kangaroos/page.html", "meow"));
356 |     EXPECT_FALSE(robot.allowed("/kangaroosandkoalas/page.html", "meow"));
357 | }
358 | 


--------------------------------------------------------------------------------