├── .gitignore ├── LICENSE ├── README.md ├── project.clj ├── src └── clj_robots │ ├── core.clj │ ├── strategy │ ├── bing.clj │ ├── extended.clj │ └── google.clj │ └── utils.clj └── test └── clj_robots ├── core_test.clj ├── empty.txt ├── robots-bad.txt ├── robots.txt ├── test └── utils.clj └── utils_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /autodoc 2 | /classes 3 | /lib 4 | /pom.xml 5 | /.lein-deps-sum 6 | /.lein-failures 7 | /*.jar 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2010 by Min Huang 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DESCRIPTION 2 | =========== 3 | A robots.txt parser. See . Support for different 4 | interpretations forthcoming. 5 | 6 | CHANGES 7 | ======= 8 | 9 | 0.6.0 10 | ----- 11 | * Removed deprecated API. 12 | * Removed dependency on [clj-httpc](https://github.com/retiman/clj-httpc) library. 13 | * Removed `get` API; use [clj-http](https://github.com/dakrone/clj-http) or a different library for your HTTP requests. 14 | 15 | USAGE 16 | ===== 17 | To use, include this in your Clojure program: 18 | 19 | (require '[clj-http.client :as client]) 20 | (require '[clj-robots.core :as robots]) 21 | 22 | Save robots.txt from a website: 23 | 24 | (def robots 25 | ((comp robots/parse 26 | #(get % :body) 27 | client/get) 28 | "http://www.google.com/robots.txt")) 29 | -> #'user/robots 30 | 31 | Now check if any paths are crawlable: 32 | 33 | (crawlable? robots "/search") 34 | -> false 35 | 36 | (crawlable? robots "/news" :user-agent "*") 37 | -> false 38 | 39 | (crawlable? robots "/jsapi") 40 | -> true 41 | 42 | Examples of other usage: 43 | 44 | (:sitemap robots) 45 | -> ["http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml" "http://www.google.com/hostednews/sitemap_index.xml" "http://www.google.com/ventures/sitemap_ventures.xml" "http://www.google.com/sitemaps_webmasters.xml" "http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml" "http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml"] 46 | 47 | (:modified-time robots) 48 | -> 1297291259732 49 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-robots "0.6.0" 2 | :description "A robots.txt parser." 3 | :min-lein-version "1.6.2" 4 | :warn-on-reflection true 5 | :dependencies 6 | [[org.clojure/clojure "1.3.0"] 7 | [clj-time "0.3.3"] 8 | [commons-io "2.0"]] 9 | :dev-dependencies 10 | [[backtype/autodoc "0.9.0-SNAPSHOT"] 11 | [lein-clojars "0.7.0"] 12 | [robert/hooke "1.1.2"] 13 | [clj-http "0.2.6"]]) 14 | -------------------------------------------------------------------------------- /src/clj_robots/core.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.core 2 | (:require 3 | [clojure.string :as s] 4 | [clojure.java.io :as io] 5 | [clj-robots.utils :as utils]) 6 | (:import 7 | [clojure.lang Sequential] 8 | [java.io InputStream] 9 | [java.net URL]) 10 | (:gen-class)) 11 | 12 | (def 13 | ^{:private true} 14 | directive-keys 15 | #{"user-agent" 16 | "allow" 17 | "disallow" 18 | "crawl-delay" 19 | "request-rate" 20 | "robot-version" 21 | "visit-time" 22 | "sitemap"}) 23 | 24 | (defn- trim-comment 25 | "Removes everything after the first # character in a String." 26 | [line] 27 | (s/replace line #"#.*$" "")) 28 | 29 | (defn- process-user-agent 30 | "Set the current user-agent and add it to the list of user-agents." 31 | [directives user-agents last-key value] 32 | (dosync 33 | (alter directives assoc :sitemap []) 34 | (if (= @last-key :user-agent) 35 | (alter user-agents conj value) 36 | (ref-set user-agents #{value})) 37 | (doseq [ua @user-agents] 38 | (alter directives assoc ua [])))) 39 | 40 | (defn- process-permission 41 | "Set an allow or disallow directive for the current user-agent." 42 | [directives user-agents key value] 43 | (dosync 44 | (doseq [ua @user-agents] 45 | (let [permissions (@directives ua)] 46 | (alter directives assoc ua (vec (conj permissions [key value]))))))) 47 | 48 | (defn- process-sitemap 49 | "Add a sitemap." 50 | [directives value] 51 | (dosync 52 | (let [sitemap (get @directives :sitemap)] 53 | (alter directives assoc :sitemap (vec (conj sitemap value)))))) 54 | 55 | (defn- process-request-rate 56 | "Convert a Request-rate to a Ratio. The ratio represents the number of 57 | documents that should be fetched per second (default). If a time unit 58 | other than seconds is used, then it is converted to seconds." 59 | [s] 60 | (let [m (case (last s) \h 3600 \m 60 \s 1 1) 61 | t (first (s/split s #"[^0-9/]")) 62 | [p q] (if-not (nil? t) (map utils/parse-int (s/split t #"/" 2)))] 63 | (cond 64 | (nil? p) nil 65 | (nil? q) nil 66 | :default (/ p (* m q))))) 67 | 68 | (defn- process-visit-time 69 | "TODO: Implement me!" 70 | [s] 71 | nil) 72 | 73 | (defn- parse-key 74 | "Parse the key in a directive." 75 | [key] 76 | (let [k (s/lower-case (s/trim key))] 77 | (if (contains? directive-keys k) 78 | (keyword k)))) 79 | 80 | (defn- parse-value 81 | "Parse the value in a directive." 82 | [key value] 83 | (cond (nil? value) "" 84 | (= key :crawl-delay) (utils/parse-int (s/trim value)) 85 | (= key :request-rate) (process-request-rate (s/trim value)) 86 | (= key :visit-time) (process-visit-time (s/trim value)) 87 | :default (s/trim value))) 88 | 89 | (defn- parse-line 90 | "Parse a line from a robots.txt file." 91 | [line] 92 | (let [[left right] (s/split (trim-comment line) #":" 2) 93 | key (parse-key left) 94 | value (parse-value key right)] 95 | (if (not= "" value) [key value]))) 96 | 97 | (defn- parse-lines 98 | "Parse the lines of the robots.txt file." 99 | [lines] 100 | (dosync 101 | (let [last-key (ref nil) 102 | user-agents (ref #{"*"}) 103 | directives (ref {"*" []})] 104 | (doseq [line lines] 105 | (let [[key value] (parse-line line)] 106 | (cond 107 | (or (nil? key) (nil? value)) 108 | nil 109 | (= key :user-agent) 110 | (process-user-agent directives user-agents last-key value) 111 | (= key :sitemap) 112 | (process-sitemap directives value) 113 | (contains? #{:allow :disallow} key) 114 | (process-permission directives user-agents key value) 115 | :default 116 | (alter directives assoc key value)) 117 | (ref-set last-key key))) 118 | (alter directives assoc :modified-time (System/currentTimeMillis)) 119 | @directives))) 120 | 121 | (defmulti get-url 122 | "Returns the robots.txt URL for a particular host (given a URL)." 123 | class) 124 | 125 | (defmethod get-url URL [url] 126 | (let [protocol (.getProtocol url) 127 | domain (.getHost url)] 128 | (str protocol "://" domain "/robots.txt"))) 129 | 130 | (defmethod get-url String [url] 131 | (get-url (io/as-url url))) 132 | 133 | (defn crawlable? 134 | "Returns true if a list of directives allows the path to be crawled using 135 | this interpretation of robots.txt: 136 | 137 | http://www.robotstxt.org/ 138 | 139 | Note that allow directives are completely ignored and only the first 140 | disallow directive is consulted to determine if a path can be crawled." 141 | [directives ^String path & {:keys [user-agent] :or {user-agent "*"}}] 142 | (let [select-disallows #(= :disallow (first %)) 143 | permissions (filter select-disallows (get directives user-agent))] 144 | (and (nil? (some #(.startsWith path (last %)) permissions)) 145 | (if (not= "*" user-agent) 146 | (crawlable? directives path :user-agent "*") 147 | true)))) 148 | 149 | (defmulti parse 150 | "Parse robots.txt; returns a data structure to pass to crawlable?" 151 | class) 152 | 153 | (defmethod parse 154 | Sequential [lines] 155 | (parse-lines lines)) 156 | 157 | (defmethod parse 158 | String [string] 159 | (parse (s/split-lines string))) 160 | 161 | (defmethod parse 162 | InputStream [stream] 163 | (parse (utils/stream-to-string stream))) 164 | 165 | (defmethod parse 166 | nil [arg] 167 | nil) 168 | -------------------------------------------------------------------------------- /src/clj_robots/strategy/bing.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.strategy.bing 2 | (:gen-class)) 3 | 4 | (defn crawlable? 5 | [directives path & {:keys [user-agent] :or {user-agent "*"}}] 6 | (throw (new UnsupportedOperationException "Method not implemented"))) 7 | -------------------------------------------------------------------------------- /src/clj_robots/strategy/extended.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.strategy.extended 2 | (:gen-class)) 3 | 4 | (defn crawlable? 5 | "Interprets robots.txt extended standard. 6 | 7 | See " 8 | [directives path & {:keys [user-agent] :or {user-agent "*"}}] 9 | (let [select-disallows #(= :disallow (first %)) 10 | permissions (filter select-disallows (mget directives user-agent)) 11 | disallow-matches #(re-matches (utils/wildcard-to-regex (last %)) path)] 12 | (and (nil? (some disallow-matches permissions)) 13 | (if (not= "*" user-agent) 14 | (crawlable? directives path :user-agent "*") 15 | true)))) 16 | -------------------------------------------------------------------------------- /src/clj_robots/strategy/google.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.strategy.google 2 | (:gen-class)) 3 | 4 | (defn crawlable? 5 | [directives path & {:keys [user-agent] :or {user-agent "*"}}] 6 | (throw (new UnsupportedOperationException "Method not implemented"))) 7 | -------------------------------------------------------------------------------- /src/clj_robots/utils.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.utils 2 | (:require 3 | [clojure.string :as s] 4 | [clojure.java.io :as io]) 5 | (:import 6 | [java.io StringWriter] 7 | [org.apache.commons.io IOUtils]) 8 | (:gen-class)) 9 | 10 | (defn load-resource 11 | "Return a resource located on path." 12 | [path] 13 | (let [t (Thread/currentThread) 14 | loader (.getContextClassLoader t)] 15 | (.getResourceAsStream loader path))) 16 | 17 | (defn stream-to-string 18 | "Convert an InputStream to a String." 19 | [stream] 20 | (let [writer (new StringWriter)] 21 | (do 22 | (IOUtils/copy stream writer) 23 | (.toString writer)))) 24 | 25 | (defn wildcard-to-regex 26 | "Convert a wildcard pattern to a Java Pattern for matching paths. 27 | Note that .* is added to the end of the pattern for this reason." 28 | [text] 29 | (let [sb (StringBuffer. (count text))] 30 | (doseq [c text] 31 | (cond 32 | (= c \*) 33 | (.append sb ".*") 34 | (= c \?) 35 | (.append sb ".") 36 | (contains? #{\( \) \[ \] \$ \^ \. \{ \} \| \\} c) 37 | (doto sb 38 | (.append \\) 39 | (.append c)) 40 | :default 41 | (.append sb c))) 42 | (.append sb ".*") 43 | (re-pattern (.toString sb)))) 44 | 45 | (defn get-lines 46 | "Load a resource, convert it to a string, and return a vector of lines." 47 | [resource] 48 | ((comp s/split-lines stream-to-string load-resource) resource)) 49 | 50 | (defn parse-int 51 | "Convert a String to an Integer." 52 | [s] 53 | (try (Integer/parseInt s) (catch NumberFormatException e nil))) 54 | -------------------------------------------------------------------------------- /test/clj_robots/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.core-test 2 | (:use 3 | [clj-robots.core] 4 | [clj-robots.utils :only (get-lines)] 5 | [clj-robots.test.utils :only (refer-private)] 6 | [clojure.test]) 7 | (:gen-class)) 8 | 9 | (refer-private 'clj-robots.core) 10 | 11 | (deftest test-trim-comment 12 | (is (= "hello there " (trim-comment "hello there #this is a comment!")))) 13 | 14 | (deftest test-parse-line 15 | (is (= [:user-agent ":*:"] 16 | (parse-line "UsEr-AgEnt: :*:# This is a comment"))) 17 | (is (nil? (parse-line "user-agent*"))) 18 | (is (nil? (parse-line "")))) 19 | 20 | (deftest test-parse-lines 21 | (let [lines (get-lines "clj_robots/robots.txt") 22 | expected {:request-rate 1/300 23 | :crawl-delay 10 24 | :robot-version "Version 2.0" 25 | :sitemap ["http://www.lousycoder.com/sitemap1.xml" 26 | "http://www.lousycoder.com/sitemap2.xml"] 27 | "*" 28 | [[:allow "/images/foo.jpg"] 29 | [:disallow "/cgi-bin/"] 30 | [:disallow "/images/"] 31 | [:disallow "/tmp/"] 32 | [:disallow "/private/"]] 33 | "google" 34 | [[:allow "/bif/baz/boo/"] 35 | [:disallow "/moo/goo/too/"]] 36 | "foo" 37 | [[:disallow "/mif/tif/psd/"] 38 | [:allow "/gif/png/img/"]] 39 | "bar" 40 | [[:disallow "/mif/tif/psd/"] 41 | [:allow "/gif/png/img/"]] 42 | "baz" 43 | [[:disallow "/mif/tif/psd/"] 44 | [:allow "/gif/png/img/"]]} 45 | ds (parse-lines lines) 46 | result (dissoc ds :modified-time)] 47 | (is (contains? ds :modified-time)) 48 | (is (= expected result)))) 49 | 50 | (deftest test-parse-lines-bad 51 | (let [lines (get-lines "clj_robots/robots-bad.txt") 52 | ds (parse-lines lines) 53 | expected {"*" [[:allow "/foobar/"]]} 54 | result (dissoc ds :modified-time)] 55 | (is (= expected result)))) 56 | 57 | (deftest test-parse 58 | (is (nil? (parse nil))) 59 | (is {"*" []} (dissoc (parse "") :modified-time)) 60 | (is {"*" []} (dissoc (parse [""]) :modified-time)) 61 | (is {"*" []} (dissoc (parse (get-lines "clj_robots/empty.txt")) 62 | :modified-time)) 63 | (is (contains? (parse (get-lines "clj_robots/robots.txt")) "google"))) 64 | 65 | (deftest test-crawlable? 66 | (let [ds {"google" 67 | [[:disallow "/foo/"] 68 | [:disallow "/bar/"] 69 | [:allow "/bar/baz.html"] 70 | [:disallow "/bar/baz/"]] 71 | "*" 72 | [[:disallow "/bar/"] 73 | [:allow "/bif/"] 74 | [:disallow "bif/bof/"]]}] 75 | (is (not (crawlable? ds "/foo/" :user-agent "google"))) 76 | (is (not (crawlable? ds "/bar/" :user-agent "google"))) 77 | (is (not (crawlable? ds "/bar/" :user-agent "*"))) 78 | (is (not (crawlable? ds "/bar/bif.html" :user-agent "google"))) 79 | (is (not (crawlable? ds "/bar/baz.html" :user-agent "google"))) 80 | (is (crawlable? ds "/foo/bar.html")) 81 | (is (crawlable? ds "/foo/" :user-agent "*"))) 82 | (let [ds {"*" [[:disallow "/foo/"] [:disallow "/bar/"]]}] 83 | (is (crawlable? ds "/foo")) 84 | (is (crawlable? ds "/bif/")) 85 | (is (not (crawlable? ds "/bar/"))) 86 | (is (not (crawlable? ds "/bar/2.html"))))) 87 | 88 | (deftest test-process-request-rate 89 | (is (= (process-request-rate "1/10") 1/10)) 90 | (is (= (process-request-rate "1/10h") 1/36000)) 91 | (is (= (process-request-rate "1/10m") 1/600)) 92 | (is (= (process-request-rate "1/10s") 1/10))) 93 | -------------------------------------------------------------------------------- /test/clj_robots/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retiman/clj-robots/e0e0305293d7b7d21e16253af9ace8b3beed94d3/test/clj_robots/empty.txt -------------------------------------------------------------------------------- /test/clj_robots/robots-bad.txt: -------------------------------------------------------------------------------- 1 | Disallow: # This is a malformed directive 2 | 3 | Crawl-delay: Not a number! 4 | # Here's another comment! 5 | 6 | Allow: /foobar/ # This is a real directive 7 | Allow # Uh oh, this one is no good : /foobaz 8 | 9 | Request-rate: Not a number! 10 | -------------------------------------------------------------------------------- /test/clj_robots/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: /images/foo.jpg # This is a comment! 3 | Disallow: /cgi-bin/ 4 | Disallow: /images/ # This is another comment! 5 | Disallow: /tmp/ 6 | Disallow: /private/ 7 | 8 | User-agent: google 9 | Allow: /bif/baz/boo/ 10 | Disallow: /moo/goo/too/ 11 | 12 | User-agent: foo 13 | User-agent: bar 14 | User-agent: baz 15 | Disallow: /mif/tif/psd/ 16 | Allow: /gif/png/img/ 17 | 18 | Crawl-delay: 10 19 | Request-rate: 1/5m 20 | # Here's another comment! 21 | 22 | Sitemap: http://www.lousycoder.com/sitemap1.xml 23 | 24 | Sitemap: http://www.lousycoder.com/sitemap2.xml 25 | 26 | Robot-version: Version 2.0 27 | -------------------------------------------------------------------------------- /test/clj_robots/test/utils.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.test.utils 2 | (:gen-class)) 3 | 4 | (defn refer-private 5 | "Access private symbols of a namespace." 6 | [ns] 7 | (doseq [[symbol var] (ns-interns ns)] 8 | (when (:private (meta var)) 9 | (intern *ns* symbol var)))) 10 | -------------------------------------------------------------------------------- /test/clj_robots/utils_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-robots.utils-test 2 | (:use 3 | [clj-robots.utils] 4 | [clj-robots.test.utils :only (refer-private)] 5 | [clojure.test]) 6 | (:gen-class)) 7 | 8 | (refer-private 'clj-robots.utils) 9 | 10 | (defn re-equals 11 | [a b] 12 | (= (.toString a) (.toString b))) 13 | 14 | (deftest test-wildcard-to-regex 15 | (is (re-equals #"hel.o.*" (wildcard-to-regex "hel?o"))) 16 | (is (re-equals #"hello.*.*" (wildcard-to-regex "hello*"))) 17 | (is (re-equals #"hello\(there\).*" (wildcard-to-regex "hello(there)")))) 18 | --------------------------------------------------------------------------------