├── .gitignore
├── .travis.yml
├── LICENSE.md
├── README.md
├── robotstxt.go
└── robotstxt_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | go:
3 |   - stable
4 | before_install:
5 |   - go get github.com/mattn/goveralls
6 | script:
7 |   - $GOPATH/bin/goveralls -service=travis-ci
8 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Sam Clarke
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Robots Parser [![Build Status](https://travis-ci.org/samclarke/robotstxt.svg?branch=master)](https://travis-ci.org/samclarke/robotstxt) [![Coverage Status](https://coveralls.io/repos/github/samclarke/robotstxt/badge.svg?branch=master)](https://coveralls.io/github/samclarke/robotstxt?branch=master) [![Go Report Card](https://goreportcard.com/badge/github.com/samclarke/robotstxt)](https://goreportcard.com/report/github.com/samclarke/robotstxt) [![GoDoc](https://godoc.org/github.com/samclarke/robotstxt?status.svg)](http://godoc.org/github.com/samclarke/robotstxt)
 2 | 
 3 | A robots.txt parser written in Go, based on the Node.js [robots-parser package](https://github.com/samclarke/robots-parser).
 4 | 
 5 | It currently supports:
 6 | 
 7 |   * User-agent:
 8 |   * Allow:
 9 |   * Disallow:
10 |   * Sitemap:
11 |   * Crawl-delay:
12 |   * Host:
13 |   * URL encoded & UTF-8 paths
14 |   * Paths with wildcards (*) and EOL matching ($)
15 | 
16 | ## Installation
17 | 
18 | Go get:
19 | 
20 |     go get github.com/samclarke/robotstxt
21 | 
22 | ## Usage
23 | 
24 | ```go
25 | import (
26 |     "log"
27 |     "github.com/samclarke/robotstxt"
28 | )
29 | 
30 | func main() {
31 |     url := "http://www.example.com/robots.txt"
32 |     contents := `
33 |         User-agent: *
34 |         Disallow: /dir/
35 |         Disallow: /test.html
36 |         Allow: /dir/test.html
37 |         Allow: /test.html
38 |         Crawl-delay: 1
39 |         Sitemap: http://example.com/sitemap.xml
40 |         Host: example.com
41 |     `
42 | 
43 |     robots, err := Parse(contents, url)
44 |     if err != nil {
45 |         log.Fatalln(err.Error())
46 |     }
47 | 
48 |     allowed, _ := robots.IsAllowed("Sams-Bot/1.0", "http://www.example.com/test.html")
49 |     if !allowed {
50 |         println("Not allowed to crawl: /test.html")
51 |     }
52 | 
53 |     allowed, _ := robots.IsAllowed("Sams-Bot/1.0", "http://www.example.com/dir/test.html")
54 |     if allowed {
55 |         println("Allowed to crawl: /dir/test.html")
56 |     }
57 | 
58 |     // 1
59 |     println("Crawl delay: " + robots.CrawlDelay("Sams-Bot/1.0"))
60 | 
61 |     // [http://example.com/sitemap.xml]
62 |     println("Sitemaps: " + strings.Join(robots.Sitemaps(), ","))
63 | 
64 |     // example.com
65 |     println("Preferred host: " + robots.Host())
66 | }
67 | ```
68 | 
69 | # License
70 | 
71 | 	The MIT License (MIT)
72 | 
73 | 	Copyright (c) 2017 Sam Clarke
74 | 
75 | 	Permission is hereby granted, free of charge, to any person obtaining a copy
76 | 	of this software and associated documentation files (the "Software"), to deal
77 | 	in the Software without restriction, including without limitation the rights
78 | 	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
79 | 	copies of the Software, and to permit persons to whom the Software is
80 | 	furnished to do so, subject to the following conditions:
81 | 
82 | 	The above copyright notice and this permission notice shall be included in
83 | 	all copies or substantial portions of the Software.
84 | 
85 | 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86 | 	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87 | 	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
88 | 	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
89 | 	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
90 | 	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
91 | 	THE SOFTWARE.
92 | 


--------------------------------------------------------------------------------
/robotstxt.go:
--------------------------------------------------------------------------------
  1 | // Package robotstxt parses robots.txt files
  2 | //
  3 | // Aims to follow the Google robots.txt specification, see:
  4 | // https://developers.google.com/search/reference/robots_txt
  5 | // for more information.
  6 | package robotstxt
  7 | 
  8 | import (
  9 | 	"net/url"
 10 | 	"regexp"
 11 | 	"strconv"
 12 | 	"strings"
 13 | 	"time"
 14 | 
 15 | 	"golang.org/x/net/idna"
 16 | )
 17 | 
 18 | type rule struct {
 19 | 	isAllowed bool
 20 | 	path      string
 21 | 	pattern   *regexp.Regexp
 22 | }
 23 | 
 24 | type group struct {
 25 | 	rules      []*rule
 26 | 	crawlDelay time.Duration
 27 | }
 28 | 
 29 | // RobotsTxt represents a parsed robots.txt file
 30 | type RobotsTxt struct {
 31 | 	url      *url.URL
 32 | 	groups   map[string]*group
 33 | 	sitemaps []string
 34 | 	host     string
 35 | }
 36 | 
 37 | // InvalidHostError is the error when a URL is tested with IsAllowed that
 38 | // is not valid for this robots.txt file
 39 | type InvalidHostError struct{}
 40 | 
 41 | func (e InvalidHostError) Error() string {
 42 | 	return "URL is not valid for this robots.txt file"
 43 | }
 44 | 
 45 | func parseAndNormalizeURL(urlStr string) (u *url.URL, err error) {
 46 | 	u, err = url.Parse(urlStr)
 47 | 	if err == nil {
 48 | 		u.Host, err = idna.ToASCII(u.Host)
 49 | 	}
 50 | 
 51 | 	return
 52 | }
 53 | 
 54 | func replaceSuffix(str, suffix, replacement string) string {
 55 | 	if strings.HasSuffix(str, suffix) {
 56 | 		return str[:len(str)-len(suffix)] + replacement
 57 | 	}
 58 | 
 59 | 	return str
 60 | }
 61 | 
 62 | func isPattern(path string) bool {
 63 | 	return strings.IndexRune(path, '*') > -1 || strings.HasSuffix(path, "$")
 64 | }
 65 | 
 66 | func compilePattern(pattern string) (*regexp.Regexp, error) {
 67 | 	pattern = regexp.QuoteMeta(pattern)
 68 | 	pattern = strings.Replace(pattern, "\\*", "(?:.*)", -1)
 69 | 
 70 | 	pattern = replaceSuffix(pattern, "\\$", "$")
 71 | 	pattern = replaceSuffix(pattern, "%24", "\\$")
 72 | 	pattern = replaceSuffix(pattern, "%2524", "%24")
 73 | 
 74 | 	pattern = strings.Replace(pattern, "%2A", "\\*", -1)
 75 | 
 76 | 	return regexp.Compile(pattern)
 77 | }
 78 | 
 79 | func normaliseUserAgent(userAgent string) string {
 80 | 	index := strings.IndexRune(userAgent, '/')
 81 | 	if index > -1 {
 82 | 		userAgent = userAgent[:index]
 83 | 	}
 84 | 
 85 | 	return strings.ToLower(strings.TrimSpace(userAgent))
 86 | }
 87 | 
 88 | func (r *group) isAllowed(userAgent string, path string) bool {
 89 | 	var result = true
 90 | 	var resultPathLength = 0
 91 | 
 92 | 	for _, rule := range r.rules {
 93 | 		if rule.pattern != nil {
 94 | 			// The first matching pattern takes precedence
 95 | 			if rule.pattern.MatchString(path) {
 96 | 				return rule.isAllowed
 97 | 			}
 98 | 		} else {
 99 | 			// The longest matching path takes precedence
100 | 			if resultPathLength > len(rule.path) {
101 | 				continue
102 | 			}
103 | 
104 | 			if strings.HasPrefix(path, rule.path) {
105 | 				result = rule.isAllowed
106 | 				resultPathLength = len(rule.path)
107 | 			}
108 | 		}
109 | 	}
110 | 
111 | 	return result
112 | }
113 | 
114 | // Parse parses the contents or a robots.txt file and returns a
115 | // RobotsTxt struct that can be used to check if URLs can be crawled
116 | // or extract crawl delays, sitemaps or the preferred host name
117 | func Parse(contents string, urlStr string) (robotsTxt *RobotsTxt, err error) {
118 | 	u, err := parseAndNormalizeURL(urlStr)
119 | 	if err != nil {
120 | 		return
121 | 	}
122 | 
123 | 	robotsTxt = &RobotsTxt{
124 | 		url:    u,
125 | 		groups: make(map[string]*group),
126 | 	}
127 | 
128 | 	var userAgents []string
129 | 	isNoneUserAgentState := false
130 | 
131 | 	lines := strings.Split(contents, "\n")
132 | 	for _, line := range lines {
133 | 		parts := strings.SplitN(line, ":", 2)
134 | 		if len(parts) > 1 {
135 | 			rule, val := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])
136 | 
137 | 			switch strings.ToLower(rule) {
138 | 			case "user-agent":
139 | 				if isNoneUserAgentState {
140 | 					userAgents = nil
141 | 				}
142 | 				userAgents = append(userAgents, normaliseUserAgent(val))
143 | 				break
144 | 			case "allow":
145 | 				for _, ua := range userAgents {
146 | 					robotsTxt.addPathRule(ua, val, true)
147 | 				}
148 | 				break
149 | 			case "disallow":
150 | 				for _, ua := range userAgents {
151 | 					robotsTxt.addPathRule(ua, val, false)
152 | 				}
153 | 				break
154 | 			case "crawl-delay":
155 | 				for _, ua := range userAgents {
156 | 					robotsTxt.addCrawlDelay(ua, val)
157 | 				}
158 | 				break
159 | 			case "sitemap":
160 | 				if val != "" {
161 | 					robotsTxt.sitemaps = append(robotsTxt.sitemaps, val)
162 | 				}
163 | 				break
164 | 			case "host":
165 | 				if val != "" {
166 | 					robotsTxt.host = val
167 | 				}
168 | 				break
169 | 			}
170 | 
171 | 			isNoneUserAgentState = strings.ToLower(rule) != "user-agent"
172 | 		}
173 | 	}
174 | 
175 | 	return
176 | }
177 | 
178 | func (r *RobotsTxt) addPathRule(userAgent string, path string, isAllowed bool) error {
179 | 	g, ok := r.groups[userAgent]
180 | 	if !ok {
181 | 		g = &group{}
182 | 		r.groups[userAgent] = g
183 | 	}
184 | 
185 | 	isPattern := isPattern(path)
186 | 	if isPattern {
187 | 		path = replaceSuffix(path, "%24", "%2524")
188 | 	}
189 | 
190 | 	// Keep * escaped
191 | 	path = strings.Replace(path, "%2A", "%252A", -1)
192 | 	if unescapedPath, err := url.PathUnescape(path); err == nil {
193 | 		path = unescapedPath
194 | 	} else {
195 | 		path = strings.Replace(path, "%252A", "%2A", -1)
196 | 	}
197 | 
198 | 	if isPattern {
199 | 		regexPattern, err := compilePattern(path)
200 | 		if err != nil {
201 | 			return err
202 | 		}
203 | 
204 | 		g.rules = append(g.rules, &rule{
205 | 			pattern:   regexPattern,
206 | 			isAllowed: isAllowed,
207 | 		})
208 | 	} else {
209 | 		g.rules = append(g.rules, &rule{
210 | 			path:      path,
211 | 			isAllowed: isAllowed,
212 | 		})
213 | 	}
214 | 
215 | 	return nil
216 | }
217 | 
218 | func (r *RobotsTxt) addCrawlDelay(userAgent string, crawlDelay string) (err error) {
219 | 	g, ok := r.groups[userAgent]
220 | 	if !ok {
221 | 		g = &group{}
222 | 		r.groups[userAgent] = g
223 | 	}
224 | 
225 | 	if delay, err := strconv.ParseFloat(crawlDelay, 64); err == nil {
226 | 		g.crawlDelay = time.Duration(delay * float64(time.Second))
227 | 	}
228 | 
229 | 	return
230 | }
231 | 
232 | // Host is the preferred hosts from the robots.txt file if there is one
233 | func (r *RobotsTxt) Host() string {
234 | 	return r.host
235 | }
236 | 
237 | // CrawlDelay returns the crawl delay for the specified
238 | // user agent or 0 if there is none
239 | func (r *RobotsTxt) CrawlDelay(userAgent string) time.Duration {
240 | 	if group, ok := r.groups[normaliseUserAgent(userAgent)]; ok {
241 | 		return group.crawlDelay
242 | 	}
243 | 
244 | 	if group, ok := r.groups["*"]; ok {
245 | 		return group.crawlDelay
246 | 	}
247 | 
248 | 	return 0
249 | }
250 | 
251 | // Sitemaps returns a list of sitemaps from the robots.txt file if any
252 | func (r *RobotsTxt) Sitemaps() []string {
253 | 	return r.sitemaps
254 | }
255 | 
256 | // IsAllowed checks if the specified path is allowed by the robots.txt file
257 | func (r *RobotsTxt) IsAllowed(userAgent string, urlStr string) (result bool, err error) {
258 | 	u, err := parseAndNormalizeURL(urlStr)
259 | 	if err != nil {
260 | 		return
261 | 	}
262 | 
263 | 	if u.Scheme != r.url.Scheme || u.Host != r.url.Host {
264 | 		err = &InvalidHostError{}
265 | 		return
266 | 	}
267 | 
268 | 	result = true
269 | 
270 | 	if group, ok := r.groups[normaliseUserAgent(userAgent)]; ok {
271 | 		result = group.isAllowed(userAgent, u.Path)
272 | 	} else if group, ok := r.groups["*"]; ok {
273 | 		result = group.isAllowed(userAgent, u.Path)
274 | 	}
275 | 
276 | 	return
277 | }
278 | 


--------------------------------------------------------------------------------
/robotstxt_test.go:
--------------------------------------------------------------------------------
  1 | package robotstxt
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | 	"time"
  7 | )
  8 | 
  9 | func testRobots(t *testing.T, contents string, url string, allowed []string, disallowed []string) {
 10 | 	robots, _ := Parse(contents, url)
 11 | 
 12 | 	for _, path := range allowed {
 13 | 		allowed, err := robots.IsAllowed("*", path)
 14 | 		if err != nil {
 15 | 			t.Errorf(err.Error())
 16 | 		} else if !allowed {
 17 | 			t.Errorf("The path " + path + " should be allowed")
 18 | 		}
 19 | 	}
 20 | 
 21 | 	for _, path := range disallowed {
 22 | 		allowed, err := robots.IsAllowed("*", path)
 23 | 		if err != nil {
 24 | 			t.Errorf(err.Error())
 25 | 		} else if allowed {
 26 | 			t.Errorf("The path " + path + " should be disallowed")
 27 | 		}
 28 | 	}
 29 | }
 30 | 
 31 | func TestRobotsTxt_prseTheDisallowDirective(t *testing.T) {
 32 | 	url := "http://www.example.com/robots.txt"
 33 | 	contents := `
 34 | 		User-agent: *
 35 | 		Disallow: /fish/
 36 | 		Disallow: /test.html
 37 | 	`
 38 | 
 39 | 	allowed := []string{
 40 | 		"http://www.example.com/fish",
 41 | 		"http://www.example.com/Test.html",
 42 | 	}
 43 | 
 44 | 	disallowed := []string{
 45 | 		"http://www.example.com/fish/index.php",
 46 | 		"http://www.example.com/fish/",
 47 | 		"http://www.example.com/test.html",
 48 | 	}
 49 | 
 50 | 	testRobots(t, contents, url, allowed, disallowed)
 51 | }
 52 | 
 53 | func TestRobotsTxt_parsePatterns(t *testing.T) {
 54 | 	url := "http://www.example.com/robots.txt"
 55 | 	contents := `
 56 | 		User-agent: *
 57 | 		Disallow: /fish*.php
 58 | 		Disallow: /*.dext$
 59 | 	`
 60 | 
 61 | 	allowed := []string{
 62 | 		"http://www.example.com/Fish.PHP",
 63 | 		"http://www.example.com/Fish.dext1",
 64 | 	}
 65 | 
 66 | 	disallowed := []string{
 67 | 		"http://www.example.com/fish.php",
 68 | 		"http://www.example.com/fishheads/catfish.php?parameters",
 69 | 		"http://www.example.com/AnYthInG.dext",
 70 | 		"http://www.example.com/Fish.dext.dext",
 71 | 	}
 72 | 
 73 | 	testRobots(t, contents, url, allowed, disallowed)
 74 | }
 75 | 
 76 | func TestRobotsTxt_correctOrderPresidenceForAllowAndDisallow(t *testing.T) {
 77 | 	url := "http://www.example.com/robots.txt"
 78 | 	contents := `
 79 | 		User-agent: *
 80 | 		Disallow: /fish*.php
 81 | 		Allow: /fish/index.php
 82 | 		Disallow: /test
 83 | 		Allow: /test/
 84 | 	`
 85 | 
 86 | 	allowed := []string{
 87 | 		"http://www.example.com/test/index.html",
 88 | 		"http://www.example.com/test/",
 89 | 	}
 90 | 
 91 | 	disallowed := []string{
 92 | 		"http://www.example.com/fish.php",
 93 | 		"http://www.example.com/fishheads/catfish.php?parameters",
 94 | 		"http://www.example.com/fish/index.php",
 95 | 		"http://www.example.com/test",
 96 | 	}
 97 | 
 98 | 	testRobots(t, contents, url, allowed, disallowed)
 99 | }
100 | 
101 | func TestRobotsTxt_ignoreRulesThatAreNotInAGroup(t *testing.T) {
102 | 	url := "http://www.example.com/robots.txt"
103 | 	contents := `
104 | 		Disallow: /secret.html
105 | 		Disallow: /test
106 | 	`
107 | 
108 | 	allowed := []string{
109 | 		"http://www.example.com/secret.html",
110 | 		"http://www.example.com/test/index.html",
111 | 		"http://www.example.com/test/",
112 | 	}
113 | 
114 | 	disallowed := []string{}
115 | 
116 | 	testRobots(t, contents, url, allowed, disallowed)
117 | }
118 | 
119 | func TestRobotsTxt_ignoreComments(t *testing.T) {
120 | 	url := "http://www.example.com/robots.txt"
121 | 	contents := `
122 | 		#
123 | 		# This is a comment
124 | 		#
125 | 		User-agent: *
126 | 		# This is a comment
127 | 		Disallow: /fish/
128 | 		# Disallow: fish
129 | 		Disallow: /test.html
130 | 	`
131 | 
132 | 	allowed := []string{
133 | 		"http://www.example.com/fish",
134 | 		"http://www.example.com/Test.html",
135 | 	}
136 | 
137 | 	disallowed := []string{
138 | 		"http://www.example.com/fish/index.php",
139 | 		"http://www.example.com/fish/",
140 | 		"http://www.example.com/test.html",
141 | 	}
142 | 
143 | 	testRobots(t, contents, url, allowed, disallowed)
144 | }
145 | 
146 | func TestRobotsTxt_ignoreInvalidLines(t *testing.T) {
147 | 	url := "http://www.example.com/robots.txt"
148 | 	contents := `
149 | 		invalid line
150 | 		User-agent: *
151 | 		Disallow: /fish/
152 | 		:::::another invalid line:::::
153 | 		Disallow: /test.html
154 | 		Unknown: tule
155 | 	`
156 | 
157 | 	allowed := []string{
158 | 		"http://www.example.com/fish",
159 | 		"http://www.example.com/Test.html",
160 | 	}
161 | 
162 | 	disallowed := []string{
163 | 		"http://www.example.com/fish/index.php",
164 | 		"http://www.example.com/fish/",
165 | 		"http://www.example.com/test.html",
166 | 	}
167 | 
168 | 	testRobots(t, contents, url, allowed, disallowed)
169 | }
170 | 
171 | func TestRobotsTxt_ignoreeEmptyUserAgentLines(t *testing.T) {
172 | 	url := "http://www.example.com/robots.txt"
173 | 	contents := `
174 | 		User-agent:
175 | 		Disallow: /fish/
176 | 		Disallow: /test.html
177 | 	`
178 | 
179 | 	allowed := []string{
180 | 		"http://www.example.com/fish",
181 | 		"http://www.example.com/Test.html",
182 | 		"http://www.example.com/fish/index.php",
183 | 		"http://www.example.com/fish/",
184 | 		"http://www.example.com/test.html",
185 | 	}
186 | 
187 | 	disallowed := []string{}
188 | 
189 | 	testRobots(t, contents, url, allowed, disallowed)
190 | }
191 | 
192 | func TestRobotsTxt_supportGroupsWithMultipleUserAgents(t *testing.T) {
193 | 	url := "http://www.example.com/robots.txt"
194 | 	contents := `
195 | 		User-agent: agEnTa
196 | 		User-agent: agentb
197 | 		Disallow: /fish
198 | 	`
199 | 
200 | 	robots, _ := Parse(contents, url)
201 | 
202 | 	allowed, _ := robots.IsAllowed("agenta", "http://www.example.com/test.html")
203 | 	if !allowed {
204 | 		t.Errorf("The path /test.html should be allowed")
205 | 	}
206 | 
207 | 	allowed, _ = robots.IsAllowed("agentb", "http://www.example.com/test.html")
208 | 	if !allowed {
209 | 		t.Errorf("The path /test.html should be allowed")
210 | 	}
211 | 
212 | 	allowed, _ = robots.IsAllowed("agenta", "http://www.example.com/fish/test.html")
213 | 	if allowed {
214 | 		t.Errorf("The path /fish/test.html should be disallowed")
215 | 	}
216 | 
217 | 	allowed, _ = robots.IsAllowed("agentb", "http://www.example.com/fish/test.html")
218 | 	if allowed {
219 | 		t.Errorf("The path /fish/test.html should be disallowed")
220 | 	}
221 | }
222 | 
223 | func TestRobotsTxt_returnErrorForInvalidUrls(t *testing.T) {
224 | 	url := "http://www.example.com/robots.txt"
225 | 	contents := `
226 | 		User-agent: *
227 | 		Disallow: /secret.html
228 | 		Disallow: /test
229 | 	`
230 | 
231 | 	invalidUrls := []string{
232 | 		"http://example.com/secret.html",
233 | 		"http://www.example.net/test/index.html",
234 | 		"http://www.examsple.com/test/",
235 | 		"h:||@@##'#']s;a[//test/",
236 | 	}
237 | 
238 | 	robots, _ := Parse(contents, url)
239 | 
240 | 	for _, u := range invalidUrls {
241 | 		_, err := robots.IsAllowed("*", u)
242 | 		_, ok := err.(*InvalidHostError)
243 | 		if !ok {
244 | 			t.Errorf("The URL " + u + " should cause an error")
245 | 		}
246 | 	}
247 | }
248 | 
249 | func TestRobotsTxt_handleUrlsWithPunycode(t *testing.T) {
250 | 	url := "http://www.münich.com/robots.txt"
251 | 	contents := `
252 | 		User-agent: *
253 | 		Disallow: /secret.html
254 | 		Disallow: /test
255 | 	`
256 | 
257 | 	allowed := []string{
258 | 		"http://www.münich.com/index.html",
259 | 		"http://www.xn--mnich-kva.com/index.html",
260 | 	}
261 | 
262 | 	disallowed := []string{
263 | 		"http://www.münich.com/secret.html",
264 | 		"http://www.xn--mnich-kva.com/secret.html",
265 | 	}
266 | 
267 | 	testRobots(t, contents, url, allowed, disallowed)
268 | }
269 | 
270 | func TestRobotsTxt_allowAllIfEmptyRobotsTxt(t *testing.T) {
271 | 	url := "http://www.example.com/robots.txt"
272 | 	contents := ``
273 | 
274 | 	allowed := []string{
275 | 		"http://www.example.com/secret.html",
276 | 		"http://www.example.com/test/index.html",
277 | 		"http://www.example.com/test/",
278 | 	}
279 | 
280 | 	disallowed := []string{}
281 | 
282 | 	testRobots(t, contents, url, allowed, disallowed)
283 | }
284 | 
285 | func TestRobotsTxt_parseTheCrawlDelayDirective(t *testing.T) {
286 | 	url := "http://www.example.com/robots.txt"
287 | 	contents := `
288 | 		user-agent: a
289 | 		crawl-delay: 1
290 | 
291 | 		user-agent: b
292 | 		disallow: /d
293 | 
294 | 		user-agent: c
295 | 		user-agent: d
296 | 		crawl-delay: 10
297 | 	`
298 | 
299 | 	robots, _ := Parse(contents, url)
300 | 
301 | 	if robots.CrawlDelay("a") != time.Second {
302 | 		t.Errorf("Expected crawl delay for a to be 1")
303 | 	}
304 | 
305 | 	if robots.CrawlDelay("b") != 0 {
306 | 		t.Errorf("Expected crawl delay for b to be 0")
307 | 	}
308 | 
309 | 	if robots.CrawlDelay("c") != 10*time.Second {
310 | 		t.Errorf("Expected crawl delay for c to be 10")
311 | 	}
312 | 
313 | 	if robots.CrawlDelay("d") != 10*time.Second {
314 | 		t.Errorf("Expected crawl delay for d to be 10")
315 | 	}
316 | }
317 | 
318 | func TestRobotsTxt_returnZeroIfEmpty(t *testing.T) {
319 | 	url := "http://www.example.com/robots.txt"
320 | 	contents := ``
321 | 
322 | 	robots, _ := Parse(contents, url)
323 | 
324 | 	if robots.CrawlDelay("a") != 0 {
325 | 		t.Errorf("Expected crawl delay for a to be 0")
326 | 	}
327 | }
328 | 
329 | func TestRobotsTxt_ignoreInvalidCrawlDelayDirectives(t *testing.T) {
330 | 	url := "http://www.example.com/robots.txt"
331 | 	contents := `
332 | 		user-agent: a
333 | 		crawl-delay: 1.2.1
334 | 
335 | 		user-agent: b
336 | 		crawl-delay: 1.a0
337 | 
338 | 		user-agent: c
339 | 		user-agent: d
340 | 		crawl-delay: 10a
341 | 	`
342 | 
343 | 	robots, _ := Parse(contents, url)
344 | 
345 | 	if robots.CrawlDelay("a") != 0 {
346 | 		t.Errorf("Expected crawl delay for a to be 0")
347 | 	}
348 | 
349 | 	if robots.CrawlDelay("b") != 0 {
350 | 		t.Errorf("Expected crawl delay for b to be 0")
351 | 	}
352 | 
353 | 	if robots.CrawlDelay("c") != 0 {
354 | 		t.Errorf("Expected crawl delay for c to be 0")
355 | 	}
356 | 
357 | 	if robots.CrawlDelay("d") != 0 {
358 | 		t.Errorf("Expected crawl delay for d to be 0")
359 | 	}
360 | }
361 | 
362 | func TestRobotsTxt_parseTheSitemapDirective(t *testing.T) {
363 | 	url := "http://www.example.com/robots.txt"
364 | 	contents := `
365 | 		user-agent: a
366 | 		crawl-delay: 1
367 | 		sitemap: http://example.com/test.xml
368 | 
369 | 		user-agent: b
370 | 		disallow: /d
371 | 
372 | 		sitemap: /sitemap.xml
373 | 		sitemap:     http://example.com/test/sitemap.xml   
374 | 	`
375 | 
376 | 	expected := []string{
377 | 		"http://example.com/test.xml",
378 | 		"/sitemap.xml",
379 | 		"http://example.com/test/sitemap.xml",
380 | 	}
381 | 
382 | 	robots, _ := Parse(contents, url)
383 | 
384 | 	if !reflect.DeepEqual(robots.Sitemaps(), expected) {
385 | 		t.Errorf("Expected sitemaps to match")
386 | 	}
387 | }
388 | 
389 | func TestRobotsTxt_parseTheHostDirective(t *testing.T) {
390 | 	url := "http://www.example.com/robots.txt"
391 | 	contents := `
392 | 		user-agent: a
393 | 		crawl-delay: 1
394 | 		host: www.example.net
395 | 
396 | 		user-agent: b
397 | 		disallow: /d
398 | 
399 | 		host: example.com 
400 | 	`
401 | 
402 | 	robots, _ := Parse(contents, url)
403 | 
404 | 	if robots.Host() != "example.com" {
405 | 		t.Errorf("Expected host to be example.com")
406 | 	}
407 | }
408 | 
409 | func TestRobotsTxt_parseEmptyAndInvalidDirectives(t *testing.T) {
410 | 	url := "http://www.example.com/robots.txt"
411 | 	contents := `
412 | 		user-agent:
413 | 		user-agent:::: a::
414 | 		crawl-delay:
415 | 		crawl-delay:::: 0:
416 | 		host:
417 | 		host:: example.com
418 | 		sitemap:
419 | 		sitemap:: site:map.xml
420 | 		disallow:
421 | 		disallow::: /:
422 | 		allow:
423 | 		allow::: /:
424 | 	`
425 | 
426 | 	_, err := Parse(contents, url)
427 | 	if err != nil {
428 | 		t.Errorf("Expected to not fail on invalid directives")
429 | 	}
430 | }
431 | 
432 | func TestRobotsTxt_treatOnlyTheLastHostDirectiveAsValid(t *testing.T) {
433 | 	url := "http://www.example.com/robots.txt"
434 | 	contents := `
435 | 		user-agent: a
436 | 		crawl-delay: 1
437 | 		host: www.example.net
438 | 
439 | 		user-agent: b
440 | 		disallow: /d
441 | 
442 | 		host: example.net
443 | 		host: example.com 
444 | 	`
445 | 
446 | 	robots, _ := Parse(contents, url)
447 | 
448 | 	if robots.Host() != "example.com" {
449 | 		t.Errorf("Expected host to be example.com")
450 | 	}
451 | }
452 | 
453 | func TestRobotsTxt_returnEmptyStringWhenThereIsNoHostDirective(t *testing.T) {
454 | 	url := "http://www.example.com/robots.txt"
455 | 	contents := `
456 | 		user-agent: a
457 | 		crawl-delay: 1
458 | 
459 | 		user-agent: b
460 | 		disallow: /d
461 | 	`
462 | 
463 | 	robots, _ := Parse(contents, url)
464 | 
465 | 	if robots.Host() != "" {
466 | 		t.Errorf("Expected host to be empty")
467 | 	}
468 | }
469 | 
470 | func TestRobotsTxt_fallbackToDefaultWhenUserAgentHasRulesOfItsOwn(t *testing.T) {
471 | 	url := "http://www.example.com/robots.txt"
472 | 	contents := `
473 | 		user-agent: *
474 | 		disallow: /test/
475 | 		crawl-delay: 1
476 | 
477 | 		user-agent: b
478 | 		crawl-delay: 12
479 | 
480 | 		user-agent: c
481 | 		user-agent: d
482 | 		crawl-delay: 10
483 | 	`
484 | 
485 | 	robots, _ := Parse(contents, url)
486 | 
487 | 	if robots.CrawlDelay("should-fall-back") != 1*time.Second {
488 | 		t.Errorf("Expected crawl delay for should-fall-back to be 1")
489 | 	}
490 | 
491 | 	if robots.CrawlDelay("d") != 10*time.Second {
492 | 		t.Errorf("Expected crawl delay for d to be 10")
493 | 	}
494 | 
495 | 	if robots.CrawlDelay("dd") != 1*time.Second {
496 | 		t.Errorf("Expected crawl delay for dd to be 1")
497 | 	}
498 | 
499 | 	allowed, _ := robots.IsAllowed("should-fall-back", "http://www.example.com/test/")
500 | 	if allowed {
501 | 		t.Errorf("Expected /test/ to be disallowed for should-fall-back to")
502 | 	}
503 | }
504 | 
505 | func TestRobotsTxt_shouldNotFallbackToDefaultWhenUserAgentHasRules(t *testing.T) {
506 | 	url := "http://www.example.com/robots.txt"
507 | 	contents := `
508 | 		user-agent: *
509 | 		disallow: /test/
510 | 		crawl-delay: 1
511 | 
512 | 		user-agent: b
513 | 		allow:
514 | 	`
515 | 
516 | 	robots, _ := Parse(contents, url)
517 | 
518 | 	if robots.CrawlDelay("b") != 0*time.Second {
519 | 		t.Errorf("Expected crawl delay for b to be 0")
520 | 	}
521 | 
522 | 	allowed, _ := robots.IsAllowed("b", "http://www.example.com/test/")
523 | 	if !allowed {
524 | 		t.Errorf("Expected /test/ to be allowed for b to")
525 | 	}
526 | }
527 | 
528 | func TestRobotsTxt_ignoreVersionNumbersInTheUserAgentString(t *testing.T) {
529 | 	url := "http://www.example.com/robots.txt"
530 | 	contents := `
531 | 		user-agent: *
532 | 		crawl-delay: 1
533 | 
534 | 		user-agent: b
535 | 		crawl-delay: 12
536 | 
537 | 		user-agent: c
538 | 		user-agent: d
539 | 		crawl-delay: 10
540 | 	`
541 | 
542 | 	robots, _ := Parse(contents, url)
543 | 
544 | 	if robots.CrawlDelay("should-fall-back/1.0.0") != time.Second {
545 | 		t.Errorf("Expected crawl delay for should-fall-back/1.0.0 to be 1")
546 | 	}
547 | 
548 | 	if robots.CrawlDelay("d/12") != 10*time.Second {
549 | 		t.Errorf("Expected crawl delay for d/12 to be 10")
550 | 	}
551 | 
552 | 	if robots.CrawlDelay("dd / 0-32-3") != 1*time.Second {
553 | 		t.Errorf("Expected crawl delay for dd / 0-32-3 to be 1")
554 | 	}
555 | 
556 | 	if robots.CrawlDelay("b / 1.0") != 12*time.Second {
557 | 		t.Errorf("Expected crawl delay for b / 1.0 to be 12")
558 | 	}
559 | }
560 | 
561 | func TestRobotsTxt_handleUrlEncodedAndUtf8PathsAndUrls(t *testing.T) {
562 | 	url := "http://www.example.com/robots.txt"
563 | 	contents := `
564 | 		User-agent: *
565 | 		Disallow: /wiki:Article_wizard
566 | 		Disallow: /wiki%3AArticle_wizard
567 | 		Disallow: /اختبارات
568 | 		Disallow: /%E6%B5%8B%E8%AF%95
569 | 		Disallow: /考查
570 | 		Disallow: /%E0%A6%AA%E0%A6%B0%E0%A7%80%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE
571 | 	`
572 | 
573 | 	allowed := []string{
574 | 		"http://www.example.com/fish",
575 | 		"http://www.example.com/اختبار",
576 | 	}
577 | 
578 | 	disallowed := []string{
579 | 		"http://www.example.com/wiki:Article_wizard",
580 | 		"http://www.example.com/wiki%3AArticle_wizard",
581 | 		"http://www.example.com/اختبارات/test",
582 | 		"http://www.example.com/测试",
583 | 		"http://www.example.com/%E8%80%83%E6%9F%A5/test",
584 | 		"http://www.example.com/%E0%A6%AA%E0%A6%B0%E0%A7%80%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE",
585 | 	}
586 | 
587 | 	testRobots(t, contents, url, allowed, disallowed)
588 | }
589 | 
590 | func TestRobotsTxt_invalidUrlEncodingsShouldBeTreatedAsUnencoded(t *testing.T) {
591 | 	url := "http://www.example.com/robots.txt"
592 | 	contents := `
593 | 		User-agent: *
594 | 		Disallow: /%20%A/test
595 | 		Disallow: /%24%A/test$
596 | 		Disallow: /%B/*test%24
597 | 	`
598 | 
599 | 	allowed := []string{
600 | 		"http://www.example.com/ %25A/test/test",
601 | 		"http://www.example.com/+%25A/test",
602 | 		"http://www.example.com/%20%25A/test",
603 | 		"http://www.example.com/*%25A/testing",
604 | 		"http://www.example.com/%25B/test$",
605 | 	}
606 | 
607 | 	disallowed := []string{
608 | 		"http://www.example.com/%2520%25A/test",
609 | 		"http://www.example.com/%2524%25A/test",
610 | 		"http://www.example.com/%25B/test%2524",
611 | 	}
612 | 
613 | 	testRobots(t, contents, url, allowed, disallowed)
614 | }
615 | 
616 | func TestRobotsTxt_handleUrlEncodingsWithPatterns(t *testing.T) {
617 | 	url := "http://www.example.com/robots.txt"
618 | 	contents := `
619 | 		User-agent: *
620 | 		Disallow: /%20A/*test$
621 | 		Disallow: /%20B/*test%24
622 | 		Disallow: /%20C/test%24
623 | 		Disallow: /%20D/%2Atest$
624 | 	`
625 | 
626 | 	allowed := []string{
627 | 		"http://www.example.com/ A/la/testing",
628 | 		"http://www.example.com/ B/la/test",
629 | 		"http://www.example.com/ C/test",
630 | 		"http://www.example.com/ D/la/test",
631 | 	}
632 | 
633 | 	disallowed := []string{
634 | 		"http://www.example.com/ A/la/test",
635 | 		"http://www.example.com/ B/la/test$",
636 | 		"http://www.example.com/ B/la/test$test",
637 | 		"http://www.example.com/ C/test$",
638 | 		"http://www.example.com/ D/*test",
639 | 		"http://www.example.com/ D/%2Atest",
640 | 	}
641 | 
642 | 	testRobots(t, contents, url, allowed, disallowed)
643 | }
644 | 


--------------------------------------------------------------------------------