├── .gitignore ├── .travis.yml ├── LICENSE.md ├── README.md ├── robotstxt.go └── robotstxt_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - stable 4 | before_install: 5 | - go get github.com/mattn/goveralls 6 | script: 7 | - $GOPATH/bin/goveralls -service=travis-ci 8 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Sam Clarke 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Robots Parser [![Build Status](https://travis-ci.org/samclarke/robotstxt.svg?branch=master)](https://travis-ci.org/samclarke/robotstxt) [![Coverage Status](https://coveralls.io/repos/github/samclarke/robotstxt/badge.svg?branch=master)](https://coveralls.io/github/samclarke/robotstxt?branch=master) [![Go Report Card](https://goreportcard.com/badge/github.com/samclarke/robotstxt)](https://goreportcard.com/report/github.com/samclarke/robotstxt) [![GoDoc](https://godoc.org/github.com/samclarke/robotstxt?status.svg)](http://godoc.org/github.com/samclarke/robotstxt) 2 | 3 | A robots.txt parser written in Go, based on the Node.js [robots-parser package](https://github.com/samclarke/robots-parser). 4 | 5 | It currently supports: 6 | 7 | * User-agent: 8 | * Allow: 9 | * Disallow: 10 | * Sitemap: 11 | * Crawl-delay: 12 | * Host: 13 | * URL encoded & UTF-8 paths 14 | * Paths with wildcards (*) and EOL matching ($) 15 | 16 | ## Installation 17 | 18 | Go get: 19 | 20 | go get github.com/samclarke/robotstxt 21 | 22 | ## Usage 23 | 24 | ```go 25 | import ( 26 | "log" 27 | "github.com/samclarke/robotstxt" 28 | ) 29 | 30 | func main() { 31 | url := "http://www.example.com/robots.txt" 32 | contents := ` 33 | User-agent: * 34 | Disallow: /dir/ 35 | Disallow: /test.html 36 | Allow: /dir/test.html 37 | Allow: /test.html 38 | Crawl-delay: 1 39 | Sitemap: http://example.com/sitemap.xml 40 | Host: example.com 41 | ` 42 | 43 | robots, err := Parse(contents, url) 44 | if err != nil { 45 | log.Fatalln(err.Error()) 46 | } 47 | 48 | allowed, _ := robots.IsAllowed("Sams-Bot/1.0", "http://www.example.com/test.html") 49 | if !allowed { 50 | println("Not allowed to crawl: /test.html") 51 | } 52 | 53 | allowed, _ := robots.IsAllowed("Sams-Bot/1.0", "http://www.example.com/dir/test.html") 54 | if allowed { 55 | println("Allowed to crawl: /dir/test.html") 56 | } 57 | 58 | // 1 59 | println("Crawl delay: " + robots.CrawlDelay("Sams-Bot/1.0")) 60 | 61 | // [http://example.com/sitemap.xml] 62 | println("Sitemaps: " + strings.Join(robots.Sitemaps(), ",")) 63 | 64 | // example.com 65 | println("Preferred host: " + robots.Host()) 66 | } 67 | ``` 68 | 69 | # License 70 | 71 | The MIT License (MIT) 72 | 73 | Copyright (c) 2017 Sam Clarke 74 | 75 | Permission is hereby granted, free of charge, to any person obtaining a copy 76 | of this software and associated documentation files (the "Software"), to deal 77 | in the Software without restriction, including without limitation the rights 78 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 79 | copies of the Software, and to permit persons to whom the Software is 80 | furnished to do so, subject to the following conditions: 81 | 82 | The above copyright notice and this permission notice shall be included in 83 | all copies or substantial portions of the Software. 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 86 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 87 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 88 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 89 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 90 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 91 | THE SOFTWARE. 92 | -------------------------------------------------------------------------------- /robotstxt.go: -------------------------------------------------------------------------------- 1 | // Package robotstxt parses robots.txt files 2 | // 3 | // Aims to follow the Google robots.txt specification, see: 4 | // https://developers.google.com/search/reference/robots_txt 5 | // for more information. 6 | package robotstxt 7 | 8 | import ( 9 | "net/url" 10 | "regexp" 11 | "strconv" 12 | "strings" 13 | "time" 14 | 15 | "golang.org/x/net/idna" 16 | ) 17 | 18 | type rule struct { 19 | isAllowed bool 20 | path string 21 | pattern *regexp.Regexp 22 | } 23 | 24 | type group struct { 25 | rules []*rule 26 | crawlDelay time.Duration 27 | } 28 | 29 | // RobotsTxt represents a parsed robots.txt file 30 | type RobotsTxt struct { 31 | url *url.URL 32 | groups map[string]*group 33 | sitemaps []string 34 | host string 35 | } 36 | 37 | // InvalidHostError is the error when a URL is tested with IsAllowed that 38 | // is not valid for this robots.txt file 39 | type InvalidHostError struct{} 40 | 41 | func (e InvalidHostError) Error() string { 42 | return "URL is not valid for this robots.txt file" 43 | } 44 | 45 | func parseAndNormalizeURL(urlStr string) (u *url.URL, err error) { 46 | u, err = url.Parse(urlStr) 47 | if err == nil { 48 | u.Host, err = idna.ToASCII(u.Host) 49 | } 50 | 51 | return 52 | } 53 | 54 | func replaceSuffix(str, suffix, replacement string) string { 55 | if strings.HasSuffix(str, suffix) { 56 | return str[:len(str)-len(suffix)] + replacement 57 | } 58 | 59 | return str 60 | } 61 | 62 | func isPattern(path string) bool { 63 | return strings.IndexRune(path, '*') > -1 || strings.HasSuffix(path, "$") 64 | } 65 | 66 | func compilePattern(pattern string) (*regexp.Regexp, error) { 67 | pattern = regexp.QuoteMeta(pattern) 68 | pattern = strings.Replace(pattern, "\\*", "(?:.*)", -1) 69 | 70 | pattern = replaceSuffix(pattern, "\\$", "$") 71 | pattern = replaceSuffix(pattern, "%24", "\\$") 72 | pattern = replaceSuffix(pattern, "%2524", "%24") 73 | 74 | pattern = strings.Replace(pattern, "%2A", "\\*", -1) 75 | 76 | return regexp.Compile(pattern) 77 | } 78 | 79 | func normaliseUserAgent(userAgent string) string { 80 | index := strings.IndexRune(userAgent, '/') 81 | if index > -1 { 82 | userAgent = userAgent[:index] 83 | } 84 | 85 | return strings.ToLower(strings.TrimSpace(userAgent)) 86 | } 87 | 88 | func (r *group) isAllowed(userAgent string, path string) bool { 89 | var result = true 90 | var resultPathLength = 0 91 | 92 | for _, rule := range r.rules { 93 | if rule.pattern != nil { 94 | // The first matching pattern takes precedence 95 | if rule.pattern.MatchString(path) { 96 | return rule.isAllowed 97 | } 98 | } else { 99 | // The longest matching path takes precedence 100 | if resultPathLength > len(rule.path) { 101 | continue 102 | } 103 | 104 | if strings.HasPrefix(path, rule.path) { 105 | result = rule.isAllowed 106 | resultPathLength = len(rule.path) 107 | } 108 | } 109 | } 110 | 111 | return result 112 | } 113 | 114 | // Parse parses the contents or a robots.txt file and returns a 115 | // RobotsTxt struct that can be used to check if URLs can be crawled 116 | // or extract crawl delays, sitemaps or the preferred host name 117 | func Parse(contents string, urlStr string) (robotsTxt *RobotsTxt, err error) { 118 | u, err := parseAndNormalizeURL(urlStr) 119 | if err != nil { 120 | return 121 | } 122 | 123 | robotsTxt = &RobotsTxt{ 124 | url: u, 125 | groups: make(map[string]*group), 126 | } 127 | 128 | var userAgents []string 129 | isNoneUserAgentState := false 130 | 131 | lines := strings.Split(contents, "\n") 132 | for _, line := range lines { 133 | parts := strings.SplitN(line, ":", 2) 134 | if len(parts) > 1 { 135 | rule, val := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]) 136 | 137 | switch strings.ToLower(rule) { 138 | case "user-agent": 139 | if isNoneUserAgentState { 140 | userAgents = nil 141 | } 142 | userAgents = append(userAgents, normaliseUserAgent(val)) 143 | break 144 | case "allow": 145 | for _, ua := range userAgents { 146 | robotsTxt.addPathRule(ua, val, true) 147 | } 148 | break 149 | case "disallow": 150 | for _, ua := range userAgents { 151 | robotsTxt.addPathRule(ua, val, false) 152 | } 153 | break 154 | case "crawl-delay": 155 | for _, ua := range userAgents { 156 | robotsTxt.addCrawlDelay(ua, val) 157 | } 158 | break 159 | case "sitemap": 160 | if val != "" { 161 | robotsTxt.sitemaps = append(robotsTxt.sitemaps, val) 162 | } 163 | break 164 | case "host": 165 | if val != "" { 166 | robotsTxt.host = val 167 | } 168 | break 169 | } 170 | 171 | isNoneUserAgentState = strings.ToLower(rule) != "user-agent" 172 | } 173 | } 174 | 175 | return 176 | } 177 | 178 | func (r *RobotsTxt) addPathRule(userAgent string, path string, isAllowed bool) error { 179 | g, ok := r.groups[userAgent] 180 | if !ok { 181 | g = &group{} 182 | r.groups[userAgent] = g 183 | } 184 | 185 | isPattern := isPattern(path) 186 | if isPattern { 187 | path = replaceSuffix(path, "%24", "%2524") 188 | } 189 | 190 | // Keep * escaped 191 | path = strings.Replace(path, "%2A", "%252A", -1) 192 | if unescapedPath, err := url.PathUnescape(path); err == nil { 193 | path = unescapedPath 194 | } else { 195 | path = strings.Replace(path, "%252A", "%2A", -1) 196 | } 197 | 198 | if isPattern { 199 | regexPattern, err := compilePattern(path) 200 | if err != nil { 201 | return err 202 | } 203 | 204 | g.rules = append(g.rules, &rule{ 205 | pattern: regexPattern, 206 | isAllowed: isAllowed, 207 | }) 208 | } else { 209 | g.rules = append(g.rules, &rule{ 210 | path: path, 211 | isAllowed: isAllowed, 212 | }) 213 | } 214 | 215 | return nil 216 | } 217 | 218 | func (r *RobotsTxt) addCrawlDelay(userAgent string, crawlDelay string) (err error) { 219 | g, ok := r.groups[userAgent] 220 | if !ok { 221 | g = &group{} 222 | r.groups[userAgent] = g 223 | } 224 | 225 | if delay, err := strconv.ParseFloat(crawlDelay, 64); err == nil { 226 | g.crawlDelay = time.Duration(delay * float64(time.Second)) 227 | } 228 | 229 | return 230 | } 231 | 232 | // Host is the preferred hosts from the robots.txt file if there is one 233 | func (r *RobotsTxt) Host() string { 234 | return r.host 235 | } 236 | 237 | // CrawlDelay returns the crawl delay for the specified 238 | // user agent or 0 if there is none 239 | func (r *RobotsTxt) CrawlDelay(userAgent string) time.Duration { 240 | if group, ok := r.groups[normaliseUserAgent(userAgent)]; ok { 241 | return group.crawlDelay 242 | } 243 | 244 | if group, ok := r.groups["*"]; ok { 245 | return group.crawlDelay 246 | } 247 | 248 | return 0 249 | } 250 | 251 | // Sitemaps returns a list of sitemaps from the robots.txt file if any 252 | func (r *RobotsTxt) Sitemaps() []string { 253 | return r.sitemaps 254 | } 255 | 256 | // IsAllowed checks if the specified path is allowed by the robots.txt file 257 | func (r *RobotsTxt) IsAllowed(userAgent string, urlStr string) (result bool, err error) { 258 | u, err := parseAndNormalizeURL(urlStr) 259 | if err != nil { 260 | return 261 | } 262 | 263 | if u.Scheme != r.url.Scheme || u.Host != r.url.Host { 264 | err = &InvalidHostError{} 265 | return 266 | } 267 | 268 | result = true 269 | 270 | if group, ok := r.groups[normaliseUserAgent(userAgent)]; ok { 271 | result = group.isAllowed(userAgent, u.Path) 272 | } else if group, ok := r.groups["*"]; ok { 273 | result = group.isAllowed(userAgent, u.Path) 274 | } 275 | 276 | return 277 | } 278 | -------------------------------------------------------------------------------- /robotstxt_test.go: -------------------------------------------------------------------------------- 1 | package robotstxt 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func testRobots(t *testing.T, contents string, url string, allowed []string, disallowed []string) { 10 | robots, _ := Parse(contents, url) 11 | 12 | for _, path := range allowed { 13 | allowed, err := robots.IsAllowed("*", path) 14 | if err != nil { 15 | t.Errorf(err.Error()) 16 | } else if !allowed { 17 | t.Errorf("The path " + path + " should be allowed") 18 | } 19 | } 20 | 21 | for _, path := range disallowed { 22 | allowed, err := robots.IsAllowed("*", path) 23 | if err != nil { 24 | t.Errorf(err.Error()) 25 | } else if allowed { 26 | t.Errorf("The path " + path + " should be disallowed") 27 | } 28 | } 29 | } 30 | 31 | func TestRobotsTxt_prseTheDisallowDirective(t *testing.T) { 32 | url := "http://www.example.com/robots.txt" 33 | contents := ` 34 | User-agent: * 35 | Disallow: /fish/ 36 | Disallow: /test.html 37 | ` 38 | 39 | allowed := []string{ 40 | "http://www.example.com/fish", 41 | "http://www.example.com/Test.html", 42 | } 43 | 44 | disallowed := []string{ 45 | "http://www.example.com/fish/index.php", 46 | "http://www.example.com/fish/", 47 | "http://www.example.com/test.html", 48 | } 49 | 50 | testRobots(t, contents, url, allowed, disallowed) 51 | } 52 | 53 | func TestRobotsTxt_parsePatterns(t *testing.T) { 54 | url := "http://www.example.com/robots.txt" 55 | contents := ` 56 | User-agent: * 57 | Disallow: /fish*.php 58 | Disallow: /*.dext$ 59 | ` 60 | 61 | allowed := []string{ 62 | "http://www.example.com/Fish.PHP", 63 | "http://www.example.com/Fish.dext1", 64 | } 65 | 66 | disallowed := []string{ 67 | "http://www.example.com/fish.php", 68 | "http://www.example.com/fishheads/catfish.php?parameters", 69 | "http://www.example.com/AnYthInG.dext", 70 | "http://www.example.com/Fish.dext.dext", 71 | } 72 | 73 | testRobots(t, contents, url, allowed, disallowed) 74 | } 75 | 76 | func TestRobotsTxt_correctOrderPresidenceForAllowAndDisallow(t *testing.T) { 77 | url := "http://www.example.com/robots.txt" 78 | contents := ` 79 | User-agent: * 80 | Disallow: /fish*.php 81 | Allow: /fish/index.php 82 | Disallow: /test 83 | Allow: /test/ 84 | ` 85 | 86 | allowed := []string{ 87 | "http://www.example.com/test/index.html", 88 | "http://www.example.com/test/", 89 | } 90 | 91 | disallowed := []string{ 92 | "http://www.example.com/fish.php", 93 | "http://www.example.com/fishheads/catfish.php?parameters", 94 | "http://www.example.com/fish/index.php", 95 | "http://www.example.com/test", 96 | } 97 | 98 | testRobots(t, contents, url, allowed, disallowed) 99 | } 100 | 101 | func TestRobotsTxt_ignoreRulesThatAreNotInAGroup(t *testing.T) { 102 | url := "http://www.example.com/robots.txt" 103 | contents := ` 104 | Disallow: /secret.html 105 | Disallow: /test 106 | ` 107 | 108 | allowed := []string{ 109 | "http://www.example.com/secret.html", 110 | "http://www.example.com/test/index.html", 111 | "http://www.example.com/test/", 112 | } 113 | 114 | disallowed := []string{} 115 | 116 | testRobots(t, contents, url, allowed, disallowed) 117 | } 118 | 119 | func TestRobotsTxt_ignoreComments(t *testing.T) { 120 | url := "http://www.example.com/robots.txt" 121 | contents := ` 122 | # 123 | # This is a comment 124 | # 125 | User-agent: * 126 | # This is a comment 127 | Disallow: /fish/ 128 | # Disallow: fish 129 | Disallow: /test.html 130 | ` 131 | 132 | allowed := []string{ 133 | "http://www.example.com/fish", 134 | "http://www.example.com/Test.html", 135 | } 136 | 137 | disallowed := []string{ 138 | "http://www.example.com/fish/index.php", 139 | "http://www.example.com/fish/", 140 | "http://www.example.com/test.html", 141 | } 142 | 143 | testRobots(t, contents, url, allowed, disallowed) 144 | } 145 | 146 | func TestRobotsTxt_ignoreInvalidLines(t *testing.T) { 147 | url := "http://www.example.com/robots.txt" 148 | contents := ` 149 | invalid line 150 | User-agent: * 151 | Disallow: /fish/ 152 | :::::another invalid line::::: 153 | Disallow: /test.html 154 | Unknown: tule 155 | ` 156 | 157 | allowed := []string{ 158 | "http://www.example.com/fish", 159 | "http://www.example.com/Test.html", 160 | } 161 | 162 | disallowed := []string{ 163 | "http://www.example.com/fish/index.php", 164 | "http://www.example.com/fish/", 165 | "http://www.example.com/test.html", 166 | } 167 | 168 | testRobots(t, contents, url, allowed, disallowed) 169 | } 170 | 171 | func TestRobotsTxt_ignoreeEmptyUserAgentLines(t *testing.T) { 172 | url := "http://www.example.com/robots.txt" 173 | contents := ` 174 | User-agent: 175 | Disallow: /fish/ 176 | Disallow: /test.html 177 | ` 178 | 179 | allowed := []string{ 180 | "http://www.example.com/fish", 181 | "http://www.example.com/Test.html", 182 | "http://www.example.com/fish/index.php", 183 | "http://www.example.com/fish/", 184 | "http://www.example.com/test.html", 185 | } 186 | 187 | disallowed := []string{} 188 | 189 | testRobots(t, contents, url, allowed, disallowed) 190 | } 191 | 192 | func TestRobotsTxt_supportGroupsWithMultipleUserAgents(t *testing.T) { 193 | url := "http://www.example.com/robots.txt" 194 | contents := ` 195 | User-agent: agEnTa 196 | User-agent: agentb 197 | Disallow: /fish 198 | ` 199 | 200 | robots, _ := Parse(contents, url) 201 | 202 | allowed, _ := robots.IsAllowed("agenta", "http://www.example.com/test.html") 203 | if !allowed { 204 | t.Errorf("The path /test.html should be allowed") 205 | } 206 | 207 | allowed, _ = robots.IsAllowed("agentb", "http://www.example.com/test.html") 208 | if !allowed { 209 | t.Errorf("The path /test.html should be allowed") 210 | } 211 | 212 | allowed, _ = robots.IsAllowed("agenta", "http://www.example.com/fish/test.html") 213 | if allowed { 214 | t.Errorf("The path /fish/test.html should be disallowed") 215 | } 216 | 217 | allowed, _ = robots.IsAllowed("agentb", "http://www.example.com/fish/test.html") 218 | if allowed { 219 | t.Errorf("The path /fish/test.html should be disallowed") 220 | } 221 | } 222 | 223 | func TestRobotsTxt_returnErrorForInvalidUrls(t *testing.T) { 224 | url := "http://www.example.com/robots.txt" 225 | contents := ` 226 | User-agent: * 227 | Disallow: /secret.html 228 | Disallow: /test 229 | ` 230 | 231 | invalidUrls := []string{ 232 | "http://example.com/secret.html", 233 | "http://www.example.net/test/index.html", 234 | "http://www.examsple.com/test/", 235 | "h:||@@##'#']s;a[//test/", 236 | } 237 | 238 | robots, _ := Parse(contents, url) 239 | 240 | for _, u := range invalidUrls { 241 | _, err := robots.IsAllowed("*", u) 242 | _, ok := err.(*InvalidHostError) 243 | if !ok { 244 | t.Errorf("The URL " + u + " should cause an error") 245 | } 246 | } 247 | } 248 | 249 | func TestRobotsTxt_handleUrlsWithPunycode(t *testing.T) { 250 | url := "http://www.münich.com/robots.txt" 251 | contents := ` 252 | User-agent: * 253 | Disallow: /secret.html 254 | Disallow: /test 255 | ` 256 | 257 | allowed := []string{ 258 | "http://www.münich.com/index.html", 259 | "http://www.xn--mnich-kva.com/index.html", 260 | } 261 | 262 | disallowed := []string{ 263 | "http://www.münich.com/secret.html", 264 | "http://www.xn--mnich-kva.com/secret.html", 265 | } 266 | 267 | testRobots(t, contents, url, allowed, disallowed) 268 | } 269 | 270 | func TestRobotsTxt_allowAllIfEmptyRobotsTxt(t *testing.T) { 271 | url := "http://www.example.com/robots.txt" 272 | contents := `` 273 | 274 | allowed := []string{ 275 | "http://www.example.com/secret.html", 276 | "http://www.example.com/test/index.html", 277 | "http://www.example.com/test/", 278 | } 279 | 280 | disallowed := []string{} 281 | 282 | testRobots(t, contents, url, allowed, disallowed) 283 | } 284 | 285 | func TestRobotsTxt_parseTheCrawlDelayDirective(t *testing.T) { 286 | url := "http://www.example.com/robots.txt" 287 | contents := ` 288 | user-agent: a 289 | crawl-delay: 1 290 | 291 | user-agent: b 292 | disallow: /d 293 | 294 | user-agent: c 295 | user-agent: d 296 | crawl-delay: 10 297 | ` 298 | 299 | robots, _ := Parse(contents, url) 300 | 301 | if robots.CrawlDelay("a") != time.Second { 302 | t.Errorf("Expected crawl delay for a to be 1") 303 | } 304 | 305 | if robots.CrawlDelay("b") != 0 { 306 | t.Errorf("Expected crawl delay for b to be 0") 307 | } 308 | 309 | if robots.CrawlDelay("c") != 10*time.Second { 310 | t.Errorf("Expected crawl delay for c to be 10") 311 | } 312 | 313 | if robots.CrawlDelay("d") != 10*time.Second { 314 | t.Errorf("Expected crawl delay for d to be 10") 315 | } 316 | } 317 | 318 | func TestRobotsTxt_returnZeroIfEmpty(t *testing.T) { 319 | url := "http://www.example.com/robots.txt" 320 | contents := `` 321 | 322 | robots, _ := Parse(contents, url) 323 | 324 | if robots.CrawlDelay("a") != 0 { 325 | t.Errorf("Expected crawl delay for a to be 0") 326 | } 327 | } 328 | 329 | func TestRobotsTxt_ignoreInvalidCrawlDelayDirectives(t *testing.T) { 330 | url := "http://www.example.com/robots.txt" 331 | contents := ` 332 | user-agent: a 333 | crawl-delay: 1.2.1 334 | 335 | user-agent: b 336 | crawl-delay: 1.a0 337 | 338 | user-agent: c 339 | user-agent: d 340 | crawl-delay: 10a 341 | ` 342 | 343 | robots, _ := Parse(contents, url) 344 | 345 | if robots.CrawlDelay("a") != 0 { 346 | t.Errorf("Expected crawl delay for a to be 0") 347 | } 348 | 349 | if robots.CrawlDelay("b") != 0 { 350 | t.Errorf("Expected crawl delay for b to be 0") 351 | } 352 | 353 | if robots.CrawlDelay("c") != 0 { 354 | t.Errorf("Expected crawl delay for c to be 0") 355 | } 356 | 357 | if robots.CrawlDelay("d") != 0 { 358 | t.Errorf("Expected crawl delay for d to be 0") 359 | } 360 | } 361 | 362 | func TestRobotsTxt_parseTheSitemapDirective(t *testing.T) { 363 | url := "http://www.example.com/robots.txt" 364 | contents := ` 365 | user-agent: a 366 | crawl-delay: 1 367 | sitemap: http://example.com/test.xml 368 | 369 | user-agent: b 370 | disallow: /d 371 | 372 | sitemap: /sitemap.xml 373 | sitemap: http://example.com/test/sitemap.xml 374 | ` 375 | 376 | expected := []string{ 377 | "http://example.com/test.xml", 378 | "/sitemap.xml", 379 | "http://example.com/test/sitemap.xml", 380 | } 381 | 382 | robots, _ := Parse(contents, url) 383 | 384 | if !reflect.DeepEqual(robots.Sitemaps(), expected) { 385 | t.Errorf("Expected sitemaps to match") 386 | } 387 | } 388 | 389 | func TestRobotsTxt_parseTheHostDirective(t *testing.T) { 390 | url := "http://www.example.com/robots.txt" 391 | contents := ` 392 | user-agent: a 393 | crawl-delay: 1 394 | host: www.example.net 395 | 396 | user-agent: b 397 | disallow: /d 398 | 399 | host: example.com 400 | ` 401 | 402 | robots, _ := Parse(contents, url) 403 | 404 | if robots.Host() != "example.com" { 405 | t.Errorf("Expected host to be example.com") 406 | } 407 | } 408 | 409 | func TestRobotsTxt_parseEmptyAndInvalidDirectives(t *testing.T) { 410 | url := "http://www.example.com/robots.txt" 411 | contents := ` 412 | user-agent: 413 | user-agent:::: a:: 414 | crawl-delay: 415 | crawl-delay:::: 0: 416 | host: 417 | host:: example.com 418 | sitemap: 419 | sitemap:: site:map.xml 420 | disallow: 421 | disallow::: /: 422 | allow: 423 | allow::: /: 424 | ` 425 | 426 | _, err := Parse(contents, url) 427 | if err != nil { 428 | t.Errorf("Expected to not fail on invalid directives") 429 | } 430 | } 431 | 432 | func TestRobotsTxt_treatOnlyTheLastHostDirectiveAsValid(t *testing.T) { 433 | url := "http://www.example.com/robots.txt" 434 | contents := ` 435 | user-agent: a 436 | crawl-delay: 1 437 | host: www.example.net 438 | 439 | user-agent: b 440 | disallow: /d 441 | 442 | host: example.net 443 | host: example.com 444 | ` 445 | 446 | robots, _ := Parse(contents, url) 447 | 448 | if robots.Host() != "example.com" { 449 | t.Errorf("Expected host to be example.com") 450 | } 451 | } 452 | 453 | func TestRobotsTxt_returnEmptyStringWhenThereIsNoHostDirective(t *testing.T) { 454 | url := "http://www.example.com/robots.txt" 455 | contents := ` 456 | user-agent: a 457 | crawl-delay: 1 458 | 459 | user-agent: b 460 | disallow: /d 461 | ` 462 | 463 | robots, _ := Parse(contents, url) 464 | 465 | if robots.Host() != "" { 466 | t.Errorf("Expected host to be empty") 467 | } 468 | } 469 | 470 | func TestRobotsTxt_fallbackToDefaultWhenUserAgentHasRulesOfItsOwn(t *testing.T) { 471 | url := "http://www.example.com/robots.txt" 472 | contents := ` 473 | user-agent: * 474 | disallow: /test/ 475 | crawl-delay: 1 476 | 477 | user-agent: b 478 | crawl-delay: 12 479 | 480 | user-agent: c 481 | user-agent: d 482 | crawl-delay: 10 483 | ` 484 | 485 | robots, _ := Parse(contents, url) 486 | 487 | if robots.CrawlDelay("should-fall-back") != 1*time.Second { 488 | t.Errorf("Expected crawl delay for should-fall-back to be 1") 489 | } 490 | 491 | if robots.CrawlDelay("d") != 10*time.Second { 492 | t.Errorf("Expected crawl delay for d to be 10") 493 | } 494 | 495 | if robots.CrawlDelay("dd") != 1*time.Second { 496 | t.Errorf("Expected crawl delay for dd to be 1") 497 | } 498 | 499 | allowed, _ := robots.IsAllowed("should-fall-back", "http://www.example.com/test/") 500 | if allowed { 501 | t.Errorf("Expected /test/ to be disallowed for should-fall-back to") 502 | } 503 | } 504 | 505 | func TestRobotsTxt_shouldNotFallbackToDefaultWhenUserAgentHasRules(t *testing.T) { 506 | url := "http://www.example.com/robots.txt" 507 | contents := ` 508 | user-agent: * 509 | disallow: /test/ 510 | crawl-delay: 1 511 | 512 | user-agent: b 513 | allow: 514 | ` 515 | 516 | robots, _ := Parse(contents, url) 517 | 518 | if robots.CrawlDelay("b") != 0*time.Second { 519 | t.Errorf("Expected crawl delay for b to be 0") 520 | } 521 | 522 | allowed, _ := robots.IsAllowed("b", "http://www.example.com/test/") 523 | if !allowed { 524 | t.Errorf("Expected /test/ to be allowed for b to") 525 | } 526 | } 527 | 528 | func TestRobotsTxt_ignoreVersionNumbersInTheUserAgentString(t *testing.T) { 529 | url := "http://www.example.com/robots.txt" 530 | contents := ` 531 | user-agent: * 532 | crawl-delay: 1 533 | 534 | user-agent: b 535 | crawl-delay: 12 536 | 537 | user-agent: c 538 | user-agent: d 539 | crawl-delay: 10 540 | ` 541 | 542 | robots, _ := Parse(contents, url) 543 | 544 | if robots.CrawlDelay("should-fall-back/1.0.0") != time.Second { 545 | t.Errorf("Expected crawl delay for should-fall-back/1.0.0 to be 1") 546 | } 547 | 548 | if robots.CrawlDelay("d/12") != 10*time.Second { 549 | t.Errorf("Expected crawl delay for d/12 to be 10") 550 | } 551 | 552 | if robots.CrawlDelay("dd / 0-32-3") != 1*time.Second { 553 | t.Errorf("Expected crawl delay for dd / 0-32-3 to be 1") 554 | } 555 | 556 | if robots.CrawlDelay("b / 1.0") != 12*time.Second { 557 | t.Errorf("Expected crawl delay for b / 1.0 to be 12") 558 | } 559 | } 560 | 561 | func TestRobotsTxt_handleUrlEncodedAndUtf8PathsAndUrls(t *testing.T) { 562 | url := "http://www.example.com/robots.txt" 563 | contents := ` 564 | User-agent: * 565 | Disallow: /wiki:Article_wizard 566 | Disallow: /wiki%3AArticle_wizard 567 | Disallow: /اختبارات 568 | Disallow: /%E6%B5%8B%E8%AF%95 569 | Disallow: /考查 570 | Disallow: /%E0%A6%AA%E0%A6%B0%E0%A7%80%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE 571 | ` 572 | 573 | allowed := []string{ 574 | "http://www.example.com/fish", 575 | "http://www.example.com/اختبار", 576 | } 577 | 578 | disallowed := []string{ 579 | "http://www.example.com/wiki:Article_wizard", 580 | "http://www.example.com/wiki%3AArticle_wizard", 581 | "http://www.example.com/اختبارات/test", 582 | "http://www.example.com/测试", 583 | "http://www.example.com/%E8%80%83%E6%9F%A5/test", 584 | "http://www.example.com/%E0%A6%AA%E0%A6%B0%E0%A7%80%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE", 585 | } 586 | 587 | testRobots(t, contents, url, allowed, disallowed) 588 | } 589 | 590 | func TestRobotsTxt_invalidUrlEncodingsShouldBeTreatedAsUnencoded(t *testing.T) { 591 | url := "http://www.example.com/robots.txt" 592 | contents := ` 593 | User-agent: * 594 | Disallow: /%20%A/test 595 | Disallow: /%24%A/test$ 596 | Disallow: /%B/*test%24 597 | ` 598 | 599 | allowed := []string{ 600 | "http://www.example.com/ %25A/test/test", 601 | "http://www.example.com/+%25A/test", 602 | "http://www.example.com/%20%25A/test", 603 | "http://www.example.com/*%25A/testing", 604 | "http://www.example.com/%25B/test$", 605 | } 606 | 607 | disallowed := []string{ 608 | "http://www.example.com/%2520%25A/test", 609 | "http://www.example.com/%2524%25A/test", 610 | "http://www.example.com/%25B/test%2524", 611 | } 612 | 613 | testRobots(t, contents, url, allowed, disallowed) 614 | } 615 | 616 | func TestRobotsTxt_handleUrlEncodingsWithPatterns(t *testing.T) { 617 | url := "http://www.example.com/robots.txt" 618 | contents := ` 619 | User-agent: * 620 | Disallow: /%20A/*test$ 621 | Disallow: /%20B/*test%24 622 | Disallow: /%20C/test%24 623 | Disallow: /%20D/%2Atest$ 624 | ` 625 | 626 | allowed := []string{ 627 | "http://www.example.com/ A/la/testing", 628 | "http://www.example.com/ B/la/test", 629 | "http://www.example.com/ C/test", 630 | "http://www.example.com/ D/la/test", 631 | } 632 | 633 | disallowed := []string{ 634 | "http://www.example.com/ A/la/test", 635 | "http://www.example.com/ B/la/test$", 636 | "http://www.example.com/ B/la/test$test", 637 | "http://www.example.com/ C/test$", 638 | "http://www.example.com/ D/*test", 639 | "http://www.example.com/ D/%2Atest", 640 | } 641 | 642 | testRobots(t, contents, url, allowed, disallowed) 643 | } 644 | --------------------------------------------------------------------------------