├── inst
    ├── robotstxts
    │   ├── empty.txt
    │   ├── host.txt
    │   ├── robots_spiegel.txt
    │   ├── disallow_all_for_all.txt
    │   ├── disallow_all_for_BadBot.txt
    │   ├── crawl_delay.txt
    │   ├── allow_single_bot.txt
    │   ├── robots_commented_token.txt
    │   ├── disallow_two_at_once.txt
    │   ├── testing_comments.txt
    │   ├── disallow_some_for_all.txt
    │   ├── robots_pmeissner.txt
    │   ├── selfhtml_Example.txt
    │   ├── robots_bundestag.txt
    │   ├── rbloggers.txt
    │   ├── robots_cdc.txt
    │   ├── robots_cdc2.txt
    │   ├── robots_new_york_times.txt
    │   ├── robots_yahoo.txt
    │   ├── robots_amazon.txt
    │   ├── robots_facebook.txt
    │   └── robots_google.txt
    └── http_requests
    │   ├── http_404.rds
    │   ├── http_ok_1.rds
    │   ├── http_ok_2.rds
    │   ├── http_ok_3.rds
    │   ├── http_ok_4.rds
    │   ├── http_client_error.rds
    │   ├── http_domain_change.rds
    │   ├── http_html_content.rds
    │   ├── http_redirect_www.rds
    │   └── http_server_error.rds
├── .github
    ├── .gitignore
    ├── workflows
    │   ├── R-CMD-check.yaml
    │   ├── test-coverage.yaml
    │   └── rhub.yaml
    └── CONTRIBUTING.md
├── .Rprofile
├── R
    ├── rt_cache.R
    ├── pipe.R
    ├── null_to_default.R
    ├── sanitize_path.R
    ├── fix_url.R
    ├── rt_get_comments.R
    ├── remove_domain.R
    ├── guess_domain.R
    ├── as_list.R
    ├── http_was_redirected.R
    ├── rt_get_useragent.R
    ├── parse_robotstxt.R
    ├── is_suspect_robotstxt.R
    ├── parse_url.R
    ├── print_robotstxt_text.R
    ├── tools.R
    ├── get_robotstxt_http_get.R
    ├── paths_allowed_worker_spiderbar.R
    ├── print_robotstxt.R
    ├── is_valid_robotstxt.R
    ├── http_subdomain_changed.R
    ├── http_domain_changed.R
    ├── list_merge.R
    ├── rt_get_fields_worker.R
    ├── rt_request_handler_defaults.R
    ├── request_handler_handler.R
    ├── rt_get_fields.R
    ├── get_robotstxts.R
    ├── paths_allowed.R
    ├── get_robotstxt.R
    ├── robotstxt.R
    └── rt_request_handler.R
├── man
    ├── figures
    │   └── logo.jpeg
    ├── pipe.Rd
    ├── named_list.Rd
    ├── guess_domain.Rd
    ├── fix_url.Rd
    ├── rt_cache.Rd
    ├── remove_domain.Rd
    ├── sanitize_path.Rd
    ├── print.robotstxt.Rd
    ├── rt_get_comments.Rd
    ├── rt_list_rtxt.Rd
    ├── null_to_default.Rd
    ├── rt_get_useragent.Rd
    ├── parse_robotstxt.Rd
    ├── print.robotstxt_text.Rd
    ├── is_suspect_robotstxt.Rd
    ├── as.list.robotstxt_text.Rd
    ├── rt_get_fields.Rd
    ├── rt_get_rtxt.Rd
    ├── http_domain_changed.Rd
    ├── http_was_redirected.Rd
    ├── http_subdomain_changed.Rd
    ├── is_valid_robotstxt.Rd
    ├── rt_get_fields_worker.Rd
    ├── parse_url.Rd
    ├── request_handler_handler.Rd
    ├── get_robotstxt_http_get.Rd
    ├── paths_allowed_worker_spiderbar.Rd
    ├── list_merge.Rd
    ├── get_robotstxt.Rd
    ├── get_robotstxts.Rd
    ├── rt_request_handler.Rd
    ├── paths_allowed.Rd
    └── robotstxt.Rd
├── data-raw
    └── logo
    │   ├── robotstxt.png
    │   ├── robotstxt.jpeg
    │   └── robotstxt-logo.jpeg
├── _pkgdown.yml
├── CRAN-SUBMISSION
├── LICENSE
├── cran-comments.md
├── .Rbuildignore
├── .gitignore
├── robotstxt.Rproj
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_get_robotstxt.R
    │   ├── _snaps
    │       ├── paths_allowed.md
    │       └── http_event_handling.md
    │   ├── test_issue50.R
    │   ├── test_attribute_handling.R
    │   ├── test_tools.R
    │   ├── test_path_examples_from_rfc.R
    │   ├── test_robotstxt.R
    │   ├── test_http_event_handling.R
    │   └── test_parser.R
├── NAMESPACE
├── DESCRIPTION
├── benchmarks
    └── spiderbar_and_futures.r
├── vignettes
    ├── style.css
    └── using_robotstxt.Rmd
├── NEWS.md
└── README.Rmd


/inst/robotstxts/empty.txt:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.Rprofile:
--------------------------------------------------------------------------------
1 | Sys.setenv("rpkg_use_internet_for_testing" = TRUE)
2 | 


--------------------------------------------------------------------------------
/inst/robotstxts/host.txt:
--------------------------------------------------------------------------------
1 | # comment
2 | 
3 | Host: www.whatever.com
4 | 


--------------------------------------------------------------------------------
/R/rt_cache.R:
--------------------------------------------------------------------------------
1 | #' Get the robotstxt cache
2 | rt_cache <- new.env( parent = emptyenv() )
3 | 


--------------------------------------------------------------------------------
/man/figures/logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/man/figures/logo.jpeg


--------------------------------------------------------------------------------
/data-raw/logo/robotstxt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/data-raw/logo/robotstxt.png


--------------------------------------------------------------------------------
/data-raw/logo/robotstxt.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/data-raw/logo/robotstxt.jpeg


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://docs.ropensci.org/robotstxt
2 | template:
3 |   bootstrap: 5
4 |   light-switch: true
5 | 


--------------------------------------------------------------------------------
/inst/http_requests/http_404.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_404.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_ok_1.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_ok_1.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_ok_2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_ok_2.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_ok_3.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_ok_3.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_ok_4.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_ok_4.rds


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 0.7.15
2 | Date: 2024-08-25 07:16:38 UTC
3 | SHA: 3c96e9f6872735f123da4dea2404c8f8df94810f
4 | 


--------------------------------------------------------------------------------
/data-raw/logo/robotstxt-logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/data-raw/logo/robotstxt-logo.jpeg


--------------------------------------------------------------------------------
/inst/robotstxts/robots_spiegel.txt:
--------------------------------------------------------------------------------
1 | User-agent: WebReaper
2 | Disallow: /
3 | 
4 | User-agent: Slurp
5 | Crawl-delay: 18
6 | 


--------------------------------------------------------------------------------
/inst/http_requests/http_client_error.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_client_error.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_domain_change.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_domain_change.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_html_content.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_html_content.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_redirect_www.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_redirect_www.rds


--------------------------------------------------------------------------------
/inst/http_requests/http_server_error.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/robotstxt/HEAD/inst/http_requests/http_server_error.rds


--------------------------------------------------------------------------------
/inst/robotstxts/disallow_all_for_all.txt:
--------------------------------------------------------------------------------
1 | # source: http://www.robotstxt.org/robotstxt.html
2 | 
3 | User-agent: *
4 | Disallow: /
5 | 


--------------------------------------------------------------------------------
/inst/robotstxts/disallow_all_for_BadBot.txt:
--------------------------------------------------------------------------------
1 | # source: http://www.robotstxt.org/robotstxt.html
2 | 
3 | User-agent: BadBot
4 | Disallow: /


--------------------------------------------------------------------------------
/R/pipe.R:
--------------------------------------------------------------------------------
1 | #' re-export magrittr pipe operator
2 | #'
3 | #' @importFrom magrittr %>%
4 | #' @name %>%
5 | #' @rdname pipe
6 | #' @export
7 | NULL
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | YEAR: 2018-2020
 2 | COPYRIGHT HOLDER: Peter Meissner
 3 | 
 4 | YEAR: 2018
 5 | COPYRIGHT HOLDER: Kun Ren
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/inst/robotstxts/crawl_delay.txt:
--------------------------------------------------------------------------------
1 | # source: https://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
2 | 
3 | User-agent: *
4 | Crawl-delay: 10
5 | 


--------------------------------------------------------------------------------
/inst/robotstxts/allow_single_bot.txt:
--------------------------------------------------------------------------------
1 | # source: http://www.robotstxt.org/robotstxt.html
2 | 
3 | User-agent: Google
4 | Disallow:
5 | 
6 | User-agent: *
7 | Disallow: /


--------------------------------------------------------------------------------
/inst/robotstxts/robots_commented_token.txt:
--------------------------------------------------------------------------------
1 | User-agent: bot_1
2 | Disallow: /path_1
3 | 
4 | # User-agent: bot_2
5 | # Disallow: /path_2
6 | 
7 | # Sitemap: /sitemap.php
8 | 


--------------------------------------------------------------------------------
/inst/robotstxts/disallow_two_at_once.txt:
--------------------------------------------------------------------------------
1 | # source: https://en.wikipedia.org/wiki/Robots_exclusion_standard
2 | 
3 | User-agent: BadBot
4 | User-agent: Googlebot
5 | Disallow: /private/


--------------------------------------------------------------------------------
/inst/robotstxts/testing_comments.txt:
--------------------------------------------------------------------------------
1 | # Comments appear after the "#" symbol at the start of a line, or after a directive
2 | User-agent: * # match all bots
3 |   Disallow: / # keep them out


--------------------------------------------------------------------------------
/inst/robotstxts/disallow_some_for_all.txt:
--------------------------------------------------------------------------------
1 | # source: http://www.robotstxt.org/robotstxt.html
2 | 
3 | User-agent: *
4 | Disallow: /cgi-bin/
5 | Disallow: /tmp/
6 | Disallow: /~joe/
7 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 | 
3 | 0 errors | 0 warnings | 1 note
4 | 
5 | * fixing "incoming feasibility" URL checks problems
6 | * changing maintainer to Pedro Baltazar <pedrobtz@gmail.com>’
7 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_pmeissner.txt:
--------------------------------------------------------------------------------
1 | # source: http://pmeissner.com/robots.txt
2 | 
3 | User-agent: *
4 |   Allow: /
5 |   Disallow: /_layouts
6 | Disallow: /images
7 | Disallow: /javascripts
8 | Disallow: /stylesheets
9 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pipe.R
3 | \name{\%>\%}
4 | \alias{\%>\%}
5 | \title{re-export magrittr pipe operator}
6 | \description{
7 | re-export magrittr pipe operator
8 | }
9 | 


--------------------------------------------------------------------------------
/inst/robotstxts/selfhtml_Example.txt:
--------------------------------------------------------------------------------
 1 | # robots.txt zu http://www.example.org/
 2 | 
 3 | User-agent: UniversalRobot/1.0
 4 | User-agent: mein-Robot
 5 | Disallow: /quellen/dtd/
 6 | 
 7 | User-agent: *
 8 | Disallow: /unsinn/
 9 | Disallow: /temp/
10 | Disallow: /newsticker.shtml
11 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | README.html
 4 | README.Rmd
 5 | README.md
 6 | logo/*.*
 7 | benchmarks
 8 | misc/*
 9 | revdep/*.*
10 | ^revdep$
11 | ^CRAN-RELEASE$
12 | ^doc$
13 | ^Meta$
14 | ^\.github$
15 | ^CRAN-SUBMISSION$
16 | ^cran-comments\.md$
17 | ^_pkgdown\.yml$
18 | 


--------------------------------------------------------------------------------
/R/null_to_default.R:
--------------------------------------------------------------------------------
 1 | #' Return default value if NULL
 2 | #'
 3 | #' @param x value to check and return
 4 | #' @param d value to return in case x is NULL
 5 | #'
 6 | null_to_default <-
 7 |   function(x, d){
 8 |     if ( is.null(x) ){
 9 |       d
10 |     }else{
11 |       x
12 |     }
13 |   }
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | inst/doc
 5 | ropenscience onboarding.md
 6 | dev.r
 7 | dev.R
 8 | revdep/*
 9 | vignettes/using_robotstxt.R
10 | vignettes/using_robotstxt.html
11 | vignettes/*.Rmd
12 | CRAN-RELEASE
13 | robotstxt_*.tar.gz
14 | revdepcheck/
15 | 
16 | doc
17 | Meta
18 | 


--------------------------------------------------------------------------------
/R/sanitize_path.R:
--------------------------------------------------------------------------------
 1 | #' Make paths uniform
 2 | #' @param path path to be sanitized
 3 | #' @return sanitized path
 4 | #' @keywords internal
 5 | sanitize_path <- function(path){
 6 |   path <- stringr::str_replace(    path, "^ *", "")
 7 |   path <- ifelse( !grepl("^/", path),  paste0("/", path), path)
 8 |   return(path)
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/fix_url.R:
--------------------------------------------------------------------------------
 1 | #' Add http protocal if missing from URL
 2 | #'
 3 | #'
 4 | #' @param url a character string containing a single URL
 5 | #'
 6 | fix_url <-
 7 |   function(url){
 8 |     parsed_url <- httr::parse_url(url)
 9 |     if ( is.null(parsed_url$scheme) ){
10 |       url <- paste0("http://", url)
11 |     }
12 |     url
13 |   }
14 | 


--------------------------------------------------------------------------------
/man/named_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tools.R
 3 | \name{named_list}
 4 | \alias{named_list}
 5 | \title{Create a named list}
 6 | \usage{
 7 | named_list(...)
 8 | }
 9 | \arguments{
10 | \item{...}{things to be put in list}
11 | }
12 | \description{
13 | Create a named list
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/man/guess_domain.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/guess_domain.R
 3 | \name{guess_domain}
 4 | \alias{guess_domain}
 5 | \title{Guess a domain from path}
 6 | \usage{
 7 | guess_domain(x)
 8 | }
 9 | \arguments{
10 | \item{x}{path aka URL from which to infer domain}
11 | }
12 | \description{
13 | Guess a domain from path
14 | }
15 | 


--------------------------------------------------------------------------------
/man/fix_url.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fix_url.R
 3 | \name{fix_url}
 4 | \alias{fix_url}
 5 | \title{Add http protocal if missing from URL}
 6 | \usage{
 7 | fix_url(url)
 8 | }
 9 | \arguments{
10 | \item{url}{a character string containing a single URL}
11 | }
12 | \description{
13 | Add http protocal if missing from URL
14 | }
15 | 


--------------------------------------------------------------------------------
/man/rt_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rt_cache.R
 3 | \docType{data}
 4 | \name{rt_cache}
 5 | \alias{rt_cache}
 6 | \title{Get the robotstxt cache}
 7 | \format{
 8 | An object of class \code{environment} of length 0.
 9 | }
10 | \usage{
11 | rt_cache
12 | }
13 | \description{
14 | Get the robotstxt cache
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/R/rt_get_comments.R:
--------------------------------------------------------------------------------
 1 | #' Extract comments from robots.txt
 2 | #' @param txt content of the robots.txt file
 3 | #' @keywords internal
 4 | rt_get_comments <- function(txt){
 5 |   txt      <- unlist(stringr::str_split(txt, "\n"))
 6 |   clines   <- grep("#", txt)
 7 |   ccontent <- stringr::str_extract(txt[clines], "#.*")
 8 |   data.frame(line=clines, comment=ccontent, stringsAsFactors = FALSE)
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/remove_domain.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/remove_domain.R
 3 | \name{remove_domain}
 4 | \alias{remove_domain}
 5 | \title{Remove domain from path}
 6 | \usage{
 7 | remove_domain(x)
 8 | }
 9 | \arguments{
10 | \item{x}{path aka URL from which to first infer domain and then remove it}
11 | }
12 | \description{
13 | Remove domain from path
14 | }
15 | 


--------------------------------------------------------------------------------
/R/remove_domain.R:
--------------------------------------------------------------------------------
 1 | #' Remove domain from path
 2 | #' @param x path aka URL from which to first infer domain and then remove it
 3 | remove_domain <- function(x){
 4 |   unlist(lapply(
 5 |     x,
 6 |     function(x){
 7 |       if( is.na(x) ){
 8 |         return(x)
 9 |       }else{
10 |         stringr::str_replace(x, paste0("^.*", "\\Q", guess_domain(x), "\\E"), "")
11 |       }
12 |     }
13 |   ))
14 | }
15 | 


--------------------------------------------------------------------------------
/man/sanitize_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sanitize_path.R
 3 | \name{sanitize_path}
 4 | \alias{sanitize_path}
 5 | \title{Make paths uniform}
 6 | \usage{
 7 | sanitize_path(path)
 8 | }
 9 | \arguments{
10 | \item{path}{path to be sanitized}
11 | }
12 | \value{
13 | sanitized path
14 | }
15 | \description{
16 | Make paths uniform
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/print.robotstxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print_robotstxt.R
 3 | \name{print.robotstxt}
 4 | \alias{print.robotstxt}
 5 | \title{Print robotstxt}
 6 | \usage{
 7 | \method{print}{robotstxt}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{robotstxt instance to be printed}
11 | 
12 | \item{...}{goes down the sink}
13 | }
14 | \description{
15 | Print robotstxt
16 | }
17 | 


--------------------------------------------------------------------------------
/man/rt_get_comments.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rt_get_comments.R
 3 | \name{rt_get_comments}
 4 | \alias{rt_get_comments}
 5 | \title{Extract comments from robots.txt}
 6 | \usage{
 7 | rt_get_comments(txt)
 8 | }
 9 | \arguments{
10 | \item{txt}{content of the robots.txt file}
11 | }
12 | \description{
13 | Extract comments from robots.txt
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/man/rt_list_rtxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tools.R
 3 | \name{rt_list_rtxt}
 4 | \alias{rt_list_rtxt}
 5 | \title{List robots.txt files saved along with the package}
 6 | \usage{
 7 | rt_list_rtxt()
 8 | }
 9 | \description{
10 | list robots.txt files saved along with the package:
11 | these functions ar very handy for testing (not used otherwise)
12 | }
13 | \keyword{internal}
14 | 


--------------------------------------------------------------------------------
/man/null_to_default.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/null_to_default.R
 3 | \name{null_to_default}
 4 | \alias{null_to_default}
 5 | \title{Return default value if NULL}
 6 | \usage{
 7 | null_to_default(x, d)
 8 | }
 9 | \arguments{
10 | \item{x}{value to check and return}
11 | 
12 | \item{d}{value to return in case x is NULL}
13 | }
14 | \description{
15 | Return default value if NULL
16 | }
17 | 


--------------------------------------------------------------------------------
/man/rt_get_useragent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rt_get_useragent.R
 3 | \name{rt_get_useragent}
 4 | \alias{rt_get_useragent}
 5 | \title{Extract HTTP useragents from robots.txt}
 6 | \usage{
 7 | rt_get_useragent(txt)
 8 | }
 9 | \arguments{
10 | \item{txt}{content of the robots.txt file}
11 | }
12 | \description{
13 | Extract HTTP useragents from robots.txt
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/man/parse_robotstxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parse_robotstxt.R
 3 | \name{parse_robotstxt}
 4 | \alias{parse_robotstxt}
 5 | \title{Parse a robots.txt file}
 6 | \usage{
 7 | parse_robotstxt(txt)
 8 | }
 9 | \arguments{
10 | \item{txt}{content of the robots.txt file}
11 | }
12 | \value{
13 | a named list with useragents, comments, permissions, sitemap
14 | }
15 | \description{
16 | Parse a robots.txt file
17 | }
18 | 


--------------------------------------------------------------------------------
/robotstxt.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | StripTrailingWhitespace: Yes
16 | 
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageRoxygenize: rd,collate,namespace
21 | 


--------------------------------------------------------------------------------
/man/print.robotstxt_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print_robotstxt_text.R
 3 | \name{print.robotstxt_text}
 4 | \alias{print.robotstxt_text}
 5 | \title{Print robotstxt's text}
 6 | \usage{
 7 | \method{print}{robotstxt_text}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{character vector aka robotstxt$text to be printed}
11 | 
12 | \item{...}{goes down the sink}
13 | }
14 | \description{
15 | Print robotstxt's text
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
 7 | # * https://testthat.r-lib.org/articles/special-files.html
 8 | 
 9 | library(testthat)
10 | library(robotstxt)
11 | if (curl::has_internet()) {
12 |   test_check("robotstxt")
13 | }
14 | 


--------------------------------------------------------------------------------
/man/is_suspect_robotstxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/is_suspect_robotstxt.R
 3 | \name{is_suspect_robotstxt}
 4 | \alias{is_suspect_robotstxt}
 5 | \title{Check if file is valid / parsable robots.txt file}
 6 | \usage{
 7 | is_suspect_robotstxt(text)
 8 | }
 9 | \arguments{
10 | \item{text}{content of a robots.txt file provides as character vector}
11 | }
12 | \description{
13 | Function that checks if file is valid / parsable robots.txt file
14 | }
15 | 


--------------------------------------------------------------------------------
/R/guess_domain.R:
--------------------------------------------------------------------------------
 1 | #' Guess a domain from path
 2 | #' @param x path aka URL from which to infer domain
 3 | guess_domain <- function(x){
 4 | 
 5 |     if(length(x)>1){
 6 |     return(
 7 |       unlist(
 8 |         lapply(
 9 |           X   = x,
10 |           FUN = guess_domain
11 |         )
12 |       )
13 |     )
14 | 
15 |   } else {
16 | 
17 |     domain <- parse_url(url = x)$domain
18 | 
19 |     if( is.null(domain) ){
20 |       domain <- NA
21 |     }
22 | 
23 |     return(domain)
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/as.list.robotstxt_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/as_list.R
 3 | \name{as.list.robotstxt_text}
 4 | \alias{as.list.robotstxt_text}
 5 | \title{Convert robotstxt_text to list}
 6 | \usage{
 7 | \method{as.list}{robotstxt_text}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{class robotstxt_text object to be transformed into list}
11 | 
12 | \item{...}{further arguments (inherited from \code{base::as.list()})}
13 | }
14 | \description{
15 | Convert robotstxt_text to list
16 | }
17 | 


--------------------------------------------------------------------------------
/R/as_list.R:
--------------------------------------------------------------------------------
 1 | #' Convert robotstxt_text to list
 2 | #'
 3 | #' @param x class robotstxt_text object to be transformed into list
 4 | #' @param ... further arguments (inherited from \code{base::as.list()})
 5 | #' @export
 6 | #'
 7 | #'
 8 | as.list.robotstxt_text <-
 9 |   function(x, ...){
10 |     res <- list()
11 | 
12 |     res$content   <- httr::content(attr(x, "request"), encoding = "UTF-8")
13 |     res$robotstxt <- as.character(x)
14 |     res$problems  <- attr(x, "problems")
15 |     res$request   <- attr(x, "request")
16 | 
17 |     res
18 |   }
19 | 


--------------------------------------------------------------------------------
/man/rt_get_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rt_get_fields.R
 3 | \name{rt_get_fields}
 4 | \alias{rt_get_fields}
 5 | \title{Extract permissions from robots.txt}
 6 | \usage{
 7 | rt_get_fields(txt, regex = "", invert = FALSE)
 8 | }
 9 | \arguments{
10 | \item{txt}{content of the robots.txt file}
11 | 
12 | \item{regex}{regular expression specify field}
13 | 
14 | \item{invert}{invert selection made via regex?}
15 | }
16 | \description{
17 | Extract permissions from robots.txt
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/rt_get_rtxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tools.R
 3 | \name{rt_get_rtxt}
 4 | \alias{rt_get_rtxt}
 5 | \title{Load robots.txt files saved along with the package}
 6 | \usage{
 7 | rt_get_rtxt(name = sample(rt_list_rtxt(), 1))
 8 | }
 9 | \arguments{
10 | \item{name}{name of the robots.txt files, defaults to a random drawn file ;-)}
11 | }
12 | \description{
13 | load robots.txt files saved along with the package:
14 | these functions are very handy for testing (not used otherwise)
15 | }
16 | \keyword{internal}
17 | 


--------------------------------------------------------------------------------
/man/http_domain_changed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/http_domain_changed.R
 3 | \name{http_domain_changed}
 4 | \alias{http_domain_changed}
 5 | \title{Check if HTTP domain changed}
 6 | \usage{
 7 | http_domain_changed(response)
 8 | }
 9 | \arguments{
10 | \item{response}{an httr response object, e.g. from a call to httr::GET()}
11 | }
12 | \value{
13 | logical of length 1 indicating whether or not any domain change
14 |     happened during the HTTP request
15 | }
16 | \description{
17 | Check if HTTP domain changed
18 | }
19 | 


--------------------------------------------------------------------------------
/man/http_was_redirected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/http_was_redirected.R
 3 | \name{http_was_redirected}
 4 | \alias{http_was_redirected}
 5 | \title{Check if HTTP redirect occurred}
 6 | \usage{
 7 | http_was_redirected(response)
 8 | }
 9 | \arguments{
10 | \item{response}{an httr response object, e.g. from a call to httr::GET()}
11 | }
12 | \value{
13 | logical of length 1 indicating whether or not any redirect happened
14 |   during the HTTP request
15 | }
16 | \description{
17 | Check if HTTP redirect occurred
18 | }
19 | 


--------------------------------------------------------------------------------
/man/http_subdomain_changed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/http_subdomain_changed.R
 3 | \name{http_subdomain_changed}
 4 | \alias{http_subdomain_changed}
 5 | \title{Check if HTTP subdomain changed}
 6 | \usage{
 7 | http_subdomain_changed(response)
 8 | }
 9 | \arguments{
10 | \item{response}{an httr response object, e.g. from a call to httr::GET()}
11 | }
12 | \value{
13 | logical of length 1 indicating whether or not any subdomain change
14 |     happened during the HTTP request
15 | }
16 | \description{
17 | Check if HTTP subdomain changed
18 | }
19 | 


--------------------------------------------------------------------------------
/R/http_was_redirected.R:
--------------------------------------------------------------------------------
 1 | #' Check if HTTP redirect occurred
 2 | #'
 3 | #' @param response an httr response object, e.g. from a call to httr::GET()
 4 | #'
 5 | #' @return logical of length 1 indicating whether or not any redirect happened
 6 | #'   during the HTTP request
 7 | #'
 8 | #'
 9 | http_was_redirected <-
10 |   function(response){
11 |     # extract status
12 |     status <-
13 |       vapply(
14 |         X         = response$all_headers,
15 |         FUN       = `[[`,
16 |         FUN.VALUE = integer(1),
17 |         "status"
18 |       )
19 | 
20 |     # check status and return
21 |     any(status >= 300 & status < 400)
22 |   }
23 | 


--------------------------------------------------------------------------------
/man/is_valid_robotstxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/is_valid_robotstxt.R
 3 | \name{is_valid_robotstxt}
 4 | \alias{is_valid_robotstxt}
 5 | \title{Validate if a file is valid / parsable robots.txt file}
 6 | \usage{
 7 | is_valid_robotstxt(text, check_strickt_ascii = FALSE)
 8 | }
 9 | \arguments{
10 | \item{text}{content of a robots.txt file provided as character vector}
11 | 
12 | \item{check_strickt_ascii}{whether or not to check if content does adhere to the specification of RFC to use plain text aka ASCII}
13 | }
14 | \description{
15 | Validate if a file is valid / parsable robots.txt file
16 | }
17 | 


--------------------------------------------------------------------------------
/man/rt_get_fields_worker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rt_get_fields_worker.R
 3 | \name{rt_get_fields_worker}
 4 | \alias{rt_get_fields_worker}
 5 | \title{Extract robotstxt fields}
 6 | \usage{
 7 | rt_get_fields_worker(txt, type = "all", regex = NULL, invert = FALSE)
 8 | }
 9 | \arguments{
10 | \item{txt}{content of the robots.txt file}
11 | 
12 | \item{type}{name or names of the fields to be returned, defaults to all
13 | fields}
14 | 
15 | \item{regex}{subsetting field names via regular expressions}
16 | 
17 | \item{invert}{field selection}
18 | }
19 | \description{
20 | Extract robotstxt fields
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/parse_url.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parse_url.R
 3 | \name{parse_url}
 4 | \alias{parse_url}
 5 | \title{Parse a URL}
 6 | \usage{
 7 | parse_url(url)
 8 | }
 9 | \arguments{
10 | \item{url}{url to parse into its components}
11 | }
12 | \value{
13 | data.frame with columns protocol, domain, path
14 | }
15 | \description{
16 | Parse a URL
17 | }
18 | \examples{
19 | 
20 | \dontrun{
21 | url <-
22 | c(
23 |   "google.com",
24 |   "google.com/",
25 |   "www.google.com",
26 |   "http://google.com",
27 |   "https://google.com",
28 |   "sub.domain.whatever.de"
29 |   "s-u-b.dom-ain.what-ever.de"
30 | )
31 | 
32 | parse_url(url)
33 | }
34 | 
35 | }
36 | \keyword{internal}
37 | 


--------------------------------------------------------------------------------
/R/rt_get_useragent.R:
--------------------------------------------------------------------------------
 1 | #' Extract HTTP useragents from robots.txt
 2 | #' @param txt content of the robots.txt file
 3 | #' @keywords internal
 4 | # rt_get_useragent <- function(txt){
 5 | #   tmp  <- stringr::str_extract_all(txt, "[uU]ser-agent:.*")
 6 | #   stringr::str_replace_all(unique(unlist(tmp)), "[uU].*:| |\n","")
 7 | # }
 8 | 
 9 | 
10 | rt_get_useragent <- function(txt){
11 |   tmp  <-
12 |     stringr::str_extract_all(
13 |       string  = txt,
14 |       pattern = stringr::regex("User-agent:.*", ignore_case = TRUE)
15 |     )
16 | 
17 |   stringr::str_replace_all(
18 |     string      = unique(unlist(tmp)),
19 |     pattern     = stringr::regex("U.*:| |\n", ignore_case = TRUE),
20 |     replacement = ""
21 |   )
22 | }
23 | 


--------------------------------------------------------------------------------
/R/parse_robotstxt.R:
--------------------------------------------------------------------------------
 1 | #' Parse a robots.txt file
 2 | #' @param txt content of the robots.txt file
 3 | #' @return a named list with useragents, comments, permissions, sitemap
 4 | #' @export
 5 | parse_robotstxt <- function(txt){
 6 |   res <-
 7 |     list(
 8 |       useragents  = rt_get_useragent(txt),
 9 |       comments    = rt_get_comments(txt),
10 |       permissions = rt_get_fields(txt, "allow"),
11 |       crawl_delay = rt_get_fields(txt, "crawl-delay"),
12 |       sitemap     = rt_get_fields(txt, "sitemap"),
13 |       host        = rt_get_fields(txt, "host"),
14 |       other       =
15 |         rt_get_fields(
16 |           txt,
17 |           regex="sitemap|allow|user-agent|host|crawl-delay",
18 |           invert=TRUE
19 |         )
20 |     )
21 |   return(res)
22 | }
23 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_bundestag.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Disallow: /includes/
 3 | Allow: /includes/images/mediathek/
 4 | Disallow: /WEB-INF/
 5 | Disallow: /cgibin/
 6 | Disallow: /include/
 7 | Disallow: /script/
 8 | Disallow: /layout/
 9 | Disallow: /track
10 | Disallow: /adwords/
11 | Disallow: /service/empfehlen/
12 | Disallow: /dokumente/suche/
13 | Disallow: /mobil/
14 | Disallow: /xml/
15 | Disallow: /service/quiz/
16 | Disallow: /presse/akkreditierung/bundesversammlung2010/
17 | Disallow: /bundestag/gremien/enquete/wachstum/datenaustausch/
18 | Disallow: /bundestag/ausschuesse17/a13/datenaustausch/
19 | Disallow: /apps/
20 | Disallow: /mediathek/
21 | Allow: /mediathek/parlamentstv
22 | Disallow: /blueprint/WEB-INF
23 | Disallow: /htdocs_f/service/recommend/
24 | Disallow: /htdocs_e/service/recommend/
25 | 
26 | 


--------------------------------------------------------------------------------
/R/is_suspect_robotstxt.R:
--------------------------------------------------------------------------------
 1 | #' Check if file is valid / parsable robots.txt file
 2 | #'
 3 | #' Function that checks if file is valid / parsable robots.txt file
 4 | #'
 5 | #' @param text content of a robots.txt file provides as character vector
 6 | #'
 7 | is_suspect_robotstxt <- function(text){
 8 | 
 9 |   if ( length(text) > 0 ){
10 |     # check for html
11 |     if( grepl(x = text[1], pattern = "^\\s*<!doctype ", ignore.case = TRUE) ){
12 |       return(TRUE)
13 |     }
14 | 
15 |     # check for xml
16 |     if( grepl(x = text[1], pattern = "^\\s*<?xml ", ignore.case = TRUE) ){
17 |       return(TRUE)
18 |     }
19 | 
20 |     # check for json
21 |     if( grepl(x = text[1], pattern = "^\\s*\\{", ignore.case = TRUE) ){
22 |       return(TRUE)
23 |     }
24 |   }
25 | 
26 |   # return default
27 |   return(FALSE)
28 | }
29 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(as.list,robotstxt_text)
 4 | S3method(print,robotstxt)
 5 | S3method(print,robotstxt_text)
 6 | export("%>%")
 7 | export(get_robotstxt)
 8 | export(get_robotstxt_http_get)
 9 | export(get_robotstxts)
10 | export(is_valid_robotstxt)
11 | export(on_client_error_default)
12 | export(on_domain_change_default)
13 | export(on_file_type_mismatch_default)
14 | export(on_not_found_default)
15 | export(on_redirect_default)
16 | export(on_server_error_default)
17 | export(on_sub_domain_change_default)
18 | export(on_suspect_content_default)
19 | export(parse_robotstxt)
20 | export(paths_allowed)
21 | export(request_handler_handler)
22 | export(robotstxt)
23 | export(rt_last_http)
24 | export(rt_request_handler)
25 | importFrom(magrittr,"%>%")
26 | importFrom(utils,modifyList)
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get_robotstxt.R:
--------------------------------------------------------------------------------
 1 | test_that(
 2 |   "NA in NA out", {
 3 |     expect_true({
 4 |       is.na(get_robotstxt(domain = NA))
 5 |     })
 6 | 
 7 |     expect_true({
 8 |       all(
 9 |         is.na(
10 |           suppressMessages(get_robotstxts(domain = c(NA, NA)))
11 |         )
12 |       )
13 |     })
14 |   }
15 | )
16 | 
17 | 
18 | test_that(
19 |   "standard usage works", {
20 |     if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){
21 |       expect_true({
22 |         suppressWarnings(get_robotstxt(domain = "example.com"))
23 |         TRUE
24 |       })
25 | 
26 |       expect_true({
27 |         suppressMessages(
28 |           suppressWarnings(
29 |             get_robotstxts(domain = c("example.com", "example.com"))
30 |           )
31 |         )
32 |         TRUE
33 |       })
34 |     }
35 |   }
36 | )
37 | 


--------------------------------------------------------------------------------
/inst/robotstxts/rbloggers.txt:
--------------------------------------------------------------------------------
 1 | # source: http://www.r-bloggers.com
 2 | 
 3 | sitemap: http://www.r-bloggers.com/sitemap.xml
 4 | 
 5 | User-agent:  *
 6 | # disallow all files in these directories
 7 | Disallow: /cgi-bin/
 8 | Disallow: /wp-admin/
 9 | Disallow: /wp-includes/
10 | Disallow: /wp-content/
11 | Disallow: /archives/
12 | disallow: /*?*
13 | Disallow: *?replytocom
14 | Disallow: /wp-*
15 | # Disallow: /author
16 | Disallow: /comments/feed/
17 | User-agent: Mediapartners-Google*
18 | Allow: /
19 | User-agent: Googlebot-Image
20 | Allow: /wp-content/uploads/
21 | 
22 | #User-agent: Adsbot-Google
23 | #Allow: /
24 | 
25 | User-agent: Googlebot-Mobile
26 | Allow: /
27 | 
28 | User-agent: Googlebot
29 | Crawl-delay: 1
30 | 
31 | User-agent: spbot
32 | Crawl-delay: 2000
33 | 
34 | User-agent: BLEXBot
35 | Crawl-delay: 2000
36 | 
37 | User-Agent: AhrefsBot
38 | Crawl-delay: 2000
39 | 
40 | User-agent: *
41 | Crawl-delay: 20


--------------------------------------------------------------------------------
/man/request_handler_handler.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/request_handler_handler.R
 3 | \name{request_handler_handler}
 4 | \alias{request_handler_handler}
 5 | \title{Handle robotstxt handlers}
 6 | \usage{
 7 | request_handler_handler(request, handler, res, info = TRUE, warn = TRUE)
 8 | }
 9 | \arguments{
10 | \item{request}{the request object returned by call to httr::GET()}
11 | 
12 | \item{handler}{the handler either a character string entailing various options or a function producing a specific list, see return.}
13 | 
14 | \item{res}{a list with elements '[handler names], ...', 'rtxt', and 'cache'}
15 | 
16 | \item{info}{info to add to problems list}
17 | 
18 | \item{warn}{if FALSE warnings and messages are suppressed}
19 | }
20 | \value{
21 | a list with elements '[handler name]', 'rtxt', and 'cache'
22 | }
23 | \description{
24 | Helper function to handle robotstxt handlers.
25 | }
26 | 


--------------------------------------------------------------------------------
/R/parse_url.R:
--------------------------------------------------------------------------------
 1 | #' Parse a URL
 2 | #'
 3 | #' @param url url to parse into its components
 4 | #'
 5 | #' @return data.frame with columns protocol, domain, path
 6 | #'
 7 | #' @keywords internal
 8 | #'
 9 | #' @examples
10 | #'
11 | #' \dontrun{
12 | #' url <-
13 | #' c(
14 | #'   "google.com",
15 | #'   "google.com/",
16 | #'   "www.google.com",
17 | #'   "http://google.com",
18 | #'   "https://google.com",
19 | #'   "sub.domain.whatever.de"
20 | #'   "s-u-b.dom-ain.what-ever.de"
21 | #' )
22 | #'
23 | #' parse_url(url)
24 | #' }
25 | #'
26 | parse_url <- function(url){
27 |   match <-
28 |     stringr::str_match(
29 |       string  = url,
30 |       pattern = "(^\\w+://)?([^/]+)?(/.*)?"
31 |     )
32 | 
33 |   match <- match[, -1, drop = FALSE]
34 | 
35 |   df        <- as.data.frame(match, stringsAsFactors = FALSE)
36 |   names(df) <- c("protocol", "domain", "path")
37 |   df$path[ is.na(df$path) ] <- ""
38 | 
39 |   # return
40 |   df
41 | }
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/R/print_robotstxt_text.R:
--------------------------------------------------------------------------------
 1 | #' Print robotstxt's text
 2 | #' @param x character vector aka robotstxt$text to be printed
 3 | #' @param ... goes down the sink
 4 | #' @export
 5 | print.robotstxt_text <- function(x, ...){
 6 | 
 7 |   # print part of the robots.txt file
 8 |   cat("[robots.txt]\n--------------------------------------\n\n")
 9 |   tmp <- unlist(strsplit(x, "\n"))
10 |   cat(tmp[seq_len(min(length(tmp), 50))], sep ="\n")
11 |   cat("\n\n\n")
12 |   if(length(tmp) > 50){
13 |     cat("[...]\n\n")
14 |   }
15 | 
16 |   # print problems
17 |   problems <- attr(x, "problems")
18 |   if ( length(problems) > 0){
19 |     cat("[events]\n--------------------------------------\n\n")
20 |     cat("requested:  ", attr(x, "request")$request$url, "\n")
21 |     cat("downloaded: ", attr(x, "request")$url, "\n\n")
22 |     cat(utils::capture.output(print(problems)), sep="\n")
23 |     cat("[attributes]\n--------------------------------------\n\n")
24 |     cat(names(attributes(x)), sep=", ")
25 |   }
26 | 
27 |   cat("\n")
28 | 
29 |   # return
30 |   invisible(x)
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/paths_allowed.md:
--------------------------------------------------------------------------------
 1 | # paths_allowed() works also with 'downloaded' robots.txt files
 2 | 
 3 |     Code
 4 |       domain_change <- readRDS(system.file("http_requests/http_domain_change.rds",
 5 |         package = "robotstxt"))
 6 |       suppressMessages(paths_allowed(paths = "https://github.io/index.html",
 7 |         rt_robotstxt_http_getter = function(...) {
 8 |           domain_change
 9 |         }, warn = FALSE))
10 |     Output
11 |       [1] TRUE
12 | 
13 | ---
14 | 
15 |     Code
16 |       domain_change <- readRDS(system.file("http_requests/http_domain_change.rds",
17 |         package = "robotstxt"))
18 |       suppressMessages(paths_allowed(paths = "https://github.io/index.html",
19 |         rt_robotstxt_http_getter = function(...) {
20 |           domain_change
21 |         }))
22 |     Condition
23 |       Warning in `request_handler_handler()`:
24 |       Event: on_domain_change
25 |       Warning in `request_handler_handler()`:
26 |       Event: on_file_type_mismatch
27 |       Warning in `request_handler_handler()`:
28 |       Event: on_suspect_content
29 |     Output
30 |       [1] TRUE
31 | 
32 | 


--------------------------------------------------------------------------------
/R/tools.R:
--------------------------------------------------------------------------------
 1 | #' Create a named list
 2 | #' @param ... things to be put in list
 3 | #' @keywords internal
 4 | named_list <- function(...){
 5 |   thelist <- list(...)
 6 |   names(thelist) <- as.character(substitute(list(...)))[-1]
 7 |   thelist
 8 | }
 9 | 
10 | #' Load robots.txt files saved along with the package
11 | #'
12 | #' load robots.txt files saved along with the package:
13 | #' these functions are very handy for testing (not used otherwise)
14 | #' @param name name of the robots.txt files, defaults to a random drawn file ;-)
15 | #' @keywords internal
16 | rt_get_rtxt <- function(name=sample(rt_list_rtxt(),1)){
17 |   if( is.numeric(name) ){
18 |     name <- rt_list_rtxt()[name]
19 |   }
20 |   readLines(
21 |     system.file( paste0("robotstxts/",name), package = "robotstxt" ),
22 |     warn = FALSE
23 |   )
24 | }
25 | 
26 | #' List robots.txt files saved along with the package
27 | #'
28 | #' list robots.txt files saved along with the package:
29 | #' these functions ar very handy for testing (not used otherwise)
30 | #' @keywords internal
31 | rt_list_rtxt <- function(){
32 |   list.files(system.file("robotstxts", package = "robotstxt" ))
33 | }
34 | 


--------------------------------------------------------------------------------
/man/get_robotstxt_http_get.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_robotstxt_http_get.R
 3 | \docType{data}
 4 | \name{rt_last_http}
 5 | \alias{rt_last_http}
 6 | \alias{get_robotstxt_http_get}
 7 | \title{Storage for HTTP request response objects}
 8 | \format{
 9 | An object of class \code{environment} of length 1.
10 | }
11 | \usage{
12 | rt_last_http
13 | 
14 | get_robotstxt_http_get(
15 |   domain,
16 |   user_agent = utils::sessionInfo()$R.version$version.string,
17 |   ssl_verifypeer = 1
18 | )
19 | }
20 | \arguments{
21 | \item{domain}{the domain to get robots.txt file for.}
22 | 
23 | \item{user_agent}{the user agent to use for HTTP request header. Defaults to current version of R.
24 | If `NULL` is passed, httr will use software versions for the header, such as
25 | `libcurl/8.7.1 r-curl/5.2.3 httr/1.4.7`}
26 | 
27 | \item{ssl_verifypeer}{either 1 (default) or 0, if 0 it disables SSL peer verification, which
28 | might help with robots.txt file retrieval}
29 | }
30 | \description{
31 | Storage for HTTP request response objects
32 | 
33 | Execute HTTP request for get_robotstxt()
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/man/paths_allowed_worker_spiderbar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/paths_allowed_worker_spiderbar.R
 3 | \name{paths_allowed_worker_spiderbar}
 4 | \alias{paths_allowed_worker_spiderbar}
 5 | \title{Check if a spiderbar bot has permissions to access page(s)}
 6 | \usage{
 7 | paths_allowed_worker_spiderbar(domain, bot, paths, robotstxt_list)
 8 | }
 9 | \arguments{
10 | \item{domain}{Domain for which paths should be checked. Defaults to "auto".
11 | If set to "auto" function will try to guess the domain by parsing the paths
12 | argument. Note however, that these are educated guesses which might utterly
13 | fail. To be on the safe side, provide appropriate domains manually.}
14 | 
15 | \item{bot}{name of the bot, defaults to "*"}
16 | 
17 | \item{paths}{paths for which to check bot's permission, defaults to "/". Please note that path to a folder should end with a trailing slash ("/").}
18 | 
19 | \item{robotstxt_list}{either NULL -- the default -- or a list of character
20 | vectors with one vector per path to check}
21 | }
22 | \description{
23 | Check if a spiderbar bot has permissions to access page(s)
24 | }
25 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_cdc.txt:
--------------------------------------------------------------------------------
 1 | # Ignore FrontPage files
 2 | User-agent: *
 3 | Disallow: /_borders
 4 | Disallow: /_derived
 5 | Disallow: /_fpclass
 6 | Disallow: /_overlay
 7 | Disallow: /_private
 8 | Disallow: /_themes
 9 | Disallow: /_vti_bin
10 | Disallow: /_vti_cnf
11 | Disallow: /_vti_log
12 | Disallow: /_vti_map
13 | Disallow: /_vti_pvt
14 | Disallow: /_vti_txt
15 | 
16 | # Do not index the following URLs
17 | Disallow: /travel/
18 | Disallow: /flu/espanol/
19 | Disallow: /migration/
20 | Disallow: /Features/SpinaBifidaProgram/
21 | Disallow: /concussion/HeadsUp/training/
22 | 
23 | # Don't spider search pages
24 | Disallow: /search.do
25 | 
26 | # Don't spider email-this-page pages
27 | Disallow: /email.do
28 |  
29 | # Don't spider printer-friendly versions of pages
30 | Disallow: /print.do
31 | 
32 | # Rover is a bad dog
33 | User-agent: Roverbot
34 | Disallow: /
35 | 
36 | # EmailSiphon is a hunter/gatherer which extracts email addresses for spam-mailers to use
37 | User-agent: EmailSiphon
38 | Disallow: /
39 | 
40 | # Exclude MindSpider since it appears to be ill-behaved
41 | User-agent: MindSpider
42 | Disallow: /
43 | 
44 | # Sitemap link per CR14586
45 | Sitemap: http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml
46 | 
47 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_cdc2.txt:
--------------------------------------------------------------------------------
 1 | # Ignore FrontPage files
 2 | User-agent: *
 3 | Disallow: /_borders
 4 | Disallow: /_derived
 5 | Disallow: /_fpclass
 6 | Disallow: /_overlay
 7 | Disallow: /_private
 8 | Disallow: /_themes
 9 | Disallow: /_vti_bin
10 | Disallow: /_vti_cnf
11 | Disallow: /_vti_log
12 | Disallow: /_vti_map
13 | Disallow: /_vti_pvt
14 | Disallow: /_vti_txt
15 | 
16 | # Do not index the following URLs
17 | Disallow: /travel/
18 | Disallow: /flu/espanol/
19 | Disallow: /migration/
20 | Disallow: /Features/SpinaBifidaProgram/
21 | Disallow: /concussion/HeadsUp/training/
22 | 
23 | # Don't spider search pages
24 | Disallow: /search.do
25 | 
26 | # Don't spider email-this-page pages
27 | Disallow: /email.do
28 |  
29 | # Don't spider printer-friendly versions of pages
30 | Disallow: /print.do
31 | 
32 | # Rover is a bad dog
33 | User-agent: Roverbot
34 | Disallow: /
35 | 
36 | # EmailSiphon is a hunter/gatherer which extracts email addresses for spam-mailers to use
37 | User-agent: EmailSiphon
38 | Disallow: /
39 | 
40 | # Exclude MindSpider since it appears to be ill-behaved
41 | User-agent: MindSpider
42 | Disallow: /
43 | 
44 | # Sitemap link per CR14586
45 | Sitemap: http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml
46 | 
47 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_new_york_times.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Allow: /ads/public/
 3 | Allow: /svc/news/v3/all/pshb.rss
 4 | Disallow: /ads/
 5 | Disallow: /adx/bin/
 6 | Disallow: /archives/
 7 | Disallow: /auth/
 8 | Disallow: /cnet/
 9 | Disallow: /college/
10 | Disallow: /external/
11 | Disallow: /financialtimes/
12 | Disallow: /idg/
13 | Disallow: /indexes/
14 | Disallow: /library/
15 | Disallow: /nytimes-partners/
16 | Disallow: /packages/flash/multimedia/TEMPLATES/
17 | Disallow: /pages/college/
18 | Disallow: /paidcontent/
19 | Disallow: /partners/
20 | Disallow: /restaurants/search*
21 | Disallow: /reuters/
22 | Disallow: /register
23 | Disallow: /thestreet/
24 | Disallow: /svc
25 | Disallow: /video/embedded/*
26 | Disallow: /web-services/
27 | Disallow: /gst/travel/travsearch*
28 | 
29 | User-agent: Mediapartners-Google
30 | Disallow: /restaurants/search*
31 | 
32 | User-agent: AdsBot-Google
33 | Disallow: /restaurants/search*
34 | 
35 | User-agent: adidxbot
36 | Disallow: /restaurants/search*
37 | 
38 | Sitemap: http://spiderbites.nytimes.com/sitemaps/www.nytimes.com/sitemap.xml.gz
39 | Sitemap: http://www.nytimes.com/sitemaps/sitemap_news/sitemap.xml.gz
40 | Sitemap: http://spiderbites.nytimes.com/sitemaps/sitemap_video/sitemap.xml.gz
41 | 


--------------------------------------------------------------------------------
/R/get_robotstxt_http_get.R:
--------------------------------------------------------------------------------
 1 | #' Storage for HTTP request response objects
 2 | #'
 3 | #' @rdname get_robotstxt_http_get
 4 | #' @export
 5 | rt_last_http <- new.env()
 6 | rt_last_http$request <- list()
 7 | 
 8 | #' Execute HTTP request for get_robotstxt()
 9 | #'
10 | #' @param ssl_verifypeer either 1 (default) or 0, if 0 it disables SSL peer verification, which
11 | #'   might help with robots.txt file retrieval
12 | #' @param domain the domain to get robots.txt file for.
13 | #' @param user_agent the user agent to use for HTTP request header. Defaults to current version of R.
14 | #'   If `NULL` is passed, httr will use software versions for the header, such as
15 | #'   `libcurl/8.7.1 r-curl/5.2.3 httr/1.4.7`
16 | #' @export
17 | get_robotstxt_http_get <- function(domain, user_agent = utils::sessionInfo()$R.version$version.string, ssl_verifypeer = 1) {
18 |   url <- fix_url(paste0(domain, "/robots.txt"))
19 | 
20 |   headers <- if (!is.null(user_agent)) {
21 |     httr::add_headers("user-agent" = user_agent)
22 |   } else {
23 |     NULL
24 |   }
25 | 
26 |   request <- httr::GET(
27 |     url = url,
28 |     config = httr::config(ssl_verifypeer = ssl_verifypeer),
29 |     headers
30 |   )
31 | 
32 |   rt_last_http$request <- request
33 | 
34 |   request
35 | }
36 | 


--------------------------------------------------------------------------------
/R/paths_allowed_worker_spiderbar.R:
--------------------------------------------------------------------------------
 1 | #' Check if a spiderbar bot has permissions to access page(s)
 2 | #'
 3 | #' @inheritParams paths_allowed
 4 | #'
 5 | paths_allowed_worker_spiderbar <-
 6 |   function(
 7 |     domain,
 8 |     bot,
 9 |     paths,
10 |     robotstxt_list
11 |   ){
12 | 
13 | 
14 |     # process inputs
15 |     robotstxts <-
16 |       unlist(lapply(robotstxt_list, paste, collapse="\n"))
17 | 
18 | 
19 |     # apply permission checker to permission data
20 |     worker <-
21 |       function(path, robotstxt, domain, bot){
22 |         if( is.na(domain) ){
23 |           return(NA)
24 |         }else{
25 |           rtxt_obj <- spiderbar::robxp(x = robotstxt)
26 | 
27 |           bot_can_fetch <-
28 |             spiderbar::can_fetch(
29 |               obj        = rtxt_obj,
30 |               path       = path,
31 |               user_agent = bot
32 |             )
33 |         }
34 |         return(bot_can_fetch)
35 |       }
36 | 
37 |     tmp <-
38 |       mapply(
39 |         FUN         = worker,
40 |         path        = paths,
41 |         robotstxt   = robotstxts,
42 |         bot         = bot,
43 |         domain      = domain
44 |       )
45 | 
46 |     names(tmp) <- NULL
47 | 
48 |     # return
49 |     return(tmp)
50 | 
51 |   }
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/R/print_robotstxt.R:
--------------------------------------------------------------------------------
 1 | #' Print robotstxt
 2 | #' @param x robotstxt instance to be printed
 3 | #' @param ... goes down the sink
 4 | #' @export
 5 | print.robotstxt <- function(x, ...){
 6 |   tmp <- x
 7 |   tmp_text <- unlist(stringr::str_split(tmp$text, "\n"))
 8 |   if( length(tmp_text) > 15 ){
 9 |     tmp$text <-
10 |       paste0( c(tmp_text[1:10], "", paste0("[... ",length(tmp_text)-10," lines omitted ...]")), collapse = "\n")
11 |   }
12 |   if( length(tmp$bots) > 7 ){
13 |     tmp$bots <-
14 |       c(utils::head(tmp$bots), "", paste("[... ", length(tmp$bots)-5, "items omitted ...]") )
15 |   }
16 |   for(i in c("permissions", "crawl_delay", "host", "sitemap", "other") ){
17 |     if( dim(tmp[[i]])[1] > 7  ){
18 |       tmp_fill <- data.frame(cbind( paste("[... ", dim(tmp[[i]])[1]-5, "items omitted ...]"), "",""))
19 |       names(tmp_fill) <- names(tmp[[i]])
20 |       tmp[[i]]  <- rbind( utils::head(tmp[[i]]), "", tmp_fill )
21 |     }
22 |   }
23 |   if( dim(tmp$comments)[1] > 7 ){
24 |     tmp_fill <- data.frame(cbind( "", paste("[... ", dim(tmp[["comments"]])[1]-5, "items omitted ...]")))
25 |     names(tmp_fill) <- names(tmp[["comments"]])
26 |     tmp[["comments"]]  <- rbind( utils::head(tmp[["comments"]]), "", tmp_fill )
27 |   }
28 |   print.default(tmp)
29 |   invisible(x)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/list_merge.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/list_merge.R
 3 | \name{list_merge}
 4 | \alias{list_merge}
 5 | \title{Merge a number of named lists in sequential order}
 6 | \usage{
 7 | list_merge(...)
 8 | }
 9 | \arguments{
10 | \item{...}{named lists}
11 | }
12 | \description{
13 | Merge a number of named lists in sequential order
14 | }
15 | \details{
16 | List merging is usually useful in the merging of program
17 | settings or configuraion with multiple versions across time,
18 | or multiple administrative levels. For example, a program
19 | settings may have an initial version in which most keys are
20 | defined and specified. In later versions, partial modifications
21 | are recorded. In this case, list merging can be useful to merge
22 | all versions of settings in release order of these versions. The
23 | result is an fully updated settings with all later modifications
24 | applied.
25 | }
26 | \author{
27 | Kun Ren <mail@renkun.me>
28 | 
29 | 
30 | The function merges a number of lists in sequential order
31 | by \code{modifyList}, that is, the later list always
32 | modifies the former list and form a merged list, and the
33 | resulted list is again being merged with the next list.
34 | The process is repeated until all lists in \code{...} or
35 | \code{list} are exausted.
36 | }
37 | 


--------------------------------------------------------------------------------
/R/is_valid_robotstxt.R:
--------------------------------------------------------------------------------
 1 | #' Validate if a file is valid / parsable robots.txt file
 2 | #'
 3 | #' @param text content of a robots.txt file provided as character vector
 4 | #' @param check_strickt_ascii whether or not to check if content does adhere to the specification of RFC to use plain text aka ASCII
 5 | #'
 6 | #' @export
 7 | #'
 8 | is_valid_robotstxt <- function(text, check_strickt_ascii = FALSE){
 9 |   text  <- unlist(strsplit(text, "\n"))
10 | 
11 | 
12 |   # actually REF specifies files to be ASCII only
13 |   # but one of the most frequently visited webpages worlld wide, namely wikipedia
14 |   # does use UTF-8 within its robots.txt files
15 |   if ( check_strickt_ascii ) {
16 |     ascii <- text == "" | stringr::str_detect(pattern = "^[[:ascii:]]+$", string = text)
17 |   } else {
18 |     ascii <- rep(TRUE, length(text))
19 |   }
20 | 
21 |   all(
22 |     # allow :
23 | 
24 |       # - spaces followed by #
25 |       grepl(
26 |         pattern  = "^(\xef\xbb\xbf)*(\\s)*#",
27 |         x        = text,
28 |         useBytes = TRUE
29 |       )   |
30 | 
31 |         # - spaces followed by letter(s) followed by a double dot (somewhere)
32 |         stringr::str_detect(
33 |           pattern = "^(\xef\xbb\xbf)*( )*([^\\[\\] ,;<>()@/?=\\{\\}\t\\\\])+( )*:",
34 |           string  = text
35 |         )  & ascii |
36 | 
37 |         # - spaces only or empty line
38 |         grepl("^(\xef\xbb\xbf)*(\\s)*$", text, useBytes = TRUE) & ascii
39 | 
40 |   )
41 | }
42 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_yahoo.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Disallow: /p/
 3 | Disallow: /r/
 4 | Disallow: /bin/
 5 | Disallow: /includes/
 6 | Disallow: /blank.html
 7 | Disallow: /_td_api
 8 | Disallow: /_tdpp_api
 9 | Disallow: /_remote
10 | Disallow: /_multiremote
11 | Disallow: /_tdhl_api
12 | 
13 | Sitemap: https://www.yahoo.com/food/sitemaps/sitemap_index_us_en-US.xml.gz
14 | Sitemap: https://www.yahoo.com/tech/sitemaps/sitemap_index_us_en-US.xml.gz
15 | Sitemap: https://www.yahoo.com/travel/sitemaps/sitemap_index_us_en-US.xml.gz
16 | Sitemap: https://www.yahoo.com/movies/sitemaps/sitemap_index_us_en-US.xml.gz
17 | Sitemap: https://www.yahoo.com/beauty/sitemaps/sitemap_index_us_en-US.xml.gz
18 | Sitemap: https://www.yahoo.com/health/sitemaps/sitemap_index_us_en-US.xml.gz
19 | Sitemap: https://www.yahoo.com/style/sitemaps/sitemap_index_us_en-US.xml.gz
20 | Sitemap: https://www.yahoo.com/makers/sitemaps/sitemap_index_us_en-US.xml.gz
21 | Sitemap: https://www.yahoo.com/parenting/sitemaps/sitemap_index_us_en-US.xml.gz
22 | Sitemap: https://www.yahoo.com/music/sitemaps/sitemap_index_us_en-US.xml.gz
23 | Sitemap: https://www.yahoo.com/tv/sitemaps/sitemap_index_us_en-US.xml.gz
24 | Sitemap: https://www.yahoo.com/politics/sitemaps/sitemap_index_us_en-US.xml.gz
25 | Sitemap: https://www.yahoo.com/autos/sitemaps/sitemap_index_us_en-US.xml.gz
26 | Sitemap: https://www.yahoo.com/katiecouric/sitemaps/sitemap_index_us_en-US.xml.gz
27 | Sitemap: https://www.yahoo.com/digest/sitemap.xml
28 | 


--------------------------------------------------------------------------------
/R/http_subdomain_changed.R:
--------------------------------------------------------------------------------
 1 | #' Check if HTTP subdomain changed
 2 | #'
 3 | #' @param response an httr response object, e.g. from a call to httr::GET()
 4 | #'
 5 | #' @return logical of length 1 indicating whether or not any subdomain change
 6 | #'     happened during the HTTP request
 7 | #'
 8 | #'
 9 | http_subdomain_changed <-
10 |   function(response){
11 | 
12 |     # get domain of original HTTP request
13 |     orig_domain <- guess_domain(response$request$url)
14 |     orig_domain <- stringr::str_replace(orig_domain, "www\\.", "")
15 | 
16 | 
17 |     # extract location headers
18 |     location <-
19 |       unlist(
20 |         lapply(
21 |           X   = response$all_headers,
22 |           FUN =
23 |             function(x){
24 |               x$headers$location
25 |             }
26 |         )
27 |       )
28 |     location        <- utils::tail(location, 1)
29 |     location        <- stringr::str_replace(location, "www\\.", "")
30 |     location_domain <- guess_domain(location)
31 | 
32 | 
33 | 
34 |     # if there is no location header nothing has changed
35 |     if ( length(location) > 0 ) {
36 |       orig_domain_regex <-
37 |         stringr::regex(
38 |           pattern     = paste0("^", stringr::str_replace_all(orig_domain, "\\.", "\\\\."), "$"),
39 |           ignore_case = TRUE
40 |         )
41 | 
42 |       return(
43 |         !( stringr::str_detect(pattern = orig_domain_regex, string = location_domain) )
44 |       )
45 | 
46 |     } else {
47 |       return(FALSE)
48 |     }
49 | 
50 |   }
51 | 


--------------------------------------------------------------------------------
/R/http_domain_changed.R:
--------------------------------------------------------------------------------
 1 | #' Check if HTTP domain changed
 2 | #'
 3 | #' @param response an httr response object, e.g. from a call to httr::GET()
 4 | #'
 5 | #' @return logical of length 1 indicating whether or not any domain change
 6 | #'     happened during the HTTP request
 7 | #'
 8 | #'
 9 | http_domain_changed <-
10 |   function(response){
11 | 
12 |     # get domain of original HTTP request
13 |     orig_domain <- guess_domain(response$request$url)
14 |     orig_domain <- stringr::str_replace(orig_domain, "www\\.", "")
15 | 
16 |     # extract location headers
17 |     location <-
18 |       unlist(
19 |         lapply(
20 |           X   = response$all_headers,
21 |           FUN =
22 |             function(x){
23 |               x$headers$location
24 |             }
25 |         )
26 |       )
27 |     location        <- utils::tail(location, 1)
28 |     location        <- stringr::str_replace(location, "www\\.", "")
29 |     location_domain <- guess_domain(location)
30 | 
31 |     # if there is no location header nothing has changed
32 |     #
33 |     if ( length(location) > 0 ) {
34 |       return(
35 |         !(
36 |           stringr::str_detect(
37 |             string = guess_domain(location_domain),
38 |             pattern =
39 |               stringr::regex(
40 |                 stringr::str_replace_all(orig_domain, "\\.", "\\\\."),
41 |                 ignore_case = TRUE
42 |               )
43 |           )
44 |         )
45 |       )
46 |     } else {
47 |       return(FALSE)
48 |     }
49 | 
50 |   }
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test_issue50.R:
--------------------------------------------------------------------------------
 1 | test_that(
 2 |   "robotstxt no scheme works", {
 3 |     expect_true({
 4 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){
 5 |         suppressMessages(paths_allowed("www.google.com"))
 6 |       } else {
 7 |         TRUE
 8 |       }
 9 |     })
10 | 
11 |     expect_true({
12 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
13 |         suppressMessages(paths_allowed("google.com"))
14 |       } else {
15 |         TRUE
16 |       }
17 |     })
18 | 
19 |   }
20 | )
21 | 
22 | 
23 | test_that(
24 |   "robotstxt scheme works", {
25 |     expect_true({
26 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
27 |         suppressMessages(paths_allowed("https://google.com"))
28 |       } else {
29 |         TRUE
30 |       }
31 |     })
32 | 
33 |     expect_true({
34 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
35 |         suppressMessages(paths_allowed("https://www.google.com"))
36 |       } else {
37 |         TRUE
38 |       }
39 |     })
40 | 
41 |     expect_true({
42 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
43 |         suppressMessages(paths_allowed("http://google.com"))
44 |       } else {
45 |         TRUE
46 |       }
47 |     })
48 | 
49 |     expect_true({
50 |       if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
51 |         suppressMessages(paths_allowed("http://www.google.com"))
52 |       } else {
53 |         TRUE
54 |       }
55 |     })
56 |   }
57 | )
58 | 


--------------------------------------------------------------------------------
/R/list_merge.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @keywords internal
 3 | #' @author Kun Ren <mail@renkun.me>
 4 | reduce <- function(f, x, init, ...) {
 5 |   y <- init
 6 |   for (xi in x) y <- f(y, xi, ...)
 7 |   y
 8 | }
 9 | 
10 | #' Merge a number of named lists in sequential order
11 | #'
12 | #'
13 | #' @author Kun Ren <mail@renkun.me>
14 | #'
15 | #'
16 | #' The function merges a number of lists in sequential order
17 | #' by \code{modifyList}, that is, the later list always
18 | #' modifies the former list and form a merged list, and the
19 | #' resulted list is again being merged with the next list.
20 | #' The process is repeated until all lists in \code{...} or
21 | #' \code{list} are exausted.
22 | #'
23 | #' @details
24 | #' List merging is usually useful in the merging of program
25 | #' settings or configuraion with multiple versions across time,
26 | #' or multiple administrative levels. For example, a program
27 | #' settings may have an initial version in which most keys are
28 | #' defined and specified. In later versions, partial modifications
29 | #' are recorded. In this case, list merging can be useful to merge
30 | #' all versions of settings in release order of these versions. The
31 | #' result is an fully updated settings with all later modifications
32 | #' applied.
33 | #' @param ... named lists
34 | #' @importFrom utils modifyList
35 | list_merge <- function(...) {
36 |   lists <- list(...)
37 |   if (any(vapply(lists, function(x) is.null(names(x)), logical(1L))))
38 |     stop("All arguments must be named list", call. = FALSE)
39 |   reduce(modifyList, lists, list())
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check.yaml
10 | 
11 | permissions: read-all
12 | 
13 | jobs:
14 |   R-CMD-check:
15 |     runs-on: ${{ matrix.config.os }}
16 | 
17 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
18 | 
19 |     strategy:
20 |       fail-fast: false
21 |       matrix:
22 |         config:
23 |           - {os: macos-latest,   r: 'release'}
24 |           - {os: windows-latest, r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
26 |           - {os: ubuntu-latest,   r: 'release'}
27 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
28 | 
29 |     env:
30 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
31 |       R_KEEP_PKG_SOURCE: yes
32 | 
33 |     steps:
34 |       - uses: actions/checkout@v4
35 | 
36 |       - uses: r-lib/actions/setup-pandoc@v2
37 | 
38 |       - uses: r-lib/actions/setup-r@v2
39 |         with:
40 |           r-version: ${{ matrix.config.r }}
41 |           http-user-agent: ${{ matrix.config.http-user-agent }}
42 |           use-public-rspm: true
43 | 
44 |       - uses: r-lib/actions/setup-r-dependencies@v2
45 |         with:
46 |           extra-packages: any::rcmdcheck
47 |           needs: check
48 | 
49 |       - uses: r-lib/actions/check-r-package@v2
50 |         with:
51 |           upload-snapshots: true
52 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
53 | 


--------------------------------------------------------------------------------
/R/rt_get_fields_worker.R:
--------------------------------------------------------------------------------
 1 | #' Extract robotstxt fields
 2 | #' @param txt content of the robots.txt file
 3 | #' @param type name or names of the fields to be returned, defaults to all
 4 | #'   fields
 5 | #' @param regex subsetting field names via regular expressions
 6 | #' @param invert field selection
 7 | #' @keywords internal
 8 | rt_get_fields_worker <- function(txt, type="all", regex=NULL, invert=FALSE){
 9 | 
10 |   # handle empty file or no fields at all
11 |   # (--> return empty data.frame)
12 |   if( all(txt == "") | all(!grepl(":",txt)) ){
13 |     return(data.frame(field="", value="")[NULL,])
14 |   }
15 | 
16 |   # split lines int ovector elements
17 |   txt_vec   <- unlist(stringr::str_split(txt, "\r*\n"))
18 | 
19 |   # filter for fields ( ^= not a comment)
20 |   fields    <- grep("(^[ \t]{0,2}[^#]\\w.*)", txt_vec, value=TRUE)
21 | 
22 |   # split by ":" to get field_name, field_vlue pairs
23 |   fields    <-
24 |     data.frame(
25 |       do.call(
26 |         rbind,
27 |         stringr::str_split(fields, ":", n=2)
28 |       ),
29 |       stringsAsFactors = FALSE
30 |     )
31 |   names(fields) <- c("field", "value")
32 | 
33 |   # some post processing and cleaning
34 |   fields$value <- stringr::str_trim(fields$value)
35 |   fields$field <- stringr::str_trim(fields$field)
36 | 
37 |   # subset fields by regex
38 |   if ( !is.null(regex) ){
39 |     fields <- fields[ grep(regex, fields$field, invert=invert, ignore.case=TRUE) ,]
40 |   }
41 | 
42 |   # subset by type
43 |   if ( all(type == "all") ){
44 |     # do nothing
45 |   }else{
46 |     fields <- fields[ tolower(fields$field) %in% tolower(type) ,]
47 |   }
48 | 
49 |   # return
50 |   return(fields)
51 | }
52 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: robotstxt
 2 | Type: Package
 3 | Title: A 'robots.txt' Parser and 'Webbot'/'Spider'/'Crawler' Permissions Checker
 4 | Version: 0.7.15.9000
 5 | Authors@R: c(
 6 |         person(
 7 |           "Pedro", "Baltazar", role = c("ctb"),
 8 |           email = "pedrobtz@gmail.com"
 9 |         ),
10 |          person(
11 |           "Jordan", "Bradford", role = c("cre"),
12 |           email = "jrdnbradford@gmail.com"
13 |         ),
14 |         person(
15 |           "Peter", "Meissner", role = c("aut"),
16 |           email = "retep.meissner@gmail.com"
17 |         ),
18 |         person(
19 |             "Kun", "Ren", email = "mail@renkun.me", role = c("aut", "cph"),
20 |             comment = "Author and copyright holder of list_merge.R."
21 |         ),
22 | 	    person("Oliver", "Keys", role = "ctb", comment = "original release code review"),
23 | 	    person("Rich", "Fitz John", role = "ctb", comment = "original release code review")
24 | 	  )
25 | Description: Provides functions to download and parse 'robots.txt' files.
26 |         Ultimately the package makes it easy to check if bots
27 |         (spiders, crawler, scrapers, ...) are allowed to access specific
28 |         resources on a domain.
29 | License: MIT + file LICENSE
30 | BugReports: https://github.com/ropensci/robotstxt/issues
31 | URL: https://docs.ropensci.org/robotstxt/, https://github.com/ropensci/robotstxt
32 | Imports:
33 |     stringr (>= 1.0.0),
34 |     httr (>= 1.0.0),
35 |     spiderbar (>= 0.2.0),
36 |     future.apply (>= 1.0.0),
37 |     magrittr,
38 |     utils
39 | Suggests:
40 |     knitr,
41 |     rmarkdown,
42 |     dplyr,
43 |     testthat (>= 3.0.0),
44 |     covr,
45 |     curl
46 | Depends:
47 |     R (>= 3.0.0)
48 | VignetteBuilder: knitr
49 | RoxygenNote: 7.3.2
50 | Encoding: UTF-8
51 | Config/testthat/edition: 3
52 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage.yaml
10 | 
11 | permissions: read-all
12 | 
13 | jobs:
14 |   test-coverage:
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - uses: r-lib/actions/setup-r@v2
23 |         with:
24 |           use-public-rspm: true
25 | 
26 |       - uses: r-lib/actions/setup-r-dependencies@v2
27 |         with:
28 |           extra-packages: any::covr, any::xml2
29 |           needs: coverage
30 | 
31 |       - name: Test coverage
32 |         run: |
33 |           cov <- covr::package_coverage(
34 |             quiet = FALSE,
35 |             clean = FALSE,
36 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
37 |           )
38 |           covr::to_cobertura(cov)
39 |         shell: Rscript {0}
40 | 
41 |       - uses: codecov/codecov-action@v4
42 |         with:
43 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
44 |           file: ./cobertura.xml
45 |           plugin: noop
46 |           disable_search: true
47 |           token: ${{ secrets.CODECOV_TOKEN }}
48 | 
49 |       - name: Show testthat output
50 |         if: always()
51 |         run: |
52 |           ## --------------------------------------------------------------------
53 |           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
54 |         shell: bash
55 | 
56 |       - name: Upload test results
57 |         if: failure()
58 |         uses: actions/upload-artifact@v4
59 |         with:
60 |           name: coverage-test-failures
61 |           path: ${{ runner.temp }}/package
62 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing[^1]
 2 | 
 3 | Development is a community effort, and we welcome participation.
 4 | 
 5 | ## Code of Conduct
 6 | 
 7 | Please note that this package is released with a [Contributor Code of Conduct](https://ropensci.org/code-of-conduct/). 
 8 | 
 9 | ## Issues
10 | 
11 | [Issues](https://github.com/ropensci/robotstxt/issues) are for maintenance, tasks, and feature requests.
12 | 
13 | ## Development
14 | 
15 | External code and documentation contributions are extremely helpful in the right circumstances. Here are the recommended steps.
16 | 
17 | 1. Prior to contribution, please propose your idea in a discussion thread so you and the maintainer can define the intent and scope of your work.
18 | 
19 | 1. [Fork the repository](https://help.github.com/articles/fork-a-repo/).
20 | 
21 | 1. Follow the [GitHub flow](https://guides.github.com/introduction/flow/index.html) to create a new branch, add commits, and open a pull request.
22 | 
23 | 1. Discuss your code with the maintainer in the pull request thread.
24 | 
25 | 1. If everything looks good, the maintainer will merge your code into the project.
26 | 
27 | Please also follow these additional guidelines.
28 | 
29 | * Respect the architecture and reasoning of the package. Depending on the scope of your work, you may want to read the design documents (package vignettes).
30 | * If possible, keep contributions small enough to easily review manually. It is okay to split up your work into multiple pull requests.
31 | * For new features or functionality, add tests in `tests`.
32 | * Check code coverage with `covr::package_coverage()`. Automated tests should cover all the new or changed functionality in your pull request.
33 | * Run overall package checks with `devtools::check()`.
34 | * Describe your contribution in the project's [`NEWS.md`](https://github.com/ropensci/robotstxt/blob/master/NEWS.md) file. Be sure to mention relevent GitHub issue numbers and your GitHub name as done in existing news entries.
35 | * If you feel contribution is substantial enough for official author or contributor status, please add yourself to the `Authors@R` field of the [`DESCRIPTION`](https://github.com/ropensci/robotstxt/blob/master/DESCRIPTION) file.
36 | 
37 | [^1]: This `CONTRIBUTING` file is modified from [ropensci/targets](https://github.com/ropensci/targets/blob/main/CONTRIBUTING.md).
38 | 


--------------------------------------------------------------------------------
/R/rt_request_handler_defaults.R:
--------------------------------------------------------------------------------
 1 | #' @docType data
 2 | #' @rdname rt_request_handler
 3 | #' @export
 4 | on_server_error_default <-
 5 |   list(
 6 |     over_write_file_with = "User-agent: *\nDisallow: /",
 7 |     signal               = "error",
 8 |     cache                = FALSE,
 9 |     priority             = 20
10 |   )
11 | 
12 | #' @docType data
13 | #' @rdname rt_request_handler
14 | #' @export
15 | on_client_error_default <-
16 |   list(
17 |     over_write_file_with = "User-agent: *\nAllow: /",
18 |     signal               = "warning",
19 |     cache                = TRUE,
20 |     priority             = 19
21 |   )
22 | 
23 | #' @docType data
24 | #' @rdname rt_request_handler
25 | #' @export
26 | on_not_found_default <-
27 |   list(
28 |     over_write_file_with = "User-agent: *\nAllow: /",
29 |     signal               = "warning",
30 |     cache                = TRUE,
31 |     priority             = 1
32 |   )
33 | 
34 | #' @docType data
35 | #' @rdname rt_request_handler
36 | #' @export
37 | on_redirect_default <-
38 |   list(
39 |     #over_write_file_with = "User-agent: *\nAllow: /",
40 |     #signal               = "warning",
41 |     cache                = TRUE,
42 |     priority             = 3
43 |   )
44 | 
45 | #' @docType data
46 | #' @rdname rt_request_handler
47 | #' @export
48 | on_domain_change_default <-
49 |   list(
50 |     # over_write_file_with = "User-agent: *\nAllow: /",
51 |     signal               = "warning",
52 |     cache                = TRUE,
53 |     priority             = 4
54 |   )
55 | 
56 | #' @docType data
57 | #' @rdname rt_request_handler
58 | #' @export
59 | on_sub_domain_change_default <-
60 |   list(
61 |     # over_write_file_with = "User-agent: *\nAllow: /",
62 |     # signal               = "warning",
63 |     cache                = TRUE,
64 |     priority             = 5
65 |   )
66 | 
67 | 
68 | #' @docType data
69 | #' @rdname rt_request_handler
70 | #' @export
71 | on_file_type_mismatch_default <-
72 |   list(
73 |     over_write_file_with = "User-agent: *\nAllow: /",
74 |     signal               = "warning",
75 |     cache                = TRUE,
76 |     priority             = 6
77 |   )
78 | 
79 | #' @docType data
80 | #' @rdname rt_request_handler
81 | #' @export
82 | on_suspect_content_default <-
83 |   list(
84 |     over_write_file_with = "User-agent: *\nAllow: /",
85 |     signal               = "warning",
86 |     cache                = TRUE,
87 |     priority             = 7
88 |   )
89 | 


--------------------------------------------------------------------------------
/benchmarks/spiderbar_and_futures.r:
--------------------------------------------------------------------------------
  1 | library(spiderbar)
  2 | library(robotstxt)
  3 | library(future)
  4 | 
  5 | # get file with urls
  6 | urls_fname <- system.file("urls.txt", package="robotstxt")
  7 | readLines(urls_fname)[1:3]
  8 | urls <- readLines(urls_fname)[-c(1:5)][1:100]
  9 | 
 10 | paths <- urls
 11 | domain <- robotstxt:::guess_domain(paths)
 12 | 
 13 | # tests for sequential
 14 | plan("sequential")
 15 | 
 16 | with_fetch_seq <-
 17 |   system.time(
 18 |     paths_allowed(
 19 |       urls,
 20 |       warn         = FALSE,
 21 |       force        = TRUE,
 22 |       use_futures  = FALSE,
 23 |       check_method = "robotstxt"
 24 |     )
 25 |   )
 26 | 
 27 | wo_fetch_seq_robotstxt <-
 28 |   system.time(
 29 |     paths_allowed(
 30 |       urls,
 31 |       warn         = FALSE,
 32 |       force        = FALSE,
 33 |       use_futures  = FALSE,
 34 |       check_method = "robotstxt"
 35 |     )
 36 |   )
 37 | 
 38 | wo_fetch_seq_spiderbar <-
 39 |   system.time(
 40 |     paths_allowed(
 41 |       urls,
 42 |       warn         = FALSE,
 43 |       force        = FALSE,
 44 |       use_futures  = FALSE,
 45 |       check_method = "spiderbar"
 46 |     )
 47 |   )
 48 | 
 49 | 
 50 | # tests for parallel
 51 | plan("multisession")
 52 | 
 53 | with_fetch_parallel <-
 54 |   system.time(
 55 |     paths_allowed(
 56 |       urls,
 57 |       warn         = FALSE,
 58 |       force        = TRUE,
 59 |       use_futures  = TRUE,
 60 |       check_method = "robotstxt"
 61 |     )
 62 |   )
 63 | 
 64 | wo_fetch_parallel_robotstxt <-
 65 |   system.time(
 66 |     paths_allowed(
 67 |       urls,
 68 |       warn         = FALSE,
 69 |       force        = FALSE,
 70 |       use_futures  = TRUE,
 71 |       check_method = "robotstxt"
 72 |     )
 73 |   )
 74 | 
 75 | wo_fetch_parallel_spiderbar <-
 76 |   system.time(
 77 |     paths_allowed(
 78 |       urls,
 79 |       warn         = FALSE,
 80 |       force        = FALSE,
 81 |       use_futures  = TRUE,
 82 |       check_method = "spiderbar"
 83 |     )
 84 |   )
 85 | 
 86 | 
 87 | 
 88 | 
 89 | # results
 90 | 
 91 | with_fetch_seq
 92 | wo_fetch_seq_robotstxt
 93 | wo_fetch_seq_spiderbar
 94 | 
 95 | 
 96 | with_fetch_parallel
 97 | wo_fetch_parallel_robotstxt
 98 | wo_fetch_parallel_spiderbar
 99 | 
100 | 
101 | with_fetch_seq
102 | with_fetch_parallel
103 | 
104 | 
105 | wo_fetch_seq_robotstxt
106 | wo_fetch_parallel_robotstxt
107 | 
108 | 
109 | wo_fetch_seq_spiderbar
110 | wo_fetch_parallel_spiderbar
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/R/request_handler_handler.R:
--------------------------------------------------------------------------------
 1 | #' Handle robotstxt handlers
 2 | #'
 3 | #' Helper function to handle robotstxt handlers.
 4 | #'
 5 | #' @param request the request object returned by call to httr::GET()
 6 | #' @param handler the handler either a character string entailing various options or a function producing a specific list, see return.
 7 | #' @param res a list with elements '[handler names], ...', 'rtxt', and 'cache'
 8 | #' @param info info to add to problems list
 9 | #' @param warn if FALSE warnings and messages are suppressed
10 | #'
11 | #' @return a list with elements '[handler name]', 'rtxt', and 'cache'
12 | #' @export
13 | #'
14 | request_handler_handler <-
15 |   function(request, handler, res, info = TRUE, warn = TRUE){
16 |     # use handler function or simply go through options bit by bit
17 |     if ( is.function(handler) ){
18 | 
19 |       return(handler(request, handler, res, info, warn))
20 | 
21 |     } else {
22 | 
23 |       # signaling
24 |       if ( length(handler$signal) == 0 ){
25 |         # do nothing
26 |       } else if ( handler$signal %in% "error" ) {
27 | 
28 |         stop(paste0("Event: ", deparse(substitute(handler))))
29 | 
30 |       } else if (  handler$signal %in% "warning" & warn == TRUE) {
31 | 
32 |         warning(paste0("Event: ", deparse(substitute(handler))))
33 | 
34 |       } else if (  handler$signal %in% "message"   & warn == TRUE) {
35 | 
36 |         message(paste0("Event: ", deparse(substitute(handler))))
37 | 
38 |       }
39 | 
40 | 
41 |       # problems logging
42 |       res$problems[[ deparse(substitute(handler)) ]] <- info
43 | 
44 | 
45 |       # rtxt handling
46 |       if ( is.null(handler$over_write_file_with) ) {
47 |         # do nothing
48 |       } else {
49 |         if ( res$priority < handler$priority){
50 |           res$priority <- handler$priority
51 |           res$rtxt     <-
52 |             paste0(
53 |               "# robots.txt overwrite by: ", deparse(substitute(handler)), "\n\n",
54 |               paste0(handler$over_write_file_with, collapse = "\n")
55 |             )
56 |         }
57 | 
58 |       }
59 | 
60 |       # cache handling
61 |       if ( handler$cache %in% TRUE ) {
62 |         if ( res$priority < handler$priority){
63 |           res$cache <- TRUE
64 |         }
65 |       } else if ( handler$cache %in% FALSE ) {
66 |         if ( res$priority < handler$priority){
67 |           res$priority <- handler$priority
68 |           res$cache <- FALSE
69 |         }
70 |       }
71 |     }
72 | 
73 |     # return
74 |     res
75 |   }
76 | 


--------------------------------------------------------------------------------
/man/get_robotstxt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_robotstxt.R
 3 | \name{get_robotstxt}
 4 | \alias{get_robotstxt}
 5 | \title{Download a robots.txt file}
 6 | \usage{
 7 | get_robotstxt(
 8 |   domain,
 9 |   warn = getOption("robotstxt_warn", TRUE),
10 |   force = FALSE,
11 |   user_agent = utils::sessionInfo()$R.version$version.string,
12 |   ssl_verifypeer = c(1, 0),
13 |   encoding = "UTF-8",
14 |   verbose = FALSE,
15 |   rt_request_handler = robotstxt::rt_request_handler,
16 |   rt_robotstxt_http_getter = robotstxt::get_robotstxt_http_get,
17 |   on_server_error = on_server_error_default,
18 |   on_client_error = on_client_error_default,
19 |   on_not_found = on_not_found_default,
20 |   on_redirect = on_redirect_default,
21 |   on_domain_change = on_domain_change_default,
22 |   on_file_type_mismatch = on_file_type_mismatch_default,
23 |   on_suspect_content = on_suspect_content_default
24 | )
25 | }
26 | \arguments{
27 | \item{domain}{domain from which to download robots.txt file}
28 | 
29 | \item{warn}{warn about being unable to download domain/robots.txt because of}
30 | 
31 | \item{force}{if TRUE instead of using possible cached results the function
32 | will re-download the robotstxt file HTTP response status 404. If this
33 | happens,}
34 | 
35 | \item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt
36 | file from domain}
37 | 
38 | \item{ssl_verifypeer}{either 1 (default) or 0, if 0 it disables SSL peer verification, which
39 | might help with robots.txt file retrieval}
40 | 
41 | \item{encoding}{Encoding of the robots.txt file.}
42 | 
43 | \item{verbose}{make function print out more information}
44 | 
45 | \item{rt_request_handler}{handler function that handles request according to
46 | the event handlers specified}
47 | 
48 | \item{rt_robotstxt_http_getter}{function that executes HTTP request}
49 | 
50 | \item{on_server_error}{request state handler for any 5xx status}
51 | 
52 | \item{on_client_error}{request state handler for any 4xx HTTP status that is
53 | not 404}
54 | 
55 | \item{on_not_found}{request state handler for HTTP status 404}
56 | 
57 | \item{on_redirect}{request state handler for any 3xx HTTP status}
58 | 
59 | \item{on_domain_change}{request state handler for any 3xx HTTP status where
60 | domain did change as well}
61 | 
62 | \item{on_file_type_mismatch}{request state handler for content type other
63 | than 'text/plain'}
64 | 
65 | \item{on_suspect_content}{request state handler for content that seems to be
66 | something else than a robots.txt file (usually a JSON, XML or HTML)}
67 | }
68 | \description{
69 | Download a robots.txt file
70 | }
71 | 


--------------------------------------------------------------------------------
/R/rt_get_fields.R:
--------------------------------------------------------------------------------
 1 | #' Extract permissions from robots.txt
 2 | #' @param txt content of the robots.txt file
 3 | #' @param regex regular expression specify field
 4 | #' @param invert invert selection made via regex?
 5 | #' @keywords internal
 6 | rt_get_fields <- function(txt, regex = "", invert = FALSE){
 7 | 
 8 |   # split into text-parts to do all processing on
 9 |   txt_parts <-
10 |     txt %>%
11 |     stringr::str_replace_all("\r\n", "\n") %>%
12 |     paste0(collapse = "\n") %>%
13 |     stringr::str_replace_all(
14 |       pattern     = "#.*?(\n|$)",
15 |       replacement = ""
16 |     ) %>%
17 |     stringr::str_replace_all(
18 |       pattern     = stringr::regex("(\nUser.+)", ignore_case = TRUE),
19 |       replacement = "\n\\1"
20 |     ) %>%
21 |     stringr::str_replace_all(
22 |       pattern     = "\n",
23 |       replacement = "/%~~~/%\n"
24 |     ) %>%
25 |     stringr::str_replace_all(
26 |       pattern     = stringr::regex("(User.+?/%~~~/%\n)/%~~~/%\n(User.+?)", ignore_case = TRUE),
27 |       replacement = "\\1\\2"
28 |     ) %>%
29 |     stringr::str_replace_all(
30 |       pattern     = "/%~~~/%",
31 |       replacement = ""
32 |     ) %>%
33 |     stringr::str_replace(
34 |       pattern     = "^\n",
35 |       replacement = ""
36 |     ) %>%
37 |     stringr::str_split(
38 |       pattern = "\n[ \t]*\n"
39 |     ) %>%
40 |     unlist()
41 | 
42 | 
43 |   # get user agents per text-part
44 |   useragents  <-
45 |     lapply(
46 |       X   = txt_parts,
47 |       FUN = rt_get_useragent
48 |     )
49 | 
50 |   for(i in seq_along(useragents)){
51 |     if( length(useragents[[i]]) == 0 ){
52 |       useragents[[i]] <- "*"
53 |     }
54 |   }
55 | 
56 | 
57 |   # get fields per part
58 |   fields  <-
59 |     lapply(
60 |       X      = txt_parts,
61 |       FUN    = rt_get_fields_worker,
62 |       regex  = regex,
63 |       invert = invert
64 |     )
65 | 
66 |   # put together user-agents and fields per text-part
67 |   df <- data.frame()
68 |   for ( i in seq_along(txt_parts) ){
69 |     df <-
70 |       rbind(
71 |         df,
72 |         cbind(
73 |           useragents[[i]][rep(seq_along(useragents[[i]]), length(fields[[i]][,1]))],
74 |           fields[[i]][rep(seq_along(fields[[i]][,1]), length(useragents[[i]])),]
75 |         )
76 |       )
77 |   }
78 | 
79 |   # getting df right
80 |   names(df)    <- c("useragent", "field", "value")
81 |   df <- df[,c("field", "useragent", "value")]
82 |   rownames(df) <- NULL
83 | 
84 |   # ensuring chracter columns
85 |   for( i in seq_len(dim(df)[2]) ){
86 |     if( is.factor(df[,i]) ){
87 |       df[,i] <- as.character(df[,i])
88 |     }
89 |   }
90 | 
91 |   # return
92 |   return(df)
93 | }
94 | 


--------------------------------------------------------------------------------
/man/get_robotstxts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_robotstxts.R
 3 | \name{get_robotstxts}
 4 | \alias{get_robotstxts}
 5 | \title{Download multiple robotstxt files}
 6 | \usage{
 7 | get_robotstxts(
 8 |   domain,
 9 |   warn = TRUE,
10 |   force = FALSE,
11 |   user_agent = utils::sessionInfo()$R.version$version.string,
12 |   ssl_verifypeer = c(1, 0),
13 |   use_futures = FALSE,
14 |   verbose = FALSE,
15 |   rt_request_handler = robotstxt::rt_request_handler,
16 |   rt_robotstxt_http_getter = robotstxt::get_robotstxt_http_get,
17 |   on_server_error = on_server_error_default,
18 |   on_client_error = on_client_error_default,
19 |   on_not_found = on_not_found_default,
20 |   on_redirect = on_redirect_default,
21 |   on_domain_change = on_domain_change_default,
22 |   on_file_type_mismatch = on_file_type_mismatch_default,
23 |   on_suspect_content = on_suspect_content_default
24 | )
25 | }
26 | \arguments{
27 | \item{domain}{domain from which to download robots.txt file}
28 | 
29 | \item{warn}{warn about being unable to download domain/robots.txt because of}
30 | 
31 | \item{force}{if TRUE instead of using possible cached results the function
32 | will re-download the robotstxt file HTTP response status 404. If this
33 | happens,}
34 | 
35 | \item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt
36 | file from domain}
37 | 
38 | \item{ssl_verifypeer}{either 1 (default) or 0, if 0 it disables SSL peer verification, which
39 | might help with robots.txt file retrieval}
40 | 
41 | \item{use_futures}{Should future::future_lapply be used for possible
42 | parallel/async retrieval or not. Note: check out help
43 | pages and vignettes of package future on how to set up
44 | plans for future execution because the robotstxt package
45 | does not do it on its own.}
46 | 
47 | \item{verbose}{make function print out more information}
48 | 
49 | \item{rt_request_handler}{handler function that handles request according to
50 | the event handlers specified}
51 | 
52 | \item{rt_robotstxt_http_getter}{function that executes HTTP request}
53 | 
54 | \item{on_server_error}{request state handler for any 5xx status}
55 | 
56 | \item{on_client_error}{request state handler for any 4xx HTTP status that is
57 | not 404}
58 | 
59 | \item{on_not_found}{request state handler for HTTP status 404}
60 | 
61 | \item{on_redirect}{request state handler for any 3xx HTTP status}
62 | 
63 | \item{on_domain_change}{request state handler for any 3xx HTTP status where
64 | domain did change as well}
65 | 
66 | \item{on_file_type_mismatch}{request state handler for content type other
67 | than 'text/plain'}
68 | 
69 | \item{on_suspect_content}{request state handler for content that seems to be
70 | something else than a robots.txt file (usually a JSON, XML or HTML)}
71 | }
72 | \description{
73 | Download multiple robotstxt files
74 | }
75 | 


--------------------------------------------------------------------------------
/tests/testthat/test_attribute_handling.R:
--------------------------------------------------------------------------------
  1 | test_that("get_robotstxt produces attributes", {
  2 | 
  3 |   expect_true({
  4 |     www_redirect <- readRDS(system.file("http_requests/http_redirect_www.rds", package = "robotstxt"))
  5 | 
  6 |     suppressMessages(
  7 |       suppressWarnings(
  8 |         rtxt <-
  9 |           get_robotstxt(
 10 |             "http://google.com",
 11 |             rt_robotstxt_http_getter = function(...){www_redirect}
 12 |           )
 13 |         )
 14 |     )
 15 | 
 16 |     "problems" %in% names(attributes(rtxt))
 17 |   })
 18 | 
 19 |   expect_true({
 20 |     http_404 <- readRDS(system.file("http_requests/http_404.rds", package = "robotstxt"))
 21 | 
 22 |     suppressMessages(
 23 |       suppressWarnings(
 24 |         rtxt <-
 25 |           get_robotstxt(
 26 |             "http://google.com",
 27 |             rt_robotstxt_http_getter = function(...){http_404}
 28 |           )
 29 |       )
 30 |     )
 31 | 
 32 |     "problems" %in% names(attributes(rtxt))
 33 |   })
 34 | 
 35 | 
 36 |   expect_true({
 37 |     http_ok <- readRDS(system.file("http_requests/http_ok_1.rds", package = "robotstxt"))
 38 | 
 39 |     suppressMessages(
 40 |       suppressWarnings(
 41 |         rtxt <-
 42 |           get_robotstxt(
 43 |             "http://google.com",
 44 |             rt_robotstxt_http_getter = function(...){http_404}
 45 |           )
 46 |       )
 47 |     )
 48 | 
 49 |     "problems" %in% names(attributes(rtxt))
 50 |   })
 51 | 
 52 | 
 53 |   expect_true({
 54 |     http_ok <- readRDS(system.file("http_requests/http_ok_2.rds", package = "robotstxt"))
 55 | 
 56 |     suppressMessages(
 57 |       suppressWarnings(
 58 |         rtxt <-
 59 |           get_robotstxt(
 60 |             "http://google.com",
 61 |             rt_robotstxt_http_getter = function(...){http_404}
 62 |           )
 63 |       )
 64 |     )
 65 | 
 66 |     "problems" %in% names(attributes(rtxt))
 67 |   })
 68 | 
 69 | 
 70 |   expect_true({
 71 |     http_ok <- readRDS(system.file("http_requests/http_ok_3.rds", package = "robotstxt"))
 72 | 
 73 |     suppressMessages(
 74 |       suppressWarnings(
 75 |         rtxt <-
 76 |           get_robotstxt(
 77 |             "http://google.com",
 78 |             rt_robotstxt_http_getter = function(...){http_404}
 79 |           )
 80 |       )
 81 |     )
 82 | 
 83 |     "problems" %in% names(attributes(rtxt))
 84 |   })
 85 | 
 86 | 
 87 |   expect_true({
 88 |     http_ok <- readRDS(system.file("http_requests/http_ok_4.rds", package = "robotstxt"))
 89 | 
 90 |     suppressMessages(
 91 |       suppressWarnings(
 92 |         rtxt <-
 93 |           get_robotstxt(
 94 |             "http://google.com",
 95 |             rt_robotstxt_http_getter = function(...){http_404}
 96 |           )
 97 |       )
 98 |     )
 99 | 
100 |     "problems" %in% names(attributes(rtxt))
101 |   })
102 | 
103 | })
104 | 


--------------------------------------------------------------------------------
/tests/testthat/test_tools.R:
--------------------------------------------------------------------------------
  1 | rtxt_asb   <- rt_get_rtxt("allow_single_bot.txt")
  2 | rtxt_dafa  <- rt_get_rtxt("disallow_all_for_all.txt")
  3 | rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt")
  4 | rtxt_dsfa  <- rt_get_rtxt("disallow_some_for_all.txt")
  5 | rtxt_empty <- rt_get_rtxt("empty.txt")
  6 | rtxt_datao <- rt_get_rtxt("disallow_two_at_once.txt")
  7 | rtxt_tcom  <- rt_get_rtxt("testing_comments.txt")
  8 | rtxt_amzn  <- rt_get_rtxt("robots_amazon.txt")
  9 | rtxt_bt    <- rt_get_rtxt("robots_bundestag.txt")
 10 | rtxt_ggl   <- rt_get_rtxt("robots_google.txt")
 11 | rtxt_nyt   <- rt_get_rtxt("robots_new_york_times.txt")
 12 | rtxt_spgl  <- rt_get_rtxt("robots_spiegel.txt")
 13 | rtxt_yh    <- rt_get_rtxt("robots_yahoo.txt")
 14 | rtxt_she   <- rt_get_rtxt("selfhtml_Example.txt")
 15 | rtxt_pm    <- rt_get_rtxt("robots_pmeissner.txt")
 16 | rtxt_wp    <- rt_get_rtxt("robots_wikipedia.txt")
 17 | 
 18 | rtxt_list <-
 19 |   list(
 20 |     rtxt_asb, rtxt_dafa, rtxt_dafbb, rtxt_dsfa, rtxt_empty, rtxt_datao,
 21 |     rtxt_tcom, rtxt_amzn, rtxt_bt, rtxt_ggl, rtxt_nyt, rtxt_spgl,
 22 |     rtxt_yh, rtxt_she, rtxt_pm, rtxt_wp
 23 |   )
 24 | 
 25 | 
 26 | test_that(
 27 |   "robotstxt print works", {
 28 |     expect_true({
 29 |       res <- logical()
 30 | 
 31 |       for ( i in seq_along(rtxt_list) ){
 32 |         rt <- robotstxt(text = rtxt_list[[i]])
 33 |         rt_print <- capture.output(rt)
 34 |         res <-
 35 |           c(
 36 |             res,
 37 |             all(
 38 |               any(grepl("\\$domain", rt_print)),
 39 |               any(grepl("\\$bots", rt_print)),
 40 |               any(grepl("\\$comments", rt_print)),
 41 |               any(grepl("\\$permissions", rt_print)),
 42 |               any(grepl("\\$crawl_delay", rt_print)),
 43 |               any(grepl("\\$host", rt_print)),
 44 |               any(grepl("\\$sitemap", rt_print)),
 45 |               any(grepl("\\$other", rt_print)),
 46 |               any(grepl("\\$check", rt_print))
 47 |             )
 48 |           )
 49 |       }
 50 | 
 51 |       all(res)
 52 |     })
 53 |   }
 54 | )
 55 | 
 56 | 
 57 | test_that(
 58 |   "robotstxt tools work", {
 59 | 
 60 |     expect_true({
 61 |       a <- 1
 62 |       identical(named_list(1), list(`1` = 1)) &
 63 |       identical(named_list(a), list(a = 1))
 64 |     })
 65 | 
 66 |     expect_silent({
 67 |       rt_get_rtxt(1)
 68 |       rt_get_rtxt("robots_wikipedia.txt")
 69 |       rt_get_rtxt()
 70 |     })
 71 |   }
 72 | )
 73 | 
 74 | 
 75 | test_that(
 76 |   "guess domain works", {
 77 | 
 78 |     expect_true({
 79 |       is.na(guess_domain(""))
 80 |     })
 81 | 
 82 |     expect_true({
 83 |       guess_domain("google.com") == "google.com"
 84 |     })
 85 | 
 86 |     expect_true({
 87 |       guess_domain("www.google.com") == "www.google.com"
 88 |     })
 89 | 
 90 |     expect_true({
 91 |       guess_domain("www.domain-with-hyphen.tld") == "www.domain-with-hyphen.tld"
 92 |     })
 93 | 
 94 |     expect_true({
 95 |       guess_domain("tld-domain.tld") == "tld-domain.tld"
 96 |     })
 97 | 
 98 |   }
 99 | )
100 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_amazon.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Disallow: /exec/obidos/account-access-login
 3 | Disallow: /exec/obidos/change-style
 4 | Disallow: /exec/obidos/flex-sign-in
 5 | Disallow: /exec/obidos/handle-buy-box
 6 | Disallow: /exec/obidos/tg/cm/member/
 7 | Disallow: /exec/obidos/refer-a-friend-login
 8 | Disallow: /exec/obidos/subst/partners/friends/access.html
 9 | Disallow: /exec/obidos/subst/marketplace/sell-your-stuff.html
10 | Disallow: /exec/obidos/subst/marketplace/sell-your-collection.html
11 | Disallow: /exec/obidos/subst/associates/join
12 | Disallow: /gp/cart
13 | Disallow: /gp/customer-media/upload
14 | Disallow: /gp/flex
15 | Disallow: /gp/sign-in
16 | Disallow: /gp/slides/make-money
17 | Disallow: /gp/yourstore
18 | Disallow: /gp/content-form
19 | Disallow: /gp/customer-reviews/common/du
20 | Disallow: /gp/customer-reviews/write-a-review.html
21 | Disallow: /gp/vote
22 | Disallow: /gp/voting/
23 | Disallow: /gp/product/product-availability 
24 | Disallow: /gp/reader
25 | Disallow: /gp/gfix
26 | Disallow: /gp/music/wma-pop-up
27 | Disallow: /gp/customer-images
28 | Disallow: /gp/music/clipserve
29 | Disallow: /gp/offer-listing
30 | Disallow: /gp/richpub/listmania/createpipeline
31 | Disallow: /gp/history
32 | Disallow: /gp/item-dispatch
33 | Disallow: /gp/recsradio
34 | Disallow: /dp/twister-update/
35 | Disallow: /dp/e-mail-friend/
36 | Disallow: /dp/product-availability/
37 | Disallow: /gp/registry/wishlist/*/reserve
38 | Disallow: /gp/structured-ratings/actions/get-experience.html
39 | Disallow: /gp/twitter/
40 | Disallow: /ap/signin
41 | Disallow: /gp/registry/wishlist/
42 | Disallow: /wishlist/
43 | Allow: /wishlist/universal*
44 | Allow: /wishlist/vendor-button*
45 | Allow: /wishlist/get-button*
46 | Disallow: /gp/wishlist/
47 | Allow: /gp/wishlist/universal*
48 | Allow: /gp/wishlist/vendor-button*
49 | Allow: /gp/wishlist/ipad-install*
50 | Disallow: /registry/wishlist/
51 | Disallow: /gp/registry/search.html
52 | Disallow: /soleil-de-venise-Bekleidung/s?ie=UTF8&page=1&rh=n%3A77028031%2Ck%3Asoleil%20de%20venise
53 | Disallow: */b?ie=UTF8&node=1619732031
54 | Disallow: /gp/orc/rml/
55 | Disallow: /gp/dmusic/mp3/player
56 | Disallow: /gp/entity-alert/external
57 | Disallow: /gp/customer-reviews/dynamic/sims-box
58 | Disallow: /review/dynamic/sims-box
59 | Disallow: /*stressless*fernsehsessel*
60 | Disallow: /*stressless*sessel*
61 | Disallow: /gp/redirect.html
62 | Disallow: /gp/customer-media/actions/delete/
63 | Disallow: /gp/customer-media/actions/edit-caption/
64 | Disallow: /gp/dmusic/
65 | Disallow: /gp/help/customer/display.html?nodeId=200393390
66 | Disallow: /gp/customer-media/product-gallery/B007HCCOD0
67 | Disallow: /forum/kindle
68 | Disallow: /gp/aag
69 | Disallow: /gp/socialmedia/giveaways
70 | Disallow: /gp/aw/so.html
71 | Disallow: /gp/rentallist
72 | Disallow: /gp/video/dvd-rental/settings
73 | Disallow: /gp/rl/settings
74 | Disallow: /gp/video/settings
75 | Disallow: /gp/video/watchlist
76 | Disallow: /gp/video/library
77 | 
78 | User-agent: EtaoSpider
79 | Disallow: /
80 | 
81 | # Sitemap files
82 | Sitemap: http://www.amazon.de/sitemap-manual-index.xml
83 | Sitemap: http://www.amazon.de/sitemaps.f3053414d236e84.SitemapIndex_0.xml.gz
84 | Sitemap: http://www.amazon.de/sitemaps.1946f6b8171de60.SitemapIndex_0.xml.gz
85 | 
86 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/tests/testthat/test_path_examples_from_rfc.R:
--------------------------------------------------------------------------------
 1 | # tests for functions responsible for data gathering and transformation
 2 | 
 3 | # This table illustrates some examples:
 4 | #
 5 | #   Record Path        URL path         Matches
 6 | # /tmp               /tmp               yes
 7 | # /tmp               /tmp.html          yes
 8 | # /tmp               /tmp/a.html        yes
 9 | # /tmp/              /tmp               no
10 | # /tmp/              /tmp/              yes
11 | # /tmp/              /tmp/a.html        yes
12 | #
13 | # /a%3cd.html        /a%3cd.html        yes
14 | # /a%3Cd.html        /a%3cd.html        yes
15 | # /a%3cd.html        /a%3Cd.html        yes
16 | # /a%3Cd.html        /a%3Cd.html        yes
17 | #
18 | # /a%2fb.html        /a%2fb.html        yes
19 | # /a%2fb.html        /a/b.html          no
20 | # /a/b.html          /a%2fb.html        no
21 | # /a/b.html          /a/b.html          yes
22 | #
23 | # /%7ejoe/index.html /~joe/index.html   yes
24 | # /~joe/index.html   /%7Ejoe/index.html yes
25 | 
26 | 
27 | test_that(
28 |   "simple check", {
29 |     expect_true(
30 |       paths_allowed(
31 |         robotstxt_list = list(""),
32 |         paths          = "/temp/",
33 |         bot            = "mein-robot"
34 |       )
35 |     )
36 |   }
37 | )
38 | 
39 | 
40 | 
41 | # A fictional site may have the following URLs:
42 | #
43 | # http://www.fict.org/
44 | # http://www.fict.org/index.html
45 | # http://www.fict.org/robots.txt
46 | # http://www.fict.org/server.html
47 | # http://www.fict.org/services/fast.html
48 | # http://www.fict.org/services/slow.html
49 | # http://www.fict.org/orgo.gif
50 | # http://www.fict.org/org/about.html
51 | # http://www.fict.org/org/plans.html
52 | # http://www.fict.org/%7Ejim/jim.html
53 | # http://www.fict.org/%7Emak/mak.html
54 | #
55 | # The site may in the /robots.txt have specific rules for robots that
56 | # send a HTTP User-agent "UnhipBot/0.1", "WebCrawler/3.0", and
57 | #
58 | #
59 | # Koster                draft-koster-robots-00.txt                [Page 8]
60 | #
61 | # INTERNET DRAFT        A Method for Robots Control       December 4, 1996
62 | #
63 | # "Excite/1.0", and a set of default rules:
64 | #
65 | #   # /robots.txt for http://www.fict.org/
66 | #   # comments to webmaster@fict.org
67 | #
68 | #   User-agent: unhipbot
69 | # Disallow: /
70 | #
71 | #   User-agent: webcrawler
72 | # User-agent: excite
73 | # Disallow:
74 | #
75 | #   User-agent: *
76 | #   Disallow: /org/plans.html
77 | # Allow: /org/
78 | #   Allow: /serv
79 | # Allow: /~mak
80 | # Disallow: /
81 | #
82 | #   The following matrix shows which robots are allowed to access URLs:
83 | #
84 | #   unhipbot webcrawler other
85 | # & excite
86 | # http://www.fict.org/                         No       Yes       No
87 | # http://www.fict.org/index.html               No       Yes       No
88 | # http://www.fict.org/robots.txt               Yes      Yes       Yes
89 | # http://www.fict.org/server.html              No       Yes       Yes
90 | # http://www.fict.org/services/fast.html       No       Yes       Yes
91 | # http://www.fict.org/services/slow.html       No       Yes       Yes
92 | # http://www.fict.org/orgo.gif                 No       Yes       No
93 | # http://www.fict.org/org/about.html           No       Yes       Yes
94 | # http://www.fict.org/org/plans.html           No       Yes       No
95 | # http://www.fict.org/%7Ejim/jim.html          No       Yes       No
96 | # http://www.fict.org/%7Emak/mak.html          No       Yes       Yes
97 | 


--------------------------------------------------------------------------------
/man/rt_request_handler.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/rt_request_handler.R,
  3 | %   R/rt_request_handler_defaults.R
  4 | \docType{data}
  5 | \name{rt_request_handler}
  6 | \alias{rt_request_handler}
  7 | \alias{on_server_error_default}
  8 | \alias{on_client_error_default}
  9 | \alias{on_not_found_default}
 10 | \alias{on_redirect_default}
 11 | \alias{on_domain_change_default}
 12 | \alias{on_sub_domain_change_default}
 13 | \alias{on_file_type_mismatch_default}
 14 | \alias{on_suspect_content_default}
 15 | \title{Handle robotstxt object retrieved from HTTP request}
 16 | \format{
 17 | An object of class \code{list} of length 4.
 18 | 
 19 | An object of class \code{list} of length 4.
 20 | 
 21 | An object of class \code{list} of length 4.
 22 | 
 23 | An object of class \code{list} of length 2.
 24 | 
 25 | An object of class \code{list} of length 3.
 26 | 
 27 | An object of class \code{list} of length 2.
 28 | 
 29 | An object of class \code{list} of length 4.
 30 | 
 31 | An object of class \code{list} of length 4.
 32 | }
 33 | \usage{
 34 | rt_request_handler(
 35 |   request,
 36 |   on_server_error = on_server_error_default,
 37 |   on_client_error = on_client_error_default,
 38 |   on_not_found = on_not_found_default,
 39 |   on_redirect = on_redirect_default,
 40 |   on_domain_change = on_domain_change_default,
 41 |   on_sub_domain_change = on_sub_domain_change_default,
 42 |   on_file_type_mismatch = on_file_type_mismatch_default,
 43 |   on_suspect_content = on_suspect_content_default,
 44 |   warn = TRUE,
 45 |   encoding = "UTF-8"
 46 | )
 47 | 
 48 | on_server_error_default
 49 | 
 50 | on_client_error_default
 51 | 
 52 | on_not_found_default
 53 | 
 54 | on_redirect_default
 55 | 
 56 | on_domain_change_default
 57 | 
 58 | on_sub_domain_change_default
 59 | 
 60 | on_file_type_mismatch_default
 61 | 
 62 | on_suspect_content_default
 63 | }
 64 | \arguments{
 65 | \item{request}{result of an HTTP request (e.g. httr::GET())}
 66 | 
 67 | \item{on_server_error}{request state handler for any 5xx status}
 68 | 
 69 | \item{on_client_error}{request state handler for any 4xx HTTP status that is
 70 | not 404}
 71 | 
 72 | \item{on_not_found}{request state handler for HTTP status 404}
 73 | 
 74 | \item{on_redirect}{request state handler for any 3xx HTTP status}
 75 | 
 76 | \item{on_domain_change}{request state handler for any 3xx HTTP status where
 77 | domain did change as well}
 78 | 
 79 | \item{on_sub_domain_change}{request state handler for any 3xx HTTP status where
 80 | domain did change but only to www-sub_domain}
 81 | 
 82 | \item{on_file_type_mismatch}{request state handler for content type other
 83 | than 'text/plain'}
 84 | 
 85 | \item{on_suspect_content}{request state handler for content that seems to be
 86 | something else than a robots.txt file (usually a JSON, XML or HTML)}
 87 | 
 88 | \item{warn}{suppress warnings}
 89 | 
 90 | \item{encoding}{The text encoding to assume if no encoding is provided in the
 91 | headers of the response}
 92 | }
 93 | \value{
 94 | a list with three items following the following schema: \cr \code{
 95 |   list( rtxt = "", problems = list( "redirect" = list( status_code = 301 ),
 96 |   "domain" = list(from_url = "...", to_url = "...") ) ) }
 97 | }
 98 | \description{
 99 | A helper function for get_robotstxt() that will extract the robots.txt file
100 | from the HTTP request result object. It will inform get_robotstxt() if the
101 | request should be cached and which problems occurred.
102 | }
103 | \keyword{datasets}
104 | 


--------------------------------------------------------------------------------
/man/paths_allowed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/paths_allowed.R
 3 | \name{paths_allowed}
 4 | \alias{paths_allowed}
 5 | \title{Check if a bot has permissions to access page(s)}
 6 | \usage{
 7 | paths_allowed(
 8 |   paths = "/",
 9 |   domain = "auto",
10 |   bot = "*",
11 |   user_agent = utils::sessionInfo()$R.version$version.string,
12 |   check_method = c("spiderbar"),
13 |   warn = getOption("robotstxt_warn", TRUE),
14 |   force = FALSE,
15 |   ssl_verifypeer = c(1, 0),
16 |   use_futures = TRUE,
17 |   robotstxt_list = NULL,
18 |   verbose = FALSE,
19 |   rt_request_handler = robotstxt::rt_request_handler,
20 |   rt_robotstxt_http_getter = robotstxt::get_robotstxt_http_get,
21 |   on_server_error = on_server_error_default,
22 |   on_client_error = on_client_error_default,
23 |   on_not_found = on_not_found_default,
24 |   on_redirect = on_redirect_default,
25 |   on_domain_change = on_domain_change_default,
26 |   on_file_type_mismatch = on_file_type_mismatch_default,
27 |   on_suspect_content = on_suspect_content_default
28 | )
29 | }
30 | \arguments{
31 | \item{paths}{paths for which to check bot's permission, defaults to "/". Please note that path to a folder should end with a trailing slash ("/").}
32 | 
33 | \item{domain}{Domain for which paths should be checked. Defaults to "auto".
34 | If set to "auto" function will try to guess the domain by parsing the paths
35 | argument. Note however, that these are educated guesses which might utterly
36 | fail. To be on the safe side, provide appropriate domains manually.}
37 | 
38 | \item{bot}{name of the bot, defaults to "*"}
39 | 
40 | \item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt
41 | file from domain}
42 | 
43 | \item{check_method}{at the moment only kept for backward compatibility reasons - do not use parameter anymore --> will let the function simply use the default}
44 | 
45 | \item{warn}{suppress warnings}
46 | 
47 | \item{force}{if TRUE instead of using possible cached results the function
48 | will re-download the robotstxt file HTTP response status 404. If this
49 | happens,}
50 | 
51 | \item{ssl_verifypeer}{either 1 (default) or 0, if 0 it disables SSL peer verification, which
52 | might help with robots.txt file retrieval}
53 | 
54 | \item{use_futures}{Should future::future_lapply be used for possible
55 | parallel/async retrieval or not. Note: check out help
56 | pages and vignettes of package future on how to set up
57 | plans for future execution because the robotstxt package
58 | does not do it on its own.}
59 | 
60 | \item{robotstxt_list}{either NULL -- the default -- or a list of character
61 | vectors with one vector per path to check}
62 | 
63 | \item{verbose}{make function print out more information}
64 | 
65 | \item{rt_request_handler}{handler function that handles request according to
66 | the event handlers specified}
67 | 
68 | \item{rt_robotstxt_http_getter}{function that executes HTTP request}
69 | 
70 | \item{on_server_error}{request state handler for any 5xx status}
71 | 
72 | \item{on_client_error}{request state handler for any 4xx HTTP status that is
73 | not 404}
74 | 
75 | \item{on_not_found}{request state handler for HTTP status 404}
76 | 
77 | \item{on_redirect}{request state handler for any 3xx HTTP status}
78 | 
79 | \item{on_domain_change}{request state handler for any 3xx HTTP status where
80 | domain did change as well}
81 | 
82 | \item{on_file_type_mismatch}{request state handler for content type other
83 | than 'text/plain'}
84 | 
85 | \item{on_suspect_content}{request state handler for content that seems to be
86 | something else than a robots.txt file (usually a JSON, XML or HTML)}
87 | }
88 | \description{
89 | Check if a bot has permissions to access page(s)
90 | }
91 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/http_event_handling.md:
--------------------------------------------------------------------------------
  1 | # non www redirects are handled non silently
  2 | 
  3 |     Code
  4 |       domain_change <- readRDS(system.file("http_requests/http_domain_change.rds",
  5 |         package = "robotstxt"))
  6 |       suppressMessages(get_robotstxt("http://google.com", rt_robotstxt_http_getter = function(
  7 |         ...) {
  8 |         domain_change
  9 |       }, warn = TRUE))
 10 |     Condition
 11 |       Warning in `request_handler_handler()`:
 12 |       Event: on_file_type_mismatch
 13 |       Warning in `request_handler_handler()`:
 14 |       Event: on_suspect_content
 15 |     Output
 16 |       [robots.txt]
 17 |       --------------------------------------
 18 |       
 19 |       # robots.txt overwrite by: on_suspect_content
 20 |       
 21 |       User-agent: *
 22 |       Allow: /
 23 |       
 24 |       
 25 |       
 26 |       [events]
 27 |       --------------------------------------
 28 |       
 29 |       requested:   www.petermeissner.de 
 30 |       downloaded:  https://petermeissner.de/ 
 31 |       
 32 |       $on_redirect
 33 |       $on_redirect[[1]]
 34 |       $on_redirect[[1]]$status
 35 |       [1] 301
 36 |       
 37 |       $on_redirect[[1]]$location
 38 |       [1] "https://www.petermeissner.de/"
 39 |       
 40 |       
 41 |       $on_redirect[[2]]
 42 |       $on_redirect[[2]]$status
 43 |       [1] 301
 44 |       
 45 |       $on_redirect[[2]]$location
 46 |       [1] "https://petermeissner.de/"
 47 |       
 48 |       
 49 |       $on_redirect[[3]]
 50 |       $on_redirect[[3]]$status
 51 |       [1] 200
 52 |       
 53 |       $on_redirect[[3]]$location
 54 |       NULL
 55 |       
 56 |       
 57 |       
 58 |       $on_file_type_mismatch
 59 |       $on_file_type_mismatch$content_type
 60 |       [1] "text/html"
 61 |       
 62 |       
 63 |       $on_suspect_content
 64 |       $on_suspect_content$parsable
 65 |       [1] FALSE
 66 |       
 67 |       $on_suspect_content$content_suspect
 68 |       [1] TRUE
 69 |       
 70 |       
 71 |       [attributes]
 72 |       --------------------------------------
 73 |       
 74 |       problems, cached, request, class
 75 | 
 76 | # client error
 77 | 
 78 |     Code
 79 |       http_client_error <- readRDS(system.file("http_requests/http_client_error.rds",
 80 |         package = "robotstxt"))
 81 |       suppressMessages(get_robotstxt("httpbin.org", rt_robotstxt_http_getter = function(
 82 |         ...) {
 83 |         http_client_error
 84 |       }))
 85 |     Condition
 86 |       Warning in `request_handler_handler()`:
 87 |       Event: on_client_error
 88 |       Warning in `request_handler_handler()`:
 89 |       Event: on_file_type_mismatch
 90 |     Output
 91 |       [robots.txt]
 92 |       --------------------------------------
 93 |       
 94 |       # robots.txt overwrite by: on_client_error
 95 |       
 96 |       User-agent: *
 97 |       Allow: /
 98 |       
 99 |       
100 |       
101 |       [events]
102 |       --------------------------------------
103 |       
104 |       requested:   https://httpbin.org/status/400 
105 |       downloaded:  https://httpbin.org/status/400 
106 |       
107 |       $on_client_error
108 |       $on_client_error$status_code
109 |       [1] 400
110 |       
111 |       
112 |       $on_file_type_mismatch
113 |       $on_file_type_mismatch$content_type
114 |       [1] "text/html; charset=utf-8"
115 |       
116 |       
117 |       [attributes]
118 |       --------------------------------------
119 |       
120 |       problems, cached, request, class
121 | 
122 | # server error
123 | 
124 |     Code
125 |       res <- suppressMessages(get_robotstxt("httpbin.org", rt_robotstxt_http_getter = f,
126 |         on_server_error = list(signal = "warning"), force = TRUE))
127 |     Condition
128 |       Warning in `request_handler_handler()`:
129 |       Event: on_server_error
130 |       Warning in `request_handler_handler()`:
131 |       Event: on_file_type_mismatch
132 | 
133 | 


--------------------------------------------------------------------------------
/tests/testthat/test_robotstxt.R:
--------------------------------------------------------------------------------
  1 | rtxt_asb   <- rt_get_rtxt("allow_single_bot.txt")
  2 | rtxt_dafa  <- rt_get_rtxt("disallow_all_for_all.txt")
  3 | rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt")
  4 | rtxt_dsfa  <- rt_get_rtxt("disallow_some_for_all.txt")
  5 | rtxt_empty <- rt_get_rtxt("empty.txt")
  6 | rtxt_datao <- rt_get_rtxt("disallow_two_at_once.txt")
  7 | rtxt_tcom  <- rt_get_rtxt("testing_comments.txt")
  8 | rtxt_amzn  <- rt_get_rtxt("robots_amazon.txt")
  9 | rtxt_bt    <- rt_get_rtxt("robots_bundestag.txt")
 10 | rtxt_ggl   <- rt_get_rtxt("robots_google.txt")
 11 | rtxt_nyt   <- rt_get_rtxt("robots_new_york_times.txt")
 12 | rtxt_spgl  <- rt_get_rtxt("robots_spiegel.txt")
 13 | rtxt_yh    <- rt_get_rtxt("robots_yahoo.txt")
 14 | rtxt_she   <- rt_get_rtxt("selfhtml_Example.txt")
 15 | rtxt_pm    <- rt_get_rtxt("robots_pmeissner.txt")
 16 | rtxt_wp    <- rt_get_rtxt("robots_wikipedia.txt")
 17 | 
 18 | # test_that(
 19 | #   "get_robotstxt() can fetch a file", {
 20 | #     expect_true(
 21 | #       {
 22 | #         rt <- get_robotstxt(domain="pmeissner.com")
 23 | #         TRUE
 24 | #       }
 25 | #     )
 26 | #   }
 27 | # )
 28 | 
 29 | test_that(
 30 |   "initialisation works well", {
 31 |     expect_error( rt <- robotstxt() )
 32 |     expect_error( rt <- robotstxt("") )
 33 |     expect_true( all(class(robotstxt(text=rtxt_she)) %in% c("robotstxt")) )
 34 |   }
 35 | )
 36 | 
 37 | 
 38 | test_that(
 39 |   "robotstxt check method works well", {
 40 |     expect_true( robotstxt(text=rtxt_she)$check() )
 41 |     expect_true( robotstxt(text=rtxt_she)$check("blah") )
 42 |   }
 43 | )
 44 | 
 45 | 
 46 | test_that(
 47 |   "robotstxt check method works well", {
 48 |     expect_true( robotstxt(text=rtxt_she)$check() )
 49 |     expect_true( robotstxt(text=rtxt_she)$check("blah") )
 50 |   }
 51 | )
 52 | 
 53 | 
 54 | test_that(
 55 |   "robotstxt parsing multi agent records without newline", {
 56 |     expect_true({
 57 |       rbtx <- spiderbar::robxp("
 58 | User-agent: *
 59 | Disallow: /*/print$
 60 | # Don't allow indexing of user needs pages
 61 | Disallow: /info/*
 62 | Sitemap: https://www.gov.uk/sitemap.xml
 63 | # https://ahrefs.com/robot/ crawls the site frequently
 64 | User-agent: dooby
 65 | User-agent: AhrefsBot
 66 | Crawl-delay: 10
 67 | # https://www.deepcrawl.com/bot/ makes lots of requests. Ideally
 68 | # we'd slow it down rather than blocking it but it doesn't mention
 69 | # whether or not it supports crawl-delay.
 70 | User-agent: deepcrawl
 71 | Disallow: /
 72 | # Complaints of 429 'Too many requests' seem to be coming from SharePoint servers
 73 | # (https://social.msdn.microsoft.com/Forums/en-US/3ea268ed-58a6-4166-ab40-d3f4fc55fef4)
 74 | # The robot doesn't recognise its User-Agent string, see the MS support article:
 75 | # https://support.microsoft.com/en-us/help/3019711/the-sharepoint-server-crawler-ignores-directives-in-robots-txt
 76 | User-agent: MS Search 6.0 Robot
 77 | Disallow: /
 78 | "
 79 |       )
 80 |       sum(spiderbar::crawl_delays(rbtx)$crawl_delay==10)==2
 81 |     })
 82 | 
 83 |     expect_true({
 84 |       robot <- robotstxt(text = "
 85 | User-agent: *
 86 | Disallow: /*/print$
 87 | # Don't allow indexing of user needs pages
 88 | Disallow: /info/*
 89 | Sitemap: https://www.gov.uk/sitemap.xml
 90 | # https://ahrefs.com/robot/ crawls the site frequently
 91 | User-agent: dooby
 92 | User-agent: AhrefsBot
 93 | Crawl-delay: 10
 94 | # https://www.deepcrawl.com/bot/ makes lots of requests. Ideally
 95 | # we'd slow it down rather than blocking it but it doesn't mention
 96 | # whether or not it supports crawl-delay.
 97 | User-agent: deepcrawl
 98 | Disallow: /
 99 | # Complaints of 429 'Too many requests' seem to be coming from SharePoint servers
100 | # (https://social.msdn.microsoft.com/Forums/en-US/3ea268ed-58a6-4166-ab40-d3f4fc55fef4)
101 | # The robot doesn't recognise its User-Agent string, see the MS support article:
102 | # https://support.microsoft.com/en-us/help/3019711/the-sharepoint-server-crawler-ignores-directives-in-robots-txt
103 | User-agent: MS Search 6.0 Robot
104 | Disallow: /
105 | ")
106 |       nrow(robot$crawl_delay) == 2
107 |     })
108 |   }
109 | )
110 | 


--------------------------------------------------------------------------------
/R/get_robotstxts.R:
--------------------------------------------------------------------------------
  1 | #' Download multiple robotstxt files
  2 | #'
  3 | #' @inheritParams get_robotstxt
  4 | #' @param use_futures Should future::future_lapply be used for possible
  5 | #'                    parallel/async retrieval or not. Note: check out help
  6 | #'                    pages and vignettes of package future on how to set up
  7 | #'                    plans for future execution because the robotstxt package
  8 | #'                    does not do it on its own.
  9 | #' @param ssl_verifypeer either 1 (default) or 0, if 0 it disables SSL peer verification, which
 10 | #'   might help with robots.txt file retrieval
 11 | #' @param rt_request_handler handler function that handles request according to
 12 | #'     the event handlers specified
 13 | #'
 14 | #' @export
 15 | #'
 16 | get_robotstxts <-
 17 |   function(
 18 |     domain,
 19 |     warn                      = TRUE,
 20 |     force                     = FALSE,
 21 |     user_agent                = utils::sessionInfo()$R.version$version.string,
 22 |     ssl_verifypeer            = c(1,0),
 23 |     use_futures               = FALSE,
 24 |     verbose                   = FALSE,
 25 |     rt_request_handler        = robotstxt::rt_request_handler,
 26 |     rt_robotstxt_http_getter  = robotstxt::get_robotstxt_http_get,
 27 |     on_server_error       = on_server_error_default,
 28 |     on_client_error       = on_client_error_default,
 29 |     on_not_found          = on_not_found_default,
 30 |     on_redirect           = on_redirect_default,
 31 |     on_domain_change      = on_domain_change_default,
 32 |     on_file_type_mismatch = on_file_type_mismatch_default,
 33 |     on_suspect_content    = on_suspect_content_default
 34 |   ){
 35 | 
 36 | 
 37 |     # combine parameter
 38 |     if ( length(user_agent) == 0 ) {
 39 | 
 40 |       parameter <-
 41 |         data.frame(
 42 |           domain           = domain,
 43 |           warn             = warn[1],
 44 |           force            = force[1],
 45 |           ssl_verifypeer   = ssl_verifypeer[1],
 46 |           verbose          = verbose,
 47 |           stringsAsFactors = FALSE
 48 |         )
 49 | 
 50 |     } else {
 51 | 
 52 |       parameter <-
 53 |         data.frame(
 54 |           domain           = domain,
 55 |           user_agent       = user_agent,
 56 |           warn             = warn[1],
 57 |           force            = force[1],
 58 |           ssl_verifypeer   = ssl_verifypeer[1],
 59 |           verbose          = verbose,
 60 |           stringsAsFactors = FALSE
 61 |         )
 62 |     }
 63 | 
 64 |     parameter_list <-
 65 |       lapply(
 66 |         split(parameter, seq_len(nrow(parameter))),
 67 |         as.list
 68 |       )
 69 | 
 70 | 
 71 |     # prepare execution of get_robotstxt()
 72 |     apply_fun <-
 73 |       if ( isTRUE(use_futures) ) {
 74 |         future.apply::future_lapply
 75 |       } else {
 76 |         lapply
 77 |       }
 78 | 
 79 |     to_be_applied_fun <-
 80 |       function(x){
 81 | 
 82 |         message(
 83 |           paste("\r", x$domain, "                     "),
 84 |           appendLF = FALSE
 85 |         )
 86 | 
 87 |         get_robotstxt(
 88 |           domain                    = x$domain,
 89 |           warn                      = x$warn,
 90 |           force                     = x$force,
 91 |           user_agent                = x$user_agent,
 92 |           ssl_verifypeer            = x$ssl_verifypeer,
 93 |           verbose                   = x$verbose,
 94 |           rt_request_handler        = rt_request_handler,
 95 |           rt_robotstxt_http_getter  = rt_robotstxt_http_getter,
 96 |           on_server_error           = on_server_error,
 97 |           on_client_error           = on_client_error,
 98 |           on_not_found              = on_not_found,
 99 |           on_redirect               = on_redirect,
100 |           on_domain_change          = on_domain_change,
101 |           on_file_type_mismatch     = on_file_type_mismatch,
102 |           on_suspect_content        = on_suspect_content
103 |         )
104 | 
105 |       }
106 | 
107 |     # execute get_robotstxt to parameter grid
108 |     rtxt_list <-
109 |       apply_fun(
110 |         parameter_list,
111 |         FUN = to_be_applied_fun
112 |       )
113 |     names(rtxt_list) <- domain
114 |     message("\n")
115 | 
116 |     # return
117 |     return(rtxt_list)
118 |   }
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/man/robotstxt.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/robotstxt.R
  3 | \name{robotstxt}
  4 | \alias{robotstxt}
  5 | \title{Generate a representation of a robots.txt file}
  6 | \usage{
  7 | robotstxt(
  8 |   domain = NULL,
  9 |   text = NULL,
 10 |   user_agent = NULL,
 11 |   warn = getOption("robotstxt_warn", TRUE),
 12 |   force = FALSE,
 13 |   ssl_verifypeer = c(1, 0),
 14 |   encoding = "UTF-8",
 15 |   verbose = FALSE,
 16 |   on_server_error = on_server_error_default,
 17 |   on_client_error = on_client_error_default,
 18 |   on_not_found = on_not_found_default,
 19 |   on_redirect = on_redirect_default,
 20 |   on_domain_change = on_domain_change_default,
 21 |   on_file_type_mismatch = on_file_type_mismatch_default,
 22 |   on_suspect_content = on_suspect_content_default
 23 | )
 24 | }
 25 | \arguments{
 26 | \item{domain}{Domain for which to generate a representation. If text equals to NULL,
 27 | the function will download the file from server - the default.}
 28 | 
 29 | \item{text}{If automatic download of the robots.txt is not preferred, the text can be
 30 | supplied directly.}
 31 | 
 32 | \item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt
 33 | file from domain}
 34 | 
 35 | \item{warn}{warn about being unable to download domain/robots.txt because of}
 36 | 
 37 | \item{force}{if TRUE instead of using possible cached results the function
 38 | will re-download the robotstxt file HTTP response status 404. If this
 39 | happens,}
 40 | 
 41 | \item{ssl_verifypeer}{either 1 (default) or 0, if 0 it disables SSL peer verification, which
 42 | might help with robots.txt file retrieval}
 43 | 
 44 | \item{encoding}{Encoding of the robots.txt file.}
 45 | 
 46 | \item{verbose}{make function print out more information}
 47 | 
 48 | \item{on_server_error}{request state handler for any 5xx status}
 49 | 
 50 | \item{on_client_error}{request state handler for any 4xx HTTP status that is
 51 | not 404}
 52 | 
 53 | \item{on_not_found}{request state handler for HTTP status 404}
 54 | 
 55 | \item{on_redirect}{request state handler for any 3xx HTTP status}
 56 | 
 57 | \item{on_domain_change}{request state handler for any 3xx HTTP status where
 58 | domain did change as well}
 59 | 
 60 | \item{on_file_type_mismatch}{request state handler for content type other
 61 | than 'text/plain'}
 62 | 
 63 | \item{on_suspect_content}{request state handler for content that seems to be
 64 | something else than a robots.txt file (usually a JSON, XML or HTML)}
 65 | }
 66 | \value{
 67 | Object (list) of class robotstxt with parsed data from a
 68 |   robots.txt (domain, text, bots, permissions, host, sitemap, other) and one
 69 |   function to (check()) to check resource permissions.
 70 | }
 71 | \description{
 72 | The function generates a list that entails data resulting from parsing a robots.txt file
 73 | as well as a function called check that enables to ask the representation if bot (or
 74 | particular bots) are allowed to access a resource on the domain.
 75 | }
 76 | \section{Fields}{
 77 | 
 78 | \describe{
 79 | \item{\code{domain}}{character vector holding domain name for which the robots.txt
 80 | file is valid; will be set to NA if not supplied on initialization}
 81 | 
 82 | \item{\code{character}}{vector of text of robots.txt file; either supplied on
 83 | initialization or automatically downloaded from domain supplied on
 84 | initialization}
 85 | 
 86 | \item{\code{bots}}{character vector of bot names mentioned in robots.txt}
 87 | 
 88 | \item{\code{permissions}}{data.frame of bot permissions found in robots.txt file}
 89 | 
 90 | \item{\code{host}}{data.frame of host fields found in robots.txt file}
 91 | 
 92 | \item{\code{sitemap}}{data.frame of sitemap fields found in robots.txt file}
 93 | 
 94 | \item{\code{other}}{data.frame of other - none of the above - fields found in
 95 | robots.txt file}
 96 | 
 97 | \item{\code{check()}}{Method to check for bot permissions. Defaults to the
 98 | domains root and no bot in particular. check() has two arguments:
 99 | paths and bot. The first is for supplying the paths for which to check
100 | permissions and the latter to put in the name of the bot.
101 | Please, note that path to a folder should end with a trailing slash ("/").}
102 | }}
103 | 
104 | \examples{
105 | \dontrun{
106 | rt <- robotstxt(domain="google.com")
107 | rt$bots
108 | rt$permissions
109 | rt$check( paths = c("/", "forbidden"), bot="*")
110 | }
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/R/paths_allowed.R:
--------------------------------------------------------------------------------
  1 | #' Check if a bot has permissions to access page(s)
  2 | #'
  3 | #'
  4 | #' @param domain Domain for which paths should be checked. Defaults to "auto".
  5 | #'   If set to "auto" function will try to guess the domain by parsing the paths
  6 | #'   argument. Note however, that these are educated guesses which might utterly
  7 | #'   fail. To be on the safe side, provide appropriate domains manually.
  8 | #' @param bot name of the bot, defaults to "*"
  9 | #' @param paths paths for which to check bot's permission, defaults to "/". Please note that path to a folder should end with a trailing slash ("/").
 10 | #' @param check_method at the moment only kept for backward compatibility reasons - do not use parameter anymore --> will let the function simply use the default
 11 | #' @param robotstxt_list either NULL -- the default -- or a list of character
 12 | #'                       vectors with one vector per path to check
 13 | #'
 14 | #' @inheritParams rt_request_handler
 15 | #'
 16 | #' @inheritParams get_robotstxt
 17 | #' @inheritParams get_robotstxts
 18 | #'
 19 | #'
 20 | #' @export
 21 | paths_allowed <-
 22 |   function(
 23 |     paths                     = "/",
 24 |     domain                    = "auto",
 25 |     bot                       = "*",
 26 |     user_agent                = utils::sessionInfo()$R.version$version.string,
 27 |     check_method              = c("spiderbar"),
 28 |     warn                      = getOption("robotstxt_warn", TRUE),
 29 |     force                     = FALSE,
 30 |     ssl_verifypeer            = c(1,0),
 31 |     use_futures               = TRUE,
 32 |     robotstxt_list            = NULL,
 33 |     verbose                   = FALSE,
 34 |     rt_request_handler        = robotstxt::rt_request_handler,
 35 |     rt_robotstxt_http_getter  = robotstxt::get_robotstxt_http_get,
 36 |     on_server_error       = on_server_error_default,
 37 |     on_client_error       = on_client_error_default,
 38 |     on_not_found          = on_not_found_default,
 39 |     on_redirect           = on_redirect_default,
 40 |     on_domain_change      = on_domain_change_default,
 41 |     on_file_type_mismatch = on_file_type_mismatch_default,
 42 |     on_suspect_content    = on_suspect_content_default
 43 |   ){
 44 | 
 45 |     # process inputs
 46 |     if( all(domain == "auto") ){
 47 |       domain <- guess_domain(paths)
 48 |       paths  <- remove_domain(paths)
 49 |     }
 50 | 
 51 |     if( all(is.na(domain)) & !is.null(robotstxt_list) ){
 52 |       domain <- "auto"
 53 |     }
 54 | 
 55 |     # get robots.txt files
 56 |     if( is.null(robotstxt_list) ){
 57 |       robotstxt_list <-
 58 |         get_robotstxts(
 59 |           domain                    = domain,
 60 |           warn                      = warn,
 61 |           force                     = force,
 62 |           user_agent                = user_agent,
 63 |           ssl_verifypeer            = ssl_verifypeer,
 64 |           use_futures               = use_futures,
 65 |           verbose                   = verbose,
 66 |           rt_request_handler        = rt_request_handler,
 67 |           rt_robotstxt_http_getter  = rt_robotstxt_http_getter,
 68 |           on_server_error           = on_server_error,
 69 |           on_client_error           = on_client_error,
 70 |           on_not_found              = on_not_found,
 71 |           on_redirect               = on_redirect,
 72 |           on_domain_change          = on_domain_change,
 73 |           on_file_type_mismatch     = on_file_type_mismatch,
 74 |           on_suspect_content        = on_suspect_content
 75 |         )
 76 |       names(robotstxt_list) <- domain
 77 |     }
 78 | 
 79 |     # check paths
 80 |     if ( check_method[1] == "robotstxt"){
 81 |       warning(
 82 |         "
 83 |         This check method is deprecated,
 84 |         please stop using it -
 85 |         use 'spiderbar' instead
 86 |         or do not specify check_method parameter at all.
 87 |         "
 88 |       )
 89 |     }
 90 |     res <-
 91 |         paths_allowed_worker_spiderbar(
 92 |           domain         = domain,
 93 |           bot            = bot,
 94 |           paths          = paths,
 95 |           robotstxt_list = robotstxt_list
 96 |         )
 97 | 
 98 |     # return
 99 |     return(res)
100 |   }
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/R/get_robotstxt.R:
--------------------------------------------------------------------------------
  1 | #' Download a robots.txt file
  2 | #'
  3 | #' @param domain domain from which to download robots.txt file
  4 | #' @param warn warn about being unable to download domain/robots.txt because of
  5 | #' @param force if TRUE instead of using possible cached results the function
  6 | #'   will re-download the robotstxt file HTTP response status 404. If this
  7 | #'   happens,
  8 | #' @param user_agent HTTP user-agent string to be used to retrieve robots.txt
  9 | #'   file from domain
 10 | #'
 11 | #' @param ssl_verifypeer either 1 (default) or 0, if 0 it disables SSL peer verification, which
 12 | #'   might help with robots.txt file retrieval
 13 | #' @param rt_robotstxt_http_getter function that executes HTTP request
 14 | #' @param rt_request_handler handler function that handles request according to
 15 | #'     the event handlers specified
 16 | #'
 17 | #' @param verbose make function print out more information
 18 | #'
 19 | #'
 20 | #' @inheritParams rt_request_handler
 21 | #'
 22 | #' @param encoding Encoding of the robots.txt file.
 23 | #'
 24 | #' @export
 25 | 
 26 | get_robotstxt <-
 27 |   function(
 28 |     domain,
 29 |     warn                      = getOption("robotstxt_warn", TRUE),
 30 |     force                     = FALSE,
 31 |     user_agent                = utils::sessionInfo()$R.version$version.string,
 32 |     ssl_verifypeer            = c(1,0),
 33 |     encoding                  = "UTF-8",
 34 |     verbose                   = FALSE,
 35 |     rt_request_handler        = robotstxt::rt_request_handler,
 36 |     rt_robotstxt_http_getter  = robotstxt::get_robotstxt_http_get,
 37 |     on_server_error           = on_server_error_default,
 38 |     on_client_error           = on_client_error_default,
 39 |     on_not_found              = on_not_found_default,
 40 |     on_redirect               = on_redirect_default,
 41 |     on_domain_change          = on_domain_change_default,
 42 |     on_file_type_mismatch     = on_file_type_mismatch_default,
 43 |     on_suspect_content        = on_suspect_content_default
 44 |   ){
 45 | 
 46 |     # pre checking input
 47 |     if( is.na(domain) ){
 48 |       return(NA)
 49 |     }
 50 | 
 51 |     # get data from cache or do download
 52 |     if( force ){
 53 | 
 54 |       request <-
 55 |         rt_robotstxt_http_getter(
 56 |           domain         = domain,
 57 |           user_agent     = user_agent,
 58 |           ssl_verifypeer = ssl_verifypeer[1]
 59 |         )
 60 | 
 61 |       if ( verbose == TRUE ){
 62 |         message("rt_robotstxt_http_getter: force http get")
 63 |       }
 64 | 
 65 |     } else if ( !is.null(rt_cache[[domain]]) ) {
 66 | 
 67 |       # get cache content
 68 |       request <-
 69 |         rt_cache[[domain]]
 70 | 
 71 |       # mark content as cached
 72 |       request$rt_from_cache <- TRUE
 73 | 
 74 |       if ( verbose == TRUE ){
 75 |         message("rt_robotstxt_http_getter: cached http get")
 76 |       }
 77 | 
 78 |     } else if ( is.null(rt_cache[[domain]]) ){
 79 | 
 80 |       # retrieve http content
 81 |       request <-
 82 |         rt_robotstxt_http_getter(
 83 |           domain         = domain,
 84 |           user_agent     = user_agent,
 85 |           ssl_verifypeer = ssl_verifypeer[1]
 86 |         )
 87 | 
 88 |       # mark content as not cached
 89 |       request$rt_from_cache <- FALSE
 90 | 
 91 |       rt_cache[[domain]] <- request
 92 | 
 93 |       if ( verbose == TRUE ){
 94 |         message("rt_robotstxt_http_getter: normal http get")
 95 |       }
 96 | 
 97 |     }
 98 | 
 99 | 
100 |     # handle request
101 |     res  <-
102 |       rt_request_handler(
103 |         request          = request,
104 |         on_redirect      = on_redirect,
105 |         on_domain_change = on_domain_change,
106 |         on_not_found     = on_not_found,
107 |         on_client_error  = on_client_error,
108 |         on_server_error  = on_server_error,
109 |         warn             = warn,
110 |         encoding         = encoding
111 |       )
112 | 
113 |     # check if cache has to be emptied if
114 |     if ( length(res$cache) == 0 || res$cache == TRUE ){
115 |       rt_cache[[domain]] <- request
116 |     } else {
117 |       rt_cache[[domain]] <- NULL
118 |     }
119 | 
120 | 
121 |     # collect info and return
122 |     rtxt <- res$rtxt
123 | 
124 |     attributes(rtxt) <-
125 |       list(
126 |         problems = res$problems,
127 |         cached   = request$rt_from_cache,
128 |         request  = request
129 |       )
130 | 
131 |     class(rtxt) <-
132 |       c("robotstxt_text", "character")
133 | 
134 |     return(rtxt)
135 |   }
136 | 
137 | 


--------------------------------------------------------------------------------
/R/robotstxt.R:
--------------------------------------------------------------------------------
  1 | #' Generate a representation of a robots.txt file
  2 | #'
  3 | #' The function generates a list that entails data resulting from parsing a robots.txt file
  4 | #' as well as a function called check that enables to ask the representation if bot (or
  5 | #' particular bots) are allowed to access a resource on the domain.
  6 | #'
  7 | #' @param domain Domain for which to generate a representation. If text equals to NULL,
  8 | #' the function will download the file from server - the default.
  9 | #'
 10 | #' @param text If automatic download of the robots.txt is not preferred, the text can be
 11 | #' supplied directly.
 12 | #' @inheritParams get_robotstxt
 13 | #'
 14 | #' @export
 15 | #'
 16 | #'
 17 | #' @return Object (list) of class robotstxt with parsed data from a
 18 | #'   robots.txt (domain, text, bots, permissions, host, sitemap, other) and one
 19 | #'   function to (check()) to check resource permissions.
 20 | #'
 21 | #' @field domain character vector holding domain name for which the robots.txt
 22 | #'   file is valid; will be set to NA if not supplied on initialization
 23 | #'
 24 | #' @field character vector of text of robots.txt file; either supplied on
 25 | #'   initialization or automatically downloaded from domain supplied on
 26 | #'   initialization
 27 | #'
 28 | #' @field bots character vector of bot names mentioned in robots.txt
 29 | #'
 30 | #' @field permissions data.frame of bot permissions found in robots.txt file
 31 | #'
 32 | #' @field host data.frame of host fields found in robots.txt file
 33 | #'
 34 | #' @field sitemap data.frame of sitemap fields found in robots.txt file
 35 | #'
 36 | #' @field other data.frame of other - none of the above - fields found in
 37 | #'   robots.txt file
 38 | #'
 39 | #'
 40 | #'
 41 | #' @field check() Method to check for bot permissions. Defaults to the
 42 | #' domains root and no bot in particular. check() has two arguments:
 43 | #' paths and bot. The first is for supplying the paths for which to check
 44 | #' permissions and the latter to put in the name of the bot.
 45 | #' Please, note that path to a folder should end with a trailing slash ("/").
 46 | #'
 47 | #'
 48 | #' @examples
 49 | #' \dontrun{
 50 | #' rt <- robotstxt(domain="google.com")
 51 | #' rt$bots
 52 | #' rt$permissions
 53 | #' rt$check( paths = c("/", "forbidden"), bot="*")
 54 | #' }
 55 | #'
 56 | robotstxt <-
 57 |   function(
 58 |     domain                = NULL,
 59 |     text                  = NULL,
 60 |     user_agent            = NULL,
 61 |     warn                  = getOption("robotstxt_warn", TRUE),
 62 |     force                 = FALSE,
 63 |     ssl_verifypeer        = c(1,0),
 64 |     encoding              = "UTF-8",
 65 |     verbose               = FALSE,
 66 |     on_server_error       = on_server_error_default,
 67 |     on_client_error       = on_client_error_default,
 68 |     on_not_found          = on_not_found_default,
 69 |     on_redirect           = on_redirect_default,
 70 |     on_domain_change      = on_domain_change_default,
 71 |     on_file_type_mismatch = on_file_type_mismatch_default,
 72 |     on_suspect_content    = on_suspect_content_default
 73 |   ) {
 74 | 
 75 |     ## check input
 76 |     self <- list()
 77 | 
 78 |     if (is.null(domain)) {
 79 |       self$domain <- NA
 80 |     }
 81 | 
 82 |     if (!is.null(text)){
 83 | 
 84 |       self$text <- text
 85 |       if(!is.null(domain)){
 86 |         self$domain <- domain
 87 |       }
 88 | 
 89 |     }else{
 90 | 
 91 |       if(!is.null(domain)){
 92 | 
 93 |         self$domain <- domain
 94 |         self$text   <-
 95 |           get_robotstxt(
 96 |             domain                = domain,
 97 |             warn                  = warn,
 98 |             force                 = force,
 99 |             user_agent            = user_agent,
100 |             ssl_verifypeer        = ssl_verifypeer,
101 |             encoding              = encoding,
102 |             verbose               = verbose,
103 |             on_server_error       = on_server_error       ,
104 |             on_client_error       = on_client_error       ,
105 |             on_not_found          = on_not_found          ,
106 |             on_redirect           = on_redirect           ,
107 |             on_domain_change      = on_domain_change      ,
108 |             on_file_type_mismatch = on_file_type_mismatch ,
109 |             on_suspect_content    = on_suspect_content
110 |           )
111 | 
112 |       }else{
113 | 
114 |         stop("robotstxt: I need text to initialize.")
115 | 
116 |       }
117 |     }
118 | 
119 |     ## fill fields with default data
120 | 
121 |     tmp <- parse_robotstxt(self$text)
122 |     self$robexclobj  <- spiderbar::robxp(self$text)
123 |     self$bots        <- tmp$useragents
124 |     self$comments    <- tmp$comments
125 |     self$permissions <- tmp$permissions
126 |     self$crawl_delay <- tmp$crawl_delay
127 |     self$host        <- tmp$host
128 |     self$sitemap     <- tmp$sitemap
129 |     self$other       <- tmp$other
130 | 
131 |     self$check <-
132 |       function(paths="/", bot="*"){
133 |         spiderbar::can_fetch(
134 |           obj        = self$robexclobj,
135 |           path       = paths,
136 |           user_agent = bot
137 |         )
138 |       }
139 | 
140 |     # return
141 |     class(self) <- "robotstxt"
142 |     return(self)
143 |   }
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/vignettes/style.css:
--------------------------------------------------------------------------------
  1 | code{
  2 |   white-space: pre;
  3 | }
  4 | 
  5 | a.sourceLine {
  6 |   display: inline-block; min-height: 1.25em;
  7 | }
  8 | 
  9 | a.sourceLine {
 10 |   pointer-events: none; color: inherit; text-decoration: inherit;
 11 | }
 12 | 
 13 | .sourceCode {
 14 |   overflow: visible;
 15 | 
 16 | }
 17 | 
 18 | code.sourceCode {
 19 |   white-space: pre;
 20 | 
 21 | }
 22 | 
 23 | @media print {
 24 |   code.sourceCode { white-space: pre-wrap; }
 25 |   a.sourceLine {
 26 |     text-indent: -1em; padding-left: 1em;
 27 |   }
 28 | }
 29 | 
 30 | pre.numberSource a.sourceLine {
 31 |   position: relative;
 32 | }
 33 | 
 34 | pre.numberSource a.sourceLine::before
 35 |   {
 36 |     content: attr(data-line-number);
 37 |     position: absolute; left: -5em; text-align: right; vertical-align: baseline;
 38 |     border: none; pointer-events: all;
 39 |     -webkit-touch-callout: none; -webkit-user-select: none;
 40 |     -khtml-user-select: none; -moz-user-select: none;
 41 |     -ms-user-select: none; user-select: none;
 42 |     padding: 0 4px; width: 4em;
 43 |     color: #aaaaaa;
 44 |   }
 45 | 
 46 | pre.numberSource {
 47 |   margin-left: 3em;
 48 |   border-left: 1px solid #aaaaaa;
 49 |   padding-left: 4px;
 50 | }
 51 | 
 52 | @media screen {
 53 |   a.sourceLine::before {
 54 |     text-decoration: underline;
 55 |     color: initial;
 56 |   }
 57 | }
 58 | 
 59 | code span.kw { color: #007020; font-weight: bold; } /* Keyword */
 60 | code span.dt { color: #902000; } /* DataType */
 61 | code span.dv { color: #40a070; } /* DecVal */
 62 | code span.bn { color: #40a070; } /* BaseN */
 63 | code span.fl { color: #40a070; } /* Float */
 64 | code span.ch { color: #4070a0; } /* Char */
 65 | code span.st { color: #4070a0; } /* String */
 66 | code span.co { color: #60a0b0; font-style: italic; } /* Comment */
 67 | code span.ot { color: #007020; } /* Other */
 68 | code span.al { color: #ff0000; font-weight: bold; } /* Alert */
 69 | code span.fu { color: #06287e; } /* Function */
 70 | code span.er { color: #ff0000; font-weight: bold; } /* Error */
 71 | code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
 72 | code span.cn { color: #880000; } /* Constant */
 73 | code span.sc { color: #4070a0; } /* SpecialChar */
 74 | code span.vs { color: #4070a0; } /* VerbatimString */
 75 | code span.ss { color: #bb6688; } /* SpecialString */
 76 | code span.im { } /* Import */
 77 | code span.va { color: #19177c; } /* Variable */
 78 | code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
 79 | code span.op { color: #666666; } /* Operator */
 80 | code span.bu { } /* BuiltIn */
 81 | code span.ex { } /* Extension */
 82 | code span.pp { color: #bc7a00; } /* Preprocessor */
 83 | code span.at { color: #7d9029; } /* Attribute */
 84 | code span.do { color: #ba2121; font-style: italic; } /* Documentation */
 85 | code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
 86 | code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
 87 | code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
 88 | 
 89 | 
 90 | 
 91 | body {
 92 |   background-color: #fff;
 93 |   margin: 1em auto;
 94 |   max-width: 700px;
 95 |   overflow: visible;
 96 |   padding-left: 2em;
 97 |   padding-right: 2em;
 98 |   font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
 99 |   font-size: 90%;
100 |   line-height: 1.8;
101 | }
102 | 
103 | #header {
104 | text-align: center;
105 | }
106 | 
107 | #TOC {
108 |   width: 100%;
109 |   clear: both;
110 |   margin-top: 20px;
111 |   margin-bottom: 20px;
112 |   padding: 4px;
113 |   border: 1px solid #CCCCCC;
114 |   border-radius: 5px;
115 |   background-color: #f6f6f6;
116 | }
117 | 
118 | #TOC .toctitle {
119 | font-weight: bold;
120 | font-size: 80%;
121 | margin-left: 5px;
122 | }
123 | 
124 | #TOC ul {
125 | padding-left: 40px;
126 | margin-left: -1.5em;
127 | margin-top: 5px;
128 | margin-bottom: 5px;
129 | }
130 | #TOC ul ul {
131 | margin-left: -2em;
132 | }
133 | #TOC li {
134 | line-height: 16px;
135 | }
136 | 
137 | table {
138 | margin: 1em auto;
139 | border-width: 1px;
140 | border-color: #DDDDDD;
141 | border-style: outset;
142 | border-collapse: collapse;
143 | }
144 | 
145 | table th {
146 | border-width: 2px;
147 | padding: 5px;
148 | border-style: inset;
149 | }
150 | 
151 | table td {
152 | border-width: 1px;
153 | border-style: inset;
154 | line-height: 18px;
155 | padding: 5px 5px;
156 | }
157 | 
158 | table, table th, table td {
159 | border-left-style: none;
160 | border-right-style: none;
161 | }
162 | 
163 | table thead, table tr.even {
164 | background-color: #f7f7f7;
165 | }
166 | 
167 | p {
168 | margin-top: 1.5em;
169 | margin-bottom: 1.5em;
170 | }
171 | 
172 | blockquote {
173 | background-color: #f6f6f6;
174 | padding: 0.25em 0.75em;
175 | }
176 | 
177 | hr {
178 | border-style: solid;
179 | border: none;
180 | border-top: 1px solid #777;
181 | margin: 28px 0;
182 | }
183 | 
184 | dl {
185 | margin-left: 0;
186 | }
187 | 
188 | dl dd {
189 | margin-bottom: 13px;
190 | margin-left: 13px;
191 | }
192 | 
193 | dl dt {
194 | font-weight: bold;
195 | }
196 | 
197 | ul {
198 | margin-top: 0;
199 | }
200 | ul li {
201 | list-style: circle outside;
202 | }
203 | ul ul {
204 | margin-bottom: 0;
205 | }
206 | pre, code {
207 | background-color: #f7f7f7;
208 | border-radius: 3px;
209 | color: #333;
210 | white-space: pre-wrap;
211 | }
212 | pre {
213 | border-radius: 3px;
214 | margin: 5px 0px 10px 0px;
215 | padding: 10px;
216 | }
217 | pre:not([class]) {
218 | background-color: #f7f7f7;
219 | }
220 | code {
221 | font-family: Consolas, Monaco, 'Courier New', monospace;
222 | font-size: 85%;
223 | }
224 | p > code, li > code {
225 | padding: 2px 0px;
226 | }
227 | div.figure {
228 | text-align: center;
229 | }
230 | img {
231 | background-color: #FFFFFF;
232 | padding: 2px;
233 | border: 1px solid #DDDDDD;
234 | border-radius: 3px;
235 | border: 1px solid #CCCCCC;
236 | margin: 0 5px;
237 | }
238 | 
239 | h1,h2,h3,h4,h5,h6 {
240 |   margin-top: 1.5em;
241 | }
242 | 
243 | a {
244 | color: #0033dd;
245 | text-decoration: none;
246 | }
247 | 
248 | a:hover {
249 | color: #6666ff; }
250 | 
251 | a:visited {
252 | color: #800080; }
253 | 
254 | a:visited:hover {
255 | color: #BB00BB; }
256 | 
257 | 
258 | 
259 | code > span.kw { color: #555; font-weight: bold; }
260 | code > span.dt { color: #902000; }
261 | code > span.dv { color: #40a070; }
262 | code > span.bn { color: #d14; }
263 | code > span.fl { color: #d14; }
264 | code > span.ch { color: #d14; }
265 | code > span.st { color: #d14; }
266 | code > span.co { color: #888888; font-style: italic; }
267 | code > span.ot { color: #007020; }
268 | code > span.al { color: #ff0000; font-weight: bold; }
269 | code > span.fu { color: #900; font-weight: bold; }  code > span.er { color: #a61717; background-color: #e3d2d2; }
270 | 


--------------------------------------------------------------------------------
/tests/testthat/test_http_event_handling.R:
--------------------------------------------------------------------------------
  1 | test_that("www redirects are handled silently", {
  2 |   expect_true({
  3 |     request <- readRDS(system.file("http_requests/http_redirect_www.rds", package = "robotstxt"))
  4 |     rt <-
  5 |       get_robotstxt(
  6 |         "www.petermeissner.de",
  7 |         rt_robotstxt_http_getter = function(...){request},
  8 |         warn = FALSE
  9 |       )
 10 | 
 11 |     !("on_domain_change" %in% names(attr(rt, "problems"))) &&
 12 |       !("on_subdomain_change" %in% names(attr(rt, "problems")))
 13 |   })
 14 | })
 15 | 
 16 | 
 17 | test_that("on_redirect detected", {
 18 |   expect_true({
 19 |     domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt"))
 20 |     rt <-
 21 |       suppressMessages(
 22 |         get_robotstxt(
 23 |           "http://google.com",
 24 |           rt_robotstxt_http_getter = function(...){domain_change},
 25 |           warn = FALSE
 26 |       ))
 27 |     "on_redirect" %in% names(attr(rt, "problems"))
 28 |   })
 29 | })
 30 | 
 31 | test_that("on_domain_change_detected", {
 32 |   expect_true({
 33 |     domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt"))
 34 |     rt <-
 35 |       suppressMessages(
 36 |         get_robotstxt(
 37 |         "github.io",
 38 |         rt_robotstxt_http_getter = function(...){domain_change},
 39 |         warn = FALSE
 40 |       ))
 41 |     "on_domain_change" %in% names(attr(rt, "problems"))
 42 |   })
 43 | })
 44 | 
 45 | 
 46 | test_that("non www redirects are handled non silently", {
 47 |   expect_snapshot({
 48 |     domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt"))
 49 |     suppressMessages(
 50 |       get_robotstxt(
 51 |       "http://google.com",
 52 |       rt_robotstxt_http_getter = function(...){domain_change},
 53 |       warn = TRUE
 54 |     ))
 55 |   })
 56 | })
 57 | 
 58 | 
 59 | test_that("warn = FALSE does silences warnings", {
 60 |   expect_silent({
 61 |     domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt"))
 62 |     suppressMessages(
 63 |       get_robotstxt(
 64 |       "github.io",
 65 |       rt_robotstxt_http_getter = function(...){domain_change},
 66 |       warn = FALSE
 67 |     ))
 68 |   })
 69 | })
 70 | 
 71 | 
 72 | test_that("suspect content", {
 73 |   expect_true({
 74 |     suppressWarnings({
 75 |       suspect_content <- readRDS(system.file("http_requests/http_html_content.rds", package = "robotstxt"))
 76 |       rtxt <-
 77 |         suppressMessages(
 78 |           get_robotstxt(
 79 |           "pages.github.com",
 80 |           rt_robotstxt_http_getter = function(...){suspect_content}
 81 |         ))
 82 |       problems <- attr(rtxt, "problems")
 83 |     })
 84 | 
 85 |     !is.null(problems$on_file_type_mismatch) & problems$on_suspect_content$content_suspect
 86 |   })
 87 | })
 88 | 
 89 | 
 90 | test_that("all ok", {
 91 |   expect_silent({
 92 |     http_ok <- readRDS(system.file("http_requests/http_ok_1.rds", package = "robotstxt"))
 93 |     suppressMessages(get_robotstxt(
 94 |       "google.com",
 95 |       rt_robotstxt_http_getter = function(...){http_ok}
 96 |     ))
 97 |   })
 98 | 
 99 |   expect_silent({
100 |     http_ok <- readRDS(system.file("http_requests/http_ok_2.rds", package = "robotstxt"))
101 |     suppressMessages(
102 |       get_robotstxt(
103 |       "google.com",
104 |       rt_robotstxt_http_getter = function(...){http_ok}
105 |     ))
106 |   })
107 | 
108 |   expect_silent({
109 |     http_ok <- readRDS(system.file("http_requests/http_ok_3.rds", package = "robotstxt"))
110 |     suppressMessages(
111 |       get_robotstxt(
112 |       "google.com",
113 |       rt_robotstxt_http_getter = function(...){http_ok}
114 |     ))
115 |   })
116 | 
117 |   expect_silent({
118 |     if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
119 |       suppressMessages(
120 |         get_robotstxt(
121 |         "google.com"
122 |       ))
123 |     }
124 |   })
125 | 
126 | 
127 |   expect_silent({
128 |     if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE"  ){
129 |       suppressMessages(
130 |         get_robotstxt(
131 |         "google.com",
132 |         force = TRUE
133 |       ))
134 |     }
135 |   })
136 | })
137 | 
138 | 
139 | test_that("client error", {
140 |   expect_snapshot({
141 |     http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt"))
142 |     suppressMessages(
143 |       get_robotstxt(
144 |         "httpbin.org",
145 |         rt_robotstxt_http_getter = function(...){http_client_error}
146 |       )
147 |     )
148 |   })
149 | 
150 |   expect_true({
151 |     http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt"))
152 |     res <-
153 |       suppressMessages(
154 |         get_robotstxt(
155 |         "httpbin.org",
156 |         rt_robotstxt_http_getter = function(...){http_client_error},
157 |         warn = FALSE
158 |       ))
159 |     problems <- attr(res, "problems")
160 |     problems$on_client_error$status_code == 400
161 |   })
162 | 
163 |   expect_true({
164 |     http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt"))
165 |     res <-
166 |       suppressMessages(
167 |         paths_allowed(
168 |         paths = c("", "/", "here/I/stand/chopping/lops"),
169 |         domain = "httpbin.org",
170 |         rt_robotstxt_http_getter = function(...){http_client_error},
171 |         warn = FALSE
172 |       ))
173 |     all(res)
174 |   })
175 | })
176 | 
177 | 
178 | test_that("server error", {
179 |   http_server_error <- readRDS(system.file("http_requests/http_server_error.rds", package = "robotstxt"))
180 |   f                 <- function(...){http_server_error}
181 | 
182 |   expect_error({
183 |     rt <-
184 |       suppressMessages(
185 |         get_robotstxt(
186 |         "httpbin.org",
187 |         rt_robotstxt_http_getter = f,
188 |         warn  = FALSE,
189 |         force = TRUE
190 |       ))
191 |   })
192 | 
193 |   expect_snapshot({
194 |     res <-
195 |       suppressMessages(
196 |         get_robotstxt(
197 |         "httpbin.org",
198 |         rt_robotstxt_http_getter = f,
199 |         on_server_error          = list(signal = "warning"),
200 |         force                    = TRUE
201 |       ))
202 |   })
203 | 
204 |   expect_true({
205 |     res <-
206 |       suppressMessages(
207 |         paths_allowed(
208 |         paths                    = c("", "/", "here/I/stand/chopping/lops"),
209 |         domain                   = "httpbin.org",
210 |         rt_robotstxt_http_getter = f,
211 |         on_server_error          = list(signal = "nothing"),
212 |         warn                     = FALSE,
213 |         force                    = TRUE
214 |       ))
215 |     all(!res)
216 |   })
217 | })
218 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_facebook.txt:
--------------------------------------------------------------------------------
  1 | # Notice: Crawling Facebook is prohibited unless you have express written
  2 | # permission. See: http://www.facebook.com/apps/site_scraping_tos_terms.php
  3 | 
  4 | User-agent: Applebot
  5 | Disallow: /ajax/
  6 | Disallow: /album.php
  7 | Disallow: /checkpoint/
  8 | Disallow: /contact_importer/
  9 | Disallow: /feeds/
 10 | Disallow: /file_download.php
 11 | Disallow: /hashtag/
 12 | Disallow: /l.php
 13 | Disallow: /live/
 14 | Disallow: /moments_app/
 15 | Disallow: /p.php
 16 | Disallow: /photo.php
 17 | Disallow: /photos.php
 18 | Disallow: /sharer/
 19 | 
 20 | User-agent: baiduspider
 21 | Disallow: /ajax/
 22 | Disallow: /album.php
 23 | Disallow: /checkpoint/
 24 | Disallow: /contact_importer/
 25 | Disallow: /feeds/
 26 | Disallow: /file_download.php
 27 | Disallow: /hashtag/
 28 | Disallow: /l.php
 29 | Disallow: /live/
 30 | Disallow: /moments_app/
 31 | Disallow: /p.php
 32 | Disallow: /photo.php
 33 | Disallow: /photos.php
 34 | Disallow: /sharer/
 35 | 
 36 | User-agent: Bingbot
 37 | Disallow: /ajax/
 38 | Disallow: /album.php
 39 | Disallow: /checkpoint/
 40 | Disallow: /contact_importer/
 41 | Disallow: /feeds/
 42 | Disallow: /file_download.php
 43 | Disallow: /hashtag/
 44 | Disallow: /l.php
 45 | Disallow: /live/
 46 | Disallow: /moments_app/
 47 | Disallow: /p.php
 48 | Disallow: /photo.php
 49 | Disallow: /photos.php
 50 | Disallow: /sharer/
 51 | 
 52 | User-agent: Googlebot
 53 | Disallow: /ajax/
 54 | Disallow: /album.php
 55 | Disallow: /checkpoint/
 56 | Disallow: /contact_importer/
 57 | Disallow: /feeds/
 58 | Disallow: /file_download.php
 59 | Disallow: /hashtag/
 60 | Disallow: /l.php
 61 | Disallow: /live/
 62 | Disallow: /moments_app/
 63 | Disallow: /p.php
 64 | Disallow: /photo.php
 65 | Disallow: /photos.php
 66 | Disallow: /sharer/
 67 | 
 68 | User-agent: ia_archiver
 69 | Disallow: /
 70 | Disallow: /ajax/
 71 | Disallow: /album.php
 72 | Disallow: /checkpoint/
 73 | Disallow: /contact_importer/
 74 | Disallow: /feeds/
 75 | Disallow: /file_download.php
 76 | Disallow: /hashtag/
 77 | Disallow: /l.php
 78 | Disallow: /live/
 79 | Disallow: /moments_app/
 80 | Disallow: /p.php
 81 | Disallow: /photo.php
 82 | Disallow: /photos.php
 83 | Disallow: /sharer/
 84 | 
 85 | User-agent: msnbot
 86 | Disallow: /ajax/
 87 | Disallow: /album.php
 88 | Disallow: /checkpoint/
 89 | Disallow: /contact_importer/
 90 | Disallow: /feeds/
 91 | Disallow: /file_download.php
 92 | Disallow: /hashtag/
 93 | Disallow: /l.php
 94 | Disallow: /live/
 95 | Disallow: /moments_app/
 96 | Disallow: /p.php
 97 | Disallow: /photo.php
 98 | Disallow: /photos.php
 99 | Disallow: /sharer/
100 | 
101 | User-agent: Naverbot
102 | Disallow: /ajax/
103 | Disallow: /album.php
104 | Disallow: /checkpoint/
105 | Disallow: /contact_importer/
106 | Disallow: /feeds/
107 | Disallow: /file_download.php
108 | Disallow: /hashtag/
109 | Disallow: /l.php
110 | Disallow: /live/
111 | Disallow: /moments_app/
112 | Disallow: /p.php
113 | Disallow: /photo.php
114 | Disallow: /photos.php
115 | Disallow: /sharer/
116 | 
117 | User-agent: seznambot
118 | Disallow: /ajax/
119 | Disallow: /album.php
120 | Disallow: /checkpoint/
121 | Disallow: /contact_importer/
122 | Disallow: /feeds/
123 | Disallow: /file_download.php
124 | Disallow: /hashtag/
125 | Disallow: /l.php
126 | Disallow: /live/
127 | Disallow: /moments_app/
128 | Disallow: /p.php
129 | Disallow: /photo.php
130 | Disallow: /photos.php
131 | Disallow: /sharer/
132 | 
133 | User-agent: Slurp
134 | Disallow: /ajax/
135 | Disallow: /album.php
136 | Disallow: /checkpoint/
137 | Disallow: /contact_importer/
138 | Disallow: /feeds/
139 | Disallow: /file_download.php
140 | Disallow: /hashtag/
141 | Disallow: /l.php
142 | Disallow: /live/
143 | Disallow: /moments_app/
144 | Disallow: /p.php
145 | Disallow: /photo.php
146 | Disallow: /photos.php
147 | Disallow: /sharer/
148 | 
149 | User-agent: teoma
150 | Disallow: /ajax/
151 | Disallow: /album.php
152 | Disallow: /checkpoint/
153 | Disallow: /contact_importer/
154 | Disallow: /feeds/
155 | Disallow: /file_download.php
156 | Disallow: /hashtag/
157 | Disallow: /l.php
158 | Disallow: /live/
159 | Disallow: /moments_app/
160 | Disallow: /p.php
161 | Disallow: /photo.php
162 | Disallow: /photos.php
163 | Disallow: /sharer/
164 | 
165 | User-agent: Twitterbot
166 | Disallow: /ajax/
167 | Disallow: /album.php
168 | Disallow: /checkpoint/
169 | Disallow: /contact_importer/
170 | Disallow: /feeds/
171 | Disallow: /file_download.php
172 | Disallow: /hashtag/
173 | Disallow: /l.php
174 | Disallow: /live/
175 | Disallow: /moments_app/
176 | Disallow: /p.php
177 | Disallow: /photo.php
178 | Disallow: /photos.php
179 | Disallow: /sharer/
180 | 
181 | User-agent: Yandex
182 | Disallow: /ajax/
183 | Disallow: /album.php
184 | Disallow: /checkpoint/
185 | Disallow: /contact_importer/
186 | Disallow: /feeds/
187 | Disallow: /file_download.php
188 | Disallow: /hashtag/
189 | Disallow: /l.php
190 | Disallow: /live/
191 | Disallow: /moments_app/
192 | Disallow: /p.php
193 | Disallow: /photo.php
194 | Disallow: /photos.php
195 | Disallow: /sharer/
196 | 
197 | User-agent: Yeti
198 | Disallow: /ajax/
199 | Disallow: /album.php
200 | Disallow: /checkpoint/
201 | Disallow: /contact_importer/
202 | Disallow: /feeds/
203 | Disallow: /file_download.php
204 | Disallow: /hashtag/
205 | Disallow: /l.php
206 | Disallow: /live/
207 | Disallow: /moments_app/
208 | Disallow: /p.php
209 | Disallow: /photo.php
210 | Disallow: /photos.php
211 | Disallow: /sharer/
212 | 
213 | User-agent: Applebot
214 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
215 | Allow: /safetycheck/
216 | 
217 | User-agent: baiduspider
218 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
219 | Allow: /safetycheck/
220 | 
221 | User-agent: Bingbot
222 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
223 | Allow: /safetycheck/
224 | 
225 | User-agent: Googlebot
226 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
227 | Allow: /safetycheck/
228 | 
229 | User-agent: ia_archiver
230 | Allow: /about/privacy
231 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
232 | Allow: /full_data_use_policy
233 | Allow: /legal/terms
234 | Allow: /policy.php
235 | Allow: /safetycheck/
236 | 
237 | User-agent: msnbot
238 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
239 | Allow: /safetycheck/
240 | 
241 | User-agent: Naverbot
242 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
243 | Allow: /safetycheck/
244 | 
245 | User-agent: seznambot
246 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
247 | Allow: /safetycheck/
248 | 
249 | User-agent: Slurp
250 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
251 | Allow: /safetycheck/
252 | 
253 | User-agent: teoma
254 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
255 | Allow: /safetycheck/
256 | 
257 | User-agent: Twitterbot
258 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
259 | Allow: /safetycheck/
260 | 
261 | User-agent: Yandex
262 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
263 | Allow: /safetycheck/
264 | 
265 | User-agent: Yeti
266 | Allow: /ajax/pagelet/generic.php/PagePostsSectionPagelet
267 | Allow: /safetycheck/
268 | 
269 | User-agent: *
270 | Disallow: /
271 | 


--------------------------------------------------------------------------------
/tests/testthat/test_parser.R:
--------------------------------------------------------------------------------
  1 | rtxt_asb        <- rt_get_rtxt("allow_single_bot.txt")
  2 | rtxt_dafa       <- rt_get_rtxt("disallow_all_for_all.txt")
  3 | rtxt_dafbb      <- rt_get_rtxt("disallow_all_for_BadBot.txt")
  4 | rtxt_dsfa       <- rt_get_rtxt("disallow_some_for_all.txt")
  5 | rtxt_empty      <- rt_get_rtxt("empty.txt")
  6 | rtxt_datao      <- rt_get_rtxt("disallow_two_at_once.txt")
  7 | rtxt_tcom       <- rt_get_rtxt("testing_comments.txt")
  8 | rtxt_amzn       <- rt_get_rtxt("robots_amazon.txt")
  9 | rtxt_bt         <- rt_get_rtxt("robots_bundestag.txt")
 10 | rtxt_ggl        <- rt_get_rtxt("robots_google.txt")
 11 | rtxt_nyt        <- rt_get_rtxt("robots_new_york_times.txt")
 12 | rtxt_spgl       <- rt_get_rtxt("robots_spiegel.txt")
 13 | rtxt_yh         <- rt_get_rtxt("robots_yahoo.txt")
 14 | rtxt_she        <- rt_get_rtxt("selfhtml_Example.txt")
 15 | rtxt_pm         <- rt_get_rtxt("robots_pmeissner.txt")
 16 | rtxt_wp         <- rt_get_rtxt("robots_wikipedia.txt")
 17 | rtxt_cd         <- rt_get_rtxt("crawl_delay.txt")
 18 | rtxt_host       <- rt_get_rtxt("host.txt")
 19 | rtxt_fb_nsp     <- rt_get_rtxt("robots_facebook_unsupported.txt")
 20 | rtxt_cdc        <- rt_get_rtxt("robots_cdc.txt")
 21 | rtxt_cdc2       <- paste(rt_get_rtxt("robots_cdc2.txt"), collapse = "\r\n")
 22 | rtxt_rbloggers  <- rt_get_rtxt("rbloggers.txt")
 23 | rtxt_ct         <- rt_get_rtxt("robots_commented_token.txt")
 24 | 
 25 | 
 26 | valid_rtxt_files <- c(
 27 |   rtxt_asb, rtxt_dafa, rtxt_dafbb, rtxt_dsfa, rtxt_empty,
 28 |   rtxt_datao, rtxt_tcom, rtxt_amzn, rtxt_bt, rtxt_ggl,
 29 |   rtxt_nyt, rtxt_spgl, rtxt_yh, rtxt_she, rtxt_pm,
 30 |   rtxt_wp, rtxt_cd, rtxt_host, rtxt_cdc, rtxt_ct,
 31 |   "\n\n\n"
 32 | )
 33 | 
 34 | test_that("all robots.txt files are valid with check_strickt_ascii = F", {
 35 |   expect_true(is_valid_robotstxt(valid_rtxt_files))
 36 | })
 37 | 
 38 | 
 39 | valid_rtxt_files_ascii <- c(
 40 |   rtxt_asb, rtxt_dafa, rtxt_dafbb, rtxt_dsfa, rtxt_empty,
 41 |   rtxt_datao, rtxt_tcom, rtxt_amzn, rtxt_bt, rtxt_ggl,
 42 |   rtxt_nyt, rtxt_spgl, rtxt_yh, rtxt_she, rtxt_pm,
 43 |   rtxt_cd, rtxt_host, rtxt_cdc, rtxt_ct,
 44 |   "\n\n\n"
 45 | )
 46 | 
 47 | test_that("all robots.txt files are valid with check_strickt_ascii = T", {
 48 |   expect_true(
 49 |     is_valid_robotstxt(valid_rtxt_files_ascii, check_strickt_ascii = TRUE)
 50 |   )
 51 | })
 52 | 
 53 | 
 54 | test_that("broken robots.txt files are invalid", {
 55 |   expect_false(is_valid_robotstxt(rtxt_fb_nsp))
 56 | 
 57 |   expect_false(
 58 |     is_valid_robotstxt(
 59 |       "       # dings\nbums\n        dings"
 60 |     )
 61 |   )
 62 | })
 63 | 
 64 | 
 65 | for (char in c(" ", "\t", "(", ")", "<", ">", "@", ",", ";", "<", ">", "/", "[", "]", "?", "=", "{", "}") ) {
 66 | 
 67 |   txt <-
 68 |     gsub(
 69 |       x           = "extension<<SPECIAL CHAR>>field: some value",
 70 |       pattern     = "<<SPECIAL CHAR>>",
 71 |       replacement = char
 72 |     )
 73 | 
 74 |   if (is_valid_robotstxt(txt)) {
 75 |     cat("CHAR: ", "'", char,"'; ", sep = "")
 76 |   }
 77 | 
 78 |   test_that("field name has no special character", {
 79 |     expect_false(is_valid_robotstxt(txt))
 80 |   })
 81 | }
 82 | 
 83 | 
 84 | test_that("field name has no special character", {
 85 |   expect_false(
 86 |     is_valid_robotstxt("extension\\field: some value", check_strickt_ascii = TRUE)
 87 |   )
 88 | })
 89 | 
 90 | 
 91 | test_that("field name has no special character", {
 92 |   expect_false(
 93 |     is_valid_robotstxt("Error in curl::curl_fetch_memory(url, handle = handle) :   Could not resolve host: domain.tld", check_strickt_ascii = TRUE)
 94 |   )
 95 | })
 96 | 
 97 | 
 98 | test_that("broken robots.txt files are invalid", {
 99 |   expect_false(is_valid_robotstxt(rtxt_fb_nsp, check_strickt_ascii = TRUE))
100 | 
101 |   expect_false(
102 |     is_valid_robotstxt(
103 |       "       # dings\nbums\n        dings", check_strickt_ascii = TRUE
104 |     )
105 |   )
106 | })
107 | 
108 | 
109 | test_that(
110 |   "all user agents are extracted", {
111 |     expect_true(all( parse_robotstxt(rtxt_asb   )$useragents %in% c("*", "Google") ))
112 |     expect_true(all( parse_robotstxt(rtxt_dafa  )$useragents %in% c("*") ))
113 |     expect_true(all( parse_robotstxt(rtxt_dafbb )$useragents %in% c("BadBot") ))
114 |     expect_true(all( parse_robotstxt(rtxt_dsfa  )$useragents %in% c("*") ))
115 |     expect_true(all( length(parse_robotstxt(rtxt_empty )$useragents) == 0  ))
116 |     expect_true(all( parse_robotstxt(rtxt_amzn  )$useragents %in% c("EtaoSpider", "*") ))
117 |     expect_true(all( parse_robotstxt(rtxt_bt    )$useragents %in% c("*") ))
118 |     expect_true(all( parse_robotstxt(rtxt_ggl   )$useragents %in% c("*") ))
119 |     expect_true(all( parse_robotstxt(rtxt_nyt   )$useragents %in% c("*", "Mediapartners-Google", "AdsBot-Google", "adidxbot" ) ))
120 |     expect_true(all( parse_robotstxt(rtxt_spgl  )$useragents %in% c("WebReaper", "Slurp") ))
121 |     expect_true(all( parse_robotstxt(rtxt_yh    )$useragents %in% c("*") ))
122 |     expect_true(all( parse_robotstxt(rtxt_she   )$useragents %in% c("*","mein-Robot", "UniversalRobot/1.0") ))
123 |     expect_true(all( parse_robotstxt(rtxt_datao )$useragents %in% c("BadBot","Googlebot") ))
124 |   }
125 | )
126 | 
127 | 
128 | test_that(
129 |   "specification of more than one user agent gets interpreted right", {
130 |     expect_true( dim(parse_robotstxt(rtxt_datao )$permissions)[1]==2)
131 |     expect_true( all(parse_robotstxt(rtxt_datao )$permissions$value=="/private/"))
132 |   }
133 | )
134 | 
135 | 
136 | test_that(
137 |   "comments get extracted right", {
138 |     expect_true(dim(parse_robotstxt(rtxt_tcom )$comments)[1]==3)
139 |   }
140 | )
141 | 
142 | 
143 | test_that(
144 |   "craw-delay gets extracted", {
145 |     expect_true(parse_robotstxt(rtxt_host)$host$value=="www.whatever.com")
146 |   }
147 | )
148 | 
149 | 
150 | test_that(
151 |   "craw-delay gets extracted", {
152 |     expect_true(parse_robotstxt(rtxt_cd)$crawl_delay$value==10)
153 |   }
154 | )
155 | 
156 | 
157 | classes <- function(x){
158 |   unlist(lapply(x, class))
159 | }
160 | 
161 | 
162 | test_that(
163 |   "data.frames contain no factors", {
164 |     expect_false( any( classes( parse_robotstxt(rtxt_datao)$useragents ) %in% "factor") )
165 |     expect_false( any( classes( parse_robotstxt(rtxt_datao)$comments   ) %in% "factor") )
166 |     expect_false( any( classes( parse_robotstxt(rtxt_datao)$permissions) %in% "factor") )
167 |     expect_false( any( classes( parse_robotstxt(rtxt_datao)$sitemap    ) %in% "factor") )
168 |     expect_false( any( classes( parse_robotstxt(rtxt_datao)$other      ) %in% "factor") )
169 | 
170 |     expect_false( any( classes( parse_robotstxt(rtxt_empty)$useragents ) %in% "factor") )
171 |     expect_false( any( classes( parse_robotstxt(rtxt_empty)$comments   ) %in% "factor") )
172 |     expect_false( any( classes( parse_robotstxt(rtxt_empty)$permissions) %in% "factor") )
173 |     expect_false( any( classes( parse_robotstxt(rtxt_empty)$sitemap    ) %in% "factor") )
174 |     expect_false( any( classes( parse_robotstxt(rtxt_empty)$other      ) %in% "factor") )
175 |   }
176 | )
177 | 
178 | 
179 | test_that(
180 |   "cdc gets parsed correctly", {
181 |     expect_true(
182 |       nrow(parse_robotstxt(rtxt_cdc)$permissions) == 23
183 |     )
184 | 
185 |     expect_true(
186 |       nrow(parse_robotstxt(rtxt_cdc2)$permissions) == 23
187 |     )
188 |   }
189 | )
190 | 
191 | 
192 | test_that(
193 |   "can handle varIOUs cases for robots.txt fields - issue #55", {
194 |     expect_true({
195 |       cd <- parse_robotstxt(rtxt_rbloggers)$crawl_delay
196 |       sum(cd$useragent == "AhrefsBot") == 1
197 |     })
198 |   }
199 | )
200 | 
201 | 
202 | test_that(
203 |   "Commented-out tokens get ignored", {
204 |     expect_true(
205 |       nrow(parse_robotstxt(rtxt_ct)$permissions) == 1
206 |     )
207 |   }
208 | )
209 | 


--------------------------------------------------------------------------------
/R/rt_request_handler.R:
--------------------------------------------------------------------------------
  1 | #' Handle robotstxt object retrieved from HTTP request
  2 | #'
  3 | #' A helper function for get_robotstxt() that will extract the robots.txt file
  4 | #' from the HTTP request result object. It will inform get_robotstxt() if the
  5 | #' request should be cached and which problems occurred.
  6 | #'
  7 | #'
  8 | #'
  9 | #' @param request result of an HTTP request (e.g. httr::GET())
 10 | #'
 11 | #' @param on_server_error request state handler for any 5xx status
 12 | #'
 13 | #' @param on_client_error request state handler for any 4xx HTTP status that is
 14 | #'   not 404
 15 | #'
 16 | #' @param on_not_found request state handler for HTTP status 404
 17 | #'
 18 | #' @param on_redirect request state handler for any 3xx HTTP status
 19 | #'
 20 | #' @param on_domain_change request state handler for any 3xx HTTP status where
 21 | #'   domain did change as well
 22 | #'
 23 | #' @param on_sub_domain_change request state handler for any 3xx HTTP status where
 24 | #'   domain did change but only to www-sub_domain
 25 | #'
 26 | #' @param on_file_type_mismatch request state handler for content type other
 27 | #'   than 'text/plain'
 28 | #'
 29 | #' @param on_suspect_content request state handler for content that seems to be
 30 | #'   something else than a robots.txt file (usually a JSON, XML or HTML)
 31 | #'
 32 | #' @param warn suppress warnings
 33 | #'
 34 | #' @param encoding The text encoding to assume if no encoding is provided in the
 35 | #'   headers of the response
 36 | #'
 37 | #' @return a list with three items following the following schema: \cr \code{
 38 | #'   list( rtxt = "", problems = list( "redirect" = list( status_code = 301 ),
 39 | #'   "domain" = list(from_url = "...", to_url = "...") ) ) }
 40 | #'
 41 | #' @export
 42 | #'
 43 | #'
 44 | rt_request_handler <-
 45 |   function(
 46 |     request,
 47 |     on_server_error       = on_server_error_default,
 48 |     on_client_error       = on_client_error_default,
 49 |     on_not_found          = on_not_found_default,
 50 |     on_redirect           = on_redirect_default,
 51 |     on_domain_change      = on_domain_change_default,
 52 |     on_sub_domain_change  = on_sub_domain_change_default,
 53 |     on_file_type_mismatch = on_file_type_mismatch_default,
 54 |     on_suspect_content    = on_suspect_content_default,
 55 |     warn                  = TRUE,
 56 |     encoding              = "UTF-8"
 57 |   ){
 58 | 
 59 |     # apply options to defaults
 60 |     on_server_error       <- list_merge(on_server_error_default, on_server_error)
 61 |     on_client_error       <- list_merge(on_client_error_default, on_client_error)
 62 |     on_not_found          <- list_merge(on_not_found_default, on_not_found)
 63 |     on_redirect           <- list_merge(on_redirect_default, on_redirect)
 64 |     on_domain_change      <- list_merge(on_domain_change_default, on_domain_change)
 65 |     on_file_type_mismatch <- list_merge(on_file_type_mismatch_default, on_file_type_mismatch)
 66 |     on_suspect_content    <- list_merge(on_suspect_content_default, on_suspect_content)
 67 | 
 68 | 
 69 |     # storage for output
 70 |     res <-
 71 |       list(
 72 |         rtxt      = NULL,
 73 |         problems  = list(),
 74 |         cache     = NULL,
 75 |         priority  = 0
 76 |       )
 77 | 
 78 | 
 79 |     # encoding suplied or not
 80 |     encoding_supplied  <-
 81 |       grepl("charset", null_to_default(request$headers$`content-type`, ""))
 82 | 
 83 | 
 84 |     if ( encoding_supplied == TRUE ) {
 85 |       rtxt_not_handled <-
 86 |         httr::content(
 87 |           request,
 88 |           as       = "text"
 89 |         )
 90 |     } else {
 91 |       rtxt_not_handled <-
 92 |         httr::content(
 93 |           request,
 94 |           encoding = encoding,
 95 |           as       = "text"
 96 |         )
 97 |     }
 98 | 
 99 | 
100 |     ## server error
101 |     server_error <-
102 |       request$status_code >= 500
103 | 
104 |     if ( server_error == TRUE ){
105 |       res <-
106 |         request_handler_handler(
107 |           request = request,
108 |           handler = on_server_error,
109 |           res     = res,
110 |           info    = list(status_code = request$status_code),
111 |           warn    = warn
112 |         )
113 |     }
114 | 
115 |     ## http 404 not found
116 |     not_found <-
117 |       request$status_code == 404
118 | 
119 |     if ( not_found == TRUE ){
120 |       res <-
121 |         request_handler_handler(
122 |           request = request,
123 |           handler = on_not_found,
124 |           res     = res,
125 |           info    = list(status_code = request$status_code),
126 |           warn    = warn
127 |         )
128 |     }
129 | 
130 | 
131 |     ## other client error
132 |     client_error       <-
133 |       request$status_code >= 400 &
134 |       request$status_code != 404 &
135 |       request$status_code < 500
136 | 
137 |     if ( client_error == TRUE ){
138 |       res <-
139 |         request_handler_handler(
140 |           request = request,
141 |           handler = on_client_error,
142 |           res     = res,
143 |           info    = list(status_code = request$status_code),
144 |           warn    = warn
145 |         )
146 |     }
147 | 
148 | 
149 |     ## redirect
150 |     redirected <-
151 |       http_was_redirected(request)
152 | 
153 |     ## domain change
154 |     domain_change <-
155 |       http_domain_changed(request)
156 | 
157 |     ## subdomain changed to www
158 |     subdomain_changed <-
159 |       http_subdomain_changed(request)
160 | 
161 | 
162 |     if ( redirected == TRUE ){
163 |       res <-
164 |         request_handler_handler(
165 |           request = request,
166 |           handler = on_redirect,
167 |           res     = res,
168 |           info    =
169 |             {
170 |               tmp <- list()
171 |               for ( i in seq_along(request$all_headers)){
172 |                 tmp[[length(tmp)+1]] <-
173 |                   list(
174 |                     status   = request$all_headers[[i]]$status,
175 |                     location = request$all_headers[[i]]$headers$location
176 |                   )
177 |               }
178 |               tmp
179 |             }
180 |             ,
181 |           warn    = warn
182 |         )
183 | 
184 |       if ( domain_change == TRUE && subdomain_changed == TRUE ){
185 |         res <-
186 |           request_handler_handler(
187 |             request = request,
188 |             handler = on_domain_change,
189 |             res     = res,
190 |             info    = "domain change",
191 |             warn    = warn
192 |           )
193 |       } else if ( domain_change == TRUE ) {
194 |         res <-
195 |           request_handler_handler(
196 |             request = request,
197 |             handler = on_sub_domain_change,
198 |             res     = res,
199 |             info    = "subdomain change",
200 |             warn    = warn
201 |           )
202 |       }
203 |     }
204 | 
205 | 
206 |     ## file type mismatch
207 |     file_type_mismatch <-
208 |       !(grepl("text/plain", null_to_default(request$headers$`content-type`, "")))
209 | 
210 |     if ( file_type_mismatch == TRUE ){
211 |       res <-
212 |         request_handler_handler(
213 |           request = request,
214 |           handler = on_file_type_mismatch,
215 |           res     = res,
216 |           info    = list(content_type = request$headers$`content-type`),
217 |           warn    = warn
218 |         )
219 |     }
220 | 
221 | 
222 |     ## content suspect
223 |     parsable           <- is_valid_robotstxt(rtxt_not_handled)
224 |     content_suspect    <- is_suspect_robotstxt(rtxt_not_handled)
225 | 
226 |     if ( parsable == FALSE | content_suspect == TRUE  ){
227 |       res <-
228 |         request_handler_handler(
229 |           request = request,
230 |           handler = on_suspect_content,
231 |           res     = res,
232 |           info    = list(parsable = parsable, content_suspect = content_suspect),
233 |           warn    = warn
234 |         )
235 |     }
236 | 
237 |     ## default robotstxt if not handled otherwise
238 |     if ( is.null(res$rtxt) ){
239 |       res$rtxt <- rtxt_not_handled
240 |     }
241 | 
242 | 
243 |     # return
244 |     res
245 |   }
246 | 


--------------------------------------------------------------------------------
/inst/robotstxts/robots_google.txt:
--------------------------------------------------------------------------------
  1 | User-agent: *
  2 | Disallow: /search
  3 | Allow: /search/about
  4 | Disallow: /sdch
  5 | Disallow: /groups
  6 | Disallow: /catalogs
  7 | Allow: /catalogs/about
  8 | Allow: /catalogs/p?
  9 | Disallow: /catalogues
 10 | Allow: /newsalerts
 11 | Disallow: /news
 12 | Allow: /news/directory
 13 | Disallow: /nwshp
 14 | Disallow: /setnewsprefs?
 15 | Disallow: /index.html?
 16 | Disallow: /?
 17 | Allow: /?hl=
 18 | Disallow: /?hl=*&
 19 | Allow: /?hl=*&gws_rd=ssl$
 20 | Disallow: /?hl=*&*&gws_rd=ssl
 21 | Allow: /?gws_rd=ssl$
 22 | Allow: /?pt1=true$
 23 | Disallow: /addurl/image?
 24 | Allow:    /mail/help/
 25 | Disallow: /mail/
 26 | Disallow: /pagead/
 27 | Disallow: /relpage/
 28 | Disallow: /relcontent
 29 | Disallow: /imgres
 30 | Disallow: /imglanding
 31 | Disallow: /sbd
 32 | Disallow: /keyword/
 33 | Disallow: /u/
 34 | Disallow: /univ/
 35 | Disallow: /cobrand
 36 | Disallow: /custom
 37 | Disallow: /advanced_group_search
 38 | Disallow: /googlesite
 39 | Disallow: /preferences
 40 | Disallow: /setprefs
 41 | Disallow: /swr
 42 | Disallow: /url
 43 | Disallow: /default
 44 | Disallow: /m?
 45 | Disallow: /m/
 46 | Allow:    /m/finance
 47 | Disallow: /wml?
 48 | Disallow: /wml/?
 49 | Disallow: /wml/search?
 50 | Disallow: /xhtml?
 51 | Disallow: /xhtml/?
 52 | Disallow: /xhtml/search?
 53 | Disallow: /xml?
 54 | Disallow: /imode?
 55 | Disallow: /imode/?
 56 | Disallow: /imode/search?
 57 | Disallow: /jsky?
 58 | Disallow: /jsky/?
 59 | Disallow: /jsky/search?
 60 | Disallow: /pda?
 61 | Disallow: /pda/?
 62 | Disallow: /pda/search?
 63 | Disallow: /sprint_xhtml
 64 | Disallow: /sprint_wml
 65 | Disallow: /pqa
 66 | Disallow: /palm
 67 | Disallow: /gwt/
 68 | Disallow: /purchases
 69 | Disallow: /bsd?
 70 | Disallow: /linux?
 71 | Disallow: /mac?
 72 | Disallow: /microsoft?
 73 | Disallow: /unclesam?
 74 | Disallow: /answers/search?q=
 75 | Disallow: /local?
 76 | Disallow: /local_url
 77 | Disallow: /shihui?
 78 | Disallow: /shihui/
 79 | Disallow: /froogle?
 80 | Disallow: /products?
 81 | Disallow: /froogle_
 82 | Disallow: /product_
 83 | Disallow: /products_
 84 | Disallow: /products;
 85 | Disallow: /print
 86 | Disallow: /books/
 87 | Disallow: /bkshp?*q=*
 88 | Disallow: /books?*q=*
 89 | Disallow: /books?*output=*
 90 | Disallow: /books?*pg=*
 91 | Disallow: /books?*jtp=*
 92 | Disallow: /books?*jscmd=*
 93 | Disallow: /books?*buy=*
 94 | Disallow: /books?*zoom=*
 95 | Allow: /books?*q=related:*
 96 | Allow: /books?*q=editions:*
 97 | Allow: /books?*q=subject:*
 98 | Allow: /books/about
 99 | Allow: /booksrightsholders
100 | Allow: /books?*zoom=1*
101 | Allow: /books?*zoom=5*
102 | Disallow: /ebooks/
103 | Disallow: /ebooks?*q=*
104 | Disallow: /ebooks?*output=*
105 | Disallow: /ebooks?*pg=*
106 | Disallow: /ebooks?*jscmd=*
107 | Disallow: /ebooks?*buy=*
108 | Disallow: /ebooks?*zoom=*
109 | Allow: /ebooks?*q=related:*
110 | Allow: /ebooks?*q=editions:*
111 | Allow: /ebooks?*q=subject:*
112 | Allow: /ebooks?*zoom=1*
113 | Allow: /ebooks?*zoom=5*
114 | Disallow: /patents?
115 | Disallow: /patents/download/
116 | Disallow: /patents/pdf/
117 | Disallow: /patents/related/
118 | Disallow: /scholar
119 | Disallow: /citations?
120 | Allow: /citations?user=
121 | Disallow: /citations?*cstart=
122 | Allow: /citations?view_op=new_profile
123 | Allow: /citations?view_op=top_venues
124 | Disallow: /complete
125 | Disallow: /s?
126 | Disallow: /sponsoredlinks
127 | Disallow: /videosearch?
128 | Disallow: /videopreview?
129 | Disallow: /videoprograminfo?
130 | Allow: /maps?*output=classic*
131 | Allow: /maps/api/js?
132 | Allow: /maps/d/
133 | Disallow: /maps?
134 | Disallow: /mapstt?
135 | Disallow: /mapslt?
136 | Disallow: /maps/stk/
137 | Disallow: /maps/br?
138 | Disallow: /mapabcpoi?
139 | Disallow: /maphp?
140 | Disallow: /mapprint?
141 | Disallow: /maps/api/js/
142 | Disallow: /maps/api/staticmap?
143 | Disallow: /mld?
144 | Disallow: /staticmap?
145 | Disallow: /places/
146 | Allow: /places/$
147 | Disallow: /maps/preview
148 | Disallow: /maps/place
149 | Disallow: /help/maps/streetview/partners/welcome/
150 | Disallow: /help/maps/indoormaps/partners/
151 | Disallow: /lochp?
152 | Disallow: /center
153 | Disallow: /ie?
154 | Disallow: /sms/demo?
155 | Disallow: /katrina?
156 | Disallow: /blogsearch?
157 | Disallow: /blogsearch/
158 | Disallow: /blogsearch_feeds
159 | Disallow: /advanced_blog_search
160 | Disallow: /uds/
161 | Disallow: /chart?
162 | Disallow: /transit?
163 | Disallow: /mbd?
164 | Disallow: /extern_js/
165 | Disallow: /xjs/
166 | Disallow: /calendar/feeds/
167 | Disallow: /calendar/ical/
168 | Disallow: /cl2/feeds/
169 | Disallow: /cl2/ical/
170 | Disallow: /coop/directory
171 | Disallow: /coop/manage
172 | Disallow: /trends?
173 | Disallow: /trends/music?
174 | Disallow: /trends/hottrends?
175 | Disallow: /trends/viz?
176 | Disallow: /trends/embed.js?
177 | Disallow: /trends/fetchComponent?
178 | Disallow: /notebook/search?
179 | Disallow: /musica
180 | Disallow: /musicad
181 | Disallow: /musicas
182 | Disallow: /musicl
183 | Disallow: /musics
184 | Disallow: /musicsearch
185 | Disallow: /musicsp
186 | Disallow: /musiclp
187 | Disallow: /browsersync
188 | Disallow: /call
189 | Disallow: /archivesearch?
190 | Disallow: /archivesearch/url
191 | Disallow: /archivesearch/advanced_search
192 | Disallow: /base/reportbadoffer
193 | Disallow: /urchin_test/
194 | Disallow: /movies?
195 | Disallow: /codesearch?
196 | Disallow: /codesearch/feeds/search?
197 | Disallow: /wapsearch?
198 | Disallow: /safebrowsing
199 | Allow: /safebrowsing/diagnostic
200 | Allow: /safebrowsing/report_badware/
201 | Allow: /safebrowsing/report_error/
202 | Allow: /safebrowsing/report_phish/
203 | Disallow: /reviews/search?
204 | Disallow: /orkut/albums
205 | Allow: /jsapi
206 | Disallow: /views?
207 | Disallow: /c/
208 | Disallow: /cbk
209 | Allow: /cbk?output=tile&cb_client=maps_sv
210 | Disallow: /recharge/dashboard/car
211 | Disallow: /recharge/dashboard/static/
212 | Disallow: /translate_a/
213 | Disallow: /translate_c
214 | Disallow: /translate_f
215 | Disallow: /translate_static/
216 | Disallow: /translate_suggestion
217 | Disallow: /profiles/me
218 | Allow: /profiles
219 | Disallow: /s2/profiles/me
220 | Allow: /s2/profiles
221 | Allow: /s2/oz
222 | Allow: /s2/photos
223 | Allow: /s2/search/social
224 | Allow: /s2/static
225 | Disallow: /s2
226 | Disallow: /transconsole/portal/
227 | Disallow: /gcc/
228 | Disallow: /aclk
229 | Disallow: /cse?
230 | Disallow: /cse/home
231 | Disallow: /cse/panel
232 | Disallow: /cse/manage
233 | Disallow: /tbproxy/
234 | Disallow: /imesync/
235 | Disallow: /shenghuo/search?
236 | Disallow: /support/forum/search?
237 | Disallow: /reviews/polls/
238 | Disallow: /hosted/images/
239 | Disallow: /ppob/?
240 | Disallow: /ppob?
241 | Disallow: /adwordsresellers
242 | Disallow: /accounts/ClientLogin
243 | Disallow: /accounts/ClientAuth
244 | Disallow: /accounts/o8
245 | Allow: /accounts/o8/id
246 | Disallow: /topicsearch?q=
247 | Disallow: /xfx7/
248 | Disallow: /squared/api
249 | Disallow: /squared/search
250 | Disallow: /squared/table
251 | Disallow: /toolkit/
252 | Allow: /toolkit/*.html
253 | Disallow: /globalmarketfinder/
254 | Allow: /globalmarketfinder/*.html
255 | Disallow: /qnasearch?
256 | Disallow: /app/updates
257 | Disallow: /sidewiki/entry/
258 | Disallow: /quality_form?
259 | Disallow: /labs/popgadget/search
260 | Disallow: /buzz/post
261 | Disallow: /compressiontest/
262 | Disallow: /analytics/reporting/
263 | Disallow: /analytics/admin/
264 | Disallow: /analytics/web/
265 | Disallow: /analytics/feeds/
266 | Disallow: /analytics/settings/
267 | Allow: /alerts/manage
268 | Allow: /alerts/remove
269 | Disallow: /alerts/
270 | Allow: /alerts/$
271 | Disallow: /ads/search?
272 | Disallow: /ads/plan/action_plan?
273 | Disallow: /ads/plan/api/
274 | Disallow: /ads/hotels/partners
275 | Disallow: /phone/compare/?
276 | Disallow: /travel/clk
277 | Disallow: /hotelfinder/rpc
278 | Disallow: /hotels/rpc
279 | Disallow: /flights/rpc
280 | Disallow: /commercesearch/services/
281 | Disallow: /evaluation/
282 | Disallow: /chrome/browser/mobile/tour
283 | Disallow: /compare/*/apply*
284 | Disallow: /forms/perks/
285 | Disallow: /baraza/*/search
286 | Disallow: /baraza/*/report
287 | Disallow: /shopping/suppliers/search
288 | Disallow: /ct/
289 | Disallow: /edu/cs4hs/
290 | Disallow: /trustedstores/s/
291 | Disallow: /trustedstores/tm2
292 | Disallow: /trustedstores/verify
293 | Disallow: /adwords/proposal
294 | Disallow: /shopping/product/
295 | Disallow: /shopping/seller
296 | Disallow: /shopping/reviewer
297 | Disallow: /about/careers/apply/
298 | Disallow: /about/careers/applications/
299 | Disallow: /landing/signout.html
300 | Disallow: /webmasters/sitemaps/ping?
301 | Disallow: /ping?
302 | Allow: /gb/images
303 | Allow: /gb/js
304 | Disallow: /gallery/
305 | Allow: /chromecast/setup$
306 | Allow: /chromecast/setup/$
307 | Sitemap: http://www.gstatic.com/culturalinstitute/sitemaps/www_google_com_culturalinstitute/sitemap-index.xml
308 | Sitemap: https://www.google.com/edu/sitemap.xml
309 | Sitemap: https://www.google.com/work/sitemap.xml
310 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml
311 | Sitemap: http://www.google.com/maps/views/sitemap.xml
312 | Sitemap: http://www.google.com/sitemaps_webmasters.xml
313 | Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
314 | Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
315 | Sitemap: http://www.gstatic.com/earth/gallery/sitemaps/sitemap.xml
316 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
317 | Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
318 | Sitemap: http://www.google.com/adwords/sitemap.xml
319 | Sitemap: http://www.google.com/drive/sitemap.xml
320 | Sitemap: http://www.google.com/docs/sitemaps.xml
321 | Sitemap: http://www.google.com/sheets/sitemaps.xml
322 | Sitemap: http://www.google.com/slides/sitemaps.xml
323 | Sitemap: http://www.google.com/forms/sitemaps.xml
324 | Sitemap: https://www.google.com/mail/sitemap.xml
325 | 
326 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | NEWS robotstxt
  2 | ==========================================================================
  3 | 
  4 | 0.7.15.9000
  5 | --------------------------------------------------------------------------
  6 | 
  7 | - `null_to_default` typo fixed
  8 | - Updates to function documentation
  9 | 
 10 | 0.7.15 | 2024-08-24
 11 | --------------------------------------------------------------------------
 12 | 
 13 | - CRAN compliance - Packages which use Internet resources should fail gracefully
 14 | - CRAN compliance - fix R CMD check NOTES.
 15 | 
 16 | 
 17 | 0.7.14 | 2024-08-24
 18 | --------------------------------------------------------------------------
 19 | 
 20 | - CRAN compliance - Packages which use Internet resources should fail gracefully
 21 | 
 22 | 
 23 | 0.7.13 | 2020-09-03
 24 | --------------------------------------------------------------------------
 25 | 
 26 | - CRAN compliance - prevent URL forwarding (HTTP 301): add www to URLs
 27 | 
 28 | 
 29 | 0.7.12 | 2020-09-03
 30 | --------------------------------------------------------------------------
 31 | 
 32 | - CRAN compliance - prevent URL forwarding (HTTP 301): add trailing slashes to URLs
 33 | 
 34 | 
 35 | 
 36 | 0.7.11 | 2020-09-02
 37 | --------------------------------------------------------------------------
 38 | 
 39 | - CRAN compliance - LICENCE file wording; prevent URL forwarding (HTTP 301)
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 0.7.10 | 2020-08-19
 45 | --------------------------------------------------------------------------
 46 | 
 47 | - fix problem in parse_robotstxt() - comment in last line of robots.txt file would lead to errornous parsing - reported by @gittaca, https://github.com/ropensci/robotstxt/pull/59 and https://github.com/ropensci/robotstxt/issues/60
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 0.7.9 | 2020-08-02
 54 | --------------------------------------------------------------------------
 55 | 
 56 | - fix problem is_valid_robotstxt() - robots.txt validity check was to lax - reported by @gittaca, https://github.com/ropensci/robotstxt/issues/58
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 0.7.8 | 2020-07-22
 63 | --------------------------------------------------------------------------
 64 | 
 65 | - fix problem with domain name extraction - reported by @gittaca, https://github.com/ropensci/robotstxt/issues/57
 66 | - fix problem with vArYING CasE in robots.txt field names - reported by @steffilazerte, https://github.com/ropensci/robotstxt/issues/55
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 0.7.7 | 2020-06-17
 73 | --------------------------------------------------------------------------
 74 | 
 75 | - fix problem in rt_request_handler - reported by @MHWauben https://github.com/dmi3kno/polite/issues/28 - patch by @dmi3kno
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 0.7.6 | 2020-06-13
 82 | --------------------------------------------------------------------------
 83 | 
 84 | - make info whether or not results were cached available - requested by @dmi3kno, https://github.com/ropensci/robotstxt/issues/53
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 0.7.5 | 2020-06-07
 90 | --------------------------------------------------------------------------
 91 | 
 92 | - **fix** passing through more parameters from robotstxt() to get_robotstxt() - reported and implemented by @dmi3kno
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 0.7.3 | 2020-05-29
 99 | --------------------------------------------------------------------------
100 | 
101 | - **minor** : improve printing of robots.txt
102 | - add request data as attribute to robots.txt
103 | - add `as.list()` method for robots.txt
104 | - adding several paragrpahs to the README file
105 | - **major** : finishing handlers - quality check, documentation
106 | - **fix** : Partial matching warnings #51 - reported by @mine-cetinkaya-rundel
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 0.7.2 | 2020-05-04
113 | --------------------------------------------------------------------------
114 | 
115 | - **minor** : changes in dependencies were introducing errors when no scheme/protocoll was provided in URL -- fixed https://github.com/ropensci/robotstxt/issues/50
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 0.7.1 | 2018-01-09
122 | --------------------------------------------------------------------------
123 | 
124 | - **minor** : modifying robots.txt parser to be more robust against different formatting of robots.txt files -- fixed https://github.com/ropensci/robotstxt/issues/48
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 0.7.0 | 2018-11-27
131 | --------------------------------------------------------------------------
132 | 
133 | - **major** : introducing http handler to allow for better interpretation of robots.txt files in case of certain events: redirects, server error, client error, suspicous content, ...
134 | 
135 | 
136 | 
137 | 0.6.4 | 2018-09-14
138 | --------------------------------------------------------------------------
139 | 
140 | - **minor** : pass through of parameter for content encoding 
141 | 
142 | 
143 | 
144 | 0.6.3 | 2018-09-14
145 | --------------------------------------------------------------------------
146 | 
147 | - **minor** : introduced parameter encoding to `get_robotstxt()` that defaults to "UTF-8" which does the content function anyways - but now it will not complain about it
148 | - **minor** : added comment to help files specifying use of trailing slash in paths pointing to folders in `paths_allowed` and `robotstxt`.
149 | 
150 | 
151 | 
152 | 
153 | 0.6.2 | 2018-07-18
154 | --------------------------------------------------------------------------
155 | 
156 | - **minor** : changed from `future::future_lapply()` to `future.apply::future_lapply()` to make package compatible with versions of future after 1.8.1
157 | 
158 | 
159 | 
160 | 
161 | 0.6.1 | 2018-05-30
162 | --------------------------------------------------------------------------
163 | 
164 | - **minor** : package was moved to other repo location and project status badge was added
165 | 
166 | 
167 | 
168 | 0.6.0 | 2018-02-10
169 | --------------------------------------------------------------------------
170 | 
171 | - **change/fix** check function paths_allowed() would not return correct result in some edge cases, indicating that spiderbar/rep-cpp check method is more reliable and shall be the default and only  method: [see 1](https://github.com/ropensci/robotstxt/issues/22), [see 2](https://github.com/hrbrmstr/spiderbar/issues/2), [see 3](https://github.com/seomoz/rep-cpp/issues/33)
172 | 
173 | 
174 | 
175 | 
176 | 0.5.2 | 2017-11-12
177 | --------------------------------------------------------------------------
178 | 
179 | - **fix** : rt_get_rtxt() would break on Windows due trying to readLines() from folder
180 | 
181 | 
182 | 
183 | 
184 | 0.5.1 | 2017-11-11
185 | --------------------------------------------------------------------------
186 | 
187 | - **change** : spiderbar is now non-default second (experimental) check method
188 | - **fix** : there were warnings in case of multiple domain guessing
189 | 
190 | 
191 | 
192 | 0.5.0 | 2017-10-07
193 | --------------------------------------------------------------------------
194 | 
195 | - **feature** : spiderbar's can_fetch() was added, now one can choose which check method to use for checking access rights 
196 | - **feature** : use futures (from package future) to speed up retrieval and parsing
197 | - **feature** : now there is a `get_robotstxts()` function wich is a 'vectorized' version of `get_robotstxt()`
198 | - **feature** : `paths_allowed()` now allows checking via either robotstxt parsed robots.txt files or via functionality provided by the spiderbar package (the latter should be faster by approximatly factor 10)
199 | - **feature** : various functions now have a ssl_verifypeer option (analog to CURL option https://curl.haxx.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html) which might help with robots.txt file retrieval in some cases
200 | - **change** : user_agent for robots.txt file retrieval will now default to: `sessionInfo()$R.version$version.string` 
201 | - **change** : robotstxt now assumes it knows how to parse --> if it cannot parse it assumes that it got no valid robots.txt file meaning that there are no restrictions
202 | - **fix** : valid_robotstxt would not accept some actual valid robotstxt files
203 | 
204 | 
205 | 
206 | 0.4.1 | 2017-08-20
207 | --------------------------------------------------------------------------
208 | 
209 | - **restructure** : put each function in separate file
210 | - **fix** : parsing would go bonkers for robots.txt of cdc.gov (e.g. combining all robots with all permissions) due to errornous handling of carriage return character (reported by @hrbrmstr - thanks)
211 | 
212 | 
213 | 
214 | 0.4.0 | 2017-07-14
215 | --------------------------------------------------------------------------
216 | 
217 | - **user_agent** parameter **added** to robotstxt() and paths_allowed to allow for user defined HTTP user-agent send when retrieving robots.txt file from domain
218 | 
219 | 
220 | 
221 | 0.3.4 | 2017-07-08
222 | --------------------------------------------------------------------------
223 | 
224 | - **fix** : non robots.txt files (e.g. html files returned by server instead of the requested robots.txt / facebook.com) would be handled as if it were non existent / empty files (reported by @simonmunzert - thanks)
225 | - **fix** : UTF-8 encoded robots.txt with BOM (byte order mark) would break parsing although files were otherwise valid robots.txt files
226 | 
227 | 
228 | 
229 | 
230 | 0.3.3 | 2016-12-10
231 | --------------------------------------------------------------------------
232 | 
233 | - updating NEWS file and switching to NEWS.md
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 0.3.2 | 2016-04-28 
240 | --------------------------------------------------------------------------
241 | 
242 | - CRAN publication
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 0.3.1 | 2016-04-27 
249 | --------------------------------------------------------------------------
250 | 
251 | - get_robotstxt() tests for HTTP errors and handles them, warnings might be suppressed while un-plausible HTTP status codes will lead to stoping the function https://github.com/ropenscilabs/robotstxt#5
252 | 
253 | - dropping R6 dependency and use list implementation instead https://github.com/ropenscilabs/robotstxt#6
254 | 
255 | - use caching for get_robotstxt() https://github.com/ropenscilabs/robotstxt#7 / https://github.com/ropenscilabs/robotstxt/commit/90ad735b8c2663367db6a9d5dedbad8df2bc0d23
256 | 
257 | - make explicit, less error prone usage of httr::content(rtxt) https://github.com/ropenscilabs/robotstxt#
258 | 
259 | - replace usage of missing for parameter check with explicit NULL as default value for parameter https://github.com/ropenscilabs/robotstxt#9
260 | 
261 | - partial match useragent / useragents https://github.com/ropenscilabs/robotstxt#10
262 | 
263 | - explicit declaration encoding: encoding="UTF-8" in httr::content() https://github.com/ropenscilabs/robotstxt#11
264 | 
265 | 
266 | 
267 | 
268 | 
269 | version 0.1.2 // 2016-02-08 ...
270 | --------------------------------------------------------------------------
271 | 
272 | - first feature complete version on CRAN
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 


--------------------------------------------------------------------------------
/vignettes/using_robotstxt.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using Robotstxt"
  3 | author: "Peter Meissner"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     toc: true
  8 |     css: style.css
  9 | vignette: >
 10 |   %\VignetteIndexEntry{using_robotstxt}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 |   %\VignetteEncoding{UTF-8}
 13 | ---
 14 | # Description
 15 | 
 16 | The package provides a simple ‘robotstxt’ class and accompanying methods
 17 | to parse and check ‘robots.txt’ files. Data fields are provided as data
 18 | frames and vectors. Permissions can be checked by providing path
 19 | character vectors and optional bot names.
 20 | 
 21 | # Robots.txt files
 22 | 
 23 | Robots.txt files are a way to kindly ask webbots, spiders, crawlers,
 24 | wanderers and the like to access or not access certain parts of a
 25 | webpage. The de facto ‘standard’ never made it beyond a informal
 26 | [“Network Working Group INTERNET
 27 | DRAFT”](http://www.robotstxt.org/norobots-rfc.txt). Nonetheless, the use
 28 | of robots.txt files is widespread
 29 | (e.g. <https://en.wikipedia.org/robots.txt>,
 30 | <https://www.google.com/robots.txt>) and bots from Google, Yahoo and the
 31 | like will adhere to the rules defined in robots.txt files - although,
 32 | their interpretation of those rules might differ (e.g. [rules for
 33 | googlebot](https://developers.google.com/search/reference/robots_txt)).
 34 | 
 35 | As the name of the files already suggests robots.txt files are plain
 36 | text and always found at the root of a domain. The syntax of the files
 37 | in essence follows a `fieldname: value` scheme with optional preceding
 38 | `user-agent: ...` lines to indicate the scope of the following rule
 39 | block. Blocks are separated by blank lines and the omission of a
 40 | user-agent field (which directly corresponds to the HTTP user-agent
 41 | field) is seen as referring to all bots. `#` serves to comment lines and
 42 | parts of lines. Everything after `#` until the end of line is regarded a
 43 | comment. Possible field names are: user-agent, disallow, allow,
 44 | crawl-delay, sitemap, and host.
 45 | 
 46 | Let us have an example file to get an idea how a robots.txt file might
 47 | look like. The file below starts with a comment line followed by a line
 48 | disallowing access to any content – everything that is contained in root
 49 | (“`/`”) – for all bots. The next block concerns GoodBot and NiceBot.
 50 | Those two get the previous permissions lifted by being disallowed
 51 | nothing. The third block is for PrettyBot. PrettyBot likes shiny stuff
 52 | and therefor gets a special permission for everything contained in the
 53 | “`/shinystuff/`” folder while all other restrictions still hold. In the
 54 | last block all bots are asked to pause at least 5 seconds between two
 55 | visits.
 56 | 
 57 |     # this is a comment 
 58 |     # a made up example of an robots.txt file
 59 | 
 60 |     Disallow: /
 61 | 
 62 |     User-agent: GoodBot # another comment
 63 |     User-agent: NiceBot
 64 |     Disallow: 
 65 | 
 66 |     User-agent: PrettyBot
 67 |     Allow: /shinystuff/
 68 | 
 69 |     Crawl-Delay: 5
 70 | 
 71 | For more information have a look at:
 72 | <http://www.robotstxt.org/norobots-rfc.txt>, where the robots.txt file
 73 | ‘standard’ is described formally. Valuable introductions can be found at
 74 | <http://www.robotstxt.org/robotstxt.html> as well as at
 75 | <https://en.wikipedia.org/wiki/Robots_exclusion_standard> - of cause.
 76 | 
 77 | # Fast food usage for the uninterested
 78 | 
 79 |     library(robotstxt)
 80 |     paths_allowed("http://google.com/")
 81 | 
 82 |     ## [1] TRUE
 83 | 
 84 |     paths_allowed("http://google.com/search")
 85 | 
 86 |     ## [1] FALSE
 87 | 
 88 | # Example Usage
 89 | 
 90 | First, let us load the package. In addition we load the dplyr package to
 91 | be able to use the magrittr pipe operator `%>%` and some easy to read
 92 | and remember data manipulation functions.
 93 | 
 94 |     library(robotstxt)
 95 |     library(dplyr)
 96 | 
 97 | ## object oriented style
 98 | 
 99 | The first step is to create an instance of the robotstxt class provided
100 | by the package. The instance has to be initiated via providing either
101 | domain or the actual text of the robots.txt file. If only the domain is
102 | provided, the robots.txt file will be downloaded automatically. Have a
103 | look at `?robotstxt` for descriptions of all data fields and methods as
104 | well as their parameters.
105 | 
106 |     rtxt <- robotstxt(domain="wikipedia.org")
107 | 
108 | `rtxt` is of class `robotstxt`.
109 | 
110 |     class(rtxt)
111 | 
112 |     ## [1] "robotstxt"
113 | 
114 | Printing the object lets us glance at all data fields and methods in
115 | `rtxt` - we have access to the text as well as all common fields.
116 | Non-standard fields are collected in `other`.
117 | 
118 |     rtxt
119 | 
120 |     ## $text
121 |     ## [1] "#\n# robots.txt for http://www.wikipedia.org/ and friends\n#\n# Please note: There are a lot of pages on this site, and there are\n# some misbehaved spiders out there that go _way_ too fast. If you're\n# irresponsible, your access to the site may be blocked.\n#\n\n# advertising-related bots:\nUser-agent: Mediapartners-Google*\n\n[... 653 lines omitted ...]"
122 |     ## 
123 |     ## $domain
124 |     ## [1] "wikipedia.org"
125 |     ## 
126 |     ## $robexclobj
127 |     ## <Robots Exclusion Protocol Object>
128 |     ## $bots
129 |     ## [1] "Mediapartners-Google*"       "IsraBot"                     "Orthogaffe"                  "UbiCrawler"                 
130 |     ## [5] "DOC"                         "Zao"                         ""                            "[...  28 items omitted ...]"
131 |     ## 
132 |     ## $comments
133 |     ##   line                                                               comment
134 |     ## 1    1                                                                     #
135 |     ## 2    2                # robots.txt for http://www.wikipedia.org/ and friends
136 |     ## 3    3                                                                     #
137 |     ## 4    4   # Please note: There are a lot of pages on this site, and there are
138 |     ## 5    5 # some misbehaved spiders out there that go _way_ too fast. If you're
139 |     ## 6    6              # irresponsible, your access to the site may be blocked.
140 |     ## 7                                                                           
141 |     ## 8                                               [...  173 items omitted ...]
142 |     ## 
143 |     ## $permissions
144 |     ##                          field             useragent value
145 |     ## 1                     Disallow Mediapartners-Google*     /
146 |     ## 2                     Disallow               IsraBot      
147 |     ## 3                     Disallow            Orthogaffe      
148 |     ## 4                     Disallow            UbiCrawler     /
149 |     ## 5                     Disallow                   DOC     /
150 |     ## 6                     Disallow                   Zao     /
151 |     ## 7                                                         
152 |     ## 8 [...  370 items omitted ...]                            
153 |     ## 
154 |     ## $crawl_delay
155 |     ## [1] field     useragent value    
156 |     ## <0 rows> (or 0-length row.names)
157 |     ## 
158 |     ## $host
159 |     ## [1] field     useragent value    
160 |     ## <0 rows> (or 0-length row.names)
161 |     ## 
162 |     ## $sitemap
163 |     ## [1] field     useragent value    
164 |     ## <0 rows> (or 0-length row.names)
165 |     ## 
166 |     ## $other
167 |     ## [1] field     useragent value    
168 |     ## <0 rows> (or 0-length row.names)
169 |     ## 
170 |     ## $check
171 |     ## function (paths = "/", bot = "*") 
172 |     ## {
173 |     ##     spiderbar::can_fetch(obj = self$robexclobj, path = paths, 
174 |     ##         user_agent = bot)
175 |     ## }
176 |     ## <bytecode: 0x12f9629b0>
177 |     ## <environment: 0x12f965c10>
178 |     ## 
179 |     ## attr(,"class")
180 |     ## [1] "robotstxt"
181 | 
182 | Checking permissions works via `rtxt`’s `check` method by providing one
183 | or more paths. If no bot name is provided `"*"` - meaning any bot - is
184 | assumed.
185 | 
186 |     # checking for access permissions
187 |     rtxt$check(paths = c("/","api/"), bot = "*")
188 | 
189 |     ## [1]  TRUE FALSE
190 | 
191 |     rtxt$check(paths = c("/","api/"), bot = "Orthogaffe")
192 | 
193 |     ## [1] TRUE TRUE
194 | 
195 |     rtxt$check(paths = c("/","api/"), bot = "Mediapartners-Google*  ")
196 | 
197 |     ## [1]  TRUE FALSE
198 | 
199 | ## functional style
200 | 
201 | While working with the robotstxt class is recommended the checking can
202 | be done with functions only as well. In the following we (1) download
203 | the robots.txt file; (2) parse it and (3) check permissions.
204 | 
205 |     r_text        <- get_robotstxt("nytimes.com") 
206 | 
207 |     r_parsed <- parse_robotstxt(r_text)
208 |     r_parsed
209 | 
210 |     ## $useragents
211 |     ## [1] "*"                    "Mediapartners-Google" "AdsBot-Google"        "adidxbot"            
212 |     ## 
213 |     ## $comments
214 |     ## [1] line    comment
215 |     ## <0 rows> (or 0-length row.names)
216 |     ## 
217 |     ## $permissions
218 |     ##       field            useragent                                 value
219 |     ## 1     Allow                    *                          /ads/public/
220 |     ## 2     Allow                    *             /svc/news/v3/all/pshb.rss
221 |     ## 3  Disallow                    *                                 /ads/
222 |     ## 4  Disallow                    *                             /adx/bin/
223 |     ## 5  Disallow                    *                            /archives/
224 |     ## 6  Disallow                    *                                /auth/
225 |     ## 7  Disallow                    *                                /cnet/
226 |     ## 8  Disallow                    *                             /college/
227 |     ## 9  Disallow                    *                            /external/
228 |     ## 10 Disallow                    *                      /financialtimes/
229 |     ## 11 Disallow                    *                                 /idg/
230 |     ## 12 Disallow                    *                             /indexes/
231 |     ## 13 Disallow                    *                             /library/
232 |     ## 14 Disallow                    *                    /nytimes-partners/
233 |     ## 15 Disallow                    * /packages/flash/multimedia/TEMPLATES/
234 |     ## 16 Disallow                    *                       /pages/college/
235 |     ## 17 Disallow                    *                         /paidcontent/
236 |     ## 18 Disallow                    *                            /partners/
237 |     ## 19 Disallow                    *                  /restaurants/search*
238 |     ## 20 Disallow                    *                             /reuters/
239 |     ## 21 Disallow                    *                             /register
240 |     ## 22 Disallow                    *                           /thestreet/
241 |     ## 23 Disallow                    *                                  /svc
242 |     ## 24 Disallow                    *                     /video/embedded/*
243 |     ## 25 Disallow                    *                        /web-services/
244 |     ## 26 Disallow                    *               /gst/travel/travsearch*
245 |     ## 27 Disallow Mediapartners-Google                  /restaurants/search*
246 |     ## 28 Disallow        AdsBot-Google                  /restaurants/search*
247 |     ## 29 Disallow             adidxbot                  /restaurants/search*
248 |     ## 
249 |     ## $crawl_delay
250 |     ## [1] field     useragent value    
251 |     ## <0 rows> (or 0-length row.names)
252 |     ## 
253 |     ## $sitemap
254 |     ##     field useragent                                                                  value
255 |     ## 1 Sitemap         * http://spiderbites.nytimes.com/sitemaps/www.nytimes.com/sitemap.xml.gz
256 |     ## 2 Sitemap         *            http://www.nytimes.com/sitemaps/sitemap_news/sitemap.xml.gz
257 |     ## 3 Sitemap         *   http://spiderbites.nytimes.com/sitemaps/sitemap_video/sitemap.xml.gz
258 |     ## 
259 |     ## $host
260 |     ## [1] field     useragent value    
261 |     ## <0 rows> (or 0-length row.names)
262 |     ## 
263 |     ## $other
264 |     ## [1] field     useragent value    
265 |     ## <0 rows> (or 0-length row.names)
266 | 
267 |     paths_allowed(
268 |       paths  = c("images/","/search"), 
269 |       domain = c("wikipedia.org", "google.com"),
270 |       bot    = "Orthogaffe"
271 |     )
272 | 
273 |     ##  wikipedia.org                       google.com
274 | 
275 |     ## [1]  TRUE FALSE
276 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | ```{r, echo = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "##",
  9 |   fig.path = "README-"
 10 | )
 11 | ```
 12 | 
 13 | # robotstxt <img src="man/figures/logo.jpeg" align="right" height="139" alt="Hand-drawn robot inside a hex sticker"/>
 14 | 
 15 | <!-- badges: start -->
 16 | [![R-CMD-check](https://github.com/ropensci/robotstxt/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ropensci/robotstxt/actions/workflows/R-CMD-check.yaml)
 17 | [![Peer Reviewed](https://badges.ropensci.org/25_status.svg)](https://github.com/ropensci/software-review/issues/25)
 18 | [![Monthly Downloads](https://cranlogs.r-pkg.org/badges/robotstxt)](https://cran.r-project.org/web/packages/robotstxt/index.html)
 19 | [![Total Downloads](https://cranlogs.r-pkg.org/badges/grand-total/robotstxt)](https://cran.r-project.org/web/packages/robotstxt/index.html)
 20 | [![Cran Checks](https://badges.cranchecks.info/summary/robotstxt.svg)](https://cran.r-project.org/web/checks/check_results_robotstxt.html)
 21 | [![Lifecycle: Stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable)
 22 | [![Codecov test coverage](https://codecov.io/gh/ropensci/robotstxt/graph/badge.svg)](https://app.codecov.io/gh/ropensci/robotstxt)
 23 | <!-- badges: end -->
 24 | 
 25 | ```{r, echo=FALSE, include=FALSE}
 26 | devtools::load_all()
 27 | ```
 28 | 
 29 | ```{r, include=FALSE}
 30 | options("width"=100)
 31 | tmp <- packageDescription( basename(getwd()) )
 32 | ```
 33 | 
 34 | ```{r, results='asis', echo=FALSE}
 35 | cat(paste0(tmp$Title, ". "))
 36 | cat(tmp$Description)
 37 | ```
 38 | 
 39 | 
 40 | ## Installation
 41 | 
 42 | Install from CRAN using
 43 | ```{r, eval=FALSE}
 44 | install.packages("robotstxt")
 45 | ```
 46 | 
 47 | Or install the development version:
 48 | ```{r, eval=FALSE}
 49 | devtools::install_github("ropensci/robotstxt")
 50 | ```
 51 | 
 52 | 
 53 | ## License
 54 | 
 55 | ```{r, results='asis', echo=FALSE}
 56 | cat(tmp$License, "<br>")
 57 | cat(tmp$Author)
 58 | ```
 59 | 
 60 | 
 61 | ## Citation
 62 | 
 63 | ```{r, comment = "", eval=TRUE, echo=FALSE}
 64 | citation("robotstxt")
 65 | ```
 66 | 
 67 | 
 68 | ## Usage
 69 | 
 70 | Review the [package index reference](https://docs.ropensci.org/robotstxt/reference/index.html) or use
 71 | ```{r, eval=FALSE}
 72 | ?robotstxt
 73 | ```
 74 | for documentation.
 75 | 
 76 | Simple path access right checking (the functional way) ... 
 77 | 
 78 | ```{r}
 79 | options(robotstxt_warn = FALSE)
 80 | 
 81 | paths_allowed(
 82 |   paths  = c("/api/rest_v1/?doc", "/w/"),
 83 |   domain = "wikipedia.org",
 84 |   bot    = "*"
 85 | )
 86 | 
 87 | paths_allowed(
 88 |   paths = c(
 89 |     "https://wikipedia.org/api/rest_v1/?doc",
 90 |     "https://wikipedia.org/w/"
 91 |   )
 92 | )
 93 | ```
 94 | 
 95 | ... or (the object oriented way) ...
 96 | 
 97 | ```{r}
 98 | options(robotstxt_warn = FALSE)
 99 | 
100 | rtxt <- robotstxt(domain = "wikipedia.org")
101 | 
102 | rtxt$check(
103 |   paths = c("/api/rest_v1/?doc", "/w/"),
104 |   bot   = "*"
105 | )
106 | ```
107 | 
108 | 
109 | ### Retrieval
110 | 
111 | Retrieving the robots.txt file for a domain: 
112 | 
113 | ```{r}
114 | # retrieval
115 | rt <- get_robotstxt("https://petermeissner.de")
116 | 
117 | # printing
118 | rt
119 | ```
120 | 
121 | 
122 | ###  Interpretation
123 | 
124 | Checking whether or not one is supposadly allowed to access some resource from a 
125 | web server is - unfortunately - not just a matter of downloading and parsing a 
126 | simple robots.txt file. 
127 | 
128 | First there is no official specification for robots.txt files so every robots.txt 
129 | file written and every robots.txt file read and used is an interpretation. Most of
130 | the time we all have a common understanding on how things are supposed to work 
131 | but things get more complicated at the edges. 
132 | 
133 | Some interpretation problems:
134 | 
135 | - finding no robots.txt file at the server (e.g. HTTP status code 404) implies that everything is allowed
136 | - subdomains should have there own robots.txt file if not it is assumed that everything is allowed
137 | - redirects involving protocol changes - e.g. upgrading from http to https - are followed and considered no domain or subdomain change - so whatever is found at the end of the redirect is considered to be the robots.txt file for the original domain
138 | - redirects from subdomain www to the domain is considered no domain change  - so whatever is found at the end of the redirect is considered to be the robots.txt file for the subdomain originally requested
139 | 
140 | 
141 | ### Event Handling
142 | 
143 | Because the interpretation of robots.txt rules not just depends on the rules specified within the file,
144 | the package implements an event handler system that allows to interpret and re-interpret events into rules. 
145 | 
146 | Under the hood the `rt_request_handler()` function is called within `get_robotstxt()`. 
147 | This function takes an {httr} request-response object and a set of event handlers. 
148 | Processing the request and the handlers it checks for various events and states 
149 | around getting the file and reading in its content. If an event/state happened
150 | the event handlers are passed on to the `request_handler_handler()` along for 
151 | problem resolution and collecting robots.txt file transformations:
152 | 
153 | - rule priorities decide if rules are applied given the current state priority
154 | - if rules specify signals those are emitted (e.g. error, message, warning)
155 | - often rules imply overwriting the raw content with a suitable interpretation given the circumstances the file was (or was not) retrieved
156 | 
157 | 
158 | Event handler rules can either consist of 4 items or can be functions - the former being the usual case and that used throughout the package itself. 
159 | Functions like `paths_allowed()` do have parameters that allow passing 
160 | along handler rules or handler functions.
161 | 
162 | Handler rules are lists with the following items: 
163 | 
164 | - `over_write_file_with`: if the rule is triggered and has higher priority than those rules applied beforehand (i.e. the new priority has an higher value than the old priority) then the robots.txt file retrieved will be overwritten by this character vector
165 | - `signal`: might be `"message"`, `"warning"`, or `"error"` and will use the signal function to signal the event/state just handled. Signaling a warning or a message might be suppressed by setting the function paramter `warn = FALSE`.
166 | - `cache` should the package be allowed to cache the results of the retrieval or not
167 | - `priority` the priority of the rule specified as numeric value, rules with higher priority will be allowed to overwrite robots.txt file content changed by rules with lower priority
168 | 
169 | 
170 | The package knows the following rules with the following defaults: 
171 | 
172 | - `on_server_error` : 
173 |   - given a server error - the server is unable to serve a file - we assume that something is terrible wrong and forbid all paths for the time being but do not cache the result so that we might get an updated file later on
174 | 
175 |     ```{r}
176 |     on_server_error_default
177 |     ```
178 | 
179 | 
180 | - `on_client_error` : 
181 |   - client errors encompass all HTTP status 4xx status codes except 404 which is handled directly
182 |   - despite the fact that there are a lot of codes that might indicate that the client has to take action (authentication, billing, ... see: https://de.wikipedia.org/wiki/HTTP-Statuscode) in the case of retrieving robots.txt with simple GET request things should just work and any client error is treated as if there is no file available and thus scraping is generally allowed
183 | 
184 |     ```{r}
185 |     on_client_error_default
186 |     ```
187 | 
188 | - `on_not_found` : 
189 |   - HTTP status code 404 has its own handler but is treated the same ways other client errors: if there is no file available and thus scraping is generally allowed
190 | 
191 | 
192 |     ```{r}
193 |     on_not_found_default
194 |     ```
195 | 
196 | - `on_redirect` : 
197 |   - redirects are ok - often redirects redirect from HTTP schema to HTTPS - {robotstxt} will use whatever content it has been redirected to
198 | 
199 |     ```{r}
200 |     on_redirect_default
201 |     ```
202 | 
203 | - `on_domain_change` : 
204 |   - domain changes are handled as if the robots.txt file did not exist and thus scraping is generally allowed
205 | 
206 |     ```{r}
207 |     on_domain_change_default
208 |     ```
209 | 
210 | - `on_file_type_mismatch` : 
211 |   - if {robotstxt} gets content with content type other than text it probably is not a robotstxt file, this situation is handled as if no file was provided and thus scraping is generally allowed
212 | 
213 |     ```{r}
214 |     on_file_type_mismatch_default
215 |     ```
216 | 
217 | - `on_suspect_content` :
218 |   - if {robotstxt} cannot parse it probably is not a robotstxt file, this situation is handled as if no file was provided and thus scraping is generally allowed
219 | 
220 | 
221 |     ```{r}
222 |     on_suspect_content_default
223 |     ```
224 | 
225 | 
226 | ### Design Map for Event/State Handling
227 | 
228 | **from version 0.7.x onwards**
229 | 
230 | While previous releases were concerned with implementing parsing and permission checking and improving performance the 0.7.x release will be about robots.txt retrieval foremost. While retrieval was implemented there are corner cases in the retrieval stage that very well influence the interpretation of permissions granted.
231 | 
232 | 
233 | **Features and Problems handled:**
234 | 
235 | - now handles corner cases of retrieving robots.txt files
236 | - e.g. if no robots.txt file is available this basically means "you can scrape it all" 
237 | - but there are further corner cases (what if there is a server error, what if redirection takes place, what if redirection takes place to different domains, what if a file is returned but it is not parsable, or is of format HTML or JSON, ...)
238 | 
239 | 
240 | **Design Decisions**
241 | 
242 | 1. the whole HTTP request-response-chain is checked for certain event/state types
243 |     - server error
244 |     - client error
245 |     - file not found (404)
246 |     - redirection
247 |     - redirection to another domain
248 | 2. the content returned by the HTTP is checked against 
249 |     - mime type / file type specification mismatch 
250 |     - suspicious content (file content does seem to be JSON, HTML, or XML instead of  robots.txt)
251 | 3. state/event handler define how these states and events are handled
252 | 4. a handler handler executes the rules defined in individual handlers
253 | 5. handler can be overwritten
254 | 6. handler defaults are defined that they should always do the right thing
255 | 7. handler can ...
256 |     - overwrite the content of a robots.txt file (e.g. allow/disallow all)
257 |     - modify how problems should be signaled: error, warning, message, none
258 |     - if robots.txt file retrieval should be cached or not
259 | 8. problems (no matter how they were handled) are attached to the robots.txt's as attributes, allowing for ... 
260 |     - transparency
261 |     - reacting post-mortem to the problems that occured
262 | 9. all handler (even the actual execution of the HTTP-request) can be overwritten at runtime to inject user defined behaviour beforehand
263 | 
264 | 
265 | ### Warnings
266 | 
267 | By default all functions retrieving robots.txt files will warn if there are 
268 | 
269 | - any HTTP events happening while retrieving the file (e.g. redirects) or
270 | - the content of the file does not seem to be a valid robots.txt file.
271 | 
272 | Warnings can be turned off in several ways: 
273 | 
274 | ```{r, eval=FALSE}
275 | suppressWarnings({
276 |   paths_allowed("PATH_WITH_WARNING")
277 | })
278 | ```
279 | 
280 | ```{r, eval=FALSE}
281 | paths_allowed("PATH_WITH_WARNING", warn = FALSE)
282 | ```
283 | 
284 | ```{r, eval=FALSE}
285 | options(robotstxt_warn = FALSE)
286 | paths_allowed("PATH_WITH_WARNING")
287 | ```
288 | 
289 | 
290 | ### Inspection and Debugging
291 | 
292 | The robots.txt files retrieved are basically mere character vectors:
293 | 
294 | ```{r}
295 | rt <- get_robotstxt("petermeissner.de")
296 | 
297 | as.character(rt)
298 | 
299 | cat(rt)
300 | ```
301 | 
302 | The last HTTP request is stored in an object 
303 | 
304 | ```{r}
305 | rt_last_http$request
306 | ```
307 | 
308 | But they also have some additional information stored as attributes.
309 | 
310 | ```{r}
311 | names(attributes(rt))
312 | ```
313 | 
314 | Events that might change the interpretation of the rules found in the robots.txt file:
315 | 
316 | ```{r}
317 | attr(rt, "problems")
318 | ```
319 | 
320 | The {httr} request-response object that allwos to dig into what exactly was going on in the client-server exchange. 
321 | 
322 | ```{r}
323 | attr(rt, "request")
324 | ```
325 | 
326 | ... or lets us retrieve the original content given back by the server:
327 | 
328 | ```{r}
329 | httr::content(
330 |   x        = attr(rt, "request"),
331 |   as       = "text",
332 |   encoding = "UTF-8"
333 | )
334 | ```
335 | 
336 | ... or have a look at the actual HTTP request issued and all response headers given back by the server:
337 | 
338 | ```{r}
339 | # extract request-response object
340 | rt_req <- attr(rt, "request")
341 | 
342 | # HTTP request
343 | rt_req$request
344 | 
345 | # response headers
346 | rt_req$all_headers
347 | ```
348 | 
349 | 
350 | ### Transformation
351 | 
352 | For convenience the package also includes a `as.list()` method for robots.txt files. 
353 | 
354 | ```{r}
355 | as.list(rt)
356 | ```
357 | 
358 | 
359 | ### Caching 
360 | 
361 | The retrieval of robots.txt files is cached on a per R-session basis. 
362 | Restarting an R-session will invalidate the cache. Also using the the 
363 | function parameter `force = TRUE` will force the package to re-retrieve the 
364 | robots.txt file. 
365 | 
366 | ```{r}
367 | paths_allowed("petermeissner.de/I_want_to_scrape_this_now", force = TRUE, verbose = TRUE)
368 | paths_allowed("petermeissner.de/I_want_to_scrape_this_now", verbose = TRUE)
369 | ```
370 | 
371 | 
372 | ## More information
373 | 
374 | - https://www.robotstxt.org/norobots-rfc.txt
375 | - [Have a look at the vignette at https://cran.r-project.org/package=robotstxt/vignettes/using_robotstxt.html ](https://cran.r-project.org/package=robotstxt/vignettes/using_robotstxt.html)
376 | - [Google on robots.txt](https://developers.google.com/search/reference/robots_txt?hl=en)
377 | - https://wiki.selfhtml.org/wiki/Grundlagen/Robots.txt
378 | - https://support.google.com/webmasters/answer/6062608?hl=en
379 | - https://www.robotstxt.org/robotstxt.html
380 | 
381 | 
382 | [![ropensci\_footer](https://ropensci.org/public_images/github_footer.png)](https://ropensci.org/)
383 | 
384 | 


--------------------------------------------------------------------------------