├── .gitignore ├── BUILD ├── CMakeLists.txt ├── CMakeLists.txt.in ├── CONTRIBUTING.md ├── LICENSE ├── MODULE.bazel ├── README.md ├── protocol-draft └── draft-illyes-repext-00.xml ├── reporting_robots.cc ├── reporting_robots.h ├── reporting_robots_test.cc ├── robots.cc ├── robots.h ├── robots_main.cc └── robots_test.cc /.gitignore: -------------------------------------------------------------------------------- 1 | c-build/* 2 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | licenses(["notice"]) 6 | 7 | exports_files(["LICENSE"]) 8 | 9 | cc_library( 10 | name = "robots", 11 | srcs = [ 12 | "robots.cc", 13 | ], 14 | hdrs = [ 15 | "robots.h", 16 | ], 17 | deps = [ 18 | "@abseil-cpp//absl/base:core_headers", 19 | "@abseil-cpp//absl/container:fixed_array", 20 | "@abseil-cpp//absl/strings", 21 | ], 22 | ) 23 | 24 | cc_library( 25 | name = "reporting_robots", 26 | srcs = ["reporting_robots.cc"], 27 | hdrs = ["reporting_robots.h"], 28 | deps = [ 29 | ":robots", 30 | "@abseil-cpp//absl/container:btree", 31 | "@abseil-cpp//absl/strings", 32 | ], 33 | ) 34 | 35 | cc_test( 36 | name = "robots_test", 37 | srcs = ["robots_test.cc"], 38 | deps = [ 39 | ":robots", 40 | "@abseil-cpp//absl/strings", 41 | "@googletest//:gtest_main", 42 | ], 43 | ) 44 | 45 | cc_test( 46 | name = "reporting_robots_test", 47 | srcs = ["reporting_robots_test.cc"], 48 | deps = [ 49 | ":reporting_robots", 50 | ":robots", 51 | "@abseil-cpp//absl/strings", 52 | "@googletest//:gtest_main", 53 | ], 54 | ) 55 | 56 | cc_binary( 57 | name = "robots_main", 58 | srcs = ["robots_main.cc"], 59 | deps = [ 60 | ":robots", 61 | ], 62 | ) 63 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | PROJECT(robots) 2 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0) 3 | 4 | SET(CMAKE_CXX_STANDARD 14) 5 | SET(CMAKE_POSITION_INDEPENDENT_CODE ON) 6 | 7 | SET(VERSION "0.0.0") 8 | 9 | STRING(REGEX MATCHALL "([0-9]+)" VERSION_DIGITS "${VERSION}") 10 | 11 | LIST(GET VERSION_DIGITS 0 CPACK_PACKAGE_VERSION_MAJOR) 12 | LIST(GET VERSION_DIGITS 1 CPACK_PACKAGE_VERSION_MINOR) 13 | LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH) 14 | 15 | SET(CPACK_PACKAGE_NAME "robots") 16 | SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library") 17 | SET(CPACK_PACKAGE_VENDOR "Google Inc.") 18 | SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md") 19 | SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE") 20 | 21 | SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") 22 | SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") 23 | 24 | SET(base_with_ver "robots-[0-9]+\\\\.[0-9]+\\\\.[0-9]+") 25 | SET(CPACK_SOURCE_IGNORE_FILES 26 | "/_CPack_Packages/" 27 | "/CMakeFiles/" 28 | "/.deps/" 29 | "^${base_with_ver}(-Source|-Linux)?/" 30 | "${base_with_ver}.tar\\\\.(gz|bz2|Z|lzma|xz)$" 31 | "\\\\.o$" 32 | "~$" 33 | "/\\\\.svn/" 34 | "/CMakeCache\\\\.txt$" 35 | "/CTestTestfile\\\\.cmake$" 36 | "/cmake_install\\\\.cmake$" 37 | "/CPackConfig\\\\.cmake$" 38 | "/CPackSourceConfig\\\\.cmake$" 39 | "/tags$" 40 | "^config\\\\.h$" 41 | "/install_manifest\\\\.txt$" 42 | "/Testing/" 43 | "ids-whitelist\\\\.txt" 44 | "/_Inline/" 45 | "/(B|build|BUILD)/" 46 | "/autom4te.cache/" 47 | ) 48 | 49 | ############ build options ############## 50 | 51 | OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON) 52 | OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF) 53 | OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON) 54 | 55 | ############ helper libs ############ 56 | 57 | INCLUDE(CPack) 58 | INCLUDE(CheckCCompilerFlag) 59 | INCLUDE(ExternalProject) 60 | 61 | ############ dependencies ############## 62 | 63 | CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt) 64 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) 65 | 66 | IF(result) 67 | MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") 68 | ENDIF() 69 | 70 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) 71 | 72 | IF(result) 73 | MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") 74 | ENDIF() 75 | 76 | # abseil-cpp 77 | IF(MSVC) 78 | # /wd4005 macro-redefinition 79 | # /wd4068 unknown pragma 80 | # /wd4244 conversion from 'type1' to 'type2' 81 | # /wd4267 conversion from 'size_t' to 'type2' 82 | # /wd4800 force value to bool 'true' or 'false' (performance warning) 83 | ADD_COMPILE_OPTIONS(/wd4005 /wd4068 /wd4244 /wd4267 /wd4800) 84 | ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS) 85 | ENDIF(MSVC) 86 | 87 | ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src 88 | ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build 89 | EXCLUDE_FROM_ALL) 90 | 91 | IF(ROBOTS_BUILD_TESTS) 92 | INCLUDE(CTest) 93 | 94 | # googletest 95 | ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src 96 | ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build 97 | EXCLUDE_FROM_ALL) 98 | 99 | SET(INSTALL_GTEST 0) 100 | SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 101 | 102 | IF(CMAKE_VERSION VERSION_LESS 2.8.11) 103 | INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include) 104 | ENDIF() 105 | ENDIF(ROBOTS_BUILD_TESTS) 106 | 107 | ########### compiler flags ############## 108 | 109 | 110 | SET(COMPILER_FLAGS_TO_CHECK 111 | "-Wall" "-Werror=implicit-function-declaration" 112 | ) 113 | 114 | IF(CPU_ARCH) 115 | LIST(APPEND COMPILER_FLAGS_TO_CHECK "-march=${CPU_ARCH}") 116 | ENDIF(CPU_ARCH) 117 | 118 | ########### project files ############### 119 | 120 | INCLUDE_DIRECTORIES(.) 121 | 122 | ######### targets ########### 123 | 124 | SET(LIBROBOTS_LIBS) 125 | 126 | SET(robots_SRCS ./robots.cc) 127 | SET(robots_LIBS absl::base absl::strings) 128 | 129 | ADD_LIBRARY(robots SHARED ${robots_SRCS}) 130 | TARGET_LINK_LIBRARIES(robots ${robots_LIBS}) 131 | LIST(APPEND LIBROBOTS_LIBS "robots") 132 | 133 | IF(ROBOTS_BUILD_STATIC) 134 | ADD_LIBRARY(robots-static STATIC ${robots_SRCS}) 135 | TARGET_LINK_LIBRARIES(robots-static ${robots_LIBS}) 136 | 137 | LIST(APPEND LIBROBOTS_LIBS "robots-static") 138 | 139 | SET_TARGET_PROPERTIES(robots-static PROPERTIES OUTPUT_NAME "robots") 140 | 141 | SET_TARGET_PROPERTIES(${LIBROBOTS_LIBS} PROPERTIES CLEAN_DIRECT_OUTPUT 1) 142 | ENDIF(ROBOTS_BUILD_STATIC) 143 | 144 | IF(WIN_32) 145 | SET_TARGET_PROPERTIES(robots PROPERTIES DEFINE_SYMBOL DLL_EXPORT) 146 | ENDIF(WIN_32) 147 | 148 | ADD_EXECUTABLE(robots-main ./robots_main.cc) 149 | TARGET_LINK_LIBRARIES(robots-main ${LIBROBOTS_LIBS}) 150 | SET_TARGET_PROPERTIES(robots-main PROPERTIES OUTPUT_NAME "robots") 151 | 152 | ############ installation ############ 153 | 154 | IF(ROBOTS_INSTALL) 155 | INSTALL(TARGETS ${LIBROBOTS_LIBS} 156 | LIBRARY DESTINATION lib 157 | ARCHIVE DESTINATION lib 158 | ) 159 | 160 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include) 161 | 162 | INSTALL(TARGETS robots-main DESTINATION bin) 163 | ENDIF(ROBOTS_INSTALL) 164 | 165 | ############ tests ############## 166 | 167 | IF(ROBOTS_BUILD_TESTS) 168 | ENABLE_TESTING() 169 | 170 | ADD_EXECUTABLE(robots-test ./robots_test.cc) 171 | TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main) 172 | ADD_TEST(NAME robots-test COMMAND robots-test) 173 | ENDIF(ROBOTS_BUILD_TESTS) 174 | 175 | -------------------------------------------------------------------------------- /CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | 2 | PROJECT(dependency-downloader NONE) 3 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0) 4 | 5 | INCLUDE(ExternalProject) 6 | 7 | ExternalProject_Add(abseilcpp 8 | GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git 9 | GIT_TAG master 10 | GIT_PROGRESS 1 11 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src" 12 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build" 13 | CONFIGURE_COMMAND "" 14 | BUILD_COMMAND "" 15 | INSTALL_COMMAND "" 16 | TEST_COMMAND "" 17 | ) 18 | 19 | ExternalProject_Add(googletest 20 | GIT_REPOSITORY https://github.com/google/googletest.git 21 | GIT_TAG main 22 | GIT_PROGRESS 1 23 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src" 24 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build" 25 | CONFIGURE_COMMAND "" 26 | BUILD_COMMAND "" 27 | INSTALL_COMMAND "" 28 | TEST_COMMAND "" 29 | ) 30 | 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Keep in mind that this library is 22 | used in Google's production systems, so we need to be very strict about what 23 | changes we accept. Consult 24 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 25 | information on using pull requests. 26 | 27 | ## Community Guidelines 28 | 29 | This project follows [Google's Open Source Community 30 | Guidelines](https://opensource.google.com/conduct/). 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | https://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------------------------------------- /MODULE.bazel: -------------------------------------------------------------------------------- 1 | module( 2 | name = "com_google_robotstxt", 3 | version = "head", 4 | ) 5 | 6 | # -- bazel_dep definitions -- # 7 | bazel_dep( 8 | name = "bazel_skylib", 9 | version = "1.5.0", 10 | ) 11 | 12 | bazel_dep( 13 | name = "abseil-cpp", 14 | version = "20240116.2", 15 | ) 16 | 17 | bazel_dep( 18 | name = "googletest", 19 | version = "1.14.0.bcr.1", 20 | ) 21 | 22 | bazel_dep( 23 | name = "rules_cc", 24 | version = "0.0.9", 25 | ) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Google Robots.txt Parser and Matcher Library 3 | 4 | The repository contains Google's robots.txt parser and matcher as a C++ library 5 | (compliant to C++14). 6 | 7 | ## About the library 8 | 9 | The Robots Exclusion Protocol (REP) is a standard that enables website owners to 10 | control which URLs may be accessed by automated clients (i.e. crawlers) through 11 | a simple text file with a specific syntax. It's one of the basic building blocks 12 | of the internet as we know it and what allows search engines to operate. 13 | 14 | Because the REP was only a de-facto standard for the past 25 years, different 15 | implementers implement parsing of robots.txt slightly differently, leading to 16 | confusion. This project aims to fix that by releasing the parser that Google 17 | uses. 18 | 19 | The library is slightly modified (i.e. some internal headers and equivalent 20 | symbols) production code used by Googlebot, Google's crawler, to determine which 21 | URLs it may access based on rules provided by webmasters in robots.txt files. 22 | The library is released open-source to help developers build tools that better 23 | reflect Google's robots.txt parsing and matching. 24 | 25 | For webmasters, we included a small binary in the project that allows testing a 26 | single URL and user-agent against a robots.txt. 27 | 28 | ## Building the library 29 | 30 | ### Quickstart 31 | 32 | We included with the library a small binary to test a local robots.txt against a 33 | user-agent and URL. Running the included binary requires: 34 | 35 | * A compatible platform (e.g. Windows, macOS, Linux, etc.). Most platforms are 36 | fully supported. 37 | * A compatible C++ compiler supporting at least C++14. Most major compilers 38 | are supported. 39 | * [Git](https://git-scm.com/) for interacting with the source code repository. 40 | To install Git, consult the 41 | [Set Up Git](https://help.github.com/articles/set-up-git/) guide on 42 | [GitHub](https://github.com/). 43 | * Although you are free to use your own build system, most of the 44 | documentation within this guide will assume you are using 45 | [Bazel](https://bazel.build/). To download and install Bazel (and any of its 46 | dependencies), consult the 47 | [Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html) 48 | 49 | #### Building with Bazel 50 | 51 | [Bazel](https://bazel.build/) is the official build system for the library, 52 | which is supported on most major platforms (Linux, Windows, MacOS, for example) 53 | and compilers. 54 | 55 | To build and run the binary: 56 | 57 | ```bash 58 | $ git clone https://github.com/google/robotstxt.git robotstxt 59 | Cloning into 'robotstxt'... 60 | ... 61 | $ cd robotstxt/ 62 | bazel-robots$ bazel test :robots_test 63 | ... 64 | /:robots_test PASSED in 0.1s 65 | 66 | Executed 1 out of 1 test: 1 test passes. 67 | ... 68 | bazel-robots$ bazel build :robots_main 69 | ... 70 | Target //:robots_main up-to-date: 71 | bazel-bin/robots_main 72 | ... 73 | bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url 74 | user-agent 'YourBot' with url 'https://example.com/url' allowed: YES 75 | ``` 76 | 77 | #### Building with CMake 78 | 79 | [CMake](https://cmake.org) is the community-supported build system for the 80 | library. 81 | 82 | To build the library using CMake, just follow the steps below: 83 | 84 | ```bash 85 | $ git clone https://github.com/google/robotstxt.git robotstxt 86 | Cloning into 'robotstxt'... 87 | ... 88 | $ cd robotstxt/ 89 | ... 90 | $ mkdir c-build && cd c-build 91 | ... 92 | $ cmake .. -DROBOTS_BUILD_TESTS=ON 93 | ... 94 | $ make 95 | ... 96 | $ make test 97 | Running tests... 98 | Test project robotstxt/c-build 99 | Start 1: robots-test 100 | 1/1 Test #1: robots-test ...................... Passed 0.02 sec 101 | 102 | 100% tests passed, 0 tests failed out of 1 103 | 104 | Total Test time (real) = 0.02 sec 105 | ... 106 | $ robots ~/local/path/to/robots.txt YourBot https://example.com/url 107 | user-agent 'YourBot' with url 'https://example.com/url' allowed: YES 108 | ``` 109 | 110 | ## Notes 111 | 112 | Parsing of robots.txt files themselves is done exactly as in the production 113 | version of Googlebot, including how percent codes and unicode characters in 114 | patterns are handled. The user must ensure however that the URI passed to the 115 | AllowedByRobots and OneAgentAllowedByRobots functions, or to the URI parameter 116 | of the robots tool, follows the format specified by RFC3986, since this library 117 | will not perform full normalization of those URI parameters. Only if the URI is 118 | in this format, the matching will be done according to the REP specification. 119 | 120 | Also note that the library, and the included binary, do not handle 121 | implementation logic that a crawler might apply outside of parsing and matching, 122 | for example: `Googlebot-Image` respecting the rules specified for `User-agent: 123 | Googlebot` if not explicitly defined in the robots.txt file being tested. 124 | 125 | ## License 126 | 127 | The robots.txt parser and matcher C++ library is licensed under the terms of the 128 | Apache license. See LICENSE for more information. 129 | 130 | ## Links 131 | 132 | To learn more about this project: 133 | 134 | * check out the 135 | [Robots Exclusion Protocol standard](https://www.rfc-editor.org/rfc/rfc9309.html), 136 | * how 137 | [Google Handles robots.txt](https://developers.google.com/search/reference/robots_txt), 138 | * or for a high level overview, the 139 | [robots.txt page on Wikipedia](https://en.wikipedia.org/wiki/Robots_exclusion_standard). 140 | -------------------------------------------------------------------------------- /protocol-draft/draft-illyes-repext-00.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | ]> 9 | 10 | 11 | 12 | 13 | Robots Exclusion Protocol Extension for URI Level Control 14 | 15 | 16 | Google LLC. 17 |
18 | 19 | Brandschenkestrasse 110 20 | Zurich 21 | 8002 22 | Switzerland 23 | 24 | garyillyes@google.com 25 |
26 |
27 | 28 | Internet Engineering Task Force (IETF) 29 | 30 | 31 | This document extends RFC9309 by specifying additional URI level 32 | controls through application level header and HTML meta tags 33 | originally developed in 1996. Additionally it moves the response 34 | header out of the experimental header space (i.e. "X-") and defines 35 | the combinability of multiple headers, which was previously not 36 | possible. 37 | 38 | 39 | About this Document 40 | 41 | This note is to be removed before publishing as an RFC. 42 | TODO(illyes): add commentable reference on github robotstxt repo. 43 | 44 |
45 | 46 |
47 | Introduction 48 | 49 | While the Robots Exclusion Protocol enables service owners to control 50 | how, if at all, automated clients known as crawlers may access the 51 | URIs on their services as defined by [RFC8288], the protocol doesn't 52 | provide controls on how the data returned by their service may be 53 | used upon allowed access. 54 | 55 | Originally developed in 1996 and widely adopted since, the use-case 56 | control is left to URI level controls implemented in the response 57 | headers, or in case of HTML in the form of a meta tag. This document 58 | specifies these control tags, and in case of the response header 59 | field, brings it to standards compliance with [RFC9110]. 60 | 61 | Application developers are requested to honor these tags. The tags 62 | are not a form of access authorization however. 63 |
64 | Requirements Language 65 | 66 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 67 | "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", 68 | and "OPTIONAL" in this document are to be interpreted as described 69 | in BCP 14 [RFC8174] when, 70 | and only when, they appear in all capitals, as shown here. 71 |
72 |
73 |
74 | Specification 75 |
76 | Robots control 77 | 78 | The URI level crawler controls are a key-value pair that 79 | can be specified two ways: 80 |
    81 |
  • an application level response header.
  • 82 |
  • in case of HTML, one or more meta tags as defined by the 83 | HTML specification.
  • 84 |
85 |
86 | Application Layer Response Header 87 | 88 | The application level response header field name is "robots-tag" 89 | and contains rules applicable to either all accessors or 90 | specifically named ones in the value. For historical reasons, 91 | implementors should support the experimental field name also 92 | — "x-robots-tag". 93 | 94 | The value is a semicolon (";", 0x3B, 0x20) separated list of 95 | key-value pairs that represent a comma separated list of rules. 96 | The rules are specific to a single product token as defined by 97 | [RFC9309] or a global identifier — "*". The global identifier 98 | may be omitted. The product token is separated by a "=" from its 99 | rules. 100 | 101 | Duplicate product tokens must be merged and the rules deduplicated. 102 | 112 | 113 | For example, the following response header field specifies 114 | "noindex" and "nosnippet" rules for all accessors, however 115 | specifies no rules for the product token "ExampleBot": 116 | 119 | 120 | The global product identifier "*" in the value may be omitted; for 121 | example, this field is equivalent to the previous example: 122 | 124 | 125 | Implementors should impose a parsing limit on the field value to 126 | protect their systems. The parsing limit MUST be at least 8 127 | kibibytes [KiB]. 128 |
129 |
130 | HTML meta element 131 | 132 | For historical reasons the robots-tag header may be specified by 133 | service owners as an HTML meta tag. In case of the meta tag, the 134 | name attribute is used to specify the product token, and the 135 | content attribute to specify the comma separated robots-tag rules. 136 | 137 | As with the header, the product token may be a global token, 138 | "robots", which signifies that the rules apply to all requestors, 139 | or a specific product token applicable to a single requestor. For 140 | example: 141 | 144 | 145 | Multiple robots meta elements may appear in a single HTML document. 146 | Requestors must obey the sum of negative rules specific to their 147 | product token and the global product token. 148 |
149 |
150 |
151 | Robots control rules 152 | The possible values of the rules are: 153 |
    154 |
  • 155 | noindex - instructs the parser to not store the served 156 | data in its publicly accessible index. 157 |
  • 158 |
  • 159 | nosnippet - instructs the parser to not reproduce any 160 | stored data as an excerpt snippet. 161 |
  • 162 |
163 | 164 | The values are case insensitive. Unsupported rules must be ignored. 165 | 166 | Implementors may support other rules as specified in Section 2.2.4 167 | of [RFC9309]. 168 |
169 |
170 | Caching of values 171 | 172 | The rules specified for a specific product token must be obeyed 173 | until the rules have changed. Implementors MAY use standard cache 174 | control as defined in [RFC9110] for caching robots-tag rules. 175 | Implementors SHOULD refresh their caches within a reasonable time 176 | frame. 177 |
178 |
179 |
180 | IANA considerations 181 | 185 |
186 |
187 | Security considerations 188 | 189 | The robots-tag is not a substitute for valid content security 190 | measures. To control access to the URI paths in a robots.txt file, 191 | users of the protocol should employ a valid security measure relevant 192 | to the application layer on which the robots.txt file is served — 193 | for example, in the case of HTTP, HTTP Authentication as defined in 194 | [RFC9110]. 195 | 196 | The content of the robots-tag header field is not secure, private or 197 | integrity-guaranteed, and due caution should be exercised when using 198 | it. Use of Transport Layer Security (TLS) with HTTP ([RFC9110] and 199 | [RFC2817]) is currently the only end-to-end way to provide such 200 | protection. 201 | 202 | In case of a robots-tag specified in a HTML meta element, implementors 203 | should consider only the meta elements specified in the head element 204 | of the HTML document, which is generally only accessible to the 205 | service owner[a]. 206 | 207 | To protect against memory overflow attacks, implementers should 208 | enforce a limit on how much data they will parse; see section N 209 | for the lower limit. 210 | 211 |
212 |
213 | 214 | 215 | References 216 | 217 | Normative References 218 | 219 | 220 | Key words for use in RFCs to Indicate Requirement Levels 221 | 222 | 223 | 224 | In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements. 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | Upgrading to TLS Within HTTP/1.1 234 | 235 | 236 | 237 | 238 | This memo explains how to use the Upgrade mechanism in HTTP/1.1 to initiate Transport Layer Security (TLS) over an existing TCP connection. [STANDARDS-TRACK] 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | Informative References 247 | 248 | 249 | Kibibyte - Simple English Wikipedia, the free encyclopedia 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 |
260 | -------------------------------------------------------------------------------- /reporting_robots.cc: -------------------------------------------------------------------------------- 1 | #include "reporting_robots.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "absl/strings/ascii.h" 8 | #include "absl/strings/string_view.h" 9 | 10 | namespace googlebot { 11 | // The kUnsupportedTags tags are popular tags in robots.txt files, but Google 12 | // doesn't use them for anything. Other search engines may, however, so we 13 | // parse them out so users of the library can highlight them for their own 14 | // users if they so wish. 15 | // These are different from the "unknown" tags, since we know that these may 16 | // have some use cases; to the best of our knowledge other tags we find, don't. 17 | // (for example, "unicorn" from "unicorn: /value") 18 | static const std::vector kUnsupportedTags = { 19 | "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"}; 20 | 21 | void RobotsParsingReporter::Digest(int line_num, 22 | RobotsParsedLine::RobotsTagName parsed_tag) { 23 | if (line_num > last_line_seen_) { 24 | last_line_seen_ = line_num; 25 | } 26 | if (parsed_tag != RobotsParsedLine::kUnknown && 27 | parsed_tag != RobotsParsedLine::kUnused) { 28 | ++valid_directives_; 29 | } 30 | 31 | RobotsParsedLine& line = robots_parse_results_[line_num]; 32 | line.line_num = line_num; 33 | line.tag_name = parsed_tag; 34 | } 35 | 36 | void RobotsParsingReporter::ReportLineMetadata(int line_num, 37 | const LineMetadata& metadata) { 38 | if (line_num > last_line_seen_) { 39 | last_line_seen_ = line_num; 40 | } 41 | RobotsParsedLine& line = robots_parse_results_[line_num]; 42 | line.line_num = line_num; 43 | line.is_typo = metadata.is_acceptable_typo; 44 | line.metadata = metadata; 45 | } 46 | 47 | void RobotsParsingReporter::HandleRobotsStart() { 48 | last_line_seen_ = 0; 49 | valid_directives_ = 0; 50 | unused_directives_ = 0; 51 | } 52 | void RobotsParsingReporter::HandleRobotsEnd() {} 53 | void RobotsParsingReporter::HandleUserAgent(int line_num, 54 | absl::string_view line_value) { 55 | Digest(line_num, RobotsParsedLine::kUserAgent); 56 | } 57 | void RobotsParsingReporter::HandleAllow(int line_num, 58 | absl::string_view line_value) { 59 | Digest(line_num, RobotsParsedLine::kAllow); 60 | } 61 | void RobotsParsingReporter::HandleDisallow(int line_num, 62 | absl::string_view line_value) { 63 | Digest(line_num, RobotsParsedLine::kDisallow); 64 | } 65 | void RobotsParsingReporter::HandleSitemap(int line_num, 66 | absl::string_view line_value) { 67 | Digest(line_num, RobotsParsedLine::kSitemap); 68 | } 69 | void RobotsParsingReporter::HandleUnknownAction(int line_num, 70 | absl::string_view action, 71 | absl::string_view line_value) { 72 | RobotsParsedLine::RobotsTagName rtn = 73 | std::count(kUnsupportedTags.begin(), kUnsupportedTags.end(), 74 | absl::AsciiStrToLower(action)) > 0 75 | ? RobotsParsedLine::kUnused 76 | : RobotsParsedLine::kUnknown; 77 | unused_directives_++; 78 | Digest(line_num, rtn); 79 | } 80 | 81 | } // namespace googlebot 82 | -------------------------------------------------------------------------------- /reporting_robots.h: -------------------------------------------------------------------------------- 1 | #ifndef THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_ 2 | #define THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_ 3 | 4 | #include 5 | 6 | #include "absl/container/btree_map.h" 7 | #include "absl/strings/string_view.h" 8 | #include "robots.h" 9 | 10 | namespace googlebot { 11 | struct RobotsParsedLine { 12 | enum RobotsTagName { 13 | // Identifier for skipped lines. A line may be skipped because it's 14 | // unparseable, or because it contains no recognizable key. Note that 15 | // comment lines are also skipped, they're no-op for parsing. For example: 16 | // random characters 17 | // unicorn: 18 | // # comment line 19 | // Same thing for empty lines. 20 | kUnknown = 0, 21 | kUserAgent = 1, 22 | kAllow = 2, 23 | kDisallow = 3, 24 | kSitemap = 4, 25 | // Identifier for parseable lines whose key is recognized, but unused. 26 | // See kUnsupportedTags in reporting_robots.cc for a list of recognized, 27 | // but unused keys. For example: 28 | // noindex: 29 | // noarchive: 30 | kUnused = 5, 31 | }; 32 | 33 | int line_num = 0; 34 | RobotsTagName tag_name = kUnknown; 35 | bool is_typo = false; 36 | RobotsParseHandler::LineMetadata metadata; 37 | }; 38 | 39 | class RobotsParsingReporter : public googlebot::RobotsParseHandler { 40 | public: 41 | void HandleRobotsStart() override; 42 | void HandleRobotsEnd() override; 43 | void HandleUserAgent(int line_num, absl::string_view line_value) override; 44 | void HandleAllow(int line_num, absl::string_view line_value) override; 45 | void HandleDisallow(int line_num, absl::string_view line_value) override; 46 | void HandleSitemap(int line_num, absl::string_view line_value) override; 47 | void HandleUnknownAction(int line_num, absl::string_view action, 48 | absl::string_view line_value) override; 49 | void ReportLineMetadata(int line_num, const LineMetadata& metadata) override; 50 | 51 | int last_line_seen() const { return last_line_seen_; } 52 | int valid_directives() const { return valid_directives_; } 53 | int unused_directives() const { return unused_directives_; } 54 | std::vector parse_results() const { 55 | std::vector vec; 56 | for (const auto& entry : robots_parse_results_) { 57 | vec.push_back(entry.second); 58 | } 59 | return vec; 60 | } 61 | 62 | private: 63 | void Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag); 64 | 65 | // Indexed and sorted by line number. 66 | absl::btree_map robots_parse_results_; 67 | int last_line_seen_ = 0; 68 | int valid_directives_ = 0; 69 | int unused_directives_ = 0; 70 | }; 71 | } // namespace googlebot 72 | #endif // THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_ 73 | -------------------------------------------------------------------------------- /reporting_robots_test.cc: -------------------------------------------------------------------------------- 1 | #include "reporting_robots.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "gtest/gtest.h" 8 | #include "absl/strings/str_cat.h" 9 | #include "absl/strings/str_split.h" 10 | #include "absl/strings/string_view.h" 11 | #include "robots.h" 12 | 13 | using ::googlebot::RobotsParsedLine; 14 | using ::googlebot::RobotsParseHandler; 15 | using ::googlebot::RobotsParsingReporter; 16 | 17 | namespace { 18 | // Allows debugging the contents of the LineMetadata struct. 19 | std::string LineMetadataToString(const RobotsParseHandler::LineMetadata& line) { 20 | // clang-format off 21 | return absl::StrCat( 22 | "{ is_empty: ", line.is_empty, 23 | " has_directive: ", line.has_directive, 24 | " has_comment: ", line.has_comment, 25 | " is_comment: ", line.is_comment, 26 | " is_acceptable_typo: ", line.is_acceptable_typo, 27 | " is_line_too_long: ", line.is_line_too_long, 28 | " is_missing_colon_separator: ", line.is_missing_colon_separator, " }"); 29 | // clang-format on 30 | } 31 | 32 | std::string TagNameToString(RobotsParsedLine::RobotsTagName tag_name) { 33 | switch (tag_name) { 34 | case RobotsParsedLine::RobotsTagName::kUnknown: 35 | return "Unknown"; 36 | case RobotsParsedLine::RobotsTagName::kUserAgent: 37 | return "UserAgent"; 38 | case RobotsParsedLine::RobotsTagName::kAllow: 39 | return "Allow"; 40 | case RobotsParsedLine::RobotsTagName::kDisallow: 41 | return "Disallow"; 42 | case RobotsParsedLine::RobotsTagName::kSitemap: 43 | return "Sitemap"; 44 | case RobotsParsedLine::RobotsTagName::kUnused: 45 | return "Unused"; 46 | } 47 | } 48 | 49 | // Allows debugging the contents of the RobotsParsedLine struct. 50 | std::string RobotsParsedLineToString(const RobotsParsedLine& line) { 51 | return absl::StrCat("{\n lin_num:", line.line_num, 52 | "\n tag_name: ", TagNameToString(line.tag_name), 53 | "\n is_typo: ", line.is_typo, 54 | "\n metadata: ", LineMetadataToString(line.metadata), 55 | "\n}"); 56 | } 57 | 58 | void expectLineToParseTo(const std::vector& lines, 59 | const std::vector& parse_results, 60 | const RobotsParsedLine& expected_result) { 61 | int line_num = expected_result.line_num; 62 | EXPECT_EQ(parse_results[line_num - 1], expected_result) 63 | << "For line " << line_num << ": '" << lines[line_num - 1] << "'"; 64 | } 65 | } // namespace 66 | 67 | namespace googlebot { 68 | // This allows us to get a debug content of the object in the test. Without 69 | // this, it would say something like this when a test fails: 70 | // Expected equality of these values: 71 | // parse_results[line_num - 1] 72 | // Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 01-00 00-00> 73 | // expected_result 74 | // Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 00-25 00-00> 75 | std::ostream& operator<<(std::ostream& o, const RobotsParsedLine& line) { 76 | o << RobotsParsedLineToString(line); 77 | return o; 78 | } 79 | 80 | // These 2 `operator==` are needed for `EXPECT_EQ` to work. 81 | bool operator==(const RobotsParseHandler::LineMetadata& lhs, 82 | const RobotsParseHandler::LineMetadata& rhs) { 83 | return lhs.is_empty == rhs.is_empty && 84 | lhs.has_directive == rhs.has_directive && 85 | lhs.has_comment == rhs.has_comment && 86 | lhs.is_comment == rhs.is_comment && 87 | lhs.is_acceptable_typo == rhs.is_acceptable_typo && 88 | lhs.is_line_too_long == rhs.is_line_too_long && 89 | lhs.is_missing_colon_separator == rhs.is_missing_colon_separator; 90 | } 91 | 92 | bool operator==(const RobotsParsedLine& lhs, const RobotsParsedLine& rhs) { 93 | return lhs.line_num == rhs.line_num && lhs.tag_name == rhs.tag_name && 94 | lhs.is_typo == rhs.is_typo && lhs.metadata == rhs.metadata; 95 | } 96 | } // namespace googlebot 97 | 98 | TEST(RobotsUnittest, LinesNumbersAreCountedCorrectly) { 99 | RobotsParsingReporter report; 100 | static const char kSimpleFile[] = 101 | "User-Agent: foo\n" // 1 102 | "Allow: /some/path\n" // 2 103 | "User-Agent bar # no\n" // 3 104 | "absolutely random line\n" // 4 105 | "#so comment, much wow\n" // 5 106 | "\n" // 6 107 | "unicorns: /extinct\n" // 7 108 | "noarchive: /some\n" // 8 109 | "Disallow: /\n" // 9 110 | "Error #and comment\n" // 10 111 | "useragent: baz\n" // 11 112 | "disallaw: /some\n" // 12 113 | "site-map: https://e/s.xml #comment\n" // 13 114 | "sitemap: https://e/t.xml\n" // 14 115 | "Noarchive: /someCapital\n"; // 15 116 | // 16 (from \n) 117 | googlebot::ParseRobotsTxt(kSimpleFile, &report); 118 | EXPECT_EQ(8, report.valid_directives()); 119 | EXPECT_EQ(16, report.last_line_seen()); 120 | EXPECT_EQ(report.parse_results().size(), report.last_line_seen()); 121 | std::vector lines = absl::StrSplit(kSimpleFile, '\n'); 122 | 123 | // For line "User-Agent: foo\n" // 1 124 | expectLineToParseTo( 125 | lines, report.parse_results(), 126 | RobotsParsedLine{.line_num = 1, 127 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent, 128 | .is_typo = false, 129 | .metadata = RobotsParseHandler::LineMetadata{ 130 | .is_empty = false, 131 | .has_comment = false, 132 | .is_comment = false, 133 | .has_directive = true, 134 | .is_missing_colon_separator = false, 135 | }}); 136 | // For line "Allow: /some/path\n" // 2 137 | expectLineToParseTo( 138 | lines, report.parse_results(), 139 | RobotsParsedLine{.line_num = 2, 140 | .tag_name = RobotsParsedLine::RobotsTagName::kAllow, 141 | .is_typo = false, 142 | .metadata = RobotsParseHandler::LineMetadata{ 143 | .is_empty = false, 144 | .has_comment = false, 145 | .is_comment = false, 146 | .has_directive = true, 147 | }}); 148 | // For line "User-Agent bar # no\n" // 3 149 | expectLineToParseTo( 150 | lines, report.parse_results(), 151 | RobotsParsedLine{.line_num = 3, 152 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent, 153 | .is_typo = false, 154 | .metadata = RobotsParseHandler::LineMetadata{ 155 | .is_empty = false, 156 | .has_comment = true, 157 | .is_comment = false, 158 | .has_directive = true, 159 | .is_missing_colon_separator = true, 160 | }}); 161 | // For line "absolutely random line\n" // 4 162 | expectLineToParseTo( 163 | lines, report.parse_results(), 164 | RobotsParsedLine{.line_num = 4, 165 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 166 | .is_typo = false, 167 | .metadata = RobotsParseHandler::LineMetadata{ 168 | .is_empty = false, 169 | .has_comment = false, 170 | .is_comment = false, 171 | .has_directive = false, 172 | .is_missing_colon_separator = false, 173 | }}); 174 | // For line "#so comment, much wow\n" // 5 175 | expectLineToParseTo( 176 | lines, report.parse_results(), 177 | RobotsParsedLine{.line_num = 5, 178 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 179 | .is_typo = false, 180 | .metadata = RobotsParseHandler::LineMetadata{ 181 | .is_empty = false, 182 | .has_comment = true, 183 | .is_comment = true, 184 | .has_directive = false, 185 | }}); 186 | // For line "\n" // 6 187 | expectLineToParseTo( 188 | lines, report.parse_results(), 189 | RobotsParsedLine{.line_num = 6, 190 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 191 | .is_typo = false, 192 | .metadata = RobotsParseHandler::LineMetadata{ 193 | .is_empty = true, 194 | .has_comment = false, 195 | .is_comment = false, 196 | .has_directive = false, 197 | }}); 198 | // For line "unicorns: /extinct\n" // 7 199 | expectLineToParseTo( 200 | lines, report.parse_results(), 201 | RobotsParsedLine{.line_num = 7, 202 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 203 | .is_typo = false, 204 | .metadata = RobotsParseHandler::LineMetadata{ 205 | .is_empty = false, 206 | .has_comment = false, 207 | .is_comment = false, 208 | .has_directive = true, 209 | }}); 210 | // For line "noarchive: /some\n" // 8 211 | expectLineToParseTo( 212 | lines, report.parse_results(), 213 | RobotsParsedLine{.line_num = 8, 214 | .tag_name = RobotsParsedLine::RobotsTagName::kUnused, 215 | .is_typo = false, 216 | .metadata = RobotsParseHandler::LineMetadata{ 217 | .is_empty = false, 218 | .has_comment = false, 219 | .is_comment = false, 220 | .has_directive = true, 221 | }}); 222 | // For line "Disallow: /\n" // 9 223 | expectLineToParseTo( 224 | lines, report.parse_results(), 225 | RobotsParsedLine{.line_num = 9, 226 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow, 227 | .is_typo = false, 228 | .metadata = RobotsParseHandler::LineMetadata{ 229 | .is_empty = false, 230 | .has_comment = false, 231 | .is_comment = false, 232 | .has_directive = true, 233 | }}); 234 | // For line "Error #and comment\n"; // 10 235 | expectLineToParseTo( 236 | lines, report.parse_results(), 237 | RobotsParsedLine{.line_num = 10, 238 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 239 | .is_typo = false, 240 | .metadata = RobotsParseHandler::LineMetadata{ 241 | .is_empty = false, 242 | .has_comment = true, 243 | .is_comment = false, 244 | .has_directive = false, 245 | .is_missing_colon_separator = false, 246 | }}); 247 | // For line "useragent: baz\n"; // 11 248 | expectLineToParseTo( 249 | lines, report.parse_results(), 250 | RobotsParsedLine{.line_num = 11, 251 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent, 252 | .is_typo = true, 253 | .metadata = RobotsParseHandler::LineMetadata{ 254 | .is_empty = false, 255 | .has_comment = false, 256 | .is_comment = false, 257 | .has_directive = true, 258 | .is_acceptable_typo = true, 259 | }}); 260 | // For line "disallaw: /some\n" // 12 261 | expectLineToParseTo( 262 | lines, report.parse_results(), 263 | RobotsParsedLine{.line_num = 12, 264 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow, 265 | .is_typo = true, 266 | .metadata = RobotsParseHandler::LineMetadata{ 267 | .is_empty = false, 268 | .has_comment = false, 269 | .is_comment = false, 270 | .has_directive = true, 271 | .is_acceptable_typo = true, 272 | }}); 273 | // For line "site-map: https://e/s.xml #comment\n" // 13; 274 | expectLineToParseTo( 275 | lines, report.parse_results(), 276 | RobotsParsedLine{.line_num = 13, 277 | .tag_name = RobotsParsedLine::RobotsTagName::kSitemap, 278 | .is_typo = true, 279 | .metadata = RobotsParseHandler::LineMetadata{ 280 | .is_empty = false, 281 | .has_comment = true, 282 | .is_comment = false, 283 | .has_directive = true, 284 | .is_acceptable_typo = true, 285 | }}); 286 | // For line "sitemap: https://e/t.xml\n" // 14; 287 | expectLineToParseTo( 288 | lines, report.parse_results(), 289 | RobotsParsedLine{.line_num = 14, 290 | .tag_name = RobotsParsedLine::RobotsTagName::kSitemap, 291 | .is_typo = false, 292 | .metadata = RobotsParseHandler::LineMetadata{ 293 | .is_empty = false, 294 | .has_comment = false, 295 | .is_comment = false, 296 | .has_directive = true, 297 | .is_acceptable_typo = false, 298 | }}); 299 | // For line "Noarchive: /someCapital\n" // 15 300 | expectLineToParseTo( 301 | lines, report.parse_results(), 302 | RobotsParsedLine{.line_num = 15, 303 | .tag_name = RobotsParsedLine::RobotsTagName::kUnused, 304 | .is_typo = false, 305 | .metadata = RobotsParseHandler::LineMetadata{ 306 | .is_empty = false, 307 | .has_comment = false, 308 | .is_comment = false, 309 | .has_directive = true, 310 | }}); 311 | // For line 16 (which is empty and comes from the last \n) 312 | expectLineToParseTo( 313 | lines, report.parse_results(), 314 | RobotsParsedLine{.line_num = 16, 315 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown, 316 | .is_typo = false, 317 | .metadata = RobotsParseHandler::LineMetadata{ 318 | .is_empty = true, 319 | .has_comment = false, 320 | .is_comment = false, 321 | .has_directive = false, 322 | }}); 323 | 324 | static const char kDosFile[] = 325 | "User-Agent: foo\r\n" 326 | "Allow: /some/path\r\n" 327 | "User-Agent: bar\r\n" 328 | "\r\n" 329 | "\r\n" 330 | "Disallow: /\r\n"; 331 | googlebot::ParseRobotsTxt(kDosFile, &report); 332 | EXPECT_EQ(4, report.valid_directives()); 333 | EXPECT_EQ(7, report.last_line_seen()); 334 | 335 | static const char kMacFile[] = 336 | "User-Agent: foo\r" 337 | "Allow: /some/path\r" 338 | "User-Agent: bar\r" 339 | "\r" 340 | "\r" 341 | "Disallow: /\r"; 342 | googlebot::ParseRobotsTxt(kMacFile, &report); 343 | EXPECT_EQ(4, report.valid_directives()); 344 | EXPECT_EQ(7, report.last_line_seen()); 345 | } 346 | 347 | TEST(RobotsUnittest, LinesTooLongReportedCorrectly) { 348 | RobotsParsingReporter report; 349 | const int kMaxLineLen = 2084 * 8; 350 | std::string allow = "allow: /\n"; 351 | std::string disallow = "disallow: "; 352 | std::string robotstxt = "user-agent: foo\n"; 353 | std::string longline = "/x/"; 354 | while (longline.size() < kMaxLineLen) { 355 | absl::StrAppend(&longline, "a"); 356 | } 357 | absl::StrAppend(&robotstxt, disallow, longline, "\n", allow); 358 | 359 | googlebot::ParseRobotsTxt(robotstxt, &report); 360 | EXPECT_EQ(3, report.valid_directives()); 361 | EXPECT_EQ(4, report.last_line_seen()); 362 | EXPECT_EQ(report.parse_results().size(), report.last_line_seen()); 363 | std::vector lines = absl::StrSplit(robotstxt, '\n'); 364 | 365 | // For line "user-agent: foo\n" // 1 366 | expectLineToParseTo( 367 | lines, report.parse_results(), 368 | RobotsParsedLine{.line_num = 1, 369 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent, 370 | .is_typo = false, 371 | .metadata = RobotsParseHandler::LineMetadata{ 372 | .is_empty = false, 373 | .has_comment = false, 374 | .is_comment = false, 375 | .has_directive = true, 376 | .is_line_too_long = false, 377 | }}); 378 | // For line "disallow: /x/a[...]a\n" // 2 379 | expectLineToParseTo( 380 | lines, report.parse_results(), 381 | RobotsParsedLine{.line_num = 2, 382 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow, 383 | .is_typo = false, 384 | .metadata = RobotsParseHandler::LineMetadata{ 385 | .is_empty = false, 386 | .has_comment = false, 387 | .is_comment = false, 388 | .has_directive = true, 389 | .is_line_too_long = true, 390 | }}); 391 | // For line "allow: /\n" // 3 392 | expectLineToParseTo( 393 | lines, report.parse_results(), 394 | RobotsParsedLine{.line_num = 3, 395 | .tag_name = RobotsParsedLine::RobotsTagName::kAllow, 396 | .is_typo = false, 397 | .metadata = RobotsParseHandler::LineMetadata{ 398 | .is_empty = false, 399 | .has_comment = false, 400 | .is_comment = false, 401 | .has_directive = true, 402 | .is_line_too_long = false, 403 | }}); 404 | } 405 | -------------------------------------------------------------------------------- /robots.cc: -------------------------------------------------------------------------------- 1 | // Copyright 1999 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // ----------------------------------------------------------------------------- 16 | // File: robots.cc 17 | // ----------------------------------------------------------------------------- 18 | // 19 | // Implements expired internet draft 20 | // http://www.robotstxt.org/norobots-rfc.txt 21 | // with Google-specific optimizations detailed at 22 | // https://developers.google.com/search/reference/robots_txt 23 | 24 | #include "robots.h" 25 | 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "absl/base/macros.h" 36 | #include "absl/container/fixed_array.h" 37 | #include "absl/strings/ascii.h" 38 | #include "absl/strings/match.h" 39 | #include "absl/strings/numbers.h" 40 | #include "absl/strings/string_view.h" 41 | 42 | // Allow for typos such as DISALOW in robots.txt. 43 | static bool kAllowFrequentTypos = true; 44 | 45 | namespace googlebot { 46 | 47 | // A RobotsMatchStrategy defines a strategy for matching individual lines in a 48 | // robots.txt file. Each Match* method should return a match priority, which is 49 | // interpreted as: 50 | // 51 | // match priority < 0: 52 | // No match. 53 | // 54 | // match priority == 0: 55 | // Match, but treat it as if matched an empty pattern. 56 | // 57 | // match priority > 0: 58 | // Match. 59 | class RobotsMatchStrategy { 60 | public: 61 | virtual ~RobotsMatchStrategy() {} 62 | 63 | virtual int MatchAllow(absl::string_view path, 64 | absl::string_view pattern) = 0; 65 | virtual int MatchDisallow(absl::string_view path, 66 | absl::string_view pattern) = 0; 67 | 68 | protected: 69 | // Implements robots.txt pattern matching. 70 | static bool Matches(absl::string_view path, absl::string_view pattern); 71 | }; 72 | 73 | // Returns true if URI path matches the specified pattern. Pattern is anchored 74 | // at the beginning of path. '$' is special only at the end of pattern. 75 | // 76 | // Since 'path' and 'pattern' are both externally determined (by the webmaster), 77 | // we make sure to have acceptable worst-case performance. 78 | /* static */ bool RobotsMatchStrategy::Matches( 79 | absl::string_view path, absl::string_view pattern) { 80 | const size_t pathlen = path.length(); 81 | absl::FixedArray pos(pathlen + 1); 82 | int numpos; 83 | 84 | // The pos[] array holds a sorted list of indexes of 'path', with length 85 | // 'numpos'. At the start and end of each iteration of the main loop below, 86 | // the pos[] array will hold a list of the prefixes of the 'path' which can 87 | // match the current prefix of 'pattern'. If this list is ever empty, 88 | // return false. If we reach the end of 'pattern' with at least one element 89 | // in pos[], return true. 90 | 91 | pos[0] = 0; 92 | numpos = 1; 93 | 94 | for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) { 95 | if (*pat == '$' && pat + 1 == pattern.end()) { 96 | return (pos[numpos - 1] == pathlen); 97 | } 98 | if (*pat == '*') { 99 | numpos = pathlen - pos[0] + 1; 100 | for (int i = 1; i < numpos; i++) { 101 | pos[i] = pos[i-1] + 1; 102 | } 103 | } else { 104 | // Includes '$' when not at end of pattern. 105 | int newnumpos = 0; 106 | for (int i = 0; i < numpos; i++) { 107 | if (pos[i] < pathlen && path[pos[i]] == *pat) { 108 | pos[newnumpos++] = pos[i] + 1; 109 | } 110 | } 111 | numpos = newnumpos; 112 | if (numpos == 0) return false; 113 | } 114 | } 115 | 116 | return true; 117 | } 118 | 119 | static const char* kHexDigits = "0123456789ABCDEF"; 120 | 121 | // GetPathParamsQuery is not in anonymous namespace to allow testing. 122 | // 123 | // Extracts path (with params) and query part from URL. Removes scheme, 124 | // authority, and fragment. Result always starts with "/". 125 | // Returns "/" if the url doesn't have a path or is not valid. 126 | std::string GetPathParamsQuery(const std::string& url) { 127 | std::string path; 128 | 129 | // Initial two slashes are ignored. 130 | size_t search_start = 0; 131 | if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2; 132 | 133 | size_t early_path = url.find_first_of("/?;", search_start); 134 | size_t protocol_end = url.find("://", search_start); 135 | if (early_path < protocol_end) { 136 | // If path, param or query starts before ://, :// doesn't indicate protocol. 137 | protocol_end = std::string::npos; 138 | } 139 | if (protocol_end == std::string::npos) { 140 | protocol_end = search_start; 141 | } else { 142 | protocol_end += 3; 143 | } 144 | 145 | size_t path_start = url.find_first_of("/?;", protocol_end); 146 | if (path_start != std::string::npos) { 147 | size_t hash_pos = url.find('#', search_start); 148 | if (hash_pos < path_start) return "/"; 149 | size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos; 150 | if (url[path_start] != '/') { 151 | // Prepend a slash if the result would start e.g. with '?'. 152 | return "/" + url.substr(path_start, path_end - path_start); 153 | } 154 | return url.substr(path_start, path_end - path_start); 155 | } 156 | 157 | return "/"; 158 | } 159 | 160 | // MaybeEscapePattern is not in anonymous namespace to allow testing. 161 | // 162 | // Canonicalize the allowed/disallowed paths. For example: 163 | // /SanJoséSellers ==> /Sanjos%C3%A9Sellers 164 | // %aa ==> %AA 165 | // When the function returns, (*dst) either points to src, or is newly 166 | // allocated. 167 | // Returns true if dst was newly allocated. 168 | bool MaybeEscapePattern(const char* src, char** dst) { 169 | int num_to_escape = 0; 170 | bool need_capitalize = false; 171 | 172 | // First, scan the buffer to see if changes are needed. Most don't. 173 | for (int i = 0; src[i] != 0; i++) { 174 | // (a) % escape sequence. 175 | if (src[i] == '%' && 176 | absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) { 177 | if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) { 178 | need_capitalize = true; 179 | } 180 | i += 2; 181 | // (b) needs escaping. 182 | } else if (src[i] & 0x80) { 183 | num_to_escape++; 184 | } 185 | // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). 186 | } 187 | // Return if no changes needed. 188 | if (!num_to_escape && !need_capitalize) { 189 | (*dst) = const_cast(src); 190 | return false; 191 | } 192 | (*dst) = new char[num_to_escape * 2 + strlen(src) + 1]; 193 | int j = 0; 194 | for (int i = 0; src[i] != 0; i++) { 195 | // (a) Normalize %-escaped sequence (eg. %2f -> %2F). 196 | if (src[i] == '%' && 197 | absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) { 198 | (*dst)[j++] = src[i++]; 199 | (*dst)[j++] = absl::ascii_toupper(src[i++]); 200 | (*dst)[j++] = absl::ascii_toupper(src[i]); 201 | // (b) %-escape octets whose highest bit is set. These are outside the 202 | // ASCII range. 203 | } else if (src[i] & 0x80) { 204 | (*dst)[j++] = '%'; 205 | (*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf]; 206 | (*dst)[j++] = kHexDigits[src[i] & 0xf]; 207 | // (c) Normal character, no modification needed. 208 | } else { 209 | (*dst)[j++] = src[i]; 210 | } 211 | } 212 | (*dst)[j] = 0; 213 | return true; 214 | } 215 | 216 | // Internal helper classes and functions. 217 | namespace { 218 | 219 | // A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents 220 | // a key. This class can parse a text-representation (including common typos) 221 | // and represent them as an enumeration which allows for faster processing 222 | // afterwards. 223 | // For unparsable keys, the original string representation is kept. 224 | class ParsedRobotsKey { 225 | public: 226 | enum KeyType { 227 | // Generic highlevel fields. 228 | USER_AGENT, 229 | SITEMAP, 230 | 231 | // Fields within a user-agent. 232 | ALLOW, 233 | DISALLOW, 234 | 235 | // Unrecognized field; kept as-is. High number so that additions to the 236 | // enumeration above does not change the serialization. 237 | UNKNOWN = 128 238 | }; 239 | 240 | ParsedRobotsKey() : type_(UNKNOWN) {} 241 | 242 | // Disallow copying and assignment. 243 | ParsedRobotsKey(const ParsedRobotsKey&) = delete; 244 | ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete; 245 | 246 | // Parse given key text and report in the is_acceptable_typo output parameter 247 | // whether the key is one of the accepted typo-variants of a supported key. 248 | // Does not copy the text, so the text_key must stay valid for the object's 249 | // life-time or the next Parse() call. 250 | void Parse(absl::string_view key, bool* is_acceptable_typo); 251 | 252 | // Returns the type of key. 253 | KeyType type() const { return type_; } 254 | 255 | // If this is an unknown key, get the text. 256 | absl::string_view GetUnknownText() const; 257 | 258 | private: 259 | // The KeyIs* methods return whether a passed in key is one of the a supported 260 | // keys, and report in the is_acceptable_typo output parameter whether the key 261 | // is one of the accepted typo-variants of a supported key. 262 | static bool KeyIsUserAgent(absl::string_view key, bool* is_acceptable_typo); 263 | static bool KeyIsAllow(absl::string_view key, bool* is_acceptable_typo); 264 | static bool KeyIsDisallow(absl::string_view key, bool* is_acceptable_typo); 265 | static bool KeyIsSitemap(absl::string_view key, bool* is_acceptable_typo); 266 | 267 | KeyType type_; 268 | absl::string_view key_text_; 269 | }; 270 | 271 | void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key, 272 | absl::string_view value, 273 | RobotsParseHandler* handler) { 274 | typedef ParsedRobotsKey Key; 275 | switch (key.type()) { 276 | case Key::USER_AGENT: handler->HandleUserAgent(line, value); break; 277 | case Key::ALLOW: handler->HandleAllow(line, value); break; 278 | case Key::DISALLOW: handler->HandleDisallow(line, value); break; 279 | case Key::SITEMAP: handler->HandleSitemap(line, value); break; 280 | case Key::UNKNOWN: 281 | handler->HandleUnknownAction(line, key.GetUnknownText(), value); 282 | break; 283 | // No default case Key:: to have the compiler warn about new values. 284 | } 285 | } 286 | 287 | class RobotsTxtParser { 288 | public: 289 | typedef ParsedRobotsKey Key; 290 | 291 | RobotsTxtParser(absl::string_view robots_body, 292 | RobotsParseHandler* handler) 293 | : robots_body_(robots_body), handler_(handler) { 294 | } 295 | 296 | void Parse(); 297 | 298 | private: 299 | // Note that `key` and `value` are only set when `metadata->has_directive 300 | // == true`. 301 | static void GetKeyAndValueFrom(char** key, char** value, char* line, 302 | RobotsParseHandler::LineMetadata* metadata); 303 | static void StripWhitespaceSlowly(char ** s); 304 | 305 | void ParseAndEmitLine(int current_line, char* line, 306 | bool* line_too_long_strict); 307 | bool NeedEscapeValueForKey(const Key& key); 308 | 309 | absl::string_view robots_body_; 310 | RobotsParseHandler* const handler_; 311 | }; 312 | 313 | bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) { 314 | switch (key.type()) { 315 | case RobotsTxtParser::Key::USER_AGENT: 316 | case RobotsTxtParser::Key::SITEMAP: 317 | return false; 318 | default: 319 | return true; 320 | } 321 | } 322 | 323 | // Removes leading and trailing whitespace from null-terminated string s. 324 | /* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) { 325 | absl::string_view stripped = absl::StripAsciiWhitespace(*s); 326 | *s = const_cast(stripped.data()); 327 | (*s)[stripped.size()] = '\0'; 328 | } 329 | 330 | void RobotsTxtParser::GetKeyAndValueFrom( 331 | char** key, char** value, char* line, 332 | RobotsParseHandler::LineMetadata* metadata) { 333 | // Remove comments from the current robots.txt line. 334 | char* const comment = strchr(line, '#'); 335 | if (nullptr != comment) { 336 | metadata->has_comment = true; 337 | *comment = '\0'; 338 | } 339 | StripWhitespaceSlowly(&line); 340 | // If the line became empty after removing the comment, return. 341 | if (strlen(line) == 0) { 342 | if (metadata->has_comment) { 343 | metadata->is_comment = true; 344 | } else { 345 | metadata->is_empty = true; 346 | } 347 | return; 348 | } 349 | 350 | // Rules must match the following pattern: 351 | // [ \t]*:[ \t]* 352 | char* sep = strchr(line, ':'); 353 | if (nullptr == sep) { 354 | // Google-specific optimization: some people forget the colon, so we need to 355 | // accept whitespace in its stead. 356 | static const char * const kWhite = " \t"; 357 | sep = strpbrk(line, kWhite); 358 | if (nullptr != sep) { 359 | const char* const val = sep + strspn(sep, kWhite); 360 | assert(*val); // since we dropped trailing whitespace above. 361 | if (nullptr != strpbrk(val, kWhite)) { 362 | // We only accept whitespace as a separator if there are exactly two 363 | // sequences of non-whitespace characters. If we get here, there were 364 | // more than 2 such sequences since we stripped trailing whitespace 365 | // above. 366 | return; 367 | } 368 | metadata->is_missing_colon_separator = true; 369 | } 370 | } 371 | if (nullptr == sep) { 372 | return; // Couldn't find a separator. 373 | } 374 | 375 | *key = line; // Key starts at beginning of line. 376 | *sep = '\0'; // And stops at the separator. 377 | StripWhitespaceSlowly(key); // Get rid of any trailing whitespace. 378 | 379 | if (strlen(*key) > 0) { 380 | *value = 1 + sep; // Value starts after the separator. 381 | StripWhitespaceSlowly(value); // Get rid of any leading whitespace. 382 | metadata->has_directive = true; 383 | return; 384 | } 385 | } 386 | 387 | void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line, 388 | bool* line_too_long_strict) { 389 | char* string_key; 390 | char* value; 391 | RobotsParseHandler::LineMetadata line_metadata; 392 | // Note that `string_key` and `value` are only set when 393 | // `line_metadata->has_directive == true`. 394 | GetKeyAndValueFrom(&string_key, &value, line, &line_metadata); 395 | line_metadata.is_line_too_long = *line_too_long_strict; 396 | if (!line_metadata.has_directive) { 397 | handler_->ReportLineMetadata(current_line, line_metadata); 398 | return; 399 | } 400 | Key key; 401 | key.Parse(string_key, &line_metadata.is_acceptable_typo); 402 | if (NeedEscapeValueForKey(key)) { 403 | char* escaped_value = nullptr; 404 | const bool is_escaped = MaybeEscapePattern(value, &escaped_value); 405 | EmitKeyValueToHandler(current_line, key, escaped_value, handler_); 406 | if (is_escaped) delete[] escaped_value; 407 | } else { 408 | EmitKeyValueToHandler(current_line, key, value, handler_); 409 | } 410 | // Finish adding metadata to the line. 411 | handler_->ReportLineMetadata(current_line, line_metadata); 412 | } 413 | 414 | void RobotsTxtParser::Parse() { 415 | // UTF-8 byte order marks. 416 | static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF}; 417 | 418 | // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's 419 | // fairly safe to assume any valid line isn't going to be more than many times 420 | // that max url length of 2KB. We want some padding for 421 | // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. 422 | // If so, we can ignore the chars on a line past that. 423 | const int kBrowserMaxLineLen = 2083; 424 | const int kMaxLineLen = kBrowserMaxLineLen * 8; 425 | // Allocate a buffer used to process the current line. 426 | char* const line_buffer = new char[kMaxLineLen]; 427 | // last_line_pos is the last writeable pos within the line array 428 | // (only a final '\0' may go here). 429 | const char* const line_buffer_end = line_buffer + kMaxLineLen - 1; 430 | bool line_too_long_strict = false; 431 | char* line_pos = line_buffer; 432 | int line_num = 0; 433 | size_t bom_pos = 0; 434 | bool last_was_carriage_return = false; 435 | handler_->HandleRobotsStart(); 436 | 437 | { 438 | for (const unsigned char ch : robots_body_) { 439 | ABSL_ASSERT(line_pos <= line_buffer_end); 440 | // Google-specific optimization: UTF-8 byte order marks should never 441 | // appear in a robots.txt file, but they do nevertheless. Skipping 442 | // possible BOM-prefix in the first bytes of the input. 443 | if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) { 444 | continue; 445 | } 446 | bom_pos = sizeof(utf_bom); 447 | if (ch != 0x0A && ch != 0x0D) { // Non-line-ending char case. 448 | // Put in next spot on current line, as long as there's room. 449 | if (line_pos < line_buffer_end) { 450 | *(line_pos++) = ch; 451 | } else { 452 | line_too_long_strict = true; 453 | } 454 | } else { // Line-ending character char case. 455 | *line_pos = '\0'; 456 | // Only emit an empty line if this was not due to the second character 457 | // of the DOS line-ending \r\n . 458 | const bool is_CRLF_continuation = 459 | (line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A; 460 | if (!is_CRLF_continuation) { 461 | ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict); 462 | line_too_long_strict = false; 463 | } 464 | line_pos = line_buffer; 465 | last_was_carriage_return = (ch == 0x0D); 466 | } 467 | } 468 | } 469 | *line_pos = '\0'; 470 | ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict); 471 | handler_->HandleRobotsEnd(); 472 | delete [] line_buffer; 473 | } 474 | 475 | // Implements the default robots.txt matching strategy. The maximum number of 476 | // characters matched by a pattern is returned as its match priority. 477 | class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy { 478 | public: 479 | LongestMatchRobotsMatchStrategy() { } 480 | 481 | // Disallow copying and assignment. 482 | LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) = 483 | delete; 484 | LongestMatchRobotsMatchStrategy& operator=( 485 | const LongestMatchRobotsMatchStrategy&) = delete; 486 | 487 | int MatchAllow(absl::string_view path, absl::string_view pattern) override; 488 | int MatchDisallow(absl::string_view path, absl::string_view pattern) override; 489 | }; 490 | } // end anonymous namespace 491 | 492 | void ParseRobotsTxt(absl::string_view robots_body, 493 | RobotsParseHandler* parse_callback) { 494 | RobotsTxtParser parser(robots_body, parse_callback); 495 | parser.Parse(); 496 | } 497 | 498 | RobotsMatcher::RobotsMatcher() 499 | : seen_global_agent_(false), 500 | seen_specific_agent_(false), 501 | ever_seen_specific_agent_(false), 502 | seen_separator_(false), 503 | path_(nullptr), 504 | user_agents_(nullptr) { 505 | match_strategy_ = new LongestMatchRobotsMatchStrategy(); 506 | } 507 | 508 | RobotsMatcher::~RobotsMatcher() { 509 | delete match_strategy_; 510 | } 511 | 512 | bool RobotsMatcher::ever_seen_specific_agent() const { 513 | return ever_seen_specific_agent_; 514 | } 515 | 516 | void RobotsMatcher::InitUserAgentsAndPath( 517 | const std::vector* user_agents, const char* path) { 518 | // The RobotsParser object doesn't own path_ or user_agents_, so overwriting 519 | // these pointers doesn't cause a memory leak. 520 | path_ = path; 521 | ABSL_ASSERT('/' == *path_); 522 | user_agents_ = user_agents; 523 | } 524 | 525 | bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body, 526 | const std::vector* user_agents, 527 | const std::string& url) { 528 | // The url is not normalized (escaped, percent encoded) here because the user 529 | // is asked to provide it in escaped form already. 530 | std::string path = GetPathParamsQuery(url); 531 | InitUserAgentsAndPath(user_agents, path.c_str()); 532 | ParseRobotsTxt(robots_body, this); 533 | return !disallow(); 534 | } 535 | 536 | bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt, 537 | const std::string& user_agent, 538 | const std::string& url) { 539 | std::vector v; 540 | v.push_back(user_agent); 541 | return AllowedByRobots(robots_txt, &v, url); 542 | } 543 | 544 | bool RobotsMatcher::disallow() const { 545 | if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) { 546 | return (disallow_.specific.priority() > allow_.specific.priority()); 547 | } 548 | 549 | if (ever_seen_specific_agent_) { 550 | // Matching group for user-agent but either without disallow or empty one, 551 | // i.e. priority == 0. 552 | return false; 553 | } 554 | 555 | if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) { 556 | return disallow_.global.priority() > allow_.global.priority(); 557 | } 558 | return false; 559 | } 560 | 561 | bool RobotsMatcher::disallow_ignore_global() const { 562 | if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) { 563 | return disallow_.specific.priority() > allow_.specific.priority(); 564 | } 565 | return false; 566 | } 567 | 568 | int RobotsMatcher::matching_line() const { 569 | if (ever_seen_specific_agent_) { 570 | return Match::HigherPriorityMatch(disallow_.specific, allow_.specific) 571 | .line(); 572 | } 573 | return Match::HigherPriorityMatch(disallow_.global, allow_.global).line(); 574 | } 575 | 576 | void RobotsMatcher::HandleRobotsStart() { 577 | // This is a new robots.txt file, so we need to reset all the instance member 578 | // variables. We do it in the same order the instance member variables are 579 | // declared, so it's easier to keep track of which ones we have (or maybe 580 | // haven't!) done. 581 | allow_.Clear(); 582 | disallow_.Clear(); 583 | 584 | seen_global_agent_ = false; 585 | seen_specific_agent_ = false; 586 | ever_seen_specific_agent_ = false; 587 | seen_separator_ = false; 588 | } 589 | 590 | /*static*/ absl::string_view RobotsMatcher::ExtractUserAgent( 591 | absl::string_view user_agent) { 592 | // Allowed characters in user-agent are [a-zA-Z_-]. 593 | const char* end = user_agent.data(); 594 | while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') { 595 | ++end; 596 | } 597 | return user_agent.substr(0, end - user_agent.data()); 598 | } 599 | 600 | /*static*/ bool RobotsMatcher::IsValidUserAgentToObey( 601 | absl::string_view user_agent) { 602 | return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent; 603 | } 604 | 605 | void RobotsMatcher::HandleUserAgent(int line_num, 606 | absl::string_view user_agent) { 607 | if (seen_separator_) { 608 | seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false; 609 | } 610 | 611 | // Google-specific optimization: a '*' followed by space and more characters 612 | // in a user-agent record is still regarded a global rule. 613 | if (user_agent.length() >= 1 && user_agent[0] == '*' && 614 | (user_agent.length() == 1 || isspace(user_agent[1]))) { 615 | seen_global_agent_ = true; 616 | } else { 617 | user_agent = ExtractUserAgent(user_agent); 618 | for (const auto& agent : *user_agents_) { 619 | if (absl::EqualsIgnoreCase(user_agent, agent)) { 620 | ever_seen_specific_agent_ = seen_specific_agent_ = true; 621 | break; 622 | } 623 | } 624 | } 625 | } 626 | 627 | void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) { 628 | if (!seen_any_agent()) return; 629 | seen_separator_ = true; 630 | const int priority = match_strategy_->MatchAllow(path_, value); 631 | if (priority >= 0) { 632 | if (seen_specific_agent_) { 633 | if (allow_.specific.priority() < priority) { 634 | allow_.specific.Set(priority, line_num); 635 | } 636 | } else { 637 | assert(seen_global_agent_); 638 | if (allow_.global.priority() < priority) { 639 | allow_.global.Set(priority, line_num); 640 | } 641 | } 642 | } else { 643 | // Google-specific optimization: 'index.htm' and 'index.html' are normalized 644 | // to '/'. 645 | const size_t slash_pos = value.find_last_of('/'); 646 | 647 | if (slash_pos != absl::string_view::npos && 648 | absl::StartsWith(absl::ClippedSubstr(value, slash_pos), 649 | "/index.htm")) { 650 | const int len = slash_pos + 1; 651 | absl::FixedArray newpattern(len + 1); 652 | strncpy(newpattern.data(), value.data(), len); 653 | newpattern[len] = '$'; 654 | HandleAllow(line_num, 655 | absl::string_view(newpattern.data(), newpattern.size())); 656 | } 657 | } 658 | } 659 | 660 | void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) { 661 | if (!seen_any_agent()) return; 662 | seen_separator_ = true; 663 | const int priority = match_strategy_->MatchDisallow(path_, value); 664 | if (priority >= 0) { 665 | if (seen_specific_agent_) { 666 | if (disallow_.specific.priority() < priority) { 667 | disallow_.specific.Set(priority, line_num); 668 | } 669 | } else { 670 | assert(seen_global_agent_); 671 | if (disallow_.global.priority() < priority) { 672 | disallow_.global.Set(priority, line_num); 673 | } 674 | } 675 | } 676 | } 677 | 678 | int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path, 679 | absl::string_view pattern) { 680 | return Matches(path, pattern) ? pattern.length() : -1; 681 | } 682 | 683 | int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path, 684 | absl::string_view pattern) { 685 | return Matches(path, pattern) ? pattern.length() : -1; 686 | } 687 | 688 | void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {} 689 | 690 | void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action, 691 | absl::string_view value) {} 692 | 693 | void ParsedRobotsKey::Parse(absl::string_view key, bool* is_acceptable_typo) { 694 | key_text_ = absl::string_view(); 695 | if (KeyIsUserAgent(key, is_acceptable_typo)) { 696 | type_ = USER_AGENT; 697 | } else if (KeyIsAllow(key, is_acceptable_typo)) { 698 | type_ = ALLOW; 699 | } else if (KeyIsDisallow(key, is_acceptable_typo)) { 700 | type_ = DISALLOW; 701 | } else if (KeyIsSitemap(key, is_acceptable_typo)) { 702 | type_ = SITEMAP; 703 | } else { 704 | type_ = UNKNOWN; 705 | key_text_ = key; 706 | } 707 | } 708 | 709 | absl::string_view ParsedRobotsKey::GetUnknownText() const { 710 | ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty()); 711 | return key_text_; 712 | } 713 | 714 | bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key, 715 | bool* is_acceptable_typo) { 716 | *is_acceptable_typo = 717 | (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") || 718 | absl::StartsWithIgnoreCase(key, "user agent"))); 719 | return (absl::StartsWithIgnoreCase(key, "user-agent") || *is_acceptable_typo); 720 | } 721 | 722 | bool ParsedRobotsKey::KeyIsAllow(absl::string_view key, 723 | bool* is_acceptable_typo) { 724 | // We don't support typos for the "allow" key. 725 | *is_acceptable_typo = false; 726 | return absl::StartsWithIgnoreCase(key, "allow"); 727 | } 728 | 729 | bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key, 730 | bool* is_acceptable_typo) { 731 | *is_acceptable_typo = 732 | (kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) || 733 | (absl::StartsWithIgnoreCase(key, "dissalow")) || 734 | (absl::StartsWithIgnoreCase(key, "disalow")) || 735 | (absl::StartsWithIgnoreCase(key, "diasllow")) || 736 | (absl::StartsWithIgnoreCase(key, "disallaw")))); 737 | return (absl::StartsWithIgnoreCase(key, "disallow") || *is_acceptable_typo); 738 | } 739 | 740 | bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key, 741 | bool* is_acceptable_typo) { 742 | *is_acceptable_typo = 743 | (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "site-map"))); 744 | return absl::StartsWithIgnoreCase(key, "sitemap") || *is_acceptable_typo; 745 | } 746 | 747 | } // namespace googlebot 748 | -------------------------------------------------------------------------------- /robots.h: -------------------------------------------------------------------------------- 1 | // Copyright 1999 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // ----------------------------------------------------------------------------- 16 | // File: robots.h 17 | // ----------------------------------------------------------------------------- 18 | // 19 | // This file implements the standard defined by the Robots Exclusion Protocol 20 | // (REP) internet draft (I-D). 21 | // https://www.rfc-editor.org/rfc/rfc9309.html 22 | // 23 | // Google doesn't follow the standard strictly, because there are a lot of 24 | // non-conforming robots.txt files out there, and we err on the side of 25 | // disallowing when this seems intended. 26 | // 27 | // An more user-friendly description of how Google handles robots.txt can be 28 | // found at: 29 | // https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt 30 | // 31 | // This library provides a low-level parser for robots.txt (ParseRobotsTxt()), 32 | // and a matcher for URLs against a robots.txt (class RobotsMatcher). 33 | 34 | #ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__ 35 | #define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__ 36 | 37 | #include 38 | #include 39 | 40 | #include "absl/strings/string_view.h" 41 | 42 | namespace googlebot { 43 | // Handler for directives found in robots.txt. These callbacks are called by 44 | // ParseRobotsTxt() in the sequence they have been found in the file. 45 | class RobotsParseHandler { 46 | public: 47 | RobotsParseHandler() {} 48 | virtual ~RobotsParseHandler() {} 49 | 50 | // Disallow copying and assignment. 51 | RobotsParseHandler(const RobotsParseHandler&) = delete; 52 | RobotsParseHandler& operator=(const RobotsParseHandler&) = delete; 53 | 54 | virtual void HandleRobotsStart() = 0; 55 | virtual void HandleRobotsEnd() = 0; 56 | 57 | virtual void HandleUserAgent(int line_num, absl::string_view value) = 0; 58 | virtual void HandleAllow(int line_num, absl::string_view value) = 0; 59 | virtual void HandleDisallow(int line_num, absl::string_view value) = 0; 60 | 61 | virtual void HandleSitemap(int line_num, absl::string_view value) = 0; 62 | 63 | // Any other unrecognized name/value pairs. 64 | virtual void HandleUnknownAction(int line_num, absl::string_view action, 65 | absl::string_view value) = 0; 66 | 67 | struct LineMetadata { 68 | // Indicates if the line is totally empty. 69 | bool is_empty = false; 70 | // Indicates if the line has a comment (may have content before it). 71 | bool has_comment = false; 72 | // Indicates if the whole line is a comment. 73 | bool is_comment = false; 74 | // Indicates that the line has a valid robots.txt directive and one of the 75 | // `Handle*` methods will be called. 76 | bool has_directive = false; 77 | // Indicates that the found directive is one of the acceptable typo variants 78 | // of the directive. See the key functions in ParsedRobotsKey for accepted 79 | // typos. 80 | bool is_acceptable_typo = false; 81 | // Indicates that the line is too long, specifically over 2083 * 8 bytes. 82 | bool is_line_too_long = false; 83 | // Indicates that the key-value pair is missing the colon separator. 84 | bool is_missing_colon_separator = false; 85 | }; 86 | 87 | virtual void ReportLineMetadata(int line_num, const LineMetadata& metadata) {} 88 | }; 89 | 90 | // Parses body of a robots.txt and emits parse callbacks. This will accept 91 | // typical typos found in robots.txt, such as 'disalow'. 92 | // 93 | // Note, this function will accept all kind of input but will skip 94 | // everything that does not look like a robots directive. 95 | void ParseRobotsTxt(absl::string_view robots_body, 96 | RobotsParseHandler* parse_callback); 97 | 98 | // RobotsMatcher - matches robots.txt against URLs. 99 | // 100 | // The Matcher uses a default match strategy for Allow/Disallow patterns which 101 | // is the official way of Google crawler to match robots.txt. It is also 102 | // possible to provide a custom match strategy. 103 | // 104 | // The entry point for the user is to call one of the *AllowedByRobots() 105 | // methods that return directly if a URL is being allowed according to the 106 | // robots.txt and the crawl agent. 107 | // The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe. 108 | class RobotsMatchStrategy; 109 | class RobotsMatcher : protected RobotsParseHandler { 110 | public: 111 | // Create a RobotsMatcher with the default matching strategy. The default 112 | // matching strategy is longest-match as opposed to the former internet draft 113 | // that provisioned first-match strategy. Analysis shows that longest-match, 114 | // while more restrictive for crawlers, is what webmasters assume when writing 115 | // directives. For example, in case of conflicting matches (both Allow and 116 | // Disallow), the longest match is the one the user wants. For example, in 117 | // case of a robots.txt file that has the following rules 118 | // Allow: / 119 | // Disallow: /cgi-bin 120 | // it's pretty obvious what the webmaster wants: they want to allow crawl of 121 | // every URI except /cgi-bin. However, according to the expired internet 122 | // standard, crawlers should be allowed to crawl everything with such a rule. 123 | RobotsMatcher(); 124 | 125 | ~RobotsMatcher() override; 126 | 127 | // Disallow copying and assignment. 128 | RobotsMatcher(const RobotsMatcher&) = delete; 129 | RobotsMatcher& operator=(const RobotsMatcher&) = delete; 130 | 131 | // Verifies that the given user agent is valid to be matched against 132 | // robots.txt. Valid user agent strings only contain the characters 133 | // [a-zA-Z_-]. 134 | static bool IsValidUserAgentToObey(absl::string_view user_agent); 135 | 136 | // Returns true iff 'url' is allowed to be fetched by any member of the 137 | // "user_agents" vector. 'url' must be %-encoded according to RFC3986. 138 | bool AllowedByRobots(absl::string_view robots_body, 139 | const std::vector* user_agents, 140 | const std::string& url); 141 | 142 | // Do robots check for 'url' when there is only one user agent. 'url' must 143 | // be %-encoded according to RFC3986. 144 | bool OneAgentAllowedByRobots(absl::string_view robots_txt, 145 | const std::string& user_agent, 146 | const std::string& url); 147 | 148 | // Returns true if we are disallowed from crawling a matching URI. 149 | bool disallow() const; 150 | 151 | // Returns true if we are disallowed from crawling a matching URI. Ignores any 152 | // rules specified for the default user agent, and bases its results only on 153 | // the specified user agents. 154 | bool disallow_ignore_global() const; 155 | 156 | // Returns true iff, when AllowedByRobots() was called, the robots file 157 | // referred explicitly to one of the specified user agents. 158 | bool ever_seen_specific_agent() const; 159 | 160 | // Returns the line that matched or 0 if none matched. 161 | int matching_line() const; 162 | 163 | protected: 164 | // Parse callbacks. 165 | // Protected because used in unittest. Never override RobotsMatcher, implement 166 | // googlebot::RobotsParseHandler instead. 167 | void HandleRobotsStart() override; 168 | void HandleRobotsEnd() override {} 169 | 170 | void HandleUserAgent(int line_num, absl::string_view user_agent) override; 171 | void HandleAllow(int line_num, absl::string_view value) override; 172 | void HandleDisallow(int line_num, absl::string_view value) override; 173 | 174 | void HandleSitemap(int line_num, absl::string_view value) override; 175 | void HandleUnknownAction(int line_num, absl::string_view action, 176 | absl::string_view value) override; 177 | 178 | protected: 179 | // Extract the matchable part of a user agent string, essentially stopping at 180 | // the first invalid character. 181 | // Example: 'Googlebot/2.1' becomes 'Googlebot' 182 | static absl::string_view ExtractUserAgent(absl::string_view user_agent); 183 | 184 | // Initialize next path and user-agents to check. Path must contain only the 185 | // path, params, and query (if any) of the url and must start with a '/'. 186 | void InitUserAgentsAndPath(const std::vector* user_agents, 187 | const char* path); 188 | 189 | // Returns true if any user-agent was seen. 190 | bool seen_any_agent() const { 191 | return seen_global_agent_ || seen_specific_agent_; 192 | } 193 | 194 | // Instead of just maintaining a Boolean indicating whether a given line has 195 | // matched, we maintain a count of the maximum number of characters matched by 196 | // that pattern. 197 | // 198 | // This structure stores the information associated with a match (e.g. when a 199 | // Disallow is matched) as priority of the match and line matching. 200 | // 201 | // The priority is initialized with a negative value to make sure that a match 202 | // of priority 0 is higher priority than no match at all. 203 | class Match { 204 | private: 205 | static const int kNoMatchPriority = -1; 206 | 207 | public: 208 | Match(int priority, int line) : priority_(priority), line_(line) {} 209 | Match() : priority_(kNoMatchPriority), line_(0) {} 210 | 211 | void Set(int priority, int line) { 212 | priority_ = priority; 213 | line_ = line; 214 | } 215 | 216 | void Clear() { Set(kNoMatchPriority, 0); } 217 | 218 | int line() const { return line_; } 219 | int priority() const { return priority_; } 220 | 221 | static const Match& HigherPriorityMatch(const Match& a, const Match& b) { 222 | if (a.priority() > b.priority()) { 223 | return a; 224 | } else { 225 | return b; 226 | } 227 | } 228 | 229 | private: 230 | int priority_; 231 | int line_; 232 | }; 233 | 234 | // For each of the directives within user-agents, we keep global and specific 235 | // match scores. 236 | struct MatchHierarchy { 237 | Match global; // Match for '*' 238 | Match specific; // Match for queried agent. 239 | void Clear() { 240 | global.Clear(); 241 | specific.Clear(); 242 | } 243 | }; 244 | MatchHierarchy allow_; // Characters of 'url' matching Allow. 245 | MatchHierarchy disallow_; // Characters of 'url' matching Disallow. 246 | 247 | bool seen_global_agent_; // True if processing global agent rules. 248 | bool seen_specific_agent_; // True if processing our specific agent. 249 | bool ever_seen_specific_agent_; // True if we ever saw a block for our agent. 250 | bool seen_separator_; // True if saw any key: value pair. 251 | 252 | // The path we want to pattern match. Not owned and only a valid pointer 253 | // during the lifetime of *AllowedByRobots calls. 254 | const char* path_; 255 | // The User-Agents we are interested in. Not owned and only a valid 256 | // pointer during the lifetime of *AllowedByRobots calls. 257 | const std::vector* user_agents_; 258 | 259 | RobotsMatchStrategy* match_strategy_; 260 | }; 261 | 262 | } // namespace googlebot 263 | #endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__ 264 | -------------------------------------------------------------------------------- /robots_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // ----------------------------------------------------------------------------- 16 | // File: robots_main.cc 17 | // ----------------------------------------------------------------------------- 18 | // 19 | // Simple binary to assess whether a URL is accessible to a user-agent according 20 | // to records found in a local robots.txt file, based on Google's robots.txt 21 | // parsing and matching algorithms. 22 | // Usage: 23 | // robots_main 24 | // Arguments: 25 | // local_path_to_robotstxt: local path to a file containing robots.txt records. 26 | // For example: /home/users/username/robots.txt 27 | // user_agent: a token to be matched against records in the robots.txt. 28 | // For example: Googlebot 29 | // url: a url to be matched against records in the robots.txt. The URL must be 30 | // %-encoded according to RFC3986. 31 | // For example: https://example.com/accessible/url.html 32 | // Output: Prints a sentence with verdict about whether 'user_agent' is allowed 33 | // to access 'url' based on records in 'local_path_to_robotstxt'. 34 | // Return code: 35 | // 0 when the url is ALLOWED for the user_agent. 36 | // 1 when the url is DISALLOWED for the user_agent. 37 | // 2 when --help is requested or if there is something invalid in the flags 38 | // passed. 39 | // 40 | #include 41 | #include 42 | 43 | #include "robots.h" 44 | 45 | bool LoadFile(const std::string& filename, std::string* result) { 46 | std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate); 47 | if (file.is_open()) { 48 | size_t size = file.tellg(); 49 | std::vector buffer(size); 50 | file.seekg(0, std::ios::beg); 51 | file.read(buffer.data(), size); 52 | file.close(); 53 | if (!file) return false; // file reading error (failbit or badbit). 54 | result->assign(buffer.begin(), buffer.end()); 55 | return true; 56 | } 57 | return false; 58 | } 59 | 60 | void ShowHelp(int argc, char** argv) { 61 | std::cerr << "Shows whether the given user_agent and URI combination" 62 | << " is allowed or disallowed by the given robots.txt file. " 63 | << std::endl 64 | << std::endl; 65 | std::cerr << "Usage: " << std::endl 66 | << " " << argv[0] << " " 67 | << std::endl 68 | << std::endl; 69 | std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl 70 | << std::endl; 71 | std::cerr << "Example: " << std::endl 72 | << " " << argv[0] << " robots.txt FooBot http://example.com/foo" 73 | << std::endl; 74 | } 75 | 76 | int main(int argc, char** argv) { 77 | std::string filename = argc >= 2 ? argv[1] : ""; 78 | if (filename == "-h" || filename == "-help" || filename == "--help") { 79 | ShowHelp(argc, argv); 80 | return 2; 81 | } 82 | if (argc != 4) { 83 | std::cerr << "Invalid amount of arguments. Showing help." << std::endl 84 | << std::endl; 85 | ShowHelp(argc, argv); 86 | return 2; 87 | } 88 | std::string robots_content; 89 | if (!(LoadFile(filename, &robots_content))) { 90 | std::cerr << "failed to read file \"" << filename << "\"" << std::endl; 91 | return 2; 92 | } 93 | 94 | std::string user_agent = argv[2]; 95 | std::vector user_agents(1, user_agent); 96 | googlebot::RobotsMatcher matcher; 97 | std::string url = argv[3]; 98 | bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url); 99 | 100 | std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3] 101 | << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl; 102 | if (robots_content.empty()) { 103 | std::cout << "notice: robots file is empty so all user-agents are allowed" 104 | << std::endl; 105 | } 106 | 107 | return allowed ? 0 : 1; 108 | } 109 | -------------------------------------------------------------------------------- /robots_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // This file tests the robots.txt parsing and matching code found in robots.cc 16 | // against the current Robots Exclusion Protocol (REP) RFC. 17 | // https://www.rfc-editor.org/rfc/rfc9309.html 18 | #include "robots.h" 19 | 20 | #include 21 | 22 | #include "gtest/gtest.h" 23 | #include "absl/strings/str_cat.h" 24 | #include "absl/strings/string_view.h" 25 | 26 | namespace { 27 | 28 | using ::googlebot::RobotsMatcher; 29 | 30 | bool IsUserAgentAllowed(const absl::string_view robotstxt, 31 | const std::string& useragent, const std::string& url) { 32 | RobotsMatcher matcher; 33 | return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url); 34 | } 35 | 36 | // Google-specific: system test. 37 | TEST(RobotsUnittest, GoogleOnly_SystemTest) { 38 | const absl::string_view robotstxt = 39 | "user-agent: FooBot\n" 40 | "disallow: /\n"; 41 | // Empty robots.txt: everything allowed. 42 | EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", "")); 43 | 44 | // Empty user-agent to be matched: everything allowed. 45 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", "")); 46 | 47 | // Empty url: implicitly disallowed, see method comment for GetPathParamsQuery 48 | // in robots.cc. 49 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "")); 50 | 51 | // All params empty: same as robots.txt empty, everything allowed. 52 | EXPECT_TRUE(IsUserAgentAllowed("", "", "")); 53 | } 54 | // Rules are colon separated name-value pairs. The following names are 55 | // provisioned: 56 | // user-agent: 57 | // allow: 58 | // disallow: 59 | // See REP RFC section "Protocol Definition". 60 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1 61 | // 62 | // Google specific: webmasters sometimes miss the colon separator, but it's 63 | // obvious what they mean by "disallow /", so we assume the colon if it's 64 | // missing. 65 | TEST(RobotsUnittest, ID_LineSyntax_Line) { 66 | const absl::string_view robotstxt_correct = 67 | "user-agent: FooBot\n" 68 | "disallow: /\n"; 69 | const absl::string_view robotstxt_incorrect = 70 | "foo: FooBot\n" 71 | "bar: /\n"; 72 | const absl::string_view robotstxt_incorrect_accepted = 73 | "user-agent FooBot\n" 74 | "disallow /\n"; 75 | const std::string url = "http://foo.bar/x/y"; 76 | 77 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url)); 78 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url)); 79 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url)); 80 | } 81 | 82 | // A group is one or more user-agent line followed by rules, and terminated 83 | // by a another user-agent line. Rules for same user-agents are combined 84 | // opaquely into one group. Rules outside groups are ignored. 85 | // See REP RFC section "Protocol Definition". 86 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1 87 | TEST(RobotsUnittest, ID_LineSyntax_Groups) { 88 | const absl::string_view robotstxt = 89 | "allow: /foo/bar/\n" 90 | "\n" 91 | "user-agent: FooBot\n" 92 | "disallow: /\n" 93 | "allow: /x/\n" 94 | "user-agent: BarBot\n" 95 | "disallow: /\n" 96 | "allow: /y/\n" 97 | "\n" 98 | "\n" 99 | "allow: /w/\n" 100 | "user-agent: BazBot\n" 101 | "\n" 102 | "user-agent: FooBot\n" 103 | "allow: /z/\n" 104 | "disallow: /\n"; 105 | 106 | const std::string url_w = "http://foo.bar/w/a"; 107 | const std::string url_x = "http://foo.bar/x/b"; 108 | const std::string url_y = "http://foo.bar/y/c"; 109 | const std::string url_z = "http://foo.bar/z/d"; 110 | const std::string url_foo = "http://foo.bar/foo/bar/"; 111 | 112 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x)); 113 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z)); 114 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y)); 115 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y)); 116 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w)); 117 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z)); 118 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z)); 119 | 120 | // Lines with rules outside groups are ignored. 121 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo)); 122 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo)); 123 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo)); 124 | } 125 | 126 | // Group must not be closed by rules not explicitly defined in the REP RFC. 127 | // See REP RFC section "Protocol Definition". 128 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1 129 | TEST(RobotsUnittest, ID_LineSyntax_Groups_OtherRules) { 130 | { 131 | const absl::string_view robotstxt = 132 | "User-agent: BarBot\n" 133 | "Sitemap: https://foo.bar/sitemap\n" 134 | "User-agent: *\n" 135 | "Disallow: /\n"; 136 | std::string url = "http://foo.bar/"; 137 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 138 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url)); 139 | } 140 | { 141 | const absl::string_view robotstxt = 142 | "User-agent: FooBot\n" 143 | "Invalid-Unknown-Line: unknown\n" 144 | "User-agent: *\n" 145 | "Disallow: /\n"; 146 | std::string url = "http://foo.bar/"; 147 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 148 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url)); 149 | } 150 | } 151 | 152 | // REP lines are case insensitive. See REP RFC section "Protocol Definition". 153 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1 154 | TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) { 155 | const absl::string_view robotstxt_upper = 156 | "USER-AGENT: FooBot\n" 157 | "ALLOW: /x/\n" 158 | "DISALLOW: /\n"; 159 | const absl::string_view robotstxt_lower = 160 | "user-agent: FooBot\n" 161 | "allow: /x/\n" 162 | "disallow: /\n"; 163 | const absl::string_view robotstxt_camel = 164 | "uSeR-aGeNt: FooBot\n" 165 | "AlLoW: /x/\n" 166 | "dIsAlLoW: /\n"; 167 | const std::string url_allowed = "http://foo.bar/x/y"; 168 | const std::string url_disallowed = "http://foo.bar/a/b"; 169 | 170 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed)); 171 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed)); 172 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed)); 173 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed)); 174 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed)); 175 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed)); 176 | } 177 | 178 | // A user-agent line is expected to contain only [a-zA-Z_-] characters and must 179 | // not be empty. See REP RFC section "The user-agent line". 180 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1 181 | TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) { 182 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot")); 183 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar")); 184 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar")); 185 | 186 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view())); 187 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("")); 188 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ")); 189 | 190 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*")); 191 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot ")); 192 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1")); 193 | 194 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar")); 195 | } 196 | 197 | // User-agent line values are case insensitive. See REP RFC section "The 198 | // user-agent line". 199 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1 200 | TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) { 201 | const absl::string_view robotstxt_upper = 202 | "User-Agent: FOO BAR\n" 203 | "Allow: /x/\n" 204 | "Disallow: /\n"; 205 | const absl::string_view robotstxt_lower = 206 | "User-Agent: foo bar\n" 207 | "Allow: /x/\n" 208 | "Disallow: /\n"; 209 | const absl::string_view robotstxt_camel = 210 | "User-Agent: FoO bAr\n" 211 | "Allow: /x/\n" 212 | "Disallow: /\n"; 213 | const std::string url_allowed = "http://foo.bar/x/y"; 214 | const std::string url_disallowed = "http://foo.bar/a/b"; 215 | 216 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed)); 217 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed)); 218 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed)); 219 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed)); 220 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed)); 221 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed)); 222 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed)); 223 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed)); 224 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed)); 225 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed)); 226 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed)); 227 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed)); 228 | } 229 | 230 | // Google specific: accept user-agent value up to the first space. Space is not 231 | // allowed in user-agent values, but that doesn't stop webmasters from using 232 | // them. This is more restrictive than the RFC, since in case of the bad value 233 | // "Googlebot Images" we'd still obey the rules with "Googlebot". 234 | // Extends REP RFC section "The user-agent line" 235 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1 236 | TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) { 237 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar")); 238 | const absl::string_view robotstxt = 239 | "User-Agent: *\n" 240 | "Disallow: /\n" 241 | "User-Agent: Foo Bar\n" 242 | "Allow: /x/\n" 243 | "Disallow: /\n"; 244 | const std::string url = "http://foo.bar/x/y"; 245 | 246 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url)); 247 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url)); 248 | } 249 | 250 | // If no group matches the user-agent, crawlers must obey the first group with a 251 | // user-agent line with a "*" value, if present. If no group satisfies either 252 | // condition, or no groups are present at all, no rules apply. 253 | // See REP RFC section "The user-agent line". 254 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1 255 | TEST(RobotsUnittest, ID_GlobalGroups_Secondary) { 256 | const absl::string_view robotstxt_empty = ""; 257 | const absl::string_view robotstxt_global = 258 | "user-agent: *\n" 259 | "allow: /\n" 260 | "user-agent: FooBot\n" 261 | "disallow: /\n"; 262 | const absl::string_view robotstxt_only_specific = 263 | "user-agent: FooBot\n" 264 | "allow: /\n" 265 | "user-agent: BarBot\n" 266 | "disallow: /\n" 267 | "user-agent: BazBot\n" 268 | "disallow: /\n"; 269 | const std::string url = "http://foo.bar/x/y"; 270 | 271 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url)); 272 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url)); 273 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url)); 274 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url)); 275 | } 276 | 277 | // Matching rules against URIs is case sensitive. 278 | // See REP RFC section "The Allow and Disallow lines". 279 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2 280 | TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) { 281 | const absl::string_view robotstxt_lowercase_url = 282 | "user-agent: FooBot\n" 283 | "disallow: /x/\n"; 284 | const absl::string_view robotstxt_uppercase_url = 285 | "user-agent: FooBot\n" 286 | "disallow: /X/\n"; 287 | const std::string url = "http://foo.bar/x/y"; 288 | 289 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url)); 290 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url)); 291 | } 292 | 293 | // The most specific match found MUST be used. The most specific match is the 294 | // match that has the most octets. In case of multiple rules with the same 295 | // length, the least strict rule must be used. 296 | // See REP RFC section "The Allow and Disallow lines". 297 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2 298 | TEST(RobotsUnittest, ID_LongestMatch) { 299 | const std::string url = "http://foo.bar/x/page.html"; 300 | { 301 | const absl::string_view robotstxt = 302 | "user-agent: FooBot\n" 303 | "disallow: /x/page.html\n" 304 | "allow: /x/\n"; 305 | 306 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 307 | } 308 | { 309 | const absl::string_view robotstxt = 310 | "user-agent: FooBot\n" 311 | "allow: /x/page.html\n" 312 | "disallow: /x/\n"; 313 | 314 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 315 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/")); 316 | } 317 | { 318 | const absl::string_view robotstxt = 319 | "user-agent: FooBot\n" 320 | "disallow: \n" 321 | "allow: \n"; 322 | // In case of equivalent disallow and allow patterns for the same 323 | // user-agent, allow is used. 324 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 325 | } 326 | { 327 | const absl::string_view robotstxt = 328 | "user-agent: FooBot\n" 329 | "disallow: /\n" 330 | "allow: /\n"; 331 | // In case of equivalent disallow and allow patterns for the same 332 | // user-agent, allow is used. 333 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 334 | } 335 | { 336 | std::string url_a = "http://foo.bar/x"; 337 | std::string url_b = "http://foo.bar/x/"; 338 | const absl::string_view robotstxt = 339 | "user-agent: FooBot\n" 340 | "disallow: /x\n" 341 | "allow: /x/\n"; 342 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a)); 343 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b)); 344 | } 345 | 346 | { 347 | const absl::string_view robotstxt = 348 | "user-agent: FooBot\n" 349 | "disallow: /x/page.html\n" 350 | "allow: /x/page.html\n"; 351 | // In case of equivalent disallow and allow patterns for the same 352 | // user-agent, allow is used. 353 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 354 | } 355 | { 356 | const absl::string_view robotstxt = 357 | "user-agent: FooBot\n" 358 | "allow: /page\n" 359 | "disallow: /*.html\n"; 360 | // Longest match wins. 361 | EXPECT_FALSE( 362 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html")); 363 | EXPECT_TRUE( 364 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page")); 365 | } 366 | { 367 | const absl::string_view robotstxt = 368 | "user-agent: FooBot\n" 369 | "allow: /x/page.\n" 370 | "disallow: /*.html\n"; 371 | // Longest match wins. 372 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 373 | EXPECT_FALSE( 374 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html")); 375 | } 376 | { 377 | const absl::string_view robotstxt = 378 | "User-agent: *\n" 379 | "Disallow: /x/\n" 380 | "User-agent: FooBot\n" 381 | "Disallow: /y/\n"; 382 | // Most specific group for FooBot allows implicitly /x/page. 383 | EXPECT_TRUE( 384 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page")); 385 | EXPECT_FALSE( 386 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page")); 387 | } 388 | } 389 | 390 | // Octets in the URI and robots.txt paths outside the range of the US-ASCII 391 | // coded character set, and those in the reserved range defined by RFC3986, 392 | // MUST be percent-encoded as defined by RFC3986 prior to comparison. 393 | // See REP RFC section "The Allow and Disallow lines". 394 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2 395 | // 396 | // NOTE: It's up to the caller to percent encode a URL before passing it to the 397 | // parser. Percent encoding URIs in the rules is unnecessary. 398 | TEST(RobotsUnittest, ID_Encoding) { 399 | // /foo/bar?baz=http://foo.bar stays unencoded. 400 | { 401 | const absl::string_view robotstxt = 402 | "User-agent: FooBot\n" 403 | "Disallow: /\n" 404 | "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n"; 405 | EXPECT_TRUE(IsUserAgentAllowed( 406 | robotstxt, "FooBot", 407 | "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par")); 408 | } 409 | 410 | // 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84 411 | { 412 | const absl::string_view robotstxt = 413 | "User-agent: FooBot\n" 414 | "Disallow: /\n" 415 | "Allow: /foo/bar/ツ\n"; 416 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 417 | "http://foo.bar/foo/bar/%E3%83%84")); 418 | // The parser encodes the 3-byte character, but the URL is not %-encoded. 419 | EXPECT_FALSE( 420 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ")); 421 | } 422 | // Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84 423 | { 424 | const absl::string_view robotstxt = 425 | "User-agent: FooBot\n" 426 | "Disallow: /\n" 427 | "Allow: /foo/bar/%E3%83%84\n"; 428 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 429 | "http://foo.bar/foo/bar/%E3%83%84")); 430 | EXPECT_FALSE( 431 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ")); 432 | } 433 | // Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL 434 | // This is illegal according to RFC3986 and while it may work here due to 435 | // simple string matching, it should not be relied on. 436 | { 437 | const absl::string_view robotstxt = 438 | "User-agent: FooBot\n" 439 | "Disallow: /\n" 440 | "Allow: /foo/bar/%62%61%7A\n"; 441 | EXPECT_FALSE( 442 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz")); 443 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 444 | "http://foo.bar/foo/bar/%62%61%7A")); 445 | } 446 | } 447 | 448 | // The REP RFC defines the following characters that have special meaning in 449 | // robots.txt: 450 | // # - inline comment. 451 | // $ - end of pattern. 452 | // * - any number of characters. 453 | // See REP RFC section "Special Characters". 454 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.3 455 | TEST(RobotsUnittest, ID_SpecialCharacters) { 456 | { 457 | const absl::string_view robotstxt = 458 | "User-agent: FooBot\n" 459 | "Disallow: /foo/bar/quz\n" 460 | "Allow: /foo/*/qux\n"; 461 | EXPECT_FALSE( 462 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz")); 463 | EXPECT_TRUE( 464 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz")); 465 | EXPECT_TRUE( 466 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz")); 467 | EXPECT_TRUE( 468 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz")); 469 | } 470 | { 471 | const absl::string_view robotstxt = 472 | "User-agent: FooBot\n" 473 | "Disallow: /foo/bar$\n" 474 | "Allow: /foo/bar/qux\n"; 475 | EXPECT_FALSE( 476 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar")); 477 | EXPECT_TRUE( 478 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux")); 479 | EXPECT_TRUE( 480 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/")); 481 | EXPECT_TRUE( 482 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz")); 483 | } 484 | { 485 | const absl::string_view robotstxt = 486 | "User-agent: FooBot\n" 487 | "# Disallow: /\n" 488 | "Disallow: /foo/quz#qux\n" 489 | "Allow: /\n"; 490 | EXPECT_TRUE( 491 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar")); 492 | EXPECT_FALSE( 493 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz")); 494 | } 495 | } 496 | 497 | // Google-specific: "index.html" (and only that) at the end of a pattern is 498 | // equivalent to "/". 499 | TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) { 500 | const absl::string_view robotstxt = 501 | "User-Agent: *\n" 502 | "Allow: /allowed-slash/index.html\n" 503 | "Disallow: /\n"; 504 | // If index.html is allowed, we interpret this as / being allowed too. 505 | EXPECT_TRUE( 506 | IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/")); 507 | // Does not exatly match. 508 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot", 509 | "http://foo.com/allowed-slash/index.htm")); 510 | // Exact match. 511 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot", 512 | "http://foo.com/allowed-slash/index.html")); 513 | EXPECT_FALSE( 514 | IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url")); 515 | } 516 | 517 | // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in 518 | // RobotsTxtParser::Parse(). 519 | TEST(RobotsUnittest, GoogleOnly_LineTooLong) { 520 | size_t kEOLLen = std::string("\n").length(); 521 | int kMaxLineLen = 2083 * 8; 522 | std::string allow = "allow: "; 523 | std::string disallow = "disallow: "; 524 | 525 | // Disallow rule pattern matches the URL after being cut off at kMaxLineLen. 526 | { 527 | std::string robotstxt = "user-agent: FooBot\n"; 528 | std::string longline = "/x/"; 529 | size_t max_length = 530 | kMaxLineLen - longline.length() - disallow.length() + kEOLLen; 531 | while (longline.size() < max_length) { 532 | absl::StrAppend(&longline, "a"); 533 | } 534 | absl::StrAppend(&robotstxt, disallow, longline, "/qux\n"); 535 | 536 | // Matches nothing, so URL is allowed. 537 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux")); 538 | // Matches cut off disallow rule. 539 | EXPECT_FALSE(IsUserAgentAllowed( 540 | robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux"))); 541 | } 542 | 543 | { 544 | std::string robotstxt = 545 | "user-agent: FooBot\n" 546 | "disallow: /\n"; 547 | std::string longline_a = "/x/"; 548 | std::string longline_b = "/x/"; 549 | size_t max_length = 550 | kMaxLineLen - longline_a.length() - allow.length() + kEOLLen; 551 | while (longline_a.size() < max_length) { 552 | absl::StrAppend(&longline_a, "a"); 553 | absl::StrAppend(&longline_b, "b"); 554 | } 555 | absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n"); 556 | absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n"); 557 | 558 | // URL matches the disallow rule. 559 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/")); 560 | // Matches the allow rule exactly. 561 | EXPECT_TRUE( 562 | IsUserAgentAllowed(robotstxt, "FooBot", 563 | absl::StrCat("http://foo.bar", longline_a, "/qux"))); 564 | // Matches cut off allow rule. 565 | EXPECT_TRUE( 566 | IsUserAgentAllowed(robotstxt, "FooBot", 567 | absl::StrCat("http://foo.bar", longline_b, "/fux"))); 568 | } 569 | } 570 | 571 | TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) { 572 | // Test documentation from 573 | // https://developers.google.com/search/reference/robots_txt 574 | // Section "URL matching based on path values". 575 | { 576 | std::string robotstxt = 577 | "user-agent: FooBot\n" 578 | "disallow: /\n" 579 | "allow: /fish\n"; 580 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 581 | 582 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 583 | EXPECT_TRUE( 584 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 585 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 586 | "http://foo.bar/fish/salmon.html")); 587 | EXPECT_TRUE( 588 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads")); 589 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 590 | "http://foo.bar/fishheads/yummy.html")); 591 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 592 | "http://foo.bar/fish.html?id=anything")); 593 | 594 | EXPECT_FALSE( 595 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp")); 596 | EXPECT_FALSE( 597 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish")); 598 | EXPECT_FALSE( 599 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish")); 600 | } 601 | // "/fish*" equals "/fish" 602 | { 603 | std::string robotstxt = 604 | "user-agent: FooBot\n" 605 | "disallow: /\n" 606 | "allow: /fish*\n"; 607 | EXPECT_FALSE( 608 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 609 | 610 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 611 | EXPECT_TRUE( 612 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 613 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 614 | "http://foo.bar/fish/salmon.html")); 615 | EXPECT_TRUE( 616 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads")); 617 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 618 | "http://foo.bar/fishheads/yummy.html")); 619 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 620 | "http://foo.bar/fish.html?id=anything")); 621 | 622 | EXPECT_FALSE( 623 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar")); 624 | EXPECT_FALSE( 625 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish")); 626 | EXPECT_FALSE( 627 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish")); 628 | } 629 | // "/fish/" does not equal "/fish" 630 | { 631 | std::string robotstxt = 632 | "user-agent: FooBot\n" 633 | "disallow: /\n" 634 | "allow: /fish/\n"; 635 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 636 | 637 | EXPECT_TRUE( 638 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/")); 639 | EXPECT_TRUE( 640 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon")); 641 | EXPECT_TRUE( 642 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon")); 643 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 644 | "http://foo.bar/fish/salmon.html")); 645 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 646 | "http://foo.bar/fish/?id=anything")); 647 | 648 | EXPECT_FALSE( 649 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 650 | EXPECT_FALSE( 651 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 652 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 653 | "http://foo.bar/Fish/Salmon.html")); 654 | } 655 | // "/*.php" 656 | { 657 | std::string robotstxt = 658 | "user-agent: FooBot\n" 659 | "disallow: /\n" 660 | "allow: /*.php\n"; 661 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 662 | 663 | EXPECT_TRUE( 664 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php")); 665 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 666 | "http://foo.bar/folder/filename.php")); 667 | EXPECT_TRUE(IsUserAgentAllowed( 668 | robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters")); 669 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 670 | "http://foo.bar//folder/any.php.file.html")); 671 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 672 | "http://foo.bar/filename.php/")); 673 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 674 | "http://foo.bar/index?f=filename.php/")); 675 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 676 | "http://foo.bar/php/")); 677 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 678 | "http://foo.bar/index?php")); 679 | 680 | EXPECT_FALSE( 681 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP")); 682 | } 683 | // "/*.php$" 684 | { 685 | std::string robotstxt = 686 | "user-agent: FooBot\n" 687 | "disallow: /\n" 688 | "allow: /*.php$\n"; 689 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 690 | 691 | EXPECT_TRUE( 692 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php")); 693 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", 694 | "http://foo.bar/folder/filename.php")); 695 | 696 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 697 | "http://foo.bar/filename.php?parameters")); 698 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 699 | "http://foo.bar/filename.php/")); 700 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 701 | "http://foo.bar/filename.php5")); 702 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 703 | "http://foo.bar/php/")); 704 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 705 | "http://foo.bar/filename?php")); 706 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", 707 | "http://foo.bar/aaaphpaaa")); 708 | EXPECT_FALSE( 709 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP")); 710 | } 711 | // "/fish*.php" 712 | { 713 | std::string robotstxt = 714 | "user-agent: FooBot\n" 715 | "disallow: /\n" 716 | "allow: /fish*.php\n"; 717 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 718 | 719 | EXPECT_TRUE( 720 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php")); 721 | EXPECT_TRUE( 722 | IsUserAgentAllowed(robotstxt, "FooBot", 723 | "http://foo.bar/fishheads/catfish.php?parameters")); 724 | 725 | EXPECT_FALSE( 726 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP")); 727 | } 728 | // Section "Order of precedence for group-member records". 729 | { 730 | std::string robotstxt = 731 | "user-agent: FooBot\n" 732 | "allow: /p\n" 733 | "disallow: /\n"; 734 | std::string url = "http://example.com/page"; 735 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 736 | } 737 | { 738 | std::string robotstxt = 739 | "user-agent: FooBot\n" 740 | "allow: /folder\n" 741 | "disallow: /folder\n"; 742 | std::string url = "http://example.com/folder/page"; 743 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 744 | } 745 | { 746 | std::string robotstxt = 747 | "user-agent: FooBot\n" 748 | "allow: /page\n" 749 | "disallow: /*.htm\n"; 750 | std::string url = "http://example.com/page.htm"; 751 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 752 | } 753 | { 754 | std::string robotstxt = 755 | "user-agent: FooBot\n" 756 | "allow: /$\n" 757 | "disallow: /\n"; 758 | std::string url = "http://example.com/"; 759 | std::string url_page = "http://example.com/page.html"; 760 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url)); 761 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page)); 762 | } 763 | } 764 | 765 | class RobotsStatsReporter : public googlebot::RobotsParseHandler { 766 | public: 767 | void HandleRobotsStart() override { 768 | last_line_seen_ = 0; 769 | valid_directives_ = 0; 770 | unknown_directives_ = 0; 771 | sitemap_.clear(); 772 | } 773 | void HandleRobotsEnd() override {} 774 | 775 | void HandleUserAgent(int line_num, absl::string_view value) override { 776 | Digest(line_num); 777 | } 778 | void HandleAllow(int line_num, absl::string_view value) override { 779 | Digest(line_num); 780 | } 781 | void HandleDisallow(int line_num, absl::string_view value) override { 782 | Digest(line_num); 783 | } 784 | 785 | void HandleSitemap(int line_num, absl::string_view value) override { 786 | Digest(line_num); 787 | sitemap_.append(value.data(), value.length()); 788 | } 789 | 790 | // Any other unrecognized name/v pairs. 791 | void HandleUnknownAction(int line_num, absl::string_view action, 792 | absl::string_view value) override { 793 | last_line_seen_ = line_num; 794 | unknown_directives_++; 795 | } 796 | 797 | int last_line_seen() const { return last_line_seen_; } 798 | 799 | // All directives found, including unknown. 800 | int valid_directives() const { return valid_directives_; } 801 | 802 | // Number of unknown directives. 803 | int unknown_directives() const { return unknown_directives_; } 804 | 805 | // Parsed sitemap line. 806 | std::string sitemap() const { return sitemap_; } 807 | 808 | private: 809 | void Digest(int line_num) { 810 | ASSERT_GE(line_num, last_line_seen_); 811 | last_line_seen_ = line_num; 812 | valid_directives_++; 813 | } 814 | 815 | int last_line_seen_ = 0; 816 | int valid_directives_ = 0; 817 | int unknown_directives_ = 0; 818 | std::string sitemap_; 819 | }; 820 | 821 | // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A 822 | TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) { 823 | RobotsStatsReporter report; 824 | static const char kUnixFile[] = 825 | "User-Agent: foo\n" 826 | "Allow: /some/path\n" 827 | "User-Agent: bar\n" 828 | "\n" 829 | "\n" 830 | "Disallow: /\n"; 831 | googlebot::ParseRobotsTxt(kUnixFile, &report); 832 | EXPECT_EQ(4, report.valid_directives()); 833 | EXPECT_EQ(6, report.last_line_seen()); 834 | 835 | static const char kDosFile[] = 836 | "User-Agent: foo\r\n" 837 | "Allow: /some/path\r\n" 838 | "User-Agent: bar\r\n" 839 | "\r\n" 840 | "\r\n" 841 | "Disallow: /\r\n"; 842 | googlebot::ParseRobotsTxt(kDosFile, &report); 843 | EXPECT_EQ(4, report.valid_directives()); 844 | EXPECT_EQ(6, report.last_line_seen()); 845 | 846 | static const char kMacFile[] = 847 | "User-Agent: foo\r" 848 | "Allow: /some/path\r" 849 | "User-Agent: bar\r" 850 | "\r" 851 | "\r" 852 | "Disallow: /\r"; 853 | googlebot::ParseRobotsTxt(kMacFile, &report); 854 | EXPECT_EQ(4, report.valid_directives()); 855 | EXPECT_EQ(6, report.last_line_seen()); 856 | 857 | static const char kNoFinalNewline[] = 858 | "User-Agent: foo\n" 859 | "Allow: /some/path\n" 860 | "User-Agent: bar\n" 861 | "\n" 862 | "\n" 863 | "Disallow: /"; 864 | googlebot::ParseRobotsTxt(kNoFinalNewline, &report); 865 | EXPECT_EQ(4, report.valid_directives()); 866 | EXPECT_EQ(6, report.last_line_seen()); 867 | 868 | static const char kMixedFile[] = 869 | "User-Agent: foo\n" 870 | "Allow: /some/path\r\n" 871 | "User-Agent: bar\n" 872 | "\r\n" 873 | "\n" 874 | "Disallow: /"; 875 | googlebot::ParseRobotsTxt(kMixedFile, &report); 876 | EXPECT_EQ(4, report.valid_directives()); 877 | EXPECT_EQ(6, report.last_line_seen()); 878 | } 879 | 880 | // BOM characters are unparseable and thus skipped. The rules following the line 881 | // are used. 882 | TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) { 883 | RobotsStatsReporter report; 884 | static const char kUtf8FileFullBOM[] = 885 | "\xEF\xBB\xBF" 886 | "User-Agent: foo\n" 887 | "Allow: /AnyValue\n"; 888 | googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report); 889 | EXPECT_EQ(2, report.valid_directives()); 890 | EXPECT_EQ(0, report.unknown_directives()); 891 | 892 | // We allow as well partial ByteOrderMarks. 893 | static const char kUtf8FilePartial2BOM[] = 894 | "\xEF\xBB" 895 | "User-Agent: foo\n" 896 | "Allow: /AnyValue\n"; 897 | googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report); 898 | EXPECT_EQ(2, report.valid_directives()); 899 | EXPECT_EQ(0, report.unknown_directives()); 900 | 901 | static const char kUtf8FilePartial1BOM[] = 902 | "\xEF" 903 | "User-Agent: foo\n" 904 | "Allow: /AnyValue\n"; 905 | googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report); 906 | EXPECT_EQ(2, report.valid_directives()); 907 | EXPECT_EQ(0, report.unknown_directives()); 908 | 909 | // If the BOM is not the right sequence, the first line looks like garbage 910 | // that is skipped (we essentially see "\x11\xBFUser-Agent"). 911 | static const char kUtf8FileBrokenBOM[] = 912 | "\xEF\x11\xBF" 913 | "User-Agent: foo\n" 914 | "Allow: /AnyValue\n"; 915 | googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report); 916 | EXPECT_EQ(1, report.valid_directives()); 917 | EXPECT_EQ(1, report.unknown_directives()); // We get one broken line. 918 | 919 | // Some other messed up file: BOMs only valid in the beginning of the file. 920 | static const char kUtf8BOMSomewhereInMiddleOfFile[] = 921 | "User-Agent: foo\n" 922 | "\xEF\xBB\xBF" 923 | "Allow: /AnyValue\n"; 924 | googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report); 925 | EXPECT_EQ(1, report.valid_directives()); 926 | EXPECT_EQ(1, report.unknown_directives()); 927 | } 928 | 929 | // Google specific: the RFC allows any line that crawlers might need, such as 930 | // sitemaps, which Google supports. 931 | // See REP RFC section "Other records". 932 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.4 933 | TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) { 934 | RobotsStatsReporter report; 935 | { 936 | std::string sitemap_loc = "http://foo.bar/sitemap.xml"; 937 | std::string robotstxt = 938 | "User-Agent: foo\n" 939 | "Allow: /some/path\n" 940 | "User-Agent: bar\n" 941 | "\n" 942 | "\n"; 943 | absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n"); 944 | 945 | googlebot::ParseRobotsTxt(robotstxt, &report); 946 | EXPECT_EQ(sitemap_loc, report.sitemap()); 947 | } 948 | // A sitemap line may appear anywhere in the file. 949 | { 950 | std::string robotstxt; 951 | std::string sitemap_loc = "http://foo.bar/sitemap.xml"; 952 | std::string robotstxt_temp = 953 | "User-Agent: foo\n" 954 | "Allow: /some/path\n" 955 | "User-Agent: bar\n" 956 | "\n" 957 | "\n"; 958 | absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp); 959 | 960 | googlebot::ParseRobotsTxt(robotstxt, &report); 961 | EXPECT_EQ(sitemap_loc, report.sitemap()); 962 | } 963 | } 964 | 965 | } // namespace 966 | 967 | // Integrity tests. These functions are available to the linker, but not in the 968 | // header, because they should only be used for testing. 969 | namespace googlebot { 970 | std::string GetPathParamsQuery(const std::string& url); 971 | bool MaybeEscapePattern(const char* src, char** dst); 972 | } // namespace googlebot 973 | 974 | void TestPath(const std::string& url, const std::string& expected_path) { 975 | EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url)); 976 | } 977 | 978 | void TestEscape(const std::string& url, const std::string& expected) { 979 | char* escaped_value = nullptr; 980 | const bool is_escaped = 981 | googlebot::MaybeEscapePattern(url.c_str(), &escaped_value); 982 | const std::string escaped = escaped_value; 983 | if (is_escaped) delete[] escaped_value; 984 | 985 | EXPECT_EQ(expected, escaped); 986 | } 987 | 988 | TEST(RobotsUnittest, TestGetPathParamsQuery) { 989 | // Only testing URLs that are already correctly escaped here. 990 | TestPath("", "/"); 991 | TestPath("http://www.example.com", "/"); 992 | TestPath("http://www.example.com/", "/"); 993 | TestPath("http://www.example.com/a", "/a"); 994 | TestPath("http://www.example.com/a/", "/a/"); 995 | TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/"); 996 | TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f"); 997 | TestPath("example.com", "/"); 998 | TestPath("example.com/", "/"); 999 | TestPath("example.com/a", "/a"); 1000 | TestPath("example.com/a/", "/a/"); 1001 | TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f"); 1002 | TestPath("a", "/"); 1003 | TestPath("a/", "/"); 1004 | TestPath("/a", "/a"); 1005 | TestPath("a/b", "/b"); 1006 | TestPath("example.com?a", "/?a"); 1007 | TestPath("example.com/a;b#c", "/a;b"); 1008 | TestPath("//a/b/c", "/b/c"); 1009 | } 1010 | 1011 | TEST(RobotsUnittest, TestMaybeEscapePattern) { 1012 | TestEscape("http://www.example.com", "http://www.example.com"); 1013 | TestEscape("/a/b/c", "/a/b/c"); 1014 | TestEscape("á", "%C3%A1"); 1015 | TestEscape("%aa", "%AA"); 1016 | } 1017 | --------------------------------------------------------------------------------