├── .gitignore
├── BUILD
├── CMakeLists.txt
├── CMakeLists.txt.in
├── CONTRIBUTING.md
├── LICENSE
├── MODULE.bazel
├── README.md
├── protocol-draft
└── draft-illyes-repext-00.xml
├── reporting_robots.cc
├── reporting_robots.h
├── reporting_robots_test.cc
├── robots.cc
├── robots.h
├── robots_main.cc
└── robots_test.cc
/.gitignore:
--------------------------------------------------------------------------------
1 | c-build/*
2 |
--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
2 |
3 | package(default_visibility = ["//visibility:public"])
4 |
5 | licenses(["notice"])
6 |
7 | exports_files(["LICENSE"])
8 |
9 | cc_library(
10 | name = "robots",
11 | srcs = [
12 | "robots.cc",
13 | ],
14 | hdrs = [
15 | "robots.h",
16 | ],
17 | deps = [
18 | "@abseil-cpp//absl/base:core_headers",
19 | "@abseil-cpp//absl/container:fixed_array",
20 | "@abseil-cpp//absl/strings",
21 | ],
22 | )
23 |
24 | cc_library(
25 | name = "reporting_robots",
26 | srcs = ["reporting_robots.cc"],
27 | hdrs = ["reporting_robots.h"],
28 | deps = [
29 | ":robots",
30 | "@abseil-cpp//absl/container:btree",
31 | "@abseil-cpp//absl/strings",
32 | ],
33 | )
34 |
35 | cc_test(
36 | name = "robots_test",
37 | srcs = ["robots_test.cc"],
38 | deps = [
39 | ":robots",
40 | "@abseil-cpp//absl/strings",
41 | "@googletest//:gtest_main",
42 | ],
43 | )
44 |
45 | cc_test(
46 | name = "reporting_robots_test",
47 | srcs = ["reporting_robots_test.cc"],
48 | deps = [
49 | ":reporting_robots",
50 | ":robots",
51 | "@abseil-cpp//absl/strings",
52 | "@googletest//:gtest_main",
53 | ],
54 | )
55 |
56 | cc_binary(
57 | name = "robots_main",
58 | srcs = ["robots_main.cc"],
59 | deps = [
60 | ":robots",
61 | ],
62 | )
63 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | PROJECT(robots)
2 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
3 |
4 | SET(CMAKE_CXX_STANDARD 14)
5 | SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
6 |
7 | SET(VERSION "0.0.0")
8 |
9 | STRING(REGEX MATCHALL "([0-9]+)" VERSION_DIGITS "${VERSION}")
10 |
11 | LIST(GET VERSION_DIGITS 0 CPACK_PACKAGE_VERSION_MAJOR)
12 | LIST(GET VERSION_DIGITS 1 CPACK_PACKAGE_VERSION_MINOR)
13 | LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH)
14 |
15 | SET(CPACK_PACKAGE_NAME "robots")
16 | SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library")
17 | SET(CPACK_PACKAGE_VENDOR "Google Inc.")
18 | SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md")
19 | SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE")
20 |
21 | SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
22 | SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
23 |
24 | SET(base_with_ver "robots-[0-9]+\\\\.[0-9]+\\\\.[0-9]+")
25 | SET(CPACK_SOURCE_IGNORE_FILES
26 | "/_CPack_Packages/"
27 | "/CMakeFiles/"
28 | "/.deps/"
29 | "^${base_with_ver}(-Source|-Linux)?/"
30 | "${base_with_ver}.tar\\\\.(gz|bz2|Z|lzma|xz)$"
31 | "\\\\.o$"
32 | "~$"
33 | "/\\\\.svn/"
34 | "/CMakeCache\\\\.txt$"
35 | "/CTestTestfile\\\\.cmake$"
36 | "/cmake_install\\\\.cmake$"
37 | "/CPackConfig\\\\.cmake$"
38 | "/CPackSourceConfig\\\\.cmake$"
39 | "/tags$"
40 | "^config\\\\.h$"
41 | "/install_manifest\\\\.txt$"
42 | "/Testing/"
43 | "ids-whitelist\\\\.txt"
44 | "/_Inline/"
45 | "/(B|build|BUILD)/"
46 | "/autom4te.cache/"
47 | )
48 |
49 | ############ build options ##############
50 |
51 | OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON)
52 | OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF)
53 | OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON)
54 |
55 | ############ helper libs ############
56 |
57 | INCLUDE(CPack)
58 | INCLUDE(CheckCCompilerFlag)
59 | INCLUDE(ExternalProject)
60 |
61 | ############ dependencies ##############
62 |
63 | CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt)
64 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
65 |
66 | IF(result)
67 | MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
68 | ENDIF()
69 |
70 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
71 |
72 | IF(result)
73 | MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
74 | ENDIF()
75 |
76 | # abseil-cpp
77 | IF(MSVC)
78 | # /wd4005 macro-redefinition
79 | # /wd4068 unknown pragma
80 | # /wd4244 conversion from 'type1' to 'type2'
81 | # /wd4267 conversion from 'size_t' to 'type2'
82 | # /wd4800 force value to bool 'true' or 'false' (performance warning)
83 | ADD_COMPILE_OPTIONS(/wd4005 /wd4068 /wd4244 /wd4267 /wd4800)
84 | ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS)
85 | ENDIF(MSVC)
86 |
87 | ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src
88 | ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build
89 | EXCLUDE_FROM_ALL)
90 |
91 | IF(ROBOTS_BUILD_TESTS)
92 | INCLUDE(CTest)
93 |
94 | # googletest
95 | ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src
96 | ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build
97 | EXCLUDE_FROM_ALL)
98 |
99 | SET(INSTALL_GTEST 0)
100 | SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
101 |
102 | IF(CMAKE_VERSION VERSION_LESS 2.8.11)
103 | INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include)
104 | ENDIF()
105 | ENDIF(ROBOTS_BUILD_TESTS)
106 |
107 | ########### compiler flags ##############
108 |
109 |
110 | SET(COMPILER_FLAGS_TO_CHECK
111 | "-Wall" "-Werror=implicit-function-declaration"
112 | )
113 |
114 | IF(CPU_ARCH)
115 | LIST(APPEND COMPILER_FLAGS_TO_CHECK "-march=${CPU_ARCH}")
116 | ENDIF(CPU_ARCH)
117 |
118 | ########### project files ###############
119 |
120 | INCLUDE_DIRECTORIES(.)
121 |
122 | ######### targets ###########
123 |
124 | SET(LIBROBOTS_LIBS)
125 |
126 | SET(robots_SRCS ./robots.cc)
127 | SET(robots_LIBS absl::base absl::strings)
128 |
129 | ADD_LIBRARY(robots SHARED ${robots_SRCS})
130 | TARGET_LINK_LIBRARIES(robots ${robots_LIBS})
131 | LIST(APPEND LIBROBOTS_LIBS "robots")
132 |
133 | IF(ROBOTS_BUILD_STATIC)
134 | ADD_LIBRARY(robots-static STATIC ${robots_SRCS})
135 | TARGET_LINK_LIBRARIES(robots-static ${robots_LIBS})
136 |
137 | LIST(APPEND LIBROBOTS_LIBS "robots-static")
138 |
139 | SET_TARGET_PROPERTIES(robots-static PROPERTIES OUTPUT_NAME "robots")
140 |
141 | SET_TARGET_PROPERTIES(${LIBROBOTS_LIBS} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
142 | ENDIF(ROBOTS_BUILD_STATIC)
143 |
144 | IF(WIN_32)
145 | SET_TARGET_PROPERTIES(robots PROPERTIES DEFINE_SYMBOL DLL_EXPORT)
146 | ENDIF(WIN_32)
147 |
148 | ADD_EXECUTABLE(robots-main ./robots_main.cc)
149 | TARGET_LINK_LIBRARIES(robots-main ${LIBROBOTS_LIBS})
150 | SET_TARGET_PROPERTIES(robots-main PROPERTIES OUTPUT_NAME "robots")
151 |
152 | ############ installation ############
153 |
154 | IF(ROBOTS_INSTALL)
155 | INSTALL(TARGETS ${LIBROBOTS_LIBS}
156 | LIBRARY DESTINATION lib
157 | ARCHIVE DESTINATION lib
158 | )
159 |
160 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include)
161 |
162 | INSTALL(TARGETS robots-main DESTINATION bin)
163 | ENDIF(ROBOTS_INSTALL)
164 |
165 | ############ tests ##############
166 |
167 | IF(ROBOTS_BUILD_TESTS)
168 | ENABLE_TESTING()
169 |
170 | ADD_EXECUTABLE(robots-test ./robots_test.cc)
171 | TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main)
172 | ADD_TEST(NAME robots-test COMMAND robots-test)
173 | ENDIF(ROBOTS_BUILD_TESTS)
174 |
175 |
--------------------------------------------------------------------------------
/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 |
2 | PROJECT(dependency-downloader NONE)
3 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
4 |
5 | INCLUDE(ExternalProject)
6 |
7 | ExternalProject_Add(abseilcpp
8 | GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
9 | GIT_TAG master
10 | GIT_PROGRESS 1
11 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src"
12 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build"
13 | CONFIGURE_COMMAND ""
14 | BUILD_COMMAND ""
15 | INSTALL_COMMAND ""
16 | TEST_COMMAND ""
17 | )
18 |
19 | ExternalProject_Add(googletest
20 | GIT_REPOSITORY https://github.com/google/googletest.git
21 | GIT_TAG main
22 | GIT_PROGRESS 1
23 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src"
24 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build"
25 | CONFIGURE_COMMAND ""
26 | BUILD_COMMAND ""
27 | INSTALL_COMMAND ""
28 | TEST_COMMAND ""
29 | )
30 |
31 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Keep in mind that this library is
22 | used in Google's production systems, so we need to be very strict about what
23 | changes we accept. Consult
24 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
25 | information on using pull requests.
26 |
27 | ## Community Guidelines
28 |
29 | This project follows [Google's Open Source Community
30 | Guidelines](https://opensource.google.com/conduct/).
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | https://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | https://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
204 |
--------------------------------------------------------------------------------
/MODULE.bazel:
--------------------------------------------------------------------------------
1 | module(
2 | name = "com_google_robotstxt",
3 | version = "head",
4 | )
5 |
6 | # -- bazel_dep definitions -- #
7 | bazel_dep(
8 | name = "bazel_skylib",
9 | version = "1.5.0",
10 | )
11 |
12 | bazel_dep(
13 | name = "abseil-cpp",
14 | version = "20240116.2",
15 | )
16 |
17 | bazel_dep(
18 | name = "googletest",
19 | version = "1.14.0.bcr.1",
20 | )
21 |
22 | bazel_dep(
23 | name = "rules_cc",
24 | version = "0.0.9",
25 | )
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Google Robots.txt Parser and Matcher Library
3 |
4 | The repository contains Google's robots.txt parser and matcher as a C++ library
5 | (compliant to C++14).
6 |
7 | ## About the library
8 |
9 | The Robots Exclusion Protocol (REP) is a standard that enables website owners to
10 | control which URLs may be accessed by automated clients (i.e. crawlers) through
11 | a simple text file with a specific syntax. It's one of the basic building blocks
12 | of the internet as we know it and what allows search engines to operate.
13 |
14 | Because the REP was only a de-facto standard for the past 25 years, different
15 | implementers implement parsing of robots.txt slightly differently, leading to
16 | confusion. This project aims to fix that by releasing the parser that Google
17 | uses.
18 |
19 | The library is slightly modified (i.e. some internal headers and equivalent
20 | symbols) production code used by Googlebot, Google's crawler, to determine which
21 | URLs it may access based on rules provided by webmasters in robots.txt files.
22 | The library is released open-source to help developers build tools that better
23 | reflect Google's robots.txt parsing and matching.
24 |
25 | For webmasters, we included a small binary in the project that allows testing a
26 | single URL and user-agent against a robots.txt.
27 |
28 | ## Building the library
29 |
30 | ### Quickstart
31 |
32 | We included with the library a small binary to test a local robots.txt against a
33 | user-agent and URL. Running the included binary requires:
34 |
35 | * A compatible platform (e.g. Windows, macOS, Linux, etc.). Most platforms are
36 | fully supported.
37 | * A compatible C++ compiler supporting at least C++14. Most major compilers
38 | are supported.
39 | * [Git](https://git-scm.com/) for interacting with the source code repository.
40 | To install Git, consult the
41 | [Set Up Git](https://help.github.com/articles/set-up-git/) guide on
42 | [GitHub](https://github.com/).
43 | * Although you are free to use your own build system, most of the
44 | documentation within this guide will assume you are using
45 | [Bazel](https://bazel.build/). To download and install Bazel (and any of its
46 | dependencies), consult the
47 | [Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html)
48 |
49 | #### Building with Bazel
50 |
51 | [Bazel](https://bazel.build/) is the official build system for the library,
52 | which is supported on most major platforms (Linux, Windows, MacOS, for example)
53 | and compilers.
54 |
55 | To build and run the binary:
56 |
57 | ```bash
58 | $ git clone https://github.com/google/robotstxt.git robotstxt
59 | Cloning into 'robotstxt'...
60 | ...
61 | $ cd robotstxt/
62 | bazel-robots$ bazel test :robots_test
63 | ...
64 | /:robots_test PASSED in 0.1s
65 |
66 | Executed 1 out of 1 test: 1 test passes.
67 | ...
68 | bazel-robots$ bazel build :robots_main
69 | ...
70 | Target //:robots_main up-to-date:
71 | bazel-bin/robots_main
72 | ...
73 | bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url
74 | user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
75 | ```
76 |
77 | #### Building with CMake
78 |
79 | [CMake](https://cmake.org) is the community-supported build system for the
80 | library.
81 |
82 | To build the library using CMake, just follow the steps below:
83 |
84 | ```bash
85 | $ git clone https://github.com/google/robotstxt.git robotstxt
86 | Cloning into 'robotstxt'...
87 | ...
88 | $ cd robotstxt/
89 | ...
90 | $ mkdir c-build && cd c-build
91 | ...
92 | $ cmake .. -DROBOTS_BUILD_TESTS=ON
93 | ...
94 | $ make
95 | ...
96 | $ make test
97 | Running tests...
98 | Test project robotstxt/c-build
99 | Start 1: robots-test
100 | 1/1 Test #1: robots-test ...................... Passed 0.02 sec
101 |
102 | 100% tests passed, 0 tests failed out of 1
103 |
104 | Total Test time (real) = 0.02 sec
105 | ...
106 | $ robots ~/local/path/to/robots.txt YourBot https://example.com/url
107 | user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
108 | ```
109 |
110 | ## Notes
111 |
112 | Parsing of robots.txt files themselves is done exactly as in the production
113 | version of Googlebot, including how percent codes and unicode characters in
114 | patterns are handled. The user must ensure however that the URI passed to the
115 | AllowedByRobots and OneAgentAllowedByRobots functions, or to the URI parameter
116 | of the robots tool, follows the format specified by RFC3986, since this library
117 | will not perform full normalization of those URI parameters. Only if the URI is
118 | in this format, the matching will be done according to the REP specification.
119 |
120 | Also note that the library, and the included binary, do not handle
121 | implementation logic that a crawler might apply outside of parsing and matching,
122 | for example: `Googlebot-Image` respecting the rules specified for `User-agent:
123 | Googlebot` if not explicitly defined in the robots.txt file being tested.
124 |
125 | ## License
126 |
127 | The robots.txt parser and matcher C++ library is licensed under the terms of the
128 | Apache license. See LICENSE for more information.
129 |
130 | ## Links
131 |
132 | To learn more about this project:
133 |
134 | * check out the
135 | [Robots Exclusion Protocol standard](https://www.rfc-editor.org/rfc/rfc9309.html),
136 | * how
137 | [Google Handles robots.txt](https://developers.google.com/search/reference/robots_txt),
138 | * or for a high level overview, the
139 | [robots.txt page on Wikipedia](https://en.wikipedia.org/wiki/Robots_exclusion_standard).
140 |
--------------------------------------------------------------------------------
/protocol-draft/draft-illyes-repext-00.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 | ]>
9 |
10 |
11 |
12 |
13 | Robots Exclusion Protocol Extension for URI Level Control
14 |
15 |
16 | Google LLC.
17 |
18 |
19 | Brandschenkestrasse 110
20 | Zurich
21 | 8002
22 | Switzerland
23 |
24 | garyillyes@google.com
25 |
26 |
27 |
28 | Internet Engineering Task Force (IETF)
29 |
30 |
31 | This document extends RFC9309 by specifying additional URI level
32 | controls through application level header and HTML meta tags
33 | originally developed in 1996. Additionally it moves the response
34 | header out of the experimental header space (i.e. "X-") and defines
35 | the combinability of multiple headers, which was previously not
36 | possible.
37 |
38 |
39 | About this Document
40 |
41 | This note is to be removed before publishing as an RFC.
42 | TODO(illyes): add commentable reference on github robotstxt repo.
43 |
44 |
45 |
46 |
47 | Introduction
48 |
49 | While the Robots Exclusion Protocol enables service owners to control
50 | how, if at all, automated clients known as crawlers may access the
51 | URIs on their services as defined by [RFC8288], the protocol doesn't
52 | provide controls on how the data returned by their service may be
53 | used upon allowed access.
54 |
55 | Originally developed in 1996 and widely adopted since, the use-case
56 | control is left to URI level controls implemented in the response
57 | headers, or in case of HTML in the form of a meta tag. This document
58 | specifies these control tags, and in case of the response header
59 | field, brings it to standards compliance with [RFC9110].
60 |
61 | Application developers are requested to honor these tags. The tags
62 | are not a form of access authorization however.
63 |
64 | Requirements Language
65 |
66 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
67 | "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY",
68 | and "OPTIONAL" in this document are to be interpreted as described
69 | in BCP 14 [RFC8174] when,
70 | and only when, they appear in all capitals, as shown here.
71 |
72 |
73 |
74 | Specification
75 |
76 | Robots control
77 |
78 | The URI level crawler controls are a key-value pair that
79 | can be specified two ways:
80 |
81 | - an application level response header.
82 | - in case of HTML, one or more meta tags as defined by the
83 | HTML specification.
84 |
85 |
86 | Application Layer Response Header
87 |
88 | The application level response header field name is "robots-tag"
89 | and contains rules applicable to either all accessors or
90 | specifically named ones in the value. For historical reasons,
91 | implementors should support the experimental field name also
92 | — "x-robots-tag".
93 |
94 | The value is a semicolon (";", 0x3B, 0x20) separated list of
95 | key-value pairs that represent a comma separated list of rules.
96 | The rules are specific to a single product token as defined by
97 | [RFC9309] or a global identifier — "*". The global identifier
98 | may be omitted. The product token is separated by a "=" from its
99 | rules.
100 |
101 | Duplicate product tokens must be merged and the rules deduplicated.
102 |
112 |
113 | For example, the following response header field specifies
114 | "noindex" and "nosnippet" rules for all accessors, however
115 | specifies no rules for the product token "ExampleBot":
116 |
119 |
120 | The global product identifier "*" in the value may be omitted; for
121 | example, this field is equivalent to the previous example:
122 |
124 |
125 | Implementors should impose a parsing limit on the field value to
126 | protect their systems. The parsing limit MUST be at least 8
127 | kibibytes [KiB].
128 |
129 |
130 | HTML meta element
131 |
132 | For historical reasons the robots-tag header may be specified by
133 | service owners as an HTML meta tag. In case of the meta tag, the
134 | name attribute is used to specify the product token, and the
135 | content attribute to specify the comma separated robots-tag rules.
136 |
137 | As with the header, the product token may be a global token,
138 | "robots", which signifies that the rules apply to all requestors,
139 | or a specific product token applicable to a single requestor. For
140 | example:
141 |
144 |
145 | Multiple robots meta elements may appear in a single HTML document.
146 | Requestors must obey the sum of negative rules specific to their
147 | product token and the global product token.
148 |
149 |
150 |
151 | Robots control rules
152 | The possible values of the rules are:
153 |
154 | -
155 | noindex - instructs the parser to not store the served
156 | data in its publicly accessible index.
157 |
158 | -
159 | nosnippet - instructs the parser to not reproduce any
160 | stored data as an excerpt snippet.
161 |
162 |
163 |
164 | The values are case insensitive. Unsupported rules must be ignored.
165 |
166 | Implementors may support other rules as specified in Section 2.2.4
167 | of [RFC9309].
168 |
169 |
170 | Caching of values
171 |
172 | The rules specified for a specific product token must be obeyed
173 | until the rules have changed. Implementors MAY use standard cache
174 | control as defined in [RFC9110] for caching robots-tag rules.
175 | Implementors SHOULD refresh their caches within a reasonable time
176 | frame.
177 |
178 |
179 |
180 | IANA considerations
181 |
185 |
186 |
187 | Security considerations
188 |
189 | The robots-tag is not a substitute for valid content security
190 | measures. To control access to the URI paths in a robots.txt file,
191 | users of the protocol should employ a valid security measure relevant
192 | to the application layer on which the robots.txt file is served —
193 | for example, in the case of HTTP, HTTP Authentication as defined in
194 | [RFC9110].
195 |
196 | The content of the robots-tag header field is not secure, private or
197 | integrity-guaranteed, and due caution should be exercised when using
198 | it. Use of Transport Layer Security (TLS) with HTTP ([RFC9110] and
199 | [RFC2817]) is currently the only end-to-end way to provide such
200 | protection.
201 |
202 | In case of a robots-tag specified in a HTML meta element, implementors
203 | should consider only the meta elements specified in the head element
204 | of the HTML document, which is generally only accessible to the
205 | service owner[a].
206 |
207 | To protect against memory overflow attacks, implementers should
208 | enforce a limit on how much data they will parse; see section N
209 | for the lower limit.
210 |
211 |
212 |
213 |
214 |
215 | References
216 |
217 | Normative References
218 |
219 |
220 | Key words for use in RFCs to Indicate Requirement Levels
221 |
222 |
223 |
224 | In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 | Upgrading to TLS Within HTTP/1.1
234 |
235 |
236 |
237 |
238 | This memo explains how to use the Upgrade mechanism in HTTP/1.1 to initiate Transport Layer Security (TLS) over an existing TCP connection. [STANDARDS-TRACK]
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 | Informative References
247 |
248 |
249 | Kibibyte - Simple English Wikipedia, the free encyclopedia
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
--------------------------------------------------------------------------------
/reporting_robots.cc:
--------------------------------------------------------------------------------
1 | #include "reporting_robots.h"
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | #include "absl/strings/ascii.h"
8 | #include "absl/strings/string_view.h"
9 |
10 | namespace googlebot {
11 | // The kUnsupportedTags tags are popular tags in robots.txt files, but Google
12 | // doesn't use them for anything. Other search engines may, however, so we
13 | // parse them out so users of the library can highlight them for their own
14 | // users if they so wish.
15 | // These are different from the "unknown" tags, since we know that these may
16 | // have some use cases; to the best of our knowledge other tags we find, don't.
17 | // (for example, "unicorn" from "unicorn: /value")
18 | static const std::vector kUnsupportedTags = {
19 | "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"};
20 |
21 | void RobotsParsingReporter::Digest(int line_num,
22 | RobotsParsedLine::RobotsTagName parsed_tag) {
23 | if (line_num > last_line_seen_) {
24 | last_line_seen_ = line_num;
25 | }
26 | if (parsed_tag != RobotsParsedLine::kUnknown &&
27 | parsed_tag != RobotsParsedLine::kUnused) {
28 | ++valid_directives_;
29 | }
30 |
31 | RobotsParsedLine& line = robots_parse_results_[line_num];
32 | line.line_num = line_num;
33 | line.tag_name = parsed_tag;
34 | }
35 |
36 | void RobotsParsingReporter::ReportLineMetadata(int line_num,
37 | const LineMetadata& metadata) {
38 | if (line_num > last_line_seen_) {
39 | last_line_seen_ = line_num;
40 | }
41 | RobotsParsedLine& line = robots_parse_results_[line_num];
42 | line.line_num = line_num;
43 | line.is_typo = metadata.is_acceptable_typo;
44 | line.metadata = metadata;
45 | }
46 |
47 | void RobotsParsingReporter::HandleRobotsStart() {
48 | last_line_seen_ = 0;
49 | valid_directives_ = 0;
50 | unused_directives_ = 0;
51 | }
52 | void RobotsParsingReporter::HandleRobotsEnd() {}
53 | void RobotsParsingReporter::HandleUserAgent(int line_num,
54 | absl::string_view line_value) {
55 | Digest(line_num, RobotsParsedLine::kUserAgent);
56 | }
57 | void RobotsParsingReporter::HandleAllow(int line_num,
58 | absl::string_view line_value) {
59 | Digest(line_num, RobotsParsedLine::kAllow);
60 | }
61 | void RobotsParsingReporter::HandleDisallow(int line_num,
62 | absl::string_view line_value) {
63 | Digest(line_num, RobotsParsedLine::kDisallow);
64 | }
65 | void RobotsParsingReporter::HandleSitemap(int line_num,
66 | absl::string_view line_value) {
67 | Digest(line_num, RobotsParsedLine::kSitemap);
68 | }
69 | void RobotsParsingReporter::HandleUnknownAction(int line_num,
70 | absl::string_view action,
71 | absl::string_view line_value) {
72 | RobotsParsedLine::RobotsTagName rtn =
73 | std::count(kUnsupportedTags.begin(), kUnsupportedTags.end(),
74 | absl::AsciiStrToLower(action)) > 0
75 | ? RobotsParsedLine::kUnused
76 | : RobotsParsedLine::kUnknown;
77 | unused_directives_++;
78 | Digest(line_num, rtn);
79 | }
80 |
81 | } // namespace googlebot
82 |
--------------------------------------------------------------------------------
/reporting_robots.h:
--------------------------------------------------------------------------------
1 | #ifndef THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
2 | #define THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
3 |
4 | #include
5 |
6 | #include "absl/container/btree_map.h"
7 | #include "absl/strings/string_view.h"
8 | #include "robots.h"
9 |
10 | namespace googlebot {
11 | struct RobotsParsedLine {
12 | enum RobotsTagName {
13 | // Identifier for skipped lines. A line may be skipped because it's
14 | // unparseable, or because it contains no recognizable key. Note that
15 | // comment lines are also skipped, they're no-op for parsing. For example:
16 | // random characters
17 | // unicorn:
18 | // # comment line
19 | // Same thing for empty lines.
20 | kUnknown = 0,
21 | kUserAgent = 1,
22 | kAllow = 2,
23 | kDisallow = 3,
24 | kSitemap = 4,
25 | // Identifier for parseable lines whose key is recognized, but unused.
26 | // See kUnsupportedTags in reporting_robots.cc for a list of recognized,
27 | // but unused keys. For example:
28 | // noindex:
29 | // noarchive:
30 | kUnused = 5,
31 | };
32 |
33 | int line_num = 0;
34 | RobotsTagName tag_name = kUnknown;
35 | bool is_typo = false;
36 | RobotsParseHandler::LineMetadata metadata;
37 | };
38 |
39 | class RobotsParsingReporter : public googlebot::RobotsParseHandler {
40 | public:
41 | void HandleRobotsStart() override;
42 | void HandleRobotsEnd() override;
43 | void HandleUserAgent(int line_num, absl::string_view line_value) override;
44 | void HandleAllow(int line_num, absl::string_view line_value) override;
45 | void HandleDisallow(int line_num, absl::string_view line_value) override;
46 | void HandleSitemap(int line_num, absl::string_view line_value) override;
47 | void HandleUnknownAction(int line_num, absl::string_view action,
48 | absl::string_view line_value) override;
49 | void ReportLineMetadata(int line_num, const LineMetadata& metadata) override;
50 |
51 | int last_line_seen() const { return last_line_seen_; }
52 | int valid_directives() const { return valid_directives_; }
53 | int unused_directives() const { return unused_directives_; }
54 | std::vector parse_results() const {
55 | std::vector vec;
56 | for (const auto& entry : robots_parse_results_) {
57 | vec.push_back(entry.second);
58 | }
59 | return vec;
60 | }
61 |
62 | private:
63 | void Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag);
64 |
65 | // Indexed and sorted by line number.
66 | absl::btree_map robots_parse_results_;
67 | int last_line_seen_ = 0;
68 | int valid_directives_ = 0;
69 | int unused_directives_ = 0;
70 | };
71 | } // namespace googlebot
72 | #endif // THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
73 |
--------------------------------------------------------------------------------
/reporting_robots_test.cc:
--------------------------------------------------------------------------------
1 | #include "reporting_robots.h"
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | #include "gtest/gtest.h"
8 | #include "absl/strings/str_cat.h"
9 | #include "absl/strings/str_split.h"
10 | #include "absl/strings/string_view.h"
11 | #include "robots.h"
12 |
13 | using ::googlebot::RobotsParsedLine;
14 | using ::googlebot::RobotsParseHandler;
15 | using ::googlebot::RobotsParsingReporter;
16 |
17 | namespace {
18 | // Allows debugging the contents of the LineMetadata struct.
19 | std::string LineMetadataToString(const RobotsParseHandler::LineMetadata& line) {
20 | // clang-format off
21 | return absl::StrCat(
22 | "{ is_empty: ", line.is_empty,
23 | " has_directive: ", line.has_directive,
24 | " has_comment: ", line.has_comment,
25 | " is_comment: ", line.is_comment,
26 | " is_acceptable_typo: ", line.is_acceptable_typo,
27 | " is_line_too_long: ", line.is_line_too_long,
28 | " is_missing_colon_separator: ", line.is_missing_colon_separator, " }");
29 | // clang-format on
30 | }
31 |
32 | std::string TagNameToString(RobotsParsedLine::RobotsTagName tag_name) {
33 | switch (tag_name) {
34 | case RobotsParsedLine::RobotsTagName::kUnknown:
35 | return "Unknown";
36 | case RobotsParsedLine::RobotsTagName::kUserAgent:
37 | return "UserAgent";
38 | case RobotsParsedLine::RobotsTagName::kAllow:
39 | return "Allow";
40 | case RobotsParsedLine::RobotsTagName::kDisallow:
41 | return "Disallow";
42 | case RobotsParsedLine::RobotsTagName::kSitemap:
43 | return "Sitemap";
44 | case RobotsParsedLine::RobotsTagName::kUnused:
45 | return "Unused";
46 | }
47 | }
48 |
49 | // Allows debugging the contents of the RobotsParsedLine struct.
50 | std::string RobotsParsedLineToString(const RobotsParsedLine& line) {
51 | return absl::StrCat("{\n lin_num:", line.line_num,
52 | "\n tag_name: ", TagNameToString(line.tag_name),
53 | "\n is_typo: ", line.is_typo,
54 | "\n metadata: ", LineMetadataToString(line.metadata),
55 | "\n}");
56 | }
57 |
58 | void expectLineToParseTo(const std::vector& lines,
59 | const std::vector& parse_results,
60 | const RobotsParsedLine& expected_result) {
61 | int line_num = expected_result.line_num;
62 | EXPECT_EQ(parse_results[line_num - 1], expected_result)
63 | << "For line " << line_num << ": '" << lines[line_num - 1] << "'";
64 | }
65 | } // namespace
66 |
67 | namespace googlebot {
68 | // This allows us to get a debug content of the object in the test. Without
69 | // this, it would say something like this when a test fails:
70 | // Expected equality of these values:
71 | // parse_results[line_num - 1]
72 | // Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 01-00 00-00>
73 | // expected_result
74 | // Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 00-25 00-00>
75 | std::ostream& operator<<(std::ostream& o, const RobotsParsedLine& line) {
76 | o << RobotsParsedLineToString(line);
77 | return o;
78 | }
79 |
80 | // These 2 `operator==` are needed for `EXPECT_EQ` to work.
81 | bool operator==(const RobotsParseHandler::LineMetadata& lhs,
82 | const RobotsParseHandler::LineMetadata& rhs) {
83 | return lhs.is_empty == rhs.is_empty &&
84 | lhs.has_directive == rhs.has_directive &&
85 | lhs.has_comment == rhs.has_comment &&
86 | lhs.is_comment == rhs.is_comment &&
87 | lhs.is_acceptable_typo == rhs.is_acceptable_typo &&
88 | lhs.is_line_too_long == rhs.is_line_too_long &&
89 | lhs.is_missing_colon_separator == rhs.is_missing_colon_separator;
90 | }
91 |
92 | bool operator==(const RobotsParsedLine& lhs, const RobotsParsedLine& rhs) {
93 | return lhs.line_num == rhs.line_num && lhs.tag_name == rhs.tag_name &&
94 | lhs.is_typo == rhs.is_typo && lhs.metadata == rhs.metadata;
95 | }
96 | } // namespace googlebot
97 |
98 | TEST(RobotsUnittest, LinesNumbersAreCountedCorrectly) {
99 | RobotsParsingReporter report;
100 | static const char kSimpleFile[] =
101 | "User-Agent: foo\n" // 1
102 | "Allow: /some/path\n" // 2
103 | "User-Agent bar # no\n" // 3
104 | "absolutely random line\n" // 4
105 | "#so comment, much wow\n" // 5
106 | "\n" // 6
107 | "unicorns: /extinct\n" // 7
108 | "noarchive: /some\n" // 8
109 | "Disallow: /\n" // 9
110 | "Error #and comment\n" // 10
111 | "useragent: baz\n" // 11
112 | "disallaw: /some\n" // 12
113 | "site-map: https://e/s.xml #comment\n" // 13
114 | "sitemap: https://e/t.xml\n" // 14
115 | "Noarchive: /someCapital\n"; // 15
116 | // 16 (from \n)
117 | googlebot::ParseRobotsTxt(kSimpleFile, &report);
118 | EXPECT_EQ(8, report.valid_directives());
119 | EXPECT_EQ(16, report.last_line_seen());
120 | EXPECT_EQ(report.parse_results().size(), report.last_line_seen());
121 | std::vector lines = absl::StrSplit(kSimpleFile, '\n');
122 |
123 | // For line "User-Agent: foo\n" // 1
124 | expectLineToParseTo(
125 | lines, report.parse_results(),
126 | RobotsParsedLine{.line_num = 1,
127 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
128 | .is_typo = false,
129 | .metadata = RobotsParseHandler::LineMetadata{
130 | .is_empty = false,
131 | .has_comment = false,
132 | .is_comment = false,
133 | .has_directive = true,
134 | .is_missing_colon_separator = false,
135 | }});
136 | // For line "Allow: /some/path\n" // 2
137 | expectLineToParseTo(
138 | lines, report.parse_results(),
139 | RobotsParsedLine{.line_num = 2,
140 | .tag_name = RobotsParsedLine::RobotsTagName::kAllow,
141 | .is_typo = false,
142 | .metadata = RobotsParseHandler::LineMetadata{
143 | .is_empty = false,
144 | .has_comment = false,
145 | .is_comment = false,
146 | .has_directive = true,
147 | }});
148 | // For line "User-Agent bar # no\n" // 3
149 | expectLineToParseTo(
150 | lines, report.parse_results(),
151 | RobotsParsedLine{.line_num = 3,
152 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
153 | .is_typo = false,
154 | .metadata = RobotsParseHandler::LineMetadata{
155 | .is_empty = false,
156 | .has_comment = true,
157 | .is_comment = false,
158 | .has_directive = true,
159 | .is_missing_colon_separator = true,
160 | }});
161 | // For line "absolutely random line\n" // 4
162 | expectLineToParseTo(
163 | lines, report.parse_results(),
164 | RobotsParsedLine{.line_num = 4,
165 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
166 | .is_typo = false,
167 | .metadata = RobotsParseHandler::LineMetadata{
168 | .is_empty = false,
169 | .has_comment = false,
170 | .is_comment = false,
171 | .has_directive = false,
172 | .is_missing_colon_separator = false,
173 | }});
174 | // For line "#so comment, much wow\n" // 5
175 | expectLineToParseTo(
176 | lines, report.parse_results(),
177 | RobotsParsedLine{.line_num = 5,
178 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
179 | .is_typo = false,
180 | .metadata = RobotsParseHandler::LineMetadata{
181 | .is_empty = false,
182 | .has_comment = true,
183 | .is_comment = true,
184 | .has_directive = false,
185 | }});
186 | // For line "\n" // 6
187 | expectLineToParseTo(
188 | lines, report.parse_results(),
189 | RobotsParsedLine{.line_num = 6,
190 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
191 | .is_typo = false,
192 | .metadata = RobotsParseHandler::LineMetadata{
193 | .is_empty = true,
194 | .has_comment = false,
195 | .is_comment = false,
196 | .has_directive = false,
197 | }});
198 | // For line "unicorns: /extinct\n" // 7
199 | expectLineToParseTo(
200 | lines, report.parse_results(),
201 | RobotsParsedLine{.line_num = 7,
202 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
203 | .is_typo = false,
204 | .metadata = RobotsParseHandler::LineMetadata{
205 | .is_empty = false,
206 | .has_comment = false,
207 | .is_comment = false,
208 | .has_directive = true,
209 | }});
210 | // For line "noarchive: /some\n" // 8
211 | expectLineToParseTo(
212 | lines, report.parse_results(),
213 | RobotsParsedLine{.line_num = 8,
214 | .tag_name = RobotsParsedLine::RobotsTagName::kUnused,
215 | .is_typo = false,
216 | .metadata = RobotsParseHandler::LineMetadata{
217 | .is_empty = false,
218 | .has_comment = false,
219 | .is_comment = false,
220 | .has_directive = true,
221 | }});
222 | // For line "Disallow: /\n" // 9
223 | expectLineToParseTo(
224 | lines, report.parse_results(),
225 | RobotsParsedLine{.line_num = 9,
226 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
227 | .is_typo = false,
228 | .metadata = RobotsParseHandler::LineMetadata{
229 | .is_empty = false,
230 | .has_comment = false,
231 | .is_comment = false,
232 | .has_directive = true,
233 | }});
234 | // For line "Error #and comment\n"; // 10
235 | expectLineToParseTo(
236 | lines, report.parse_results(),
237 | RobotsParsedLine{.line_num = 10,
238 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
239 | .is_typo = false,
240 | .metadata = RobotsParseHandler::LineMetadata{
241 | .is_empty = false,
242 | .has_comment = true,
243 | .is_comment = false,
244 | .has_directive = false,
245 | .is_missing_colon_separator = false,
246 | }});
247 | // For line "useragent: baz\n"; // 11
248 | expectLineToParseTo(
249 | lines, report.parse_results(),
250 | RobotsParsedLine{.line_num = 11,
251 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
252 | .is_typo = true,
253 | .metadata = RobotsParseHandler::LineMetadata{
254 | .is_empty = false,
255 | .has_comment = false,
256 | .is_comment = false,
257 | .has_directive = true,
258 | .is_acceptable_typo = true,
259 | }});
260 | // For line "disallaw: /some\n" // 12
261 | expectLineToParseTo(
262 | lines, report.parse_results(),
263 | RobotsParsedLine{.line_num = 12,
264 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
265 | .is_typo = true,
266 | .metadata = RobotsParseHandler::LineMetadata{
267 | .is_empty = false,
268 | .has_comment = false,
269 | .is_comment = false,
270 | .has_directive = true,
271 | .is_acceptable_typo = true,
272 | }});
273 | // For line "site-map: https://e/s.xml #comment\n" // 13;
274 | expectLineToParseTo(
275 | lines, report.parse_results(),
276 | RobotsParsedLine{.line_num = 13,
277 | .tag_name = RobotsParsedLine::RobotsTagName::kSitemap,
278 | .is_typo = true,
279 | .metadata = RobotsParseHandler::LineMetadata{
280 | .is_empty = false,
281 | .has_comment = true,
282 | .is_comment = false,
283 | .has_directive = true,
284 | .is_acceptable_typo = true,
285 | }});
286 | // For line "sitemap: https://e/t.xml\n" // 14;
287 | expectLineToParseTo(
288 | lines, report.parse_results(),
289 | RobotsParsedLine{.line_num = 14,
290 | .tag_name = RobotsParsedLine::RobotsTagName::kSitemap,
291 | .is_typo = false,
292 | .metadata = RobotsParseHandler::LineMetadata{
293 | .is_empty = false,
294 | .has_comment = false,
295 | .is_comment = false,
296 | .has_directive = true,
297 | .is_acceptable_typo = false,
298 | }});
299 | // For line "Noarchive: /someCapital\n" // 15
300 | expectLineToParseTo(
301 | lines, report.parse_results(),
302 | RobotsParsedLine{.line_num = 15,
303 | .tag_name = RobotsParsedLine::RobotsTagName::kUnused,
304 | .is_typo = false,
305 | .metadata = RobotsParseHandler::LineMetadata{
306 | .is_empty = false,
307 | .has_comment = false,
308 | .is_comment = false,
309 | .has_directive = true,
310 | }});
311 | // For line 16 (which is empty and comes from the last \n)
312 | expectLineToParseTo(
313 | lines, report.parse_results(),
314 | RobotsParsedLine{.line_num = 16,
315 | .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
316 | .is_typo = false,
317 | .metadata = RobotsParseHandler::LineMetadata{
318 | .is_empty = true,
319 | .has_comment = false,
320 | .is_comment = false,
321 | .has_directive = false,
322 | }});
323 |
324 | static const char kDosFile[] =
325 | "User-Agent: foo\r\n"
326 | "Allow: /some/path\r\n"
327 | "User-Agent: bar\r\n"
328 | "\r\n"
329 | "\r\n"
330 | "Disallow: /\r\n";
331 | googlebot::ParseRobotsTxt(kDosFile, &report);
332 | EXPECT_EQ(4, report.valid_directives());
333 | EXPECT_EQ(7, report.last_line_seen());
334 |
335 | static const char kMacFile[] =
336 | "User-Agent: foo\r"
337 | "Allow: /some/path\r"
338 | "User-Agent: bar\r"
339 | "\r"
340 | "\r"
341 | "Disallow: /\r";
342 | googlebot::ParseRobotsTxt(kMacFile, &report);
343 | EXPECT_EQ(4, report.valid_directives());
344 | EXPECT_EQ(7, report.last_line_seen());
345 | }
346 |
347 | TEST(RobotsUnittest, LinesTooLongReportedCorrectly) {
348 | RobotsParsingReporter report;
349 | const int kMaxLineLen = 2084 * 8;
350 | std::string allow = "allow: /\n";
351 | std::string disallow = "disallow: ";
352 | std::string robotstxt = "user-agent: foo\n";
353 | std::string longline = "/x/";
354 | while (longline.size() < kMaxLineLen) {
355 | absl::StrAppend(&longline, "a");
356 | }
357 | absl::StrAppend(&robotstxt, disallow, longline, "\n", allow);
358 |
359 | googlebot::ParseRobotsTxt(robotstxt, &report);
360 | EXPECT_EQ(3, report.valid_directives());
361 | EXPECT_EQ(4, report.last_line_seen());
362 | EXPECT_EQ(report.parse_results().size(), report.last_line_seen());
363 | std::vector lines = absl::StrSplit(robotstxt, '\n');
364 |
365 | // For line "user-agent: foo\n" // 1
366 | expectLineToParseTo(
367 | lines, report.parse_results(),
368 | RobotsParsedLine{.line_num = 1,
369 | .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
370 | .is_typo = false,
371 | .metadata = RobotsParseHandler::LineMetadata{
372 | .is_empty = false,
373 | .has_comment = false,
374 | .is_comment = false,
375 | .has_directive = true,
376 | .is_line_too_long = false,
377 | }});
378 | // For line "disallow: /x/a[...]a\n" // 2
379 | expectLineToParseTo(
380 | lines, report.parse_results(),
381 | RobotsParsedLine{.line_num = 2,
382 | .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
383 | .is_typo = false,
384 | .metadata = RobotsParseHandler::LineMetadata{
385 | .is_empty = false,
386 | .has_comment = false,
387 | .is_comment = false,
388 | .has_directive = true,
389 | .is_line_too_long = true,
390 | }});
391 | // For line "allow: /\n" // 3
392 | expectLineToParseTo(
393 | lines, report.parse_results(),
394 | RobotsParsedLine{.line_num = 3,
395 | .tag_name = RobotsParsedLine::RobotsTagName::kAllow,
396 | .is_typo = false,
397 | .metadata = RobotsParseHandler::LineMetadata{
398 | .is_empty = false,
399 | .has_comment = false,
400 | .is_comment = false,
401 | .has_directive = true,
402 | .is_line_too_long = false,
403 | }});
404 | }
405 |
--------------------------------------------------------------------------------
/robots.cc:
--------------------------------------------------------------------------------
1 | // Copyright 1999 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // -----------------------------------------------------------------------------
16 | // File: robots.cc
17 | // -----------------------------------------------------------------------------
18 | //
19 | // Implements expired internet draft
20 | // http://www.robotstxt.org/norobots-rfc.txt
21 | // with Google-specific optimizations detailed at
22 | // https://developers.google.com/search/reference/robots_txt
23 |
24 | #include "robots.h"
25 |
26 | #include
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 |
35 | #include "absl/base/macros.h"
36 | #include "absl/container/fixed_array.h"
37 | #include "absl/strings/ascii.h"
38 | #include "absl/strings/match.h"
39 | #include "absl/strings/numbers.h"
40 | #include "absl/strings/string_view.h"
41 |
42 | // Allow for typos such as DISALOW in robots.txt.
43 | static bool kAllowFrequentTypos = true;
44 |
45 | namespace googlebot {
46 |
47 | // A RobotsMatchStrategy defines a strategy for matching individual lines in a
48 | // robots.txt file. Each Match* method should return a match priority, which is
49 | // interpreted as:
50 | //
51 | // match priority < 0:
52 | // No match.
53 | //
54 | // match priority == 0:
55 | // Match, but treat it as if matched an empty pattern.
56 | //
57 | // match priority > 0:
58 | // Match.
59 | class RobotsMatchStrategy {
60 | public:
61 | virtual ~RobotsMatchStrategy() {}
62 |
63 | virtual int MatchAllow(absl::string_view path,
64 | absl::string_view pattern) = 0;
65 | virtual int MatchDisallow(absl::string_view path,
66 | absl::string_view pattern) = 0;
67 |
68 | protected:
69 | // Implements robots.txt pattern matching.
70 | static bool Matches(absl::string_view path, absl::string_view pattern);
71 | };
72 |
73 | // Returns true if URI path matches the specified pattern. Pattern is anchored
74 | // at the beginning of path. '$' is special only at the end of pattern.
75 | //
76 | // Since 'path' and 'pattern' are both externally determined (by the webmaster),
77 | // we make sure to have acceptable worst-case performance.
78 | /* static */ bool RobotsMatchStrategy::Matches(
79 | absl::string_view path, absl::string_view pattern) {
80 | const size_t pathlen = path.length();
81 | absl::FixedArray pos(pathlen + 1);
82 | int numpos;
83 |
84 | // The pos[] array holds a sorted list of indexes of 'path', with length
85 | // 'numpos'. At the start and end of each iteration of the main loop below,
86 | // the pos[] array will hold a list of the prefixes of the 'path' which can
87 | // match the current prefix of 'pattern'. If this list is ever empty,
88 | // return false. If we reach the end of 'pattern' with at least one element
89 | // in pos[], return true.
90 |
91 | pos[0] = 0;
92 | numpos = 1;
93 |
94 | for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
95 | if (*pat == '$' && pat + 1 == pattern.end()) {
96 | return (pos[numpos - 1] == pathlen);
97 | }
98 | if (*pat == '*') {
99 | numpos = pathlen - pos[0] + 1;
100 | for (int i = 1; i < numpos; i++) {
101 | pos[i] = pos[i-1] + 1;
102 | }
103 | } else {
104 | // Includes '$' when not at end of pattern.
105 | int newnumpos = 0;
106 | for (int i = 0; i < numpos; i++) {
107 | if (pos[i] < pathlen && path[pos[i]] == *pat) {
108 | pos[newnumpos++] = pos[i] + 1;
109 | }
110 | }
111 | numpos = newnumpos;
112 | if (numpos == 0) return false;
113 | }
114 | }
115 |
116 | return true;
117 | }
118 |
119 | static const char* kHexDigits = "0123456789ABCDEF";
120 |
121 | // GetPathParamsQuery is not in anonymous namespace to allow testing.
122 | //
123 | // Extracts path (with params) and query part from URL. Removes scheme,
124 | // authority, and fragment. Result always starts with "/".
125 | // Returns "/" if the url doesn't have a path or is not valid.
126 | std::string GetPathParamsQuery(const std::string& url) {
127 | std::string path;
128 |
129 | // Initial two slashes are ignored.
130 | size_t search_start = 0;
131 | if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2;
132 |
133 | size_t early_path = url.find_first_of("/?;", search_start);
134 | size_t protocol_end = url.find("://", search_start);
135 | if (early_path < protocol_end) {
136 | // If path, param or query starts before ://, :// doesn't indicate protocol.
137 | protocol_end = std::string::npos;
138 | }
139 | if (protocol_end == std::string::npos) {
140 | protocol_end = search_start;
141 | } else {
142 | protocol_end += 3;
143 | }
144 |
145 | size_t path_start = url.find_first_of("/?;", protocol_end);
146 | if (path_start != std::string::npos) {
147 | size_t hash_pos = url.find('#', search_start);
148 | if (hash_pos < path_start) return "/";
149 | size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos;
150 | if (url[path_start] != '/') {
151 | // Prepend a slash if the result would start e.g. with '?'.
152 | return "/" + url.substr(path_start, path_end - path_start);
153 | }
154 | return url.substr(path_start, path_end - path_start);
155 | }
156 |
157 | return "/";
158 | }
159 |
160 | // MaybeEscapePattern is not in anonymous namespace to allow testing.
161 | //
162 | // Canonicalize the allowed/disallowed paths. For example:
163 | // /SanJoséSellers ==> /Sanjos%C3%A9Sellers
164 | // %aa ==> %AA
165 | // When the function returns, (*dst) either points to src, or is newly
166 | // allocated.
167 | // Returns true if dst was newly allocated.
168 | bool MaybeEscapePattern(const char* src, char** dst) {
169 | int num_to_escape = 0;
170 | bool need_capitalize = false;
171 |
172 | // First, scan the buffer to see if changes are needed. Most don't.
173 | for (int i = 0; src[i] != 0; i++) {
174 | // (a) % escape sequence.
175 | if (src[i] == '%' &&
176 | absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
177 | if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) {
178 | need_capitalize = true;
179 | }
180 | i += 2;
181 | // (b) needs escaping.
182 | } else if (src[i] & 0x80) {
183 | num_to_escape++;
184 | }
185 | // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
186 | }
187 | // Return if no changes needed.
188 | if (!num_to_escape && !need_capitalize) {
189 | (*dst) = const_cast(src);
190 | return false;
191 | }
192 | (*dst) = new char[num_to_escape * 2 + strlen(src) + 1];
193 | int j = 0;
194 | for (int i = 0; src[i] != 0; i++) {
195 | // (a) Normalize %-escaped sequence (eg. %2f -> %2F).
196 | if (src[i] == '%' &&
197 | absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
198 | (*dst)[j++] = src[i++];
199 | (*dst)[j++] = absl::ascii_toupper(src[i++]);
200 | (*dst)[j++] = absl::ascii_toupper(src[i]);
201 | // (b) %-escape octets whose highest bit is set. These are outside the
202 | // ASCII range.
203 | } else if (src[i] & 0x80) {
204 | (*dst)[j++] = '%';
205 | (*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf];
206 | (*dst)[j++] = kHexDigits[src[i] & 0xf];
207 | // (c) Normal character, no modification needed.
208 | } else {
209 | (*dst)[j++] = src[i];
210 | }
211 | }
212 | (*dst)[j] = 0;
213 | return true;
214 | }
215 |
216 | // Internal helper classes and functions.
217 | namespace {
218 |
219 | // A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
220 | // a key. This class can parse a text-representation (including common typos)
221 | // and represent them as an enumeration which allows for faster processing
222 | // afterwards.
223 | // For unparsable keys, the original string representation is kept.
224 | class ParsedRobotsKey {
225 | public:
226 | enum KeyType {
227 | // Generic highlevel fields.
228 | USER_AGENT,
229 | SITEMAP,
230 |
231 | // Fields within a user-agent.
232 | ALLOW,
233 | DISALLOW,
234 |
235 | // Unrecognized field; kept as-is. High number so that additions to the
236 | // enumeration above does not change the serialization.
237 | UNKNOWN = 128
238 | };
239 |
240 | ParsedRobotsKey() : type_(UNKNOWN) {}
241 |
242 | // Disallow copying and assignment.
243 | ParsedRobotsKey(const ParsedRobotsKey&) = delete;
244 | ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete;
245 |
246 | // Parse given key text and report in the is_acceptable_typo output parameter
247 | // whether the key is one of the accepted typo-variants of a supported key.
248 | // Does not copy the text, so the text_key must stay valid for the object's
249 | // life-time or the next Parse() call.
250 | void Parse(absl::string_view key, bool* is_acceptable_typo);
251 |
252 | // Returns the type of key.
253 | KeyType type() const { return type_; }
254 |
255 | // If this is an unknown key, get the text.
256 | absl::string_view GetUnknownText() const;
257 |
258 | private:
259 | // The KeyIs* methods return whether a passed in key is one of the a supported
260 | // keys, and report in the is_acceptable_typo output parameter whether the key
261 | // is one of the accepted typo-variants of a supported key.
262 | static bool KeyIsUserAgent(absl::string_view key, bool* is_acceptable_typo);
263 | static bool KeyIsAllow(absl::string_view key, bool* is_acceptable_typo);
264 | static bool KeyIsDisallow(absl::string_view key, bool* is_acceptable_typo);
265 | static bool KeyIsSitemap(absl::string_view key, bool* is_acceptable_typo);
266 |
267 | KeyType type_;
268 | absl::string_view key_text_;
269 | };
270 |
271 | void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key,
272 | absl::string_view value,
273 | RobotsParseHandler* handler) {
274 | typedef ParsedRobotsKey Key;
275 | switch (key.type()) {
276 | case Key::USER_AGENT: handler->HandleUserAgent(line, value); break;
277 | case Key::ALLOW: handler->HandleAllow(line, value); break;
278 | case Key::DISALLOW: handler->HandleDisallow(line, value); break;
279 | case Key::SITEMAP: handler->HandleSitemap(line, value); break;
280 | case Key::UNKNOWN:
281 | handler->HandleUnknownAction(line, key.GetUnknownText(), value);
282 | break;
283 | // No default case Key:: to have the compiler warn about new values.
284 | }
285 | }
286 |
287 | class RobotsTxtParser {
288 | public:
289 | typedef ParsedRobotsKey Key;
290 |
291 | RobotsTxtParser(absl::string_view robots_body,
292 | RobotsParseHandler* handler)
293 | : robots_body_(robots_body), handler_(handler) {
294 | }
295 |
296 | void Parse();
297 |
298 | private:
299 | // Note that `key` and `value` are only set when `metadata->has_directive
300 | // == true`.
301 | static void GetKeyAndValueFrom(char** key, char** value, char* line,
302 | RobotsParseHandler::LineMetadata* metadata);
303 | static void StripWhitespaceSlowly(char ** s);
304 |
305 | void ParseAndEmitLine(int current_line, char* line,
306 | bool* line_too_long_strict);
307 | bool NeedEscapeValueForKey(const Key& key);
308 |
309 | absl::string_view robots_body_;
310 | RobotsParseHandler* const handler_;
311 | };
312 |
313 | bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) {
314 | switch (key.type()) {
315 | case RobotsTxtParser::Key::USER_AGENT:
316 | case RobotsTxtParser::Key::SITEMAP:
317 | return false;
318 | default:
319 | return true;
320 | }
321 | }
322 |
323 | // Removes leading and trailing whitespace from null-terminated string s.
324 | /* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) {
325 | absl::string_view stripped = absl::StripAsciiWhitespace(*s);
326 | *s = const_cast(stripped.data());
327 | (*s)[stripped.size()] = '\0';
328 | }
329 |
330 | void RobotsTxtParser::GetKeyAndValueFrom(
331 | char** key, char** value, char* line,
332 | RobotsParseHandler::LineMetadata* metadata) {
333 | // Remove comments from the current robots.txt line.
334 | char* const comment = strchr(line, '#');
335 | if (nullptr != comment) {
336 | metadata->has_comment = true;
337 | *comment = '\0';
338 | }
339 | StripWhitespaceSlowly(&line);
340 | // If the line became empty after removing the comment, return.
341 | if (strlen(line) == 0) {
342 | if (metadata->has_comment) {
343 | metadata->is_comment = true;
344 | } else {
345 | metadata->is_empty = true;
346 | }
347 | return;
348 | }
349 |
350 | // Rules must match the following pattern:
351 | // [ \t]*:[ \t]*
352 | char* sep = strchr(line, ':');
353 | if (nullptr == sep) {
354 | // Google-specific optimization: some people forget the colon, so we need to
355 | // accept whitespace in its stead.
356 | static const char * const kWhite = " \t";
357 | sep = strpbrk(line, kWhite);
358 | if (nullptr != sep) {
359 | const char* const val = sep + strspn(sep, kWhite);
360 | assert(*val); // since we dropped trailing whitespace above.
361 | if (nullptr != strpbrk(val, kWhite)) {
362 | // We only accept whitespace as a separator if there are exactly two
363 | // sequences of non-whitespace characters. If we get here, there were
364 | // more than 2 such sequences since we stripped trailing whitespace
365 | // above.
366 | return;
367 | }
368 | metadata->is_missing_colon_separator = true;
369 | }
370 | }
371 | if (nullptr == sep) {
372 | return; // Couldn't find a separator.
373 | }
374 |
375 | *key = line; // Key starts at beginning of line.
376 | *sep = '\0'; // And stops at the separator.
377 | StripWhitespaceSlowly(key); // Get rid of any trailing whitespace.
378 |
379 | if (strlen(*key) > 0) {
380 | *value = 1 + sep; // Value starts after the separator.
381 | StripWhitespaceSlowly(value); // Get rid of any leading whitespace.
382 | metadata->has_directive = true;
383 | return;
384 | }
385 | }
386 |
387 | void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line,
388 | bool* line_too_long_strict) {
389 | char* string_key;
390 | char* value;
391 | RobotsParseHandler::LineMetadata line_metadata;
392 | // Note that `string_key` and `value` are only set when
393 | // `line_metadata->has_directive == true`.
394 | GetKeyAndValueFrom(&string_key, &value, line, &line_metadata);
395 | line_metadata.is_line_too_long = *line_too_long_strict;
396 | if (!line_metadata.has_directive) {
397 | handler_->ReportLineMetadata(current_line, line_metadata);
398 | return;
399 | }
400 | Key key;
401 | key.Parse(string_key, &line_metadata.is_acceptable_typo);
402 | if (NeedEscapeValueForKey(key)) {
403 | char* escaped_value = nullptr;
404 | const bool is_escaped = MaybeEscapePattern(value, &escaped_value);
405 | EmitKeyValueToHandler(current_line, key, escaped_value, handler_);
406 | if (is_escaped) delete[] escaped_value;
407 | } else {
408 | EmitKeyValueToHandler(current_line, key, value, handler_);
409 | }
410 | // Finish adding metadata to the line.
411 | handler_->ReportLineMetadata(current_line, line_metadata);
412 | }
413 |
414 | void RobotsTxtParser::Parse() {
415 | // UTF-8 byte order marks.
416 | static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF};
417 |
418 | // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
419 | // fairly safe to assume any valid line isn't going to be more than many times
420 | // that max url length of 2KB. We want some padding for
421 | // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
422 | // If so, we can ignore the chars on a line past that.
423 | const int kBrowserMaxLineLen = 2083;
424 | const int kMaxLineLen = kBrowserMaxLineLen * 8;
425 | // Allocate a buffer used to process the current line.
426 | char* const line_buffer = new char[kMaxLineLen];
427 | // last_line_pos is the last writeable pos within the line array
428 | // (only a final '\0' may go here).
429 | const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
430 | bool line_too_long_strict = false;
431 | char* line_pos = line_buffer;
432 | int line_num = 0;
433 | size_t bom_pos = 0;
434 | bool last_was_carriage_return = false;
435 | handler_->HandleRobotsStart();
436 |
437 | {
438 | for (const unsigned char ch : robots_body_) {
439 | ABSL_ASSERT(line_pos <= line_buffer_end);
440 | // Google-specific optimization: UTF-8 byte order marks should never
441 | // appear in a robots.txt file, but they do nevertheless. Skipping
442 | // possible BOM-prefix in the first bytes of the input.
443 | if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) {
444 | continue;
445 | }
446 | bom_pos = sizeof(utf_bom);
447 | if (ch != 0x0A && ch != 0x0D) { // Non-line-ending char case.
448 | // Put in next spot on current line, as long as there's room.
449 | if (line_pos < line_buffer_end) {
450 | *(line_pos++) = ch;
451 | } else {
452 | line_too_long_strict = true;
453 | }
454 | } else { // Line-ending character char case.
455 | *line_pos = '\0';
456 | // Only emit an empty line if this was not due to the second character
457 | // of the DOS line-ending \r\n .
458 | const bool is_CRLF_continuation =
459 | (line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
460 | if (!is_CRLF_continuation) {
461 | ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
462 | line_too_long_strict = false;
463 | }
464 | line_pos = line_buffer;
465 | last_was_carriage_return = (ch == 0x0D);
466 | }
467 | }
468 | }
469 | *line_pos = '\0';
470 | ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
471 | handler_->HandleRobotsEnd();
472 | delete [] line_buffer;
473 | }
474 |
475 | // Implements the default robots.txt matching strategy. The maximum number of
476 | // characters matched by a pattern is returned as its match priority.
477 | class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
478 | public:
479 | LongestMatchRobotsMatchStrategy() { }
480 |
481 | // Disallow copying and assignment.
482 | LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
483 | delete;
484 | LongestMatchRobotsMatchStrategy& operator=(
485 | const LongestMatchRobotsMatchStrategy&) = delete;
486 |
487 | int MatchAllow(absl::string_view path, absl::string_view pattern) override;
488 | int MatchDisallow(absl::string_view path, absl::string_view pattern) override;
489 | };
490 | } // end anonymous namespace
491 |
492 | void ParseRobotsTxt(absl::string_view robots_body,
493 | RobotsParseHandler* parse_callback) {
494 | RobotsTxtParser parser(robots_body, parse_callback);
495 | parser.Parse();
496 | }
497 |
498 | RobotsMatcher::RobotsMatcher()
499 | : seen_global_agent_(false),
500 | seen_specific_agent_(false),
501 | ever_seen_specific_agent_(false),
502 | seen_separator_(false),
503 | path_(nullptr),
504 | user_agents_(nullptr) {
505 | match_strategy_ = new LongestMatchRobotsMatchStrategy();
506 | }
507 |
508 | RobotsMatcher::~RobotsMatcher() {
509 | delete match_strategy_;
510 | }
511 |
512 | bool RobotsMatcher::ever_seen_specific_agent() const {
513 | return ever_seen_specific_agent_;
514 | }
515 |
516 | void RobotsMatcher::InitUserAgentsAndPath(
517 | const std::vector* user_agents, const char* path) {
518 | // The RobotsParser object doesn't own path_ or user_agents_, so overwriting
519 | // these pointers doesn't cause a memory leak.
520 | path_ = path;
521 | ABSL_ASSERT('/' == *path_);
522 | user_agents_ = user_agents;
523 | }
524 |
525 | bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
526 | const std::vector* user_agents,
527 | const std::string& url) {
528 | // The url is not normalized (escaped, percent encoded) here because the user
529 | // is asked to provide it in escaped form already.
530 | std::string path = GetPathParamsQuery(url);
531 | InitUserAgentsAndPath(user_agents, path.c_str());
532 | ParseRobotsTxt(robots_body, this);
533 | return !disallow();
534 | }
535 |
536 | bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
537 | const std::string& user_agent,
538 | const std::string& url) {
539 | std::vector v;
540 | v.push_back(user_agent);
541 | return AllowedByRobots(robots_txt, &v, url);
542 | }
543 |
544 | bool RobotsMatcher::disallow() const {
545 | if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
546 | return (disallow_.specific.priority() > allow_.specific.priority());
547 | }
548 |
549 | if (ever_seen_specific_agent_) {
550 | // Matching group for user-agent but either without disallow or empty one,
551 | // i.e. priority == 0.
552 | return false;
553 | }
554 |
555 | if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) {
556 | return disallow_.global.priority() > allow_.global.priority();
557 | }
558 | return false;
559 | }
560 |
561 | bool RobotsMatcher::disallow_ignore_global() const {
562 | if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
563 | return disallow_.specific.priority() > allow_.specific.priority();
564 | }
565 | return false;
566 | }
567 |
568 | int RobotsMatcher::matching_line() const {
569 | if (ever_seen_specific_agent_) {
570 | return Match::HigherPriorityMatch(disallow_.specific, allow_.specific)
571 | .line();
572 | }
573 | return Match::HigherPriorityMatch(disallow_.global, allow_.global).line();
574 | }
575 |
576 | void RobotsMatcher::HandleRobotsStart() {
577 | // This is a new robots.txt file, so we need to reset all the instance member
578 | // variables. We do it in the same order the instance member variables are
579 | // declared, so it's easier to keep track of which ones we have (or maybe
580 | // haven't!) done.
581 | allow_.Clear();
582 | disallow_.Clear();
583 |
584 | seen_global_agent_ = false;
585 | seen_specific_agent_ = false;
586 | ever_seen_specific_agent_ = false;
587 | seen_separator_ = false;
588 | }
589 |
590 | /*static*/ absl::string_view RobotsMatcher::ExtractUserAgent(
591 | absl::string_view user_agent) {
592 | // Allowed characters in user-agent are [a-zA-Z_-].
593 | const char* end = user_agent.data();
594 | while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') {
595 | ++end;
596 | }
597 | return user_agent.substr(0, end - user_agent.data());
598 | }
599 |
600 | /*static*/ bool RobotsMatcher::IsValidUserAgentToObey(
601 | absl::string_view user_agent) {
602 | return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent;
603 | }
604 |
605 | void RobotsMatcher::HandleUserAgent(int line_num,
606 | absl::string_view user_agent) {
607 | if (seen_separator_) {
608 | seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
609 | }
610 |
611 | // Google-specific optimization: a '*' followed by space and more characters
612 | // in a user-agent record is still regarded a global rule.
613 | if (user_agent.length() >= 1 && user_agent[0] == '*' &&
614 | (user_agent.length() == 1 || isspace(user_agent[1]))) {
615 | seen_global_agent_ = true;
616 | } else {
617 | user_agent = ExtractUserAgent(user_agent);
618 | for (const auto& agent : *user_agents_) {
619 | if (absl::EqualsIgnoreCase(user_agent, agent)) {
620 | ever_seen_specific_agent_ = seen_specific_agent_ = true;
621 | break;
622 | }
623 | }
624 | }
625 | }
626 |
627 | void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) {
628 | if (!seen_any_agent()) return;
629 | seen_separator_ = true;
630 | const int priority = match_strategy_->MatchAllow(path_, value);
631 | if (priority >= 0) {
632 | if (seen_specific_agent_) {
633 | if (allow_.specific.priority() < priority) {
634 | allow_.specific.Set(priority, line_num);
635 | }
636 | } else {
637 | assert(seen_global_agent_);
638 | if (allow_.global.priority() < priority) {
639 | allow_.global.Set(priority, line_num);
640 | }
641 | }
642 | } else {
643 | // Google-specific optimization: 'index.htm' and 'index.html' are normalized
644 | // to '/'.
645 | const size_t slash_pos = value.find_last_of('/');
646 |
647 | if (slash_pos != absl::string_view::npos &&
648 | absl::StartsWith(absl::ClippedSubstr(value, slash_pos),
649 | "/index.htm")) {
650 | const int len = slash_pos + 1;
651 | absl::FixedArray newpattern(len + 1);
652 | strncpy(newpattern.data(), value.data(), len);
653 | newpattern[len] = '$';
654 | HandleAllow(line_num,
655 | absl::string_view(newpattern.data(), newpattern.size()));
656 | }
657 | }
658 | }
659 |
660 | void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) {
661 | if (!seen_any_agent()) return;
662 | seen_separator_ = true;
663 | const int priority = match_strategy_->MatchDisallow(path_, value);
664 | if (priority >= 0) {
665 | if (seen_specific_agent_) {
666 | if (disallow_.specific.priority() < priority) {
667 | disallow_.specific.Set(priority, line_num);
668 | }
669 | } else {
670 | assert(seen_global_agent_);
671 | if (disallow_.global.priority() < priority) {
672 | disallow_.global.Set(priority, line_num);
673 | }
674 | }
675 | }
676 | }
677 |
678 | int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path,
679 | absl::string_view pattern) {
680 | return Matches(path, pattern) ? pattern.length() : -1;
681 | }
682 |
683 | int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path,
684 | absl::string_view pattern) {
685 | return Matches(path, pattern) ? pattern.length() : -1;
686 | }
687 |
688 | void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {}
689 |
690 | void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action,
691 | absl::string_view value) {}
692 |
693 | void ParsedRobotsKey::Parse(absl::string_view key, bool* is_acceptable_typo) {
694 | key_text_ = absl::string_view();
695 | if (KeyIsUserAgent(key, is_acceptable_typo)) {
696 | type_ = USER_AGENT;
697 | } else if (KeyIsAllow(key, is_acceptable_typo)) {
698 | type_ = ALLOW;
699 | } else if (KeyIsDisallow(key, is_acceptable_typo)) {
700 | type_ = DISALLOW;
701 | } else if (KeyIsSitemap(key, is_acceptable_typo)) {
702 | type_ = SITEMAP;
703 | } else {
704 | type_ = UNKNOWN;
705 | key_text_ = key;
706 | }
707 | }
708 |
709 | absl::string_view ParsedRobotsKey::GetUnknownText() const {
710 | ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty());
711 | return key_text_;
712 | }
713 |
714 | bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key,
715 | bool* is_acceptable_typo) {
716 | *is_acceptable_typo =
717 | (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") ||
718 | absl::StartsWithIgnoreCase(key, "user agent")));
719 | return (absl::StartsWithIgnoreCase(key, "user-agent") || *is_acceptable_typo);
720 | }
721 |
722 | bool ParsedRobotsKey::KeyIsAllow(absl::string_view key,
723 | bool* is_acceptable_typo) {
724 | // We don't support typos for the "allow" key.
725 | *is_acceptable_typo = false;
726 | return absl::StartsWithIgnoreCase(key, "allow");
727 | }
728 |
729 | bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key,
730 | bool* is_acceptable_typo) {
731 | *is_acceptable_typo =
732 | (kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) ||
733 | (absl::StartsWithIgnoreCase(key, "dissalow")) ||
734 | (absl::StartsWithIgnoreCase(key, "disalow")) ||
735 | (absl::StartsWithIgnoreCase(key, "diasllow")) ||
736 | (absl::StartsWithIgnoreCase(key, "disallaw"))));
737 | return (absl::StartsWithIgnoreCase(key, "disallow") || *is_acceptable_typo);
738 | }
739 |
740 | bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key,
741 | bool* is_acceptable_typo) {
742 | *is_acceptable_typo =
743 | (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "site-map")));
744 | return absl::StartsWithIgnoreCase(key, "sitemap") || *is_acceptable_typo;
745 | }
746 |
747 | } // namespace googlebot
748 |
--------------------------------------------------------------------------------
/robots.h:
--------------------------------------------------------------------------------
1 | // Copyright 1999 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // -----------------------------------------------------------------------------
16 | // File: robots.h
17 | // -----------------------------------------------------------------------------
18 | //
19 | // This file implements the standard defined by the Robots Exclusion Protocol
20 | // (REP) internet draft (I-D).
21 | // https://www.rfc-editor.org/rfc/rfc9309.html
22 | //
23 | // Google doesn't follow the standard strictly, because there are a lot of
24 | // non-conforming robots.txt files out there, and we err on the side of
25 | // disallowing when this seems intended.
26 | //
27 | // An more user-friendly description of how Google handles robots.txt can be
28 | // found at:
29 | // https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt
30 | //
31 | // This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
32 | // and a matcher for URLs against a robots.txt (class RobotsMatcher).
33 |
34 | #ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
35 | #define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
36 |
37 | #include
38 | #include
39 |
40 | #include "absl/strings/string_view.h"
41 |
42 | namespace googlebot {
43 | // Handler for directives found in robots.txt. These callbacks are called by
44 | // ParseRobotsTxt() in the sequence they have been found in the file.
45 | class RobotsParseHandler {
46 | public:
47 | RobotsParseHandler() {}
48 | virtual ~RobotsParseHandler() {}
49 |
50 | // Disallow copying and assignment.
51 | RobotsParseHandler(const RobotsParseHandler&) = delete;
52 | RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
53 |
54 | virtual void HandleRobotsStart() = 0;
55 | virtual void HandleRobotsEnd() = 0;
56 |
57 | virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
58 | virtual void HandleAllow(int line_num, absl::string_view value) = 0;
59 | virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
60 |
61 | virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
62 |
63 | // Any other unrecognized name/value pairs.
64 | virtual void HandleUnknownAction(int line_num, absl::string_view action,
65 | absl::string_view value) = 0;
66 |
67 | struct LineMetadata {
68 | // Indicates if the line is totally empty.
69 | bool is_empty = false;
70 | // Indicates if the line has a comment (may have content before it).
71 | bool has_comment = false;
72 | // Indicates if the whole line is a comment.
73 | bool is_comment = false;
74 | // Indicates that the line has a valid robots.txt directive and one of the
75 | // `Handle*` methods will be called.
76 | bool has_directive = false;
77 | // Indicates that the found directive is one of the acceptable typo variants
78 | // of the directive. See the key functions in ParsedRobotsKey for accepted
79 | // typos.
80 | bool is_acceptable_typo = false;
81 | // Indicates that the line is too long, specifically over 2083 * 8 bytes.
82 | bool is_line_too_long = false;
83 | // Indicates that the key-value pair is missing the colon separator.
84 | bool is_missing_colon_separator = false;
85 | };
86 |
87 | virtual void ReportLineMetadata(int line_num, const LineMetadata& metadata) {}
88 | };
89 |
90 | // Parses body of a robots.txt and emits parse callbacks. This will accept
91 | // typical typos found in robots.txt, such as 'disalow'.
92 | //
93 | // Note, this function will accept all kind of input but will skip
94 | // everything that does not look like a robots directive.
95 | void ParseRobotsTxt(absl::string_view robots_body,
96 | RobotsParseHandler* parse_callback);
97 |
98 | // RobotsMatcher - matches robots.txt against URLs.
99 | //
100 | // The Matcher uses a default match strategy for Allow/Disallow patterns which
101 | // is the official way of Google crawler to match robots.txt. It is also
102 | // possible to provide a custom match strategy.
103 | //
104 | // The entry point for the user is to call one of the *AllowedByRobots()
105 | // methods that return directly if a URL is being allowed according to the
106 | // robots.txt and the crawl agent.
107 | // The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
108 | class RobotsMatchStrategy;
109 | class RobotsMatcher : protected RobotsParseHandler {
110 | public:
111 | // Create a RobotsMatcher with the default matching strategy. The default
112 | // matching strategy is longest-match as opposed to the former internet draft
113 | // that provisioned first-match strategy. Analysis shows that longest-match,
114 | // while more restrictive for crawlers, is what webmasters assume when writing
115 | // directives. For example, in case of conflicting matches (both Allow and
116 | // Disallow), the longest match is the one the user wants. For example, in
117 | // case of a robots.txt file that has the following rules
118 | // Allow: /
119 | // Disallow: /cgi-bin
120 | // it's pretty obvious what the webmaster wants: they want to allow crawl of
121 | // every URI except /cgi-bin. However, according to the expired internet
122 | // standard, crawlers should be allowed to crawl everything with such a rule.
123 | RobotsMatcher();
124 |
125 | ~RobotsMatcher() override;
126 |
127 | // Disallow copying and assignment.
128 | RobotsMatcher(const RobotsMatcher&) = delete;
129 | RobotsMatcher& operator=(const RobotsMatcher&) = delete;
130 |
131 | // Verifies that the given user agent is valid to be matched against
132 | // robots.txt. Valid user agent strings only contain the characters
133 | // [a-zA-Z_-].
134 | static bool IsValidUserAgentToObey(absl::string_view user_agent);
135 |
136 | // Returns true iff 'url' is allowed to be fetched by any member of the
137 | // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
138 | bool AllowedByRobots(absl::string_view robots_body,
139 | const std::vector* user_agents,
140 | const std::string& url);
141 |
142 | // Do robots check for 'url' when there is only one user agent. 'url' must
143 | // be %-encoded according to RFC3986.
144 | bool OneAgentAllowedByRobots(absl::string_view robots_txt,
145 | const std::string& user_agent,
146 | const std::string& url);
147 |
148 | // Returns true if we are disallowed from crawling a matching URI.
149 | bool disallow() const;
150 |
151 | // Returns true if we are disallowed from crawling a matching URI. Ignores any
152 | // rules specified for the default user agent, and bases its results only on
153 | // the specified user agents.
154 | bool disallow_ignore_global() const;
155 |
156 | // Returns true iff, when AllowedByRobots() was called, the robots file
157 | // referred explicitly to one of the specified user agents.
158 | bool ever_seen_specific_agent() const;
159 |
160 | // Returns the line that matched or 0 if none matched.
161 | int matching_line() const;
162 |
163 | protected:
164 | // Parse callbacks.
165 | // Protected because used in unittest. Never override RobotsMatcher, implement
166 | // googlebot::RobotsParseHandler instead.
167 | void HandleRobotsStart() override;
168 | void HandleRobotsEnd() override {}
169 |
170 | void HandleUserAgent(int line_num, absl::string_view user_agent) override;
171 | void HandleAllow(int line_num, absl::string_view value) override;
172 | void HandleDisallow(int line_num, absl::string_view value) override;
173 |
174 | void HandleSitemap(int line_num, absl::string_view value) override;
175 | void HandleUnknownAction(int line_num, absl::string_view action,
176 | absl::string_view value) override;
177 |
178 | protected:
179 | // Extract the matchable part of a user agent string, essentially stopping at
180 | // the first invalid character.
181 | // Example: 'Googlebot/2.1' becomes 'Googlebot'
182 | static absl::string_view ExtractUserAgent(absl::string_view user_agent);
183 |
184 | // Initialize next path and user-agents to check. Path must contain only the
185 | // path, params, and query (if any) of the url and must start with a '/'.
186 | void InitUserAgentsAndPath(const std::vector* user_agents,
187 | const char* path);
188 |
189 | // Returns true if any user-agent was seen.
190 | bool seen_any_agent() const {
191 | return seen_global_agent_ || seen_specific_agent_;
192 | }
193 |
194 | // Instead of just maintaining a Boolean indicating whether a given line has
195 | // matched, we maintain a count of the maximum number of characters matched by
196 | // that pattern.
197 | //
198 | // This structure stores the information associated with a match (e.g. when a
199 | // Disallow is matched) as priority of the match and line matching.
200 | //
201 | // The priority is initialized with a negative value to make sure that a match
202 | // of priority 0 is higher priority than no match at all.
203 | class Match {
204 | private:
205 | static const int kNoMatchPriority = -1;
206 |
207 | public:
208 | Match(int priority, int line) : priority_(priority), line_(line) {}
209 | Match() : priority_(kNoMatchPriority), line_(0) {}
210 |
211 | void Set(int priority, int line) {
212 | priority_ = priority;
213 | line_ = line;
214 | }
215 |
216 | void Clear() { Set(kNoMatchPriority, 0); }
217 |
218 | int line() const { return line_; }
219 | int priority() const { return priority_; }
220 |
221 | static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
222 | if (a.priority() > b.priority()) {
223 | return a;
224 | } else {
225 | return b;
226 | }
227 | }
228 |
229 | private:
230 | int priority_;
231 | int line_;
232 | };
233 |
234 | // For each of the directives within user-agents, we keep global and specific
235 | // match scores.
236 | struct MatchHierarchy {
237 | Match global; // Match for '*'
238 | Match specific; // Match for queried agent.
239 | void Clear() {
240 | global.Clear();
241 | specific.Clear();
242 | }
243 | };
244 | MatchHierarchy allow_; // Characters of 'url' matching Allow.
245 | MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
246 |
247 | bool seen_global_agent_; // True if processing global agent rules.
248 | bool seen_specific_agent_; // True if processing our specific agent.
249 | bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
250 | bool seen_separator_; // True if saw any key: value pair.
251 |
252 | // The path we want to pattern match. Not owned and only a valid pointer
253 | // during the lifetime of *AllowedByRobots calls.
254 | const char* path_;
255 | // The User-Agents we are interested in. Not owned and only a valid
256 | // pointer during the lifetime of *AllowedByRobots calls.
257 | const std::vector* user_agents_;
258 |
259 | RobotsMatchStrategy* match_strategy_;
260 | };
261 |
262 | } // namespace googlebot
263 | #endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
264 |
--------------------------------------------------------------------------------
/robots_main.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2019 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // -----------------------------------------------------------------------------
16 | // File: robots_main.cc
17 | // -----------------------------------------------------------------------------
18 | //
19 | // Simple binary to assess whether a URL is accessible to a user-agent according
20 | // to records found in a local robots.txt file, based on Google's robots.txt
21 | // parsing and matching algorithms.
22 | // Usage:
23 | // robots_main
24 | // Arguments:
25 | // local_path_to_robotstxt: local path to a file containing robots.txt records.
26 | // For example: /home/users/username/robots.txt
27 | // user_agent: a token to be matched against records in the robots.txt.
28 | // For example: Googlebot
29 | // url: a url to be matched against records in the robots.txt. The URL must be
30 | // %-encoded according to RFC3986.
31 | // For example: https://example.com/accessible/url.html
32 | // Output: Prints a sentence with verdict about whether 'user_agent' is allowed
33 | // to access 'url' based on records in 'local_path_to_robotstxt'.
34 | // Return code:
35 | // 0 when the url is ALLOWED for the user_agent.
36 | // 1 when the url is DISALLOWED for the user_agent.
37 | // 2 when --help is requested or if there is something invalid in the flags
38 | // passed.
39 | //
40 | #include
41 | #include
42 |
43 | #include "robots.h"
44 |
45 | bool LoadFile(const std::string& filename, std::string* result) {
46 | std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
47 | if (file.is_open()) {
48 | size_t size = file.tellg();
49 | std::vector buffer(size);
50 | file.seekg(0, std::ios::beg);
51 | file.read(buffer.data(), size);
52 | file.close();
53 | if (!file) return false; // file reading error (failbit or badbit).
54 | result->assign(buffer.begin(), buffer.end());
55 | return true;
56 | }
57 | return false;
58 | }
59 |
60 | void ShowHelp(int argc, char** argv) {
61 | std::cerr << "Shows whether the given user_agent and URI combination"
62 | << " is allowed or disallowed by the given robots.txt file. "
63 | << std::endl
64 | << std::endl;
65 | std::cerr << "Usage: " << std::endl
66 | << " " << argv[0] << " "
67 | << std::endl
68 | << std::endl;
69 | std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
70 | << std::endl;
71 | std::cerr << "Example: " << std::endl
72 | << " " << argv[0] << " robots.txt FooBot http://example.com/foo"
73 | << std::endl;
74 | }
75 |
76 | int main(int argc, char** argv) {
77 | std::string filename = argc >= 2 ? argv[1] : "";
78 | if (filename == "-h" || filename == "-help" || filename == "--help") {
79 | ShowHelp(argc, argv);
80 | return 2;
81 | }
82 | if (argc != 4) {
83 | std::cerr << "Invalid amount of arguments. Showing help." << std::endl
84 | << std::endl;
85 | ShowHelp(argc, argv);
86 | return 2;
87 | }
88 | std::string robots_content;
89 | if (!(LoadFile(filename, &robots_content))) {
90 | std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
91 | return 2;
92 | }
93 |
94 | std::string user_agent = argv[2];
95 | std::vector user_agents(1, user_agent);
96 | googlebot::RobotsMatcher matcher;
97 | std::string url = argv[3];
98 | bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
99 |
100 | std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
101 | << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
102 | if (robots_content.empty()) {
103 | std::cout << "notice: robots file is empty so all user-agents are allowed"
104 | << std::endl;
105 | }
106 |
107 | return allowed ? 0 : 1;
108 | }
109 |
--------------------------------------------------------------------------------
/robots_test.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2019 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // This file tests the robots.txt parsing and matching code found in robots.cc
16 | // against the current Robots Exclusion Protocol (REP) RFC.
17 | // https://www.rfc-editor.org/rfc/rfc9309.html
18 | #include "robots.h"
19 |
20 | #include
21 |
22 | #include "gtest/gtest.h"
23 | #include "absl/strings/str_cat.h"
24 | #include "absl/strings/string_view.h"
25 |
26 | namespace {
27 |
28 | using ::googlebot::RobotsMatcher;
29 |
30 | bool IsUserAgentAllowed(const absl::string_view robotstxt,
31 | const std::string& useragent, const std::string& url) {
32 | RobotsMatcher matcher;
33 | return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
34 | }
35 |
36 | // Google-specific: system test.
37 | TEST(RobotsUnittest, GoogleOnly_SystemTest) {
38 | const absl::string_view robotstxt =
39 | "user-agent: FooBot\n"
40 | "disallow: /\n";
41 | // Empty robots.txt: everything allowed.
42 | EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", ""));
43 |
44 | // Empty user-agent to be matched: everything allowed.
45 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", ""));
46 |
47 | // Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
48 | // in robots.cc.
49 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", ""));
50 |
51 | // All params empty: same as robots.txt empty, everything allowed.
52 | EXPECT_TRUE(IsUserAgentAllowed("", "", ""));
53 | }
54 | // Rules are colon separated name-value pairs. The following names are
55 | // provisioned:
56 | // user-agent:
57 | // allow:
58 | // disallow:
59 | // See REP RFC section "Protocol Definition".
60 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
61 | //
62 | // Google specific: webmasters sometimes miss the colon separator, but it's
63 | // obvious what they mean by "disallow /", so we assume the colon if it's
64 | // missing.
65 | TEST(RobotsUnittest, ID_LineSyntax_Line) {
66 | const absl::string_view robotstxt_correct =
67 | "user-agent: FooBot\n"
68 | "disallow: /\n";
69 | const absl::string_view robotstxt_incorrect =
70 | "foo: FooBot\n"
71 | "bar: /\n";
72 | const absl::string_view robotstxt_incorrect_accepted =
73 | "user-agent FooBot\n"
74 | "disallow /\n";
75 | const std::string url = "http://foo.bar/x/y";
76 |
77 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
78 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
79 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
80 | }
81 |
82 | // A group is one or more user-agent line followed by rules, and terminated
83 | // by a another user-agent line. Rules for same user-agents are combined
84 | // opaquely into one group. Rules outside groups are ignored.
85 | // See REP RFC section "Protocol Definition".
86 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
87 | TEST(RobotsUnittest, ID_LineSyntax_Groups) {
88 | const absl::string_view robotstxt =
89 | "allow: /foo/bar/\n"
90 | "\n"
91 | "user-agent: FooBot\n"
92 | "disallow: /\n"
93 | "allow: /x/\n"
94 | "user-agent: BarBot\n"
95 | "disallow: /\n"
96 | "allow: /y/\n"
97 | "\n"
98 | "\n"
99 | "allow: /w/\n"
100 | "user-agent: BazBot\n"
101 | "\n"
102 | "user-agent: FooBot\n"
103 | "allow: /z/\n"
104 | "disallow: /\n";
105 |
106 | const std::string url_w = "http://foo.bar/w/a";
107 | const std::string url_x = "http://foo.bar/x/b";
108 | const std::string url_y = "http://foo.bar/y/c";
109 | const std::string url_z = "http://foo.bar/z/d";
110 | const std::string url_foo = "http://foo.bar/foo/bar/";
111 |
112 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x));
113 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z));
114 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y));
115 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y));
116 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w));
117 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z));
118 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z));
119 |
120 | // Lines with rules outside groups are ignored.
121 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
122 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
123 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
124 | }
125 |
126 | // Group must not be closed by rules not explicitly defined in the REP RFC.
127 | // See REP RFC section "Protocol Definition".
128 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
129 | TEST(RobotsUnittest, ID_LineSyntax_Groups_OtherRules) {
130 | {
131 | const absl::string_view robotstxt =
132 | "User-agent: BarBot\n"
133 | "Sitemap: https://foo.bar/sitemap\n"
134 | "User-agent: *\n"
135 | "Disallow: /\n";
136 | std::string url = "http://foo.bar/";
137 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
138 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url));
139 | }
140 | {
141 | const absl::string_view robotstxt =
142 | "User-agent: FooBot\n"
143 | "Invalid-Unknown-Line: unknown\n"
144 | "User-agent: *\n"
145 | "Disallow: /\n";
146 | std::string url = "http://foo.bar/";
147 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
148 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url));
149 | }
150 | }
151 |
152 | // REP lines are case insensitive. See REP RFC section "Protocol Definition".
153 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
154 | TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
155 | const absl::string_view robotstxt_upper =
156 | "USER-AGENT: FooBot\n"
157 | "ALLOW: /x/\n"
158 | "DISALLOW: /\n";
159 | const absl::string_view robotstxt_lower =
160 | "user-agent: FooBot\n"
161 | "allow: /x/\n"
162 | "disallow: /\n";
163 | const absl::string_view robotstxt_camel =
164 | "uSeR-aGeNt: FooBot\n"
165 | "AlLoW: /x/\n"
166 | "dIsAlLoW: /\n";
167 | const std::string url_allowed = "http://foo.bar/x/y";
168 | const std::string url_disallowed = "http://foo.bar/a/b";
169 |
170 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
171 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
172 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
173 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
174 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
175 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
176 | }
177 |
178 | // A user-agent line is expected to contain only [a-zA-Z_-] characters and must
179 | // not be empty. See REP RFC section "The user-agent line".
180 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
181 | TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) {
182 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot"));
183 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar"));
184 | EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar"));
185 |
186 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view()));
187 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(""));
188 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ"));
189 |
190 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*"));
191 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot "));
192 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1"));
193 |
194 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
195 | }
196 |
197 | // User-agent line values are case insensitive. See REP RFC section "The
198 | // user-agent line".
199 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
200 | TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) {
201 | const absl::string_view robotstxt_upper =
202 | "User-Agent: FOO BAR\n"
203 | "Allow: /x/\n"
204 | "Disallow: /\n";
205 | const absl::string_view robotstxt_lower =
206 | "User-Agent: foo bar\n"
207 | "Allow: /x/\n"
208 | "Disallow: /\n";
209 | const absl::string_view robotstxt_camel =
210 | "User-Agent: FoO bAr\n"
211 | "Allow: /x/\n"
212 | "Disallow: /\n";
213 | const std::string url_allowed = "http://foo.bar/x/y";
214 | const std::string url_disallowed = "http://foo.bar/a/b";
215 |
216 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
217 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
218 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
219 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
220 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
221 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
222 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
223 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
224 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
225 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
226 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
227 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
228 | }
229 |
230 | // Google specific: accept user-agent value up to the first space. Space is not
231 | // allowed in user-agent values, but that doesn't stop webmasters from using
232 | // them. This is more restrictive than the RFC, since in case of the bad value
233 | // "Googlebot Images" we'd still obey the rules with "Googlebot".
234 | // Extends REP RFC section "The user-agent line"
235 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
236 | TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) {
237 | EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
238 | const absl::string_view robotstxt =
239 | "User-Agent: *\n"
240 | "Disallow: /\n"
241 | "User-Agent: Foo Bar\n"
242 | "Allow: /x/\n"
243 | "Disallow: /\n";
244 | const std::string url = "http://foo.bar/x/y";
245 |
246 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url));
247 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url));
248 | }
249 |
250 | // If no group matches the user-agent, crawlers must obey the first group with a
251 | // user-agent line with a "*" value, if present. If no group satisfies either
252 | // condition, or no groups are present at all, no rules apply.
253 | // See REP RFC section "The user-agent line".
254 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
255 | TEST(RobotsUnittest, ID_GlobalGroups_Secondary) {
256 | const absl::string_view robotstxt_empty = "";
257 | const absl::string_view robotstxt_global =
258 | "user-agent: *\n"
259 | "allow: /\n"
260 | "user-agent: FooBot\n"
261 | "disallow: /\n";
262 | const absl::string_view robotstxt_only_specific =
263 | "user-agent: FooBot\n"
264 | "allow: /\n"
265 | "user-agent: BarBot\n"
266 | "disallow: /\n"
267 | "user-agent: BazBot\n"
268 | "disallow: /\n";
269 | const std::string url = "http://foo.bar/x/y";
270 |
271 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
272 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url));
273 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url));
274 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
275 | }
276 |
277 | // Matching rules against URIs is case sensitive.
278 | // See REP RFC section "The Allow and Disallow lines".
279 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
280 | TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) {
281 | const absl::string_view robotstxt_lowercase_url =
282 | "user-agent: FooBot\n"
283 | "disallow: /x/\n";
284 | const absl::string_view robotstxt_uppercase_url =
285 | "user-agent: FooBot\n"
286 | "disallow: /X/\n";
287 | const std::string url = "http://foo.bar/x/y";
288 |
289 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
290 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
291 | }
292 |
293 | // The most specific match found MUST be used. The most specific match is the
294 | // match that has the most octets. In case of multiple rules with the same
295 | // length, the least strict rule must be used.
296 | // See REP RFC section "The Allow and Disallow lines".
297 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
298 | TEST(RobotsUnittest, ID_LongestMatch) {
299 | const std::string url = "http://foo.bar/x/page.html";
300 | {
301 | const absl::string_view robotstxt =
302 | "user-agent: FooBot\n"
303 | "disallow: /x/page.html\n"
304 | "allow: /x/\n";
305 |
306 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
307 | }
308 | {
309 | const absl::string_view robotstxt =
310 | "user-agent: FooBot\n"
311 | "allow: /x/page.html\n"
312 | "disallow: /x/\n";
313 |
314 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
315 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
316 | }
317 | {
318 | const absl::string_view robotstxt =
319 | "user-agent: FooBot\n"
320 | "disallow: \n"
321 | "allow: \n";
322 | // In case of equivalent disallow and allow patterns for the same
323 | // user-agent, allow is used.
324 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
325 | }
326 | {
327 | const absl::string_view robotstxt =
328 | "user-agent: FooBot\n"
329 | "disallow: /\n"
330 | "allow: /\n";
331 | // In case of equivalent disallow and allow patterns for the same
332 | // user-agent, allow is used.
333 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
334 | }
335 | {
336 | std::string url_a = "http://foo.bar/x";
337 | std::string url_b = "http://foo.bar/x/";
338 | const absl::string_view robotstxt =
339 | "user-agent: FooBot\n"
340 | "disallow: /x\n"
341 | "allow: /x/\n";
342 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a));
343 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b));
344 | }
345 |
346 | {
347 | const absl::string_view robotstxt =
348 | "user-agent: FooBot\n"
349 | "disallow: /x/page.html\n"
350 | "allow: /x/page.html\n";
351 | // In case of equivalent disallow and allow patterns for the same
352 | // user-agent, allow is used.
353 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
354 | }
355 | {
356 | const absl::string_view robotstxt =
357 | "user-agent: FooBot\n"
358 | "allow: /page\n"
359 | "disallow: /*.html\n";
360 | // Longest match wins.
361 | EXPECT_FALSE(
362 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
363 | EXPECT_TRUE(
364 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
365 | }
366 | {
367 | const absl::string_view robotstxt =
368 | "user-agent: FooBot\n"
369 | "allow: /x/page.\n"
370 | "disallow: /*.html\n";
371 | // Longest match wins.
372 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
373 | EXPECT_FALSE(
374 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
375 | }
376 | {
377 | const absl::string_view robotstxt =
378 | "User-agent: *\n"
379 | "Disallow: /x/\n"
380 | "User-agent: FooBot\n"
381 | "Disallow: /y/\n";
382 | // Most specific group for FooBot allows implicitly /x/page.
383 | EXPECT_TRUE(
384 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
385 | EXPECT_FALSE(
386 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
387 | }
388 | }
389 |
390 | // Octets in the URI and robots.txt paths outside the range of the US-ASCII
391 | // coded character set, and those in the reserved range defined by RFC3986,
392 | // MUST be percent-encoded as defined by RFC3986 prior to comparison.
393 | // See REP RFC section "The Allow and Disallow lines".
394 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
395 | //
396 | // NOTE: It's up to the caller to percent encode a URL before passing it to the
397 | // parser. Percent encoding URIs in the rules is unnecessary.
398 | TEST(RobotsUnittest, ID_Encoding) {
399 | // /foo/bar?baz=http://foo.bar stays unencoded.
400 | {
401 | const absl::string_view robotstxt =
402 | "User-agent: FooBot\n"
403 | "Disallow: /\n"
404 | "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
405 | EXPECT_TRUE(IsUserAgentAllowed(
406 | robotstxt, "FooBot",
407 | "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
408 | }
409 |
410 | // 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
411 | {
412 | const absl::string_view robotstxt =
413 | "User-agent: FooBot\n"
414 | "Disallow: /\n"
415 | "Allow: /foo/bar/ツ\n";
416 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
417 | "http://foo.bar/foo/bar/%E3%83%84"));
418 | // The parser encodes the 3-byte character, but the URL is not %-encoded.
419 | EXPECT_FALSE(
420 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
421 | }
422 | // Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
423 | {
424 | const absl::string_view robotstxt =
425 | "User-agent: FooBot\n"
426 | "Disallow: /\n"
427 | "Allow: /foo/bar/%E3%83%84\n";
428 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
429 | "http://foo.bar/foo/bar/%E3%83%84"));
430 | EXPECT_FALSE(
431 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
432 | }
433 | // Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
434 | // This is illegal according to RFC3986 and while it may work here due to
435 | // simple string matching, it should not be relied on.
436 | {
437 | const absl::string_view robotstxt =
438 | "User-agent: FooBot\n"
439 | "Disallow: /\n"
440 | "Allow: /foo/bar/%62%61%7A\n";
441 | EXPECT_FALSE(
442 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
443 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
444 | "http://foo.bar/foo/bar/%62%61%7A"));
445 | }
446 | }
447 |
448 | // The REP RFC defines the following characters that have special meaning in
449 | // robots.txt:
450 | // # - inline comment.
451 | // $ - end of pattern.
452 | // * - any number of characters.
453 | // See REP RFC section "Special Characters".
454 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.3
455 | TEST(RobotsUnittest, ID_SpecialCharacters) {
456 | {
457 | const absl::string_view robotstxt =
458 | "User-agent: FooBot\n"
459 | "Disallow: /foo/bar/quz\n"
460 | "Allow: /foo/*/qux\n";
461 | EXPECT_FALSE(
462 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
463 | EXPECT_TRUE(
464 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
465 | EXPECT_TRUE(
466 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
467 | EXPECT_TRUE(
468 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
469 | }
470 | {
471 | const absl::string_view robotstxt =
472 | "User-agent: FooBot\n"
473 | "Disallow: /foo/bar$\n"
474 | "Allow: /foo/bar/qux\n";
475 | EXPECT_FALSE(
476 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
477 | EXPECT_TRUE(
478 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
479 | EXPECT_TRUE(
480 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
481 | EXPECT_TRUE(
482 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
483 | }
484 | {
485 | const absl::string_view robotstxt =
486 | "User-agent: FooBot\n"
487 | "# Disallow: /\n"
488 | "Disallow: /foo/quz#qux\n"
489 | "Allow: /\n";
490 | EXPECT_TRUE(
491 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
492 | EXPECT_FALSE(
493 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
494 | }
495 | }
496 |
497 | // Google-specific: "index.html" (and only that) at the end of a pattern is
498 | // equivalent to "/".
499 | TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) {
500 | const absl::string_view robotstxt =
501 | "User-Agent: *\n"
502 | "Allow: /allowed-slash/index.html\n"
503 | "Disallow: /\n";
504 | // If index.html is allowed, we interpret this as / being allowed too.
505 | EXPECT_TRUE(
506 | IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
507 | // Does not exatly match.
508 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot",
509 | "http://foo.com/allowed-slash/index.htm"));
510 | // Exact match.
511 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot",
512 | "http://foo.com/allowed-slash/index.html"));
513 | EXPECT_FALSE(
514 | IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
515 | }
516 |
517 | // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
518 | // RobotsTxtParser::Parse().
519 | TEST(RobotsUnittest, GoogleOnly_LineTooLong) {
520 | size_t kEOLLen = std::string("\n").length();
521 | int kMaxLineLen = 2083 * 8;
522 | std::string allow = "allow: ";
523 | std::string disallow = "disallow: ";
524 |
525 | // Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
526 | {
527 | std::string robotstxt = "user-agent: FooBot\n";
528 | std::string longline = "/x/";
529 | size_t max_length =
530 | kMaxLineLen - longline.length() - disallow.length() + kEOLLen;
531 | while (longline.size() < max_length) {
532 | absl::StrAppend(&longline, "a");
533 | }
534 | absl::StrAppend(&robotstxt, disallow, longline, "/qux\n");
535 |
536 | // Matches nothing, so URL is allowed.
537 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
538 | // Matches cut off disallow rule.
539 | EXPECT_FALSE(IsUserAgentAllowed(
540 | robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux")));
541 | }
542 |
543 | {
544 | std::string robotstxt =
545 | "user-agent: FooBot\n"
546 | "disallow: /\n";
547 | std::string longline_a = "/x/";
548 | std::string longline_b = "/x/";
549 | size_t max_length =
550 | kMaxLineLen - longline_a.length() - allow.length() + kEOLLen;
551 | while (longline_a.size() < max_length) {
552 | absl::StrAppend(&longline_a, "a");
553 | absl::StrAppend(&longline_b, "b");
554 | }
555 | absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n");
556 | absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n");
557 |
558 | // URL matches the disallow rule.
559 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
560 | // Matches the allow rule exactly.
561 | EXPECT_TRUE(
562 | IsUserAgentAllowed(robotstxt, "FooBot",
563 | absl::StrCat("http://foo.bar", longline_a, "/qux")));
564 | // Matches cut off allow rule.
565 | EXPECT_TRUE(
566 | IsUserAgentAllowed(robotstxt, "FooBot",
567 | absl::StrCat("http://foo.bar", longline_b, "/fux")));
568 | }
569 | }
570 |
571 | TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) {
572 | // Test documentation from
573 | // https://developers.google.com/search/reference/robots_txt
574 | // Section "URL matching based on path values".
575 | {
576 | std::string robotstxt =
577 | "user-agent: FooBot\n"
578 | "disallow: /\n"
579 | "allow: /fish\n";
580 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
581 |
582 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
583 | EXPECT_TRUE(
584 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
585 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
586 | "http://foo.bar/fish/salmon.html"));
587 | EXPECT_TRUE(
588 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
589 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
590 | "http://foo.bar/fishheads/yummy.html"));
591 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
592 | "http://foo.bar/fish.html?id=anything"));
593 |
594 | EXPECT_FALSE(
595 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
596 | EXPECT_FALSE(
597 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
598 | EXPECT_FALSE(
599 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
600 | }
601 | // "/fish*" equals "/fish"
602 | {
603 | std::string robotstxt =
604 | "user-agent: FooBot\n"
605 | "disallow: /\n"
606 | "allow: /fish*\n";
607 | EXPECT_FALSE(
608 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
609 |
610 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
611 | EXPECT_TRUE(
612 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
613 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
614 | "http://foo.bar/fish/salmon.html"));
615 | EXPECT_TRUE(
616 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
617 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
618 | "http://foo.bar/fishheads/yummy.html"));
619 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
620 | "http://foo.bar/fish.html?id=anything"));
621 |
622 | EXPECT_FALSE(
623 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
624 | EXPECT_FALSE(
625 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
626 | EXPECT_FALSE(
627 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
628 | }
629 | // "/fish/" does not equal "/fish"
630 | {
631 | std::string robotstxt =
632 | "user-agent: FooBot\n"
633 | "disallow: /\n"
634 | "allow: /fish/\n";
635 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
636 |
637 | EXPECT_TRUE(
638 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
639 | EXPECT_TRUE(
640 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
641 | EXPECT_TRUE(
642 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
643 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
644 | "http://foo.bar/fish/salmon.html"));
645 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
646 | "http://foo.bar/fish/?id=anything"));
647 |
648 | EXPECT_FALSE(
649 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
650 | EXPECT_FALSE(
651 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
652 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
653 | "http://foo.bar/Fish/Salmon.html"));
654 | }
655 | // "/*.php"
656 | {
657 | std::string robotstxt =
658 | "user-agent: FooBot\n"
659 | "disallow: /\n"
660 | "allow: /*.php\n";
661 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
662 |
663 | EXPECT_TRUE(
664 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
665 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
666 | "http://foo.bar/folder/filename.php"));
667 | EXPECT_TRUE(IsUserAgentAllowed(
668 | robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
669 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
670 | "http://foo.bar//folder/any.php.file.html"));
671 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
672 | "http://foo.bar/filename.php/"));
673 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
674 | "http://foo.bar/index?f=filename.php/"));
675 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
676 | "http://foo.bar/php/"));
677 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
678 | "http://foo.bar/index?php"));
679 |
680 | EXPECT_FALSE(
681 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
682 | }
683 | // "/*.php$"
684 | {
685 | std::string robotstxt =
686 | "user-agent: FooBot\n"
687 | "disallow: /\n"
688 | "allow: /*.php$\n";
689 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
690 |
691 | EXPECT_TRUE(
692 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
693 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
694 | "http://foo.bar/folder/filename.php"));
695 |
696 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
697 | "http://foo.bar/filename.php?parameters"));
698 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
699 | "http://foo.bar/filename.php/"));
700 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
701 | "http://foo.bar/filename.php5"));
702 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
703 | "http://foo.bar/php/"));
704 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
705 | "http://foo.bar/filename?php"));
706 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
707 | "http://foo.bar/aaaphpaaa"));
708 | EXPECT_FALSE(
709 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
710 | }
711 | // "/fish*.php"
712 | {
713 | std::string robotstxt =
714 | "user-agent: FooBot\n"
715 | "disallow: /\n"
716 | "allow: /fish*.php\n";
717 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
718 |
719 | EXPECT_TRUE(
720 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
721 | EXPECT_TRUE(
722 | IsUserAgentAllowed(robotstxt, "FooBot",
723 | "http://foo.bar/fishheads/catfish.php?parameters"));
724 |
725 | EXPECT_FALSE(
726 | IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
727 | }
728 | // Section "Order of precedence for group-member records".
729 | {
730 | std::string robotstxt =
731 | "user-agent: FooBot\n"
732 | "allow: /p\n"
733 | "disallow: /\n";
734 | std::string url = "http://example.com/page";
735 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
736 | }
737 | {
738 | std::string robotstxt =
739 | "user-agent: FooBot\n"
740 | "allow: /folder\n"
741 | "disallow: /folder\n";
742 | std::string url = "http://example.com/folder/page";
743 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
744 | }
745 | {
746 | std::string robotstxt =
747 | "user-agent: FooBot\n"
748 | "allow: /page\n"
749 | "disallow: /*.htm\n";
750 | std::string url = "http://example.com/page.htm";
751 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
752 | }
753 | {
754 | std::string robotstxt =
755 | "user-agent: FooBot\n"
756 | "allow: /$\n"
757 | "disallow: /\n";
758 | std::string url = "http://example.com/";
759 | std::string url_page = "http://example.com/page.html";
760 | EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
761 | EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page));
762 | }
763 | }
764 |
765 | class RobotsStatsReporter : public googlebot::RobotsParseHandler {
766 | public:
767 | void HandleRobotsStart() override {
768 | last_line_seen_ = 0;
769 | valid_directives_ = 0;
770 | unknown_directives_ = 0;
771 | sitemap_.clear();
772 | }
773 | void HandleRobotsEnd() override {}
774 |
775 | void HandleUserAgent(int line_num, absl::string_view value) override {
776 | Digest(line_num);
777 | }
778 | void HandleAllow(int line_num, absl::string_view value) override {
779 | Digest(line_num);
780 | }
781 | void HandleDisallow(int line_num, absl::string_view value) override {
782 | Digest(line_num);
783 | }
784 |
785 | void HandleSitemap(int line_num, absl::string_view value) override {
786 | Digest(line_num);
787 | sitemap_.append(value.data(), value.length());
788 | }
789 |
790 | // Any other unrecognized name/v pairs.
791 | void HandleUnknownAction(int line_num, absl::string_view action,
792 | absl::string_view value) override {
793 | last_line_seen_ = line_num;
794 | unknown_directives_++;
795 | }
796 |
797 | int last_line_seen() const { return last_line_seen_; }
798 |
799 | // All directives found, including unknown.
800 | int valid_directives() const { return valid_directives_; }
801 |
802 | // Number of unknown directives.
803 | int unknown_directives() const { return unknown_directives_; }
804 |
805 | // Parsed sitemap line.
806 | std::string sitemap() const { return sitemap_; }
807 |
808 | private:
809 | void Digest(int line_num) {
810 | ASSERT_GE(line_num, last_line_seen_);
811 | last_line_seen_ = line_num;
812 | valid_directives_++;
813 | }
814 |
815 | int last_line_seen_ = 0;
816 | int valid_directives_ = 0;
817 | int unknown_directives_ = 0;
818 | std::string sitemap_;
819 | };
820 |
821 | // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
822 | TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) {
823 | RobotsStatsReporter report;
824 | static const char kUnixFile[] =
825 | "User-Agent: foo\n"
826 | "Allow: /some/path\n"
827 | "User-Agent: bar\n"
828 | "\n"
829 | "\n"
830 | "Disallow: /\n";
831 | googlebot::ParseRobotsTxt(kUnixFile, &report);
832 | EXPECT_EQ(4, report.valid_directives());
833 | EXPECT_EQ(6, report.last_line_seen());
834 |
835 | static const char kDosFile[] =
836 | "User-Agent: foo\r\n"
837 | "Allow: /some/path\r\n"
838 | "User-Agent: bar\r\n"
839 | "\r\n"
840 | "\r\n"
841 | "Disallow: /\r\n";
842 | googlebot::ParseRobotsTxt(kDosFile, &report);
843 | EXPECT_EQ(4, report.valid_directives());
844 | EXPECT_EQ(6, report.last_line_seen());
845 |
846 | static const char kMacFile[] =
847 | "User-Agent: foo\r"
848 | "Allow: /some/path\r"
849 | "User-Agent: bar\r"
850 | "\r"
851 | "\r"
852 | "Disallow: /\r";
853 | googlebot::ParseRobotsTxt(kMacFile, &report);
854 | EXPECT_EQ(4, report.valid_directives());
855 | EXPECT_EQ(6, report.last_line_seen());
856 |
857 | static const char kNoFinalNewline[] =
858 | "User-Agent: foo\n"
859 | "Allow: /some/path\n"
860 | "User-Agent: bar\n"
861 | "\n"
862 | "\n"
863 | "Disallow: /";
864 | googlebot::ParseRobotsTxt(kNoFinalNewline, &report);
865 | EXPECT_EQ(4, report.valid_directives());
866 | EXPECT_EQ(6, report.last_line_seen());
867 |
868 | static const char kMixedFile[] =
869 | "User-Agent: foo\n"
870 | "Allow: /some/path\r\n"
871 | "User-Agent: bar\n"
872 | "\r\n"
873 | "\n"
874 | "Disallow: /";
875 | googlebot::ParseRobotsTxt(kMixedFile, &report);
876 | EXPECT_EQ(4, report.valid_directives());
877 | EXPECT_EQ(6, report.last_line_seen());
878 | }
879 |
880 | // BOM characters are unparseable and thus skipped. The rules following the line
881 | // are used.
882 | TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) {
883 | RobotsStatsReporter report;
884 | static const char kUtf8FileFullBOM[] =
885 | "\xEF\xBB\xBF"
886 | "User-Agent: foo\n"
887 | "Allow: /AnyValue\n";
888 | googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report);
889 | EXPECT_EQ(2, report.valid_directives());
890 | EXPECT_EQ(0, report.unknown_directives());
891 |
892 | // We allow as well partial ByteOrderMarks.
893 | static const char kUtf8FilePartial2BOM[] =
894 | "\xEF\xBB"
895 | "User-Agent: foo\n"
896 | "Allow: /AnyValue\n";
897 | googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report);
898 | EXPECT_EQ(2, report.valid_directives());
899 | EXPECT_EQ(0, report.unknown_directives());
900 |
901 | static const char kUtf8FilePartial1BOM[] =
902 | "\xEF"
903 | "User-Agent: foo\n"
904 | "Allow: /AnyValue\n";
905 | googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report);
906 | EXPECT_EQ(2, report.valid_directives());
907 | EXPECT_EQ(0, report.unknown_directives());
908 |
909 | // If the BOM is not the right sequence, the first line looks like garbage
910 | // that is skipped (we essentially see "\x11\xBFUser-Agent").
911 | static const char kUtf8FileBrokenBOM[] =
912 | "\xEF\x11\xBF"
913 | "User-Agent: foo\n"
914 | "Allow: /AnyValue\n";
915 | googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report);
916 | EXPECT_EQ(1, report.valid_directives());
917 | EXPECT_EQ(1, report.unknown_directives()); // We get one broken line.
918 |
919 | // Some other messed up file: BOMs only valid in the beginning of the file.
920 | static const char kUtf8BOMSomewhereInMiddleOfFile[] =
921 | "User-Agent: foo\n"
922 | "\xEF\xBB\xBF"
923 | "Allow: /AnyValue\n";
924 | googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report);
925 | EXPECT_EQ(1, report.valid_directives());
926 | EXPECT_EQ(1, report.unknown_directives());
927 | }
928 |
929 | // Google specific: the RFC allows any line that crawlers might need, such as
930 | // sitemaps, which Google supports.
931 | // See REP RFC section "Other records".
932 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.4
933 | TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) {
934 | RobotsStatsReporter report;
935 | {
936 | std::string sitemap_loc = "http://foo.bar/sitemap.xml";
937 | std::string robotstxt =
938 | "User-Agent: foo\n"
939 | "Allow: /some/path\n"
940 | "User-Agent: bar\n"
941 | "\n"
942 | "\n";
943 | absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n");
944 |
945 | googlebot::ParseRobotsTxt(robotstxt, &report);
946 | EXPECT_EQ(sitemap_loc, report.sitemap());
947 | }
948 | // A sitemap line may appear anywhere in the file.
949 | {
950 | std::string robotstxt;
951 | std::string sitemap_loc = "http://foo.bar/sitemap.xml";
952 | std::string robotstxt_temp =
953 | "User-Agent: foo\n"
954 | "Allow: /some/path\n"
955 | "User-Agent: bar\n"
956 | "\n"
957 | "\n";
958 | absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp);
959 |
960 | googlebot::ParseRobotsTxt(robotstxt, &report);
961 | EXPECT_EQ(sitemap_loc, report.sitemap());
962 | }
963 | }
964 |
965 | } // namespace
966 |
967 | // Integrity tests. These functions are available to the linker, but not in the
968 | // header, because they should only be used for testing.
969 | namespace googlebot {
970 | std::string GetPathParamsQuery(const std::string& url);
971 | bool MaybeEscapePattern(const char* src, char** dst);
972 | } // namespace googlebot
973 |
974 | void TestPath(const std::string& url, const std::string& expected_path) {
975 | EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url));
976 | }
977 |
978 | void TestEscape(const std::string& url, const std::string& expected) {
979 | char* escaped_value = nullptr;
980 | const bool is_escaped =
981 | googlebot::MaybeEscapePattern(url.c_str(), &escaped_value);
982 | const std::string escaped = escaped_value;
983 | if (is_escaped) delete[] escaped_value;
984 |
985 | EXPECT_EQ(expected, escaped);
986 | }
987 |
988 | TEST(RobotsUnittest, TestGetPathParamsQuery) {
989 | // Only testing URLs that are already correctly escaped here.
990 | TestPath("", "/");
991 | TestPath("http://www.example.com", "/");
992 | TestPath("http://www.example.com/", "/");
993 | TestPath("http://www.example.com/a", "/a");
994 | TestPath("http://www.example.com/a/", "/a/");
995 | TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
996 | TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
997 | TestPath("example.com", "/");
998 | TestPath("example.com/", "/");
999 | TestPath("example.com/a", "/a");
1000 | TestPath("example.com/a/", "/a/");
1001 | TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
1002 | TestPath("a", "/");
1003 | TestPath("a/", "/");
1004 | TestPath("/a", "/a");
1005 | TestPath("a/b", "/b");
1006 | TestPath("example.com?a", "/?a");
1007 | TestPath("example.com/a;b#c", "/a;b");
1008 | TestPath("//a/b/c", "/b/c");
1009 | }
1010 |
1011 | TEST(RobotsUnittest, TestMaybeEscapePattern) {
1012 | TestEscape("http://www.example.com", "http://www.example.com");
1013 | TestEscape("/a/b/c", "/a/b/c");
1014 | TestEscape("á", "%C3%A1");
1015 | TestEscape("%aa", "%AA");
1016 | }
1017 |
--------------------------------------------------------------------------------