├── .gitignore
├── BUILD
├── CMakeLists.txt
├── CMakeLists.txt.in
├── CONTRIBUTING.md
├── LICENSE
├── MODULE.bazel
├── README.md
├── protocol-draft
    └── draft-illyes-repext-00.xml
├── reporting_robots.cc
├── reporting_robots.h
├── reporting_robots_test.cc
├── robots.cc
├── robots.h
├── robots_main.cc
└── robots_test.cc


/.gitignore:
--------------------------------------------------------------------------------
1 | c-build/*
2 | 


--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | licenses(["notice"])
 6 | 
 7 | exports_files(["LICENSE"])
 8 | 
 9 | cc_library(
10 |     name = "robots",
11 |     srcs = [
12 |         "robots.cc",
13 |     ],
14 |     hdrs = [
15 |         "robots.h",
16 |     ],
17 |     deps = [
18 |         "@abseil-cpp//absl/base:core_headers",
19 |         "@abseil-cpp//absl/container:fixed_array",
20 |         "@abseil-cpp//absl/strings",
21 |     ],
22 | )
23 | 
24 | cc_library(
25 |     name = "reporting_robots",
26 |     srcs = ["reporting_robots.cc"],
27 |     hdrs = ["reporting_robots.h"],
28 |     deps = [
29 |         ":robots",
30 |         "@abseil-cpp//absl/container:btree",
31 |         "@abseil-cpp//absl/strings",
32 |     ],
33 | )
34 | 
35 | cc_test(
36 |     name = "robots_test",
37 |     srcs = ["robots_test.cc"],
38 |     deps = [
39 |         ":robots",
40 |         "@abseil-cpp//absl/strings",
41 |         "@googletest//:gtest_main",
42 |     ],
43 | )
44 | 
45 | cc_test(
46 |     name = "reporting_robots_test",
47 |     srcs = ["reporting_robots_test.cc"],
48 |     deps = [
49 |         ":reporting_robots",
50 |         ":robots",
51 |         "@abseil-cpp//absl/strings",
52 |         "@googletest//:gtest_main",
53 |     ],
54 | )
55 | 
56 | cc_binary(
57 |     name = "robots_main",
58 |     srcs = ["robots_main.cc"],
59 |     deps = [
60 |         ":robots",
61 |     ],
62 | )
63 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | PROJECT(robots)
  2 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
  3 | 
  4 | SET(CMAKE_CXX_STANDARD 14)
  5 | SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
  6 | 
  7 | SET(VERSION "0.0.0")
  8 | 
  9 | STRING(REGEX MATCHALL "([0-9]+)" VERSION_DIGITS "${VERSION}")
 10 | 
 11 | LIST(GET VERSION_DIGITS 0 CPACK_PACKAGE_VERSION_MAJOR)
 12 | LIST(GET VERSION_DIGITS 1 CPACK_PACKAGE_VERSION_MINOR)
 13 | LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH)
 14 | 
 15 | SET(CPACK_PACKAGE_NAME "robots")
 16 | SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library")
 17 | SET(CPACK_PACKAGE_VENDOR "Google Inc.")
 18 | SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md")
 19 | SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE")
 20 | 
 21 | SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 22 | SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 23 | 
 24 | SET(base_with_ver "robots-[0-9]+\\\\.[0-9]+\\\\.[0-9]+")
 25 | SET(CPACK_SOURCE_IGNORE_FILES
 26 |     "/_CPack_Packages/"
 27 |     "/CMakeFiles/"
 28 |     "/.deps/"
 29 |     "^${base_with_ver}(-Source|-Linux)?/"
 30 |     "${base_with_ver}.tar\\\\.(gz|bz2|Z|lzma|xz)$"
 31 |     "\\\\.o$"
 32 |     "~$"
 33 |     "/\\\\.svn/"
 34 |     "/CMakeCache\\\\.txt$"
 35 |     "/CTestTestfile\\\\.cmake$"
 36 |     "/cmake_install\\\\.cmake$"
 37 |     "/CPackConfig\\\\.cmake$"
 38 |     "/CPackSourceConfig\\\\.cmake$"
 39 |     "/tags$"
 40 |     "^config\\\\.h$"
 41 |     "/install_manifest\\\\.txt$"
 42 |     "/Testing/"
 43 |     "ids-whitelist\\\\.txt"
 44 |     "/_Inline/"
 45 |     "/(B|build|BUILD)/"
 46 |     "/autom4te.cache/"
 47 | )
 48 | 
 49 | ############ build options ##############
 50 | 
 51 | OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON)
 52 | OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF)
 53 | OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON)
 54 | 
 55 | ############ helper libs ############
 56 | 
 57 | INCLUDE(CPack)
 58 | INCLUDE(CheckCCompilerFlag)
 59 | INCLUDE(ExternalProject)
 60 | 
 61 | ############ dependencies ##############
 62 | 
 63 | CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt)
 64 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
 65 | 
 66 | IF(result)
 67 |     MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
 68 | ENDIF()
 69 | 
 70 | EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
 71 | 
 72 | IF(result)
 73 |     MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
 74 | ENDIF()
 75 | 
 76 | # abseil-cpp
 77 | IF(MSVC)
 78 |     # /wd4005  macro-redefinition
 79 |     # /wd4068  unknown pragma
 80 |     # /wd4244  conversion from 'type1' to 'type2'
 81 |     # /wd4267  conversion from 'size_t' to 'type2'
 82 |     # /wd4800  force value to bool 'true' or 'false' (performance warning)
 83 |     ADD_COMPILE_OPTIONS(/wd4005 /wd4068 /wd4244 /wd4267 /wd4800)
 84 |     ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS)
 85 | ENDIF(MSVC)
 86 | 
 87 | ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src
 88 |                  ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build
 89 |                  EXCLUDE_FROM_ALL)
 90 | 
 91 | IF(ROBOTS_BUILD_TESTS)
 92 |     INCLUDE(CTest)
 93 | 
 94 |     # googletest
 95 |     ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src
 96 |                      ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build
 97 |                      EXCLUDE_FROM_ALL)
 98 | 
 99 |     SET(INSTALL_GTEST 0)
100 |     SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
101 | 
102 |     IF(CMAKE_VERSION VERSION_LESS 2.8.11)
103 |         INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include)
104 |     ENDIF()
105 | ENDIF(ROBOTS_BUILD_TESTS)
106 | 
107 | ########### compiler flags ##############
108 | 
109 | 
110 | SET(COMPILER_FLAGS_TO_CHECK
111 |     "-Wall" "-Werror=implicit-function-declaration"
112 | )
113 | 
114 | IF(CPU_ARCH)
115 |     LIST(APPEND COMPILER_FLAGS_TO_CHECK "-march=${CPU_ARCH}")
116 | ENDIF(CPU_ARCH)
117 | 
118 | ########### project files ###############
119 | 
120 | INCLUDE_DIRECTORIES(.)
121 | 
122 | ######### targets ###########
123 | 
124 | SET(LIBROBOTS_LIBS)
125 | 
126 | SET(robots_SRCS ./robots.cc)
127 | SET(robots_LIBS absl::base absl::strings)
128 | 
129 | ADD_LIBRARY(robots SHARED ${robots_SRCS})
130 | TARGET_LINK_LIBRARIES(robots ${robots_LIBS})
131 | LIST(APPEND LIBROBOTS_LIBS "robots")
132 | 
133 | IF(ROBOTS_BUILD_STATIC)
134 |     ADD_LIBRARY(robots-static STATIC ${robots_SRCS})
135 |     TARGET_LINK_LIBRARIES(robots-static ${robots_LIBS})
136 | 
137 |     LIST(APPEND LIBROBOTS_LIBS "robots-static")
138 | 
139 |     SET_TARGET_PROPERTIES(robots-static PROPERTIES OUTPUT_NAME "robots")
140 | 
141 |     SET_TARGET_PROPERTIES(${LIBROBOTS_LIBS} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
142 | ENDIF(ROBOTS_BUILD_STATIC)
143 | 
144 | IF(WIN_32)
145 |     SET_TARGET_PROPERTIES(robots PROPERTIES DEFINE_SYMBOL DLL_EXPORT)
146 | ENDIF(WIN_32)
147 | 
148 | ADD_EXECUTABLE(robots-main ./robots_main.cc)
149 | TARGET_LINK_LIBRARIES(robots-main ${LIBROBOTS_LIBS})
150 | SET_TARGET_PROPERTIES(robots-main PROPERTIES OUTPUT_NAME "robots")
151 | 
152 | ############ installation ############
153 | 
154 | IF(ROBOTS_INSTALL)
155 |     INSTALL(TARGETS ${LIBROBOTS_LIBS}
156 |         LIBRARY DESTINATION lib
157 |         ARCHIVE DESTINATION lib
158 |     )
159 | 
160 |     INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include)
161 | 
162 |     INSTALL(TARGETS robots-main DESTINATION bin)
163 | ENDIF(ROBOTS_INSTALL)
164 | 
165 | ############ tests ##############
166 | 
167 | IF(ROBOTS_BUILD_TESTS)
168 |     ENABLE_TESTING()
169 | 
170 |     ADD_EXECUTABLE(robots-test ./robots_test.cc)
171 |     TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main)
172 |     ADD_TEST(NAME robots-test COMMAND robots-test)
173 | ENDIF(ROBOTS_BUILD_TESTS)
174 | 
175 | 


--------------------------------------------------------------------------------
/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | 
 2 | PROJECT(dependency-downloader NONE)
 3 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | 
 7 | ExternalProject_Add(abseilcpp
 8 |     GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
 9 |     GIT_TAG master
10 |     GIT_PROGRESS 1
11 |     SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src"
12 |     BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build"
13 |     CONFIGURE_COMMAND ""
14 |     BUILD_COMMAND ""
15 |     INSTALL_COMMAND ""
16 |     TEST_COMMAND ""
17 | )
18 | 
19 | ExternalProject_Add(googletest
20 |     GIT_REPOSITORY https://github.com/google/googletest.git
21 |     GIT_TAG main
22 |     GIT_PROGRESS 1
23 |     SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src"
24 |     BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build"
25 |     CONFIGURE_COMMAND ""
26 |     BUILD_COMMAND ""
27 |     INSTALL_COMMAND ""
28 |     TEST_COMMAND ""
29 | )
30 | 
31 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Keep in mind that this library is
22 | used in Google's production systems, so we need to be very strict about what
23 | changes we accept. Consult
24 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
25 | information on using pull requests.
26 | 
27 | ## Community Guidelines
28 | 
29 | This project follows [Google's Open Source Community
30 | Guidelines](https://opensource.google.com/conduct/).
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         https://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        https://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 


--------------------------------------------------------------------------------
/MODULE.bazel:
--------------------------------------------------------------------------------
 1 | module(
 2 |     name = "com_google_robotstxt",
 3 |     version = "head",
 4 | )
 5 | 
 6 | # -- bazel_dep definitions -- #
 7 | bazel_dep(
 8 |     name = "bazel_skylib",
 9 |     version = "1.5.0",
10 | )
11 | 
12 | bazel_dep(
13 |     name = "abseil-cpp",
14 |     version = "20240116.2",
15 | )
16 | 
17 | bazel_dep(
18 |     name = "googletest",
19 |     version = "1.14.0.bcr.1",
20 | )
21 | 
22 | bazel_dep(
23 |     name = "rules_cc",
24 |     version = "0.0.9",
25 | )
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Google Robots.txt Parser and Matcher Library
  3 | 
  4 | The repository contains Google's robots.txt parser and matcher as a C++ library
  5 | (compliant to C++14).
  6 | 
  7 | ## About the library
  8 | 
  9 | The Robots Exclusion Protocol (REP) is a standard that enables website owners to
 10 | control which URLs may be accessed by automated clients (i.e. crawlers) through
 11 | a simple text file with a specific syntax. It's one of the basic building blocks
 12 | of the internet as we know it and what allows search engines to operate.
 13 | 
 14 | Because the REP was only a de-facto standard for the past 25 years, different
 15 | implementers implement parsing of robots.txt slightly differently, leading to
 16 | confusion. This project aims to fix that by releasing the parser that Google
 17 | uses.
 18 | 
 19 | The library is slightly modified (i.e. some internal headers and equivalent
 20 | symbols) production code used by Googlebot, Google's crawler, to determine which
 21 | URLs it may access based on rules provided by webmasters in robots.txt files.
 22 | The library is released open-source to help developers build tools that better
 23 | reflect Google's robots.txt parsing and matching.
 24 | 
 25 | For webmasters, we included a small binary in the project that allows testing a
 26 | single URL and user-agent against a robots.txt.
 27 | 
 28 | ## Building the library
 29 | 
 30 | ### Quickstart
 31 | 
 32 | We included with the library a small binary to test a local robots.txt against a
 33 | user-agent and URL. Running the included binary requires:
 34 | 
 35 | *   A compatible platform (e.g. Windows, macOS, Linux, etc.). Most platforms are
 36 |     fully supported.
 37 | *   A compatible C++ compiler supporting at least C++14. Most major compilers
 38 |     are supported.
 39 | *   [Git](https://git-scm.com/) for interacting with the source code repository.
 40 |     To install Git, consult the
 41 |     [Set Up Git](https://help.github.com/articles/set-up-git/) guide on
 42 |     [GitHub](https://github.com/).
 43 | *   Although you are free to use your own build system, most of the
 44 |     documentation within this guide will assume you are using
 45 |     [Bazel](https://bazel.build/). To download and install Bazel (and any of its
 46 |     dependencies), consult the
 47 |     [Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html)
 48 | 
 49 | #### Building with Bazel
 50 | 
 51 | [Bazel](https://bazel.build/) is the official build system for the library,
 52 | which is supported on most major platforms (Linux, Windows, MacOS, for example)
 53 | and compilers.
 54 | 
 55 | To build and run the binary:
 56 | 
 57 | ```bash
 58 | $ git clone https://github.com/google/robotstxt.git robotstxt
 59 | Cloning into 'robotstxt'...
 60 | ...
 61 | $ cd robotstxt/
 62 | bazel-robots$ bazel test :robots_test
 63 | ...
 64 | /:robots_test                                                      PASSED in 0.1s
 65 | 
 66 | Executed 1 out of 1 test: 1 test passes.
 67 | ...
 68 | bazel-robots$ bazel build :robots_main
 69 | ...
 70 | Target //:robots_main up-to-date:
 71 |   bazel-bin/robots_main
 72 | ...
 73 | bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url
 74 |   user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
 75 | ```
 76 | 
 77 | #### Building with CMake
 78 | 
 79 | [CMake](https://cmake.org) is the community-supported build system for the
 80 | library.
 81 | 
 82 | To build the library using CMake, just follow the steps below:
 83 | 
 84 | ```bash
 85 | $ git clone https://github.com/google/robotstxt.git robotstxt
 86 | Cloning into 'robotstxt'...
 87 | ...
 88 | $ cd robotstxt/
 89 | ...
 90 | $ mkdir c-build && cd c-build
 91 | ...
 92 | $ cmake .. -DROBOTS_BUILD_TESTS=ON
 93 | ...
 94 | $ make
 95 | ...
 96 | $ make test
 97 | Running tests...
 98 | Test project robotstxt/c-build
 99 |     Start 1: robots-test
100 | 1/1 Test #1: robots-test ......................   Passed    0.02 sec
101 | 
102 | 100% tests passed, 0 tests failed out of 1
103 | 
104 | Total Test time (real) =   0.02 sec
105 | ...
106 | $ robots ~/local/path/to/robots.txt YourBot https://example.com/url
107 |   user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
108 | ```
109 | 
110 | ## Notes
111 | 
112 | Parsing of robots.txt files themselves is done exactly as in the production
113 | version of Googlebot, including how percent codes and unicode characters in
114 | patterns are handled. The user must ensure however that the URI passed to the
115 | AllowedByRobots and OneAgentAllowedByRobots functions, or to the URI parameter
116 | of the robots tool, follows the format specified by RFC3986, since this library
117 | will not perform full normalization of those URI parameters. Only if the URI is
118 | in this format, the matching will be done according to the REP specification.
119 | 
120 | Also note that the library, and the included binary, do not handle
121 | implementation logic that a crawler might apply outside of parsing and matching,
122 | for example: `Googlebot-Image` respecting the rules specified for `User-agent:
123 | Googlebot` if not explicitly defined in the robots.txt file being tested.
124 | 
125 | ## License
126 | 
127 | The robots.txt parser and matcher C++ library is licensed under the terms of the
128 | Apache license. See LICENSE for more information.
129 | 
130 | ## Links
131 | 
132 | To learn more about this project:
133 | 
134 | *   check out the
135 |     [Robots Exclusion Protocol standard](https://www.rfc-editor.org/rfc/rfc9309.html),
136 | *   how
137 |     [Google Handles robots.txt](https://developers.google.com/search/reference/robots_txt),
138 | *   or for a high level overview, the
139 |     [robots.txt page on Wikipedia](https://en.wikipedia.org/wiki/Robots_exclusion_standard).
140 | 


--------------------------------------------------------------------------------
/protocol-draft/draft-illyes-repext-00.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='utf-8'?>
  2 | <!DOCTYPE rfc [
  3 |   <!ENTITY nbsp    "&#160;">
  4 |   <!ENTITY zwsp   "&#8203;">
  5 |   <!ENTITY nbhy   "&#8209;">
  6 |   <!ENTITY mdash  "&#8212;">
  7 |   <!ENTITY wj     "&#8288;">
  8 | ]>
  9 | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" submissionType="IETF" docName="draft-illyes-repext-00" category="info" ipr="trust200902" obsoletes="" updates="" xml:lang="en" symRefs="true" sortRefs="true" version="3">
 10 |   <!-- xml2rfc v2v3 conversion 3.22.0 -->
 11 |   <!-- Generated by id2xml 1.5.2 on 2024-08-01T16:00:46Z -->
 12 | 	<front>
 13 |     <title abbrev="Robots Exclusion Protocol Extension for ">Robots Exclusion Protocol Extension for URI Level Control</title>
 14 |     <seriesInfo name="Internet-Draft" value="draft-illyes-repext-00"/>
 15 |     <author initials="G." surname="Illyes" fullname="Gary Illyes" role="editor">
 16 |       <organization>Google LLC.</organization>
 17 |       <address>
 18 |         <postal>
 19 |           <street>Brandschenkestrasse 110</street>
 20 |           <city>Zurich</city>
 21 |           <code>8002</code>
 22 |           <country>Switzerland</country>
 23 |         </postal>
 24 |         <email>garyillyes@google.com</email>
 25 |       </address>
 26 |     </author>
 27 |     <date year="2024" month="August" day="1"/>
 28 |     <workgroup>Internet Engineering Task Force (IETF)</workgroup>
 29 |     <abstract>
 30 |       <t>
 31 |    This document extends RFC9309 by specifying additional URI level
 32 |    controls through application level header and HTML meta tags
 33 |    originally developed in 1996. Additionally it moves the response
 34 |    header out of the experimental header space (i.e. "X-") and defines
 35 |    the combinability of multiple headers, which was previously not
 36 |    possible.</t>
 37 |     </abstract>
 38 |     <note>
 39 |       <name>About this Document</name>
 40 |       <t>
 41 |    This note is to be removed before publishing as an RFC.
 42 |    TODO(illyes): add commentable reference on github robotstxt repo.</t>
 43 |     </note>
 44 |   </front>
 45 |   <middle>
 46 |     <section anchor="sect-1" numbered="true" toc="default">
 47 |       <name>Introduction</name>
 48 |       <t>
 49 |    While the Robots Exclusion Protocol enables service owners to control
 50 |    how, if at all, automated clients known as crawlers may access the
 51 |    URIs on their services as defined by [RFC8288], the protocol doesn't
 52 |    provide controls on how the data returned by their service may be
 53 |    used upon allowed access.</t>
 54 |       <t>
 55 |    Originally developed in 1996 and widely adopted since, the use-case
 56 |    control is left to URI level controls implemented in the response
 57 |    headers, or in case of HTML in the form of a meta tag. This document
 58 |    specifies these control tags, and in case of the response header
 59 |    field, brings it to standards compliance with [RFC9110].</t>
 60 |       <t>
 61 |    Application developers are requested to honor these tags. The tags
 62 |    are not a form of access authorization however.</t>
 63 |        <section anchor="sect-1.1" numbered="true" toc="default">
 64 |          <name>Requirements Language</name>
 65 |          <t>
 66 |   The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
 67 |   "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY",
 68 |   and "OPTIONAL" in this document are to be interpreted as described
 69 |   in BCP 14 <xref target="RFC2119" format="default"/> [RFC8174] when, 
 70 |   and only when, they appear in all capitals, as shown here.</t>
 71 |         </section>
 72 |     </section>
 73 |     <section anchor="sect-2" numbered="true" toc="default">
 74 |       <name>Specification</name>
 75 |       <section anchor="sect-2.1" numbered="true" toc="default">
 76 |         <name>Robots control</name>
 77 |         <t>
 78 |           The URI level crawler controls are a key-value pair that 
 79 |           can be specified two ways:</t>
 80 |         <ul empty="true" spacing="normal">
 81 |           <li>an application level response header.</li>
 82 |           <li>in case of HTML, one or more meta tags as defined by the
 83 |               HTML specification.</li>
 84 |         </ul>
 85 |         <section anchor="sect-2.1.1" numbered="true" toc="default">
 86 |           <name>Application Layer Response Header</name>
 87 |             <t>
 88 |     The application level response header field name is "robots-tag"
 89 |     and contains rules applicable to either all accessors or
 90 |     specifically named ones in the value. For historical reasons,
 91 |     implementors should support the experimental field name also
 92 |     &mdash; "x-robots-tag".</t>
 93 |             <t>
 94 |     The value is a semicolon (";", 0x3B, 0x20) separated list of
 95 |     key-value pairs that represent a comma separated list of rules.
 96 |     The rules are specific to a single product token as defined by
 97 |     [RFC9309] or a global identifier &mdash; "*". The global identifier
 98 |     may be omitted. The product token is separated by a "=" from its
 99 |     rules.</t>
100 |             <t>
101 |     Duplicate product tokens must be merged and the rules deduplicated.</t>
102 |             <artwork name="" type="" align="left" alt=""><![CDATA[
103 |     ; key-values definition for the robots-tag response header.
104 |     robots-tag = "robots-tag" ":" robots-tag-values
105 |     robots-tag-values = *(value ";")
106 |     value = ( global-product-token / ( product-token "=" ) ) [rule]
107 |     global-product-token = "*" / OWS
108 |     product-token =  1*( %x2D / %x41-5A / %x5F / %x61-7A )
109 |     rule = "noindex" / "nosnippet"
110 |     OWS = *( SP / HTAB )
111 |     ]]></artwork>
112 |             <t>
113 |  For example, the following response header field specifies
114 |  "noindex" and "nosnippet" rules for all accessors, however
115 |  specifies no rules for the product token "ExampleBot":</t>
116 |             <artwork name="" type="" align="left" alt=""><![CDATA[
117 |     Robots-Tag: *=noindex, nosnippet; ExampleBot=;
118 |     ]]></artwork>
119 |             <t>
120 |     The global product identifier "*" in the value may be omitted; for
121 |     example, this field is equivalent to the previous example:</t>
122 |             <artwork name="" type="" align="left" alt=""><![CDATA[
123 |     Robots-Tag: noindex, nosnippet; ExampleBot=;]]></artwork>
124 |             <t>
125 |     Implementors should impose a parsing limit on the field value to
126 |     protect their systems. The parsing limit MUST be at least 8
127 |     kibibytes [KiB].</t>
128 |         </section>
129 |         <section anchor="sect-2.1.2" numbered="true" toc="default">
130 |           <name>HTML meta element</name>
131 |             <t>
132 |     For historical reasons the robots-tag header may be specified by
133 |     service owners as an HTML meta tag. In case of the meta tag, the
134 |     name attribute is used to specify the product token, and the
135 |     content attribute to specify the comma separated robots-tag rules.</t>
136 |             <t>
137 |     As with the header, the product token may be a global token,
138 |     "robots", which signifies that the rules apply to all requestors,
139 |     or a specific product token applicable to a single requestor. For
140 |     example:</t>
141 |             <artwork name="" type="" align="left" alt=""><![CDATA[
142 |     &lt;meta name="robots" content="noindex"&gt;
143 |     &lt;meta name="examplebot" content="nosnippet"&gt;]]></artwork>
144 |             <t>
145 |     Multiple robots meta elements may appear in a single HTML document.
146 |     Requestors must obey the sum of negative rules specific to their
147 |     product token and the global product token.</t>
148 |         </section>
149 |       </section>
150 |       <section anchor="sect-2.2" numbered="true" toc="default">
151 |         <name>Robots control rules</name>
152 |         <t>The possible values of the rules are:</t>
153 |           <ul spacing="compact">
154 |             <li>
155 |               <t>noindex - instructs the parser to not store the served
156 |                 data in its publicly accessible index.</t>
157 |             </li>
158 |             <li>
159 |               <t>nosnippet - instructs the parser to not reproduce any
160 |                 stored data as an excerpt snippet.</t>
161 |             </li>
162 |           </ul>
163 |           <t>
164 |     The values are case insensitive. Unsupported rules must be ignored.</t>
165 |           <t>
166 |     Implementors may support other rules as specified in Section 2.2.4
167 |     of [RFC9309].</t>
168 |       </section>
169 |       <section anchor="sect-2.3" numbered="true" toc="default">
170 |         <name>Caching of values</name>
171 |           <t>
172 |     The rules specified for a specific product token must be obeyed
173 |     until the rules have changed. Implementors MAY use standard cache
174 |     control as defined in [RFC9110] for caching robots-tag rules.
175 |     Implementors SHOULD refresh their caches within a reasonable time
176 |     frame.</t>
177 |       </section>
178 |     </section>
179 |     <section anchor="sect-3" numbered="true" toc="default">
180 |       <name>IANA considerations</name>
181 |       <artwork name="" type="" align="left" alt=""><![CDATA[
182 | TODO(illyes):
183 | https://www.rfc-editor.org/rfc/rfc9110.html#name-field-name-registry
184 | ]]></artwork>
185 |     </section>
186 |     <section anchor="sect-4" numbered="true" toc="default">
187 |       <name>Security considerations</name>
188 |       <t>
189 | The robots-tag is not a substitute for valid content security
190 | measures. To control access to the URI paths in a robots.txt file,
191 | users of the protocol should employ a valid security measure relevant
192 | to the application layer on which the robots.txt file is served &mdash;
193 | for example, in the case of HTTP, HTTP Authentication as defined in
194 | [RFC9110].</t>
195 |       <t>
196 | The content of the robots-tag header field is not secure, private or
197 | integrity-guaranteed, and due caution should be exercised when using
198 | it. Use of Transport Layer Security (TLS) with HTTP ([RFC9110] and
199 | [RFC2817]) is currently the only end-to-end way to provide such
200 | protection.</t>
201 |       <t>
202 | In case of a robots-tag specified in a HTML meta element, implementors
203 | should consider only the meta elements specified in the head element
204 | of the HTML document, which is generally only accessible to the
205 | service owner[a].</t>
206 |       <t>
207 | To protect against memory overflow attacks, implementers should
208 | enforce a limit on how much data they will parse; see section N
209 | for the lower limit.</t>
210 | 
211 |     </section>
212 |   </middle>
213 |   <back>
214 |     <references>
215 |       <name>References</name>
216 |       <references>
217 |         <name>Normative References</name>
218 |         <reference anchor="RFC2119" target="https://www.rfc-editor.org/info/rfc2119" xml:base="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2119.xml">
219 |           <front>
220 |             <title>Key words for use in RFCs to Indicate Requirement Levels</title>
221 |             <author fullname="S. Bradner" initials="S." surname="Bradner"/>
222 |             <date month="March" year="1997"/>
223 |             <abstract>
224 |               <t>In many standards track documents several words are used to signify the requirements in the specification. These words are often capitalized. This document defines these words as they should be interpreted in IETF documents. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.</t>
225 |             </abstract>
226 |           </front>
227 |           <seriesInfo name="BCP" value="14"/>
228 |           <seriesInfo name="RFC" value="2119"/>
229 |           <seriesInfo name="DOI" value="10.17487/RFC2119"/>
230 |         </reference>
231 |         <reference anchor="RFC2817" target="https://www.rfc-editor.org/info/rfc2817" xml:base="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2817.xml">
232 |           <front>
233 |             <title>Upgrading to TLS Within HTTP/1.1</title>
234 |             <author fullname="R. Khare" initials="R." surname="Khare"/>
235 |             <author fullname="S. Lawrence" initials="S." surname="Lawrence"/>
236 |             <date month="May" year="2000"/>
237 |             <abstract>
238 |               <t>This memo explains how to use the Upgrade mechanism in HTTP/1.1 to initiate Transport Layer Security (TLS) over an existing TCP connection. [STANDARDS-TRACK]</t>
239 |             </abstract>
240 |           </front>
241 |           <seriesInfo name="RFC" value="2817"/>
242 |           <seriesInfo name="DOI" value="10.17487/RFC2817"/>
243 |         </reference>
244 |       </references>
245 |       <references>
246 |         <name>Informative References</name>
247 |         <reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibibyte">
248 |           <front>
249 |             <title>Kibibyte - Simple English Wikipedia, the free encyclopedia</title>
250 |             <author>
251 |               <organization></organization>
252 |             </author>
253 |             <date month="March" year="2006"/>
254 |           </front>
255 |         </reference>
256 |       </references>
257 |     </references>
258 |   </back>
259 | </rfc>
260 | 


--------------------------------------------------------------------------------
/reporting_robots.cc:
--------------------------------------------------------------------------------
 1 | #include "reporting_robots.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "absl/strings/ascii.h"
 8 | #include "absl/strings/string_view.h"
 9 | 
10 | namespace googlebot {
11 | // The kUnsupportedTags tags are popular tags in robots.txt files, but Google
12 | // doesn't use them for anything. Other search engines may, however, so we
13 | // parse them out so users of the library can highlight them for their own
14 | // users if they so wish.
15 | // These are different from the "unknown" tags, since we know that these may
16 | // have some use cases; to the best of our knowledge other tags we find, don't.
17 | // (for example, "unicorn" from "unicorn: /value")
18 | static const std::vector<std::string> kUnsupportedTags = {
19 |     "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"};
20 | 
21 | void RobotsParsingReporter::Digest(int line_num,
22 |                                    RobotsParsedLine::RobotsTagName parsed_tag) {
23 |   if (line_num > last_line_seen_) {
24 |     last_line_seen_ = line_num;
25 |   }
26 |   if (parsed_tag != RobotsParsedLine::kUnknown &&
27 |       parsed_tag != RobotsParsedLine::kUnused) {
28 |     ++valid_directives_;
29 |   }
30 | 
31 |   RobotsParsedLine& line = robots_parse_results_[line_num];
32 |   line.line_num = line_num;
33 |   line.tag_name = parsed_tag;
34 | }
35 | 
36 | void RobotsParsingReporter::ReportLineMetadata(int line_num,
37 |                                                const LineMetadata& metadata) {
38 |   if (line_num > last_line_seen_) {
39 |     last_line_seen_ = line_num;
40 |   }
41 |   RobotsParsedLine& line = robots_parse_results_[line_num];
42 |   line.line_num = line_num;
43 |   line.is_typo = metadata.is_acceptable_typo;
44 |   line.metadata = metadata;
45 | }
46 | 
47 | void RobotsParsingReporter::HandleRobotsStart() {
48 |   last_line_seen_ = 0;
49 |   valid_directives_ = 0;
50 |   unused_directives_ = 0;
51 | }
52 | void RobotsParsingReporter::HandleRobotsEnd() {}
53 | void RobotsParsingReporter::HandleUserAgent(int line_num,
54 |                                             absl::string_view line_value) {
55 |   Digest(line_num, RobotsParsedLine::kUserAgent);
56 | }
57 | void RobotsParsingReporter::HandleAllow(int line_num,
58 |                                         absl::string_view line_value) {
59 |   Digest(line_num, RobotsParsedLine::kAllow);
60 | }
61 | void RobotsParsingReporter::HandleDisallow(int line_num,
62 |                                            absl::string_view line_value) {
63 |   Digest(line_num, RobotsParsedLine::kDisallow);
64 | }
65 | void RobotsParsingReporter::HandleSitemap(int line_num,
66 |                                           absl::string_view line_value) {
67 |   Digest(line_num, RobotsParsedLine::kSitemap);
68 | }
69 | void RobotsParsingReporter::HandleUnknownAction(int line_num,
70 |                                                 absl::string_view action,
71 |                                                 absl::string_view line_value) {
72 |   RobotsParsedLine::RobotsTagName rtn =
73 |       std::count(kUnsupportedTags.begin(), kUnsupportedTags.end(),
74 |                  absl::AsciiStrToLower(action)) > 0
75 |           ? RobotsParsedLine::kUnused
76 |           : RobotsParsedLine::kUnknown;
77 |   unused_directives_++;
78 |   Digest(line_num, rtn);
79 | }
80 | 
81 | }  // namespace googlebot
82 | 


--------------------------------------------------------------------------------
/reporting_robots.h:
--------------------------------------------------------------------------------
 1 | #ifndef THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
 2 | #define THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include "absl/container/btree_map.h"
 7 | #include "absl/strings/string_view.h"
 8 | #include "robots.h"
 9 | 
10 | namespace googlebot {
11 | struct RobotsParsedLine {
12 |   enum RobotsTagName {
13 |     // Identifier for skipped lines. A line may be skipped because it's
14 |     // unparseable, or because it contains no recognizable key. Note that
15 |     // comment lines are also skipped, they're no-op for parsing. For example:
16 |     //   random characters
17 |     //   unicorn: <value>
18 |     //   # comment line
19 |     // Same thing for empty lines.
20 |     kUnknown = 0,
21 |     kUserAgent = 1,
22 |     kAllow = 2,
23 |     kDisallow = 3,
24 |     kSitemap = 4,
25 |     // Identifier for parseable lines whose key is recognized, but unused.
26 |     // See kUnsupportedTags in reporting_robots.cc for a list of recognized,
27 |     // but unused keys. For example:
28 |     //   noindex: <value>
29 |     //   noarchive: <value>
30 |     kUnused = 5,
31 |   };
32 | 
33 |   int line_num = 0;
34 |   RobotsTagName tag_name = kUnknown;
35 |   bool is_typo = false;
36 |   RobotsParseHandler::LineMetadata metadata;
37 | };
38 | 
39 | class RobotsParsingReporter : public googlebot::RobotsParseHandler {
40 |  public:
41 |   void HandleRobotsStart() override;
42 |   void HandleRobotsEnd() override;
43 |   void HandleUserAgent(int line_num, absl::string_view line_value) override;
44 |   void HandleAllow(int line_num, absl::string_view line_value) override;
45 |   void HandleDisallow(int line_num, absl::string_view line_value) override;
46 |   void HandleSitemap(int line_num, absl::string_view line_value) override;
47 |   void HandleUnknownAction(int line_num, absl::string_view action,
48 |                            absl::string_view line_value) override;
49 |   void ReportLineMetadata(int line_num, const LineMetadata& metadata) override;
50 | 
51 |   int last_line_seen() const { return last_line_seen_; }
52 |   int valid_directives() const { return valid_directives_; }
53 |   int unused_directives() const { return unused_directives_; }
54 |   std::vector<RobotsParsedLine> parse_results() const {
55 |     std::vector<RobotsParsedLine> vec;
56 |     for (const auto& entry : robots_parse_results_) {
57 |       vec.push_back(entry.second);
58 |     }
59 |     return vec;
60 |   }
61 | 
62 |  private:
63 |   void Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag);
64 | 
65 |   // Indexed and sorted by line number.
66 |   absl::btree_map<int, RobotsParsedLine> robots_parse_results_;
67 |   int last_line_seen_ = 0;
68 |   int valid_directives_ = 0;
69 |   int unused_directives_ = 0;
70 | };
71 | }  // namespace googlebot
72 | #endif  // THIRD_PARTY_ROBOTSTXT_REPORTING_ROBOTS_H_
73 | 


--------------------------------------------------------------------------------
/reporting_robots_test.cc:
--------------------------------------------------------------------------------
  1 | #include "reporting_robots.h"
  2 | 
  3 | #include <ostream>
  4 | #include <string>
  5 | #include <vector>
  6 | 
  7 | #include "gtest/gtest.h"
  8 | #include "absl/strings/str_cat.h"
  9 | #include "absl/strings/str_split.h"
 10 | #include "absl/strings/string_view.h"
 11 | #include "robots.h"
 12 | 
 13 | using ::googlebot::RobotsParsedLine;
 14 | using ::googlebot::RobotsParseHandler;
 15 | using ::googlebot::RobotsParsingReporter;
 16 | 
 17 | namespace {
 18 | // Allows debugging the contents of the LineMetadata struct.
 19 | std::string LineMetadataToString(const RobotsParseHandler::LineMetadata& line) {
 20 |   // clang-format off
 21 |   return absl::StrCat(
 22 |       "{ is_empty: ", line.is_empty,
 23 |       " has_directive: ", line.has_directive,
 24 |       " has_comment: ", line.has_comment,
 25 |       " is_comment: ", line.is_comment,
 26 |       " is_acceptable_typo: ", line.is_acceptable_typo,
 27 |       " is_line_too_long: ", line.is_line_too_long,
 28 |       " is_missing_colon_separator: ", line.is_missing_colon_separator, " }");
 29 |   // clang-format on
 30 | }
 31 | 
 32 | std::string TagNameToString(RobotsParsedLine::RobotsTagName tag_name) {
 33 |   switch (tag_name) {
 34 |     case RobotsParsedLine::RobotsTagName::kUnknown:
 35 |       return "Unknown";
 36 |     case RobotsParsedLine::RobotsTagName::kUserAgent:
 37 |       return "UserAgent";
 38 |     case RobotsParsedLine::RobotsTagName::kAllow:
 39 |       return "Allow";
 40 |     case RobotsParsedLine::RobotsTagName::kDisallow:
 41 |       return "Disallow";
 42 |     case RobotsParsedLine::RobotsTagName::kSitemap:
 43 |       return "Sitemap";
 44 |     case RobotsParsedLine::RobotsTagName::kUnused:
 45 |       return "Unused";
 46 |   }
 47 | }
 48 | 
 49 | // Allows debugging the contents of the RobotsParsedLine struct.
 50 | std::string RobotsParsedLineToString(const RobotsParsedLine& line) {
 51 |   return absl::StrCat("{\n lin_num:", line.line_num,
 52 |                       "\n tag_name: ", TagNameToString(line.tag_name),
 53 |                       "\n is_typo: ", line.is_typo,
 54 |                       "\n metadata: ", LineMetadataToString(line.metadata),
 55 |                       "\n}");
 56 | }
 57 | 
 58 | void expectLineToParseTo(const std::vector<absl::string_view>& lines,
 59 |                          const std::vector<RobotsParsedLine>& parse_results,
 60 |                          const RobotsParsedLine& expected_result) {
 61 |   int line_num = expected_result.line_num;
 62 |   EXPECT_EQ(parse_results[line_num - 1], expected_result)
 63 |       << "For line " << line_num << ": '" << lines[line_num - 1] << "'";
 64 | }
 65 | }  // namespace
 66 | 
 67 | namespace googlebot {
 68 | // This allows us to get a debug content of the object in the test. Without
 69 | // this, it would say something like this when a test fails:
 70 | //  Expected equality of these values:
 71 | //   parse_results[line_num - 1]
 72 | //    Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 01-00 00-00>
 73 | //   expected_result
 74 | //    Which is: 16-byte object <01-00 00-00 01-00 00-00 00-00 00-00 00-25 00-00>
 75 | std::ostream& operator<<(std::ostream& o, const RobotsParsedLine& line) {
 76 |   o << RobotsParsedLineToString(line);
 77 |   return o;
 78 | }
 79 | 
 80 | // These 2 `operator==` are needed for `EXPECT_EQ` to work.
 81 | bool operator==(const RobotsParseHandler::LineMetadata& lhs,
 82 |                 const RobotsParseHandler::LineMetadata& rhs) {
 83 |   return lhs.is_empty == rhs.is_empty &&
 84 |          lhs.has_directive == rhs.has_directive &&
 85 |          lhs.has_comment == rhs.has_comment &&
 86 |          lhs.is_comment == rhs.is_comment &&
 87 |          lhs.is_acceptable_typo == rhs.is_acceptable_typo &&
 88 |          lhs.is_line_too_long == rhs.is_line_too_long &&
 89 |          lhs.is_missing_colon_separator == rhs.is_missing_colon_separator;
 90 | }
 91 | 
 92 | bool operator==(const RobotsParsedLine& lhs, const RobotsParsedLine& rhs) {
 93 |   return lhs.line_num == rhs.line_num && lhs.tag_name == rhs.tag_name &&
 94 |          lhs.is_typo == rhs.is_typo && lhs.metadata == rhs.metadata;
 95 | }
 96 | }  // namespace googlebot
 97 | 
 98 | TEST(RobotsUnittest, LinesNumbersAreCountedCorrectly) {
 99 |   RobotsParsingReporter report;
100 |   static const char kSimpleFile[] =
101 |       "User-Agent: foo\n"                     // 1
102 |       "Allow: /some/path\n"                   // 2
103 |       "User-Agent bar # no\n"                 // 3
104 |       "absolutely random line\n"              // 4
105 |       "#so comment, much wow\n"               // 5
106 |       "\n"                                    // 6
107 |       "unicorns: /extinct\n"                  // 7
108 |       "noarchive: /some\n"                    // 8
109 |       "Disallow: /\n"                         // 9
110 |       "Error #and comment\n"                  // 10
111 |       "useragent: baz\n"                      // 11
112 |       "disallaw: /some\n"                     // 12
113 |       "site-map: https://e/s.xml #comment\n"  // 13
114 |       "sitemap: https://e/t.xml\n"            // 14
115 |       "Noarchive: /someCapital\n";            // 15
116 |                                               // 16 (from \n)
117 |   googlebot::ParseRobotsTxt(kSimpleFile, &report);
118 |   EXPECT_EQ(8, report.valid_directives());
119 |   EXPECT_EQ(16, report.last_line_seen());
120 |   EXPECT_EQ(report.parse_results().size(), report.last_line_seen());
121 |   std::vector<absl::string_view> lines = absl::StrSplit(kSimpleFile, '\n');
122 | 
123 |   // For line "User-Agent: foo\n"         // 1
124 |   expectLineToParseTo(
125 |       lines, report.parse_results(),
126 |       RobotsParsedLine{.line_num = 1,
127 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
128 |                        .is_typo = false,
129 |                        .metadata = RobotsParseHandler::LineMetadata{
130 |                            .is_empty = false,
131 |                            .has_comment = false,
132 |                            .is_comment = false,
133 |                            .has_directive = true,
134 |                            .is_missing_colon_separator = false,
135 |                        }});
136 |   // For line "Allow: /some/path\n"       // 2
137 |   expectLineToParseTo(
138 |       lines, report.parse_results(),
139 |       RobotsParsedLine{.line_num = 2,
140 |                        .tag_name = RobotsParsedLine::RobotsTagName::kAllow,
141 |                        .is_typo = false,
142 |                        .metadata = RobotsParseHandler::LineMetadata{
143 |                            .is_empty = false,
144 |                            .has_comment = false,
145 |                            .is_comment = false,
146 |                            .has_directive = true,
147 |                        }});
148 |   // For line "User-Agent bar # no\n"    // 3
149 |   expectLineToParseTo(
150 |       lines, report.parse_results(),
151 |       RobotsParsedLine{.line_num = 3,
152 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
153 |                        .is_typo = false,
154 |                        .metadata = RobotsParseHandler::LineMetadata{
155 |                            .is_empty = false,
156 |                            .has_comment = true,
157 |                            .is_comment = false,
158 |                            .has_directive = true,
159 |                            .is_missing_colon_separator = true,
160 |                        }});
161 |   // For line "absolutely random line\n"  // 4
162 |   expectLineToParseTo(
163 |       lines, report.parse_results(),
164 |       RobotsParsedLine{.line_num = 4,
165 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
166 |                        .is_typo = false,
167 |                        .metadata = RobotsParseHandler::LineMetadata{
168 |                            .is_empty = false,
169 |                            .has_comment = false,
170 |                            .is_comment = false,
171 |                            .has_directive = false,
172 |                            .is_missing_colon_separator = false,
173 |                        }});
174 |   // For line "#so comment, much wow\n"   // 5
175 |   expectLineToParseTo(
176 |       lines, report.parse_results(),
177 |       RobotsParsedLine{.line_num = 5,
178 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
179 |                        .is_typo = false,
180 |                        .metadata = RobotsParseHandler::LineMetadata{
181 |                            .is_empty = false,
182 |                            .has_comment = true,
183 |                            .is_comment = true,
184 |                            .has_directive = false,
185 |                        }});
186 |   // For line "\n"                        // 6
187 |   expectLineToParseTo(
188 |       lines, report.parse_results(),
189 |       RobotsParsedLine{.line_num = 6,
190 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
191 |                        .is_typo = false,
192 |                        .metadata = RobotsParseHandler::LineMetadata{
193 |                            .is_empty = true,
194 |                            .has_comment = false,
195 |                            .is_comment = false,
196 |                            .has_directive = false,
197 |                        }});
198 |   // For line "unicorns: /extinct\n"      // 7
199 |   expectLineToParseTo(
200 |       lines, report.parse_results(),
201 |       RobotsParsedLine{.line_num = 7,
202 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
203 |                        .is_typo = false,
204 |                        .metadata = RobotsParseHandler::LineMetadata{
205 |                            .is_empty = false,
206 |                            .has_comment = false,
207 |                            .is_comment = false,
208 |                            .has_directive = true,
209 |                        }});
210 |   // For line "noarchive: /some\n"        // 8
211 |   expectLineToParseTo(
212 |       lines, report.parse_results(),
213 |       RobotsParsedLine{.line_num = 8,
214 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnused,
215 |                        .is_typo = false,
216 |                        .metadata = RobotsParseHandler::LineMetadata{
217 |                            .is_empty = false,
218 |                            .has_comment = false,
219 |                            .is_comment = false,
220 |                            .has_directive = true,
221 |                        }});
222 |   // For line "Disallow: /\n"             // 9
223 |   expectLineToParseTo(
224 |       lines, report.parse_results(),
225 |       RobotsParsedLine{.line_num = 9,
226 |                        .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
227 |                        .is_typo = false,
228 |                        .metadata = RobotsParseHandler::LineMetadata{
229 |                            .is_empty = false,
230 |                            .has_comment = false,
231 |                            .is_comment = false,
232 |                            .has_directive = true,
233 |                        }});
234 |   // For line "Error #and comment\n";     // 10
235 |   expectLineToParseTo(
236 |       lines, report.parse_results(),
237 |       RobotsParsedLine{.line_num = 10,
238 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
239 |                        .is_typo = false,
240 |                        .metadata = RobotsParseHandler::LineMetadata{
241 |                            .is_empty = false,
242 |                            .has_comment = true,
243 |                            .is_comment = false,
244 |                            .has_directive = false,
245 |                            .is_missing_colon_separator = false,
246 |                        }});
247 |   // For line "useragent: baz\n";         // 11
248 |   expectLineToParseTo(
249 |       lines, report.parse_results(),
250 |       RobotsParsedLine{.line_num = 11,
251 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
252 |                        .is_typo = true,
253 |                        .metadata = RobotsParseHandler::LineMetadata{
254 |                            .is_empty = false,
255 |                            .has_comment = false,
256 |                            .is_comment = false,
257 |                            .has_directive = true,
258 |                            .is_acceptable_typo = true,
259 |                        }});
260 |   // For line "disallaw: /some\n"         // 12
261 |   expectLineToParseTo(
262 |       lines, report.parse_results(),
263 |       RobotsParsedLine{.line_num = 12,
264 |                        .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
265 |                        .is_typo = true,
266 |                        .metadata = RobotsParseHandler::LineMetadata{
267 |                            .is_empty = false,
268 |                            .has_comment = false,
269 |                            .is_comment = false,
270 |                            .has_directive = true,
271 |                            .is_acceptable_typo = true,
272 |                        }});
273 |   // For line "site-map: https://e/s.xml #comment\n"  // 13;
274 |   expectLineToParseTo(
275 |       lines, report.parse_results(),
276 |       RobotsParsedLine{.line_num = 13,
277 |                        .tag_name = RobotsParsedLine::RobotsTagName::kSitemap,
278 |                        .is_typo = true,
279 |                        .metadata = RobotsParseHandler::LineMetadata{
280 |                            .is_empty = false,
281 |                            .has_comment = true,
282 |                            .is_comment = false,
283 |                            .has_directive = true,
284 |                            .is_acceptable_typo = true,
285 |                        }});
286 |   // For line "sitemap: https://e/t.xml\n"  // 14;
287 |   expectLineToParseTo(
288 |       lines, report.parse_results(),
289 |       RobotsParsedLine{.line_num = 14,
290 |                        .tag_name = RobotsParsedLine::RobotsTagName::kSitemap,
291 |                        .is_typo = false,
292 |                        .metadata = RobotsParseHandler::LineMetadata{
293 |                            .is_empty = false,
294 |                            .has_comment = false,
295 |                            .is_comment = false,
296 |                            .has_directive = true,
297 |                            .is_acceptable_typo = false,
298 |                        }});
299 |   // For line "Noarchive: /someCapital\n"        // 15
300 |   expectLineToParseTo(
301 |       lines, report.parse_results(),
302 |       RobotsParsedLine{.line_num = 15,
303 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnused,
304 |                        .is_typo = false,
305 |                        .metadata = RobotsParseHandler::LineMetadata{
306 |                            .is_empty = false,
307 |                            .has_comment = false,
308 |                            .is_comment = false,
309 |                            .has_directive = true,
310 |                        }});
311 |   // For line 16 (which is empty and comes from the last \n)
312 |   expectLineToParseTo(
313 |       lines, report.parse_results(),
314 |       RobotsParsedLine{.line_num = 16,
315 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUnknown,
316 |                        .is_typo = false,
317 |                        .metadata = RobotsParseHandler::LineMetadata{
318 |                            .is_empty = true,
319 |                            .has_comment = false,
320 |                            .is_comment = false,
321 |                            .has_directive = false,
322 |                        }});
323 | 
324 |   static const char kDosFile[] =
325 |       "User-Agent: foo\r\n"
326 |       "Allow: /some/path\r\n"
327 |       "User-Agent: bar\r\n"
328 |       "\r\n"
329 |       "\r\n"
330 |       "Disallow: /\r\n";
331 |   googlebot::ParseRobotsTxt(kDosFile, &report);
332 |   EXPECT_EQ(4, report.valid_directives());
333 |   EXPECT_EQ(7, report.last_line_seen());
334 | 
335 |   static const char kMacFile[] =
336 |       "User-Agent: foo\r"
337 |       "Allow: /some/path\r"
338 |       "User-Agent: bar\r"
339 |       "\r"
340 |       "\r"
341 |       "Disallow: /\r";
342 |   googlebot::ParseRobotsTxt(kMacFile, &report);
343 |   EXPECT_EQ(4, report.valid_directives());
344 |   EXPECT_EQ(7, report.last_line_seen());
345 | }
346 | 
347 | TEST(RobotsUnittest, LinesTooLongReportedCorrectly) {
348 |   RobotsParsingReporter report;
349 |   const int kMaxLineLen = 2084 * 8;
350 |   std::string allow = "allow: /\n";
351 |   std::string disallow = "disallow: ";
352 |   std::string robotstxt = "user-agent: foo\n";
353 |   std::string longline = "/x/";
354 |   while (longline.size() < kMaxLineLen) {
355 |     absl::StrAppend(&longline, "a");
356 |   }
357 |   absl::StrAppend(&robotstxt, disallow, longline, "\n", allow);
358 | 
359 |   googlebot::ParseRobotsTxt(robotstxt, &report);
360 |   EXPECT_EQ(3, report.valid_directives());
361 |   EXPECT_EQ(4, report.last_line_seen());
362 |   EXPECT_EQ(report.parse_results().size(), report.last_line_seen());
363 |   std::vector<absl::string_view> lines = absl::StrSplit(robotstxt, '\n');
364 | 
365 |   // For line "user-agent: foo\n"       // 1
366 |   expectLineToParseTo(
367 |       lines, report.parse_results(),
368 |       RobotsParsedLine{.line_num = 1,
369 |                        .tag_name = RobotsParsedLine::RobotsTagName::kUserAgent,
370 |                        .is_typo = false,
371 |                        .metadata = RobotsParseHandler::LineMetadata{
372 |                            .is_empty = false,
373 |                            .has_comment = false,
374 |                            .is_comment = false,
375 |                            .has_directive = true,
376 |                            .is_line_too_long = false,
377 |                        }});
378 |   // For line "disallow: /x/a[...]a\n"  // 2
379 |   expectLineToParseTo(
380 |       lines, report.parse_results(),
381 |       RobotsParsedLine{.line_num = 2,
382 |                        .tag_name = RobotsParsedLine::RobotsTagName::kDisallow,
383 |                        .is_typo = false,
384 |                        .metadata = RobotsParseHandler::LineMetadata{
385 |                            .is_empty = false,
386 |                            .has_comment = false,
387 |                            .is_comment = false,
388 |                            .has_directive = true,
389 |                            .is_line_too_long = true,
390 |                        }});
391 |   // For line "allow: /\n"              // 3
392 |   expectLineToParseTo(
393 |       lines, report.parse_results(),
394 |       RobotsParsedLine{.line_num = 3,
395 |                        .tag_name = RobotsParsedLine::RobotsTagName::kAllow,
396 |                        .is_typo = false,
397 |                        .metadata = RobotsParseHandler::LineMetadata{
398 |                            .is_empty = false,
399 |                            .has_comment = false,
400 |                            .is_comment = false,
401 |                            .has_directive = true,
402 |                            .is_line_too_long = false,
403 |                        }});
404 | }
405 | 


--------------------------------------------------------------------------------
/robots.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 1999 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     https://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | //
 15 | // -----------------------------------------------------------------------------
 16 | // File: robots.cc
 17 | // -----------------------------------------------------------------------------
 18 | //
 19 | // Implements expired internet draft
 20 | //   http://www.robotstxt.org/norobots-rfc.txt
 21 | // with Google-specific optimizations detailed at
 22 | //   https://developers.google.com/search/reference/robots_txt
 23 | 
 24 | #include "robots.h"
 25 | 
 26 | #include <stdlib.h>
 27 | 
 28 | #include <cassert>
 29 | #include <cctype>
 30 | #include <cstddef>
 31 | #include <cstring>
 32 | #include <string>
 33 | #include <vector>
 34 | 
 35 | #include "absl/base/macros.h"
 36 | #include "absl/container/fixed_array.h"
 37 | #include "absl/strings/ascii.h"
 38 | #include "absl/strings/match.h"
 39 | #include "absl/strings/numbers.h"
 40 | #include "absl/strings/string_view.h"
 41 | 
 42 | // Allow for typos such as DISALOW in robots.txt.
 43 | static bool kAllowFrequentTypos = true;
 44 | 
 45 | namespace googlebot {
 46 | 
 47 | // A RobotsMatchStrategy defines a strategy for matching individual lines in a
 48 | // robots.txt file. Each Match* method should return a match priority, which is
 49 | // interpreted as:
 50 | //
 51 | // match priority < 0:
 52 | //    No match.
 53 | //
 54 | // match priority == 0:
 55 | //    Match, but treat it as if matched an empty pattern.
 56 | //
 57 | // match priority > 0:
 58 | //    Match.
 59 | class RobotsMatchStrategy {
 60 |  public:
 61 |   virtual ~RobotsMatchStrategy() {}
 62 | 
 63 |   virtual int MatchAllow(absl::string_view path,
 64 |                          absl::string_view pattern) = 0;
 65 |   virtual int MatchDisallow(absl::string_view path,
 66 |                             absl::string_view pattern) = 0;
 67 | 
 68 |  protected:
 69 |   // Implements robots.txt pattern matching.
 70 |   static bool Matches(absl::string_view path, absl::string_view pattern);
 71 | };
 72 | 
 73 | // Returns true if URI path matches the specified pattern. Pattern is anchored
 74 | // at the beginning of path. '$' is special only at the end of pattern.
 75 | //
 76 | // Since 'path' and 'pattern' are both externally determined (by the webmaster),
 77 | // we make sure to have acceptable worst-case performance.
 78 | /* static */ bool RobotsMatchStrategy::Matches(
 79 |     absl::string_view path, absl::string_view pattern) {
 80 |   const size_t pathlen = path.length();
 81 |   absl::FixedArray<size_t> pos(pathlen + 1);
 82 |   int numpos;
 83 | 
 84 |   // The pos[] array holds a sorted list of indexes of 'path', with length
 85 |   // 'numpos'.  At the start and end of each iteration of the main loop below,
 86 |   // the pos[] array will hold a list of the prefixes of the 'path' which can
 87 |   // match the current prefix of 'pattern'. If this list is ever empty,
 88 |   // return false. If we reach the end of 'pattern' with at least one element
 89 |   // in pos[], return true.
 90 | 
 91 |   pos[0] = 0;
 92 |   numpos = 1;
 93 | 
 94 |   for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
 95 |     if (*pat == '$' && pat + 1 == pattern.end()) {
 96 |       return (pos[numpos - 1] == pathlen);
 97 |     }
 98 |     if (*pat == '*') {
 99 |       numpos = pathlen - pos[0] + 1;
100 |       for (int i = 1; i < numpos; i++) {
101 |         pos[i] = pos[i-1] + 1;
102 |       }
103 |     } else {
104 |       // Includes '$' when not at end of pattern.
105 |       int newnumpos = 0;
106 |       for (int i = 0; i < numpos; i++) {
107 |         if (pos[i] < pathlen && path[pos[i]] == *pat) {
108 |           pos[newnumpos++] = pos[i] + 1;
109 |         }
110 |       }
111 |       numpos = newnumpos;
112 |       if (numpos == 0) return false;
113 |     }
114 |   }
115 | 
116 |   return true;
117 | }
118 | 
119 | static const char* kHexDigits = "0123456789ABCDEF";
120 | 
121 | // GetPathParamsQuery is not in anonymous namespace to allow testing.
122 | //
123 | // Extracts path (with params) and query part from URL. Removes scheme,
124 | // authority, and fragment. Result always starts with "/".
125 | // Returns "/" if the url doesn't have a path or is not valid.
126 | std::string GetPathParamsQuery(const std::string& url) {
127 |   std::string path;
128 | 
129 |   // Initial two slashes are ignored.
130 |   size_t search_start = 0;
131 |   if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2;
132 | 
133 |   size_t early_path = url.find_first_of("/?;", search_start);
134 |   size_t protocol_end = url.find("://", search_start);
135 |   if (early_path < protocol_end) {
136 |     // If path, param or query starts before ://, :// doesn't indicate protocol.
137 |     protocol_end = std::string::npos;
138 |   }
139 |   if (protocol_end == std::string::npos) {
140 |     protocol_end = search_start;
141 |   } else {
142 |     protocol_end += 3;
143 |   }
144 | 
145 |   size_t path_start = url.find_first_of("/?;", protocol_end);
146 |   if (path_start != std::string::npos) {
147 |     size_t hash_pos = url.find('#', search_start);
148 |     if (hash_pos < path_start) return "/";
149 |     size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos;
150 |     if (url[path_start] != '/') {
151 |       // Prepend a slash if the result would start e.g. with '?'.
152 |       return "/" + url.substr(path_start, path_end - path_start);
153 |     }
154 |     return url.substr(path_start, path_end - path_start);
155 |   }
156 | 
157 |   return "/";
158 | }
159 | 
160 | // MaybeEscapePattern is not in anonymous namespace to allow testing.
161 | //
162 | // Canonicalize the allowed/disallowed paths. For example:
163 | //     /SanJoséSellers ==> /Sanjos%C3%A9Sellers
164 | //     %aa ==> %AA
165 | // When the function returns, (*dst) either points to src, or is newly
166 | // allocated.
167 | // Returns true if dst was newly allocated.
168 | bool MaybeEscapePattern(const char* src, char** dst) {
169 |   int num_to_escape = 0;
170 |   bool need_capitalize = false;
171 | 
172 |   // First, scan the buffer to see if changes are needed. Most don't.
173 |   for (int i = 0; src[i] != 0; i++) {
174 |     // (a) % escape sequence.
175 |     if (src[i] == '%' &&
176 |         absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
177 |       if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) {
178 |         need_capitalize = true;
179 |       }
180 |       i += 2;
181 |     // (b) needs escaping.
182 |     } else if (src[i] & 0x80) {
183 |       num_to_escape++;
184 |     }
185 |     // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
186 |   }
187 |   // Return if no changes needed.
188 |   if (!num_to_escape && !need_capitalize) {
189 |     (*dst) = const_cast<char*>(src);
190 |     return false;
191 |   }
192 |   (*dst) = new char[num_to_escape * 2 + strlen(src) + 1];
193 |   int j = 0;
194 |   for (int i = 0; src[i] != 0; i++) {
195 |     // (a) Normalize %-escaped sequence (eg. %2f -> %2F).
196 |     if (src[i] == '%' &&
197 |         absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
198 |       (*dst)[j++] = src[i++];
199 |       (*dst)[j++] = absl::ascii_toupper(src[i++]);
200 |       (*dst)[j++] = absl::ascii_toupper(src[i]);
201 |       // (b) %-escape octets whose highest bit is set. These are outside the
202 |       // ASCII range.
203 |     } else if (src[i] & 0x80) {
204 |       (*dst)[j++] = '%';
205 |       (*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf];
206 |       (*dst)[j++] = kHexDigits[src[i] & 0xf];
207 |     // (c) Normal character, no modification needed.
208 |     } else {
209 |       (*dst)[j++] = src[i];
210 |     }
211 |   }
212 |   (*dst)[j] = 0;
213 |   return true;
214 | }
215 | 
216 | // Internal helper classes and functions.
217 | namespace {
218 | 
219 | // A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
220 | // a key. This class can parse a text-representation (including common typos)
221 | // and represent them as an enumeration which allows for faster processing
222 | // afterwards.
223 | // For unparsable keys, the original string representation is kept.
224 | class ParsedRobotsKey {
225 |  public:
226 |   enum KeyType {
227 |     // Generic highlevel fields.
228 |     USER_AGENT,
229 |     SITEMAP,
230 | 
231 |     // Fields within a user-agent.
232 |     ALLOW,
233 |     DISALLOW,
234 | 
235 |     // Unrecognized field; kept as-is. High number so that additions to the
236 |     // enumeration above does not change the serialization.
237 |     UNKNOWN = 128
238 |   };
239 | 
240 |   ParsedRobotsKey() : type_(UNKNOWN) {}
241 | 
242 |   // Disallow copying and assignment.
243 |   ParsedRobotsKey(const ParsedRobotsKey&) = delete;
244 |   ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete;
245 | 
246 |   // Parse given key text and report in the is_acceptable_typo output parameter
247 |   // whether the key is one of the accepted typo-variants of a supported key.
248 |   // Does not copy the text, so the text_key must stay valid for the object's
249 |   // life-time or the next Parse() call.
250 |   void Parse(absl::string_view key, bool* is_acceptable_typo);
251 | 
252 |   // Returns the type of key.
253 |   KeyType type() const { return type_; }
254 | 
255 |   // If this is an unknown key, get the text.
256 |   absl::string_view GetUnknownText() const;
257 | 
258 |  private:
259 |   // The KeyIs* methods return whether a passed in key is one of the a supported
260 |   // keys, and report in the is_acceptable_typo output parameter whether the key
261 |   // is one of the accepted typo-variants of a supported key.
262 |   static bool KeyIsUserAgent(absl::string_view key, bool* is_acceptable_typo);
263 |   static bool KeyIsAllow(absl::string_view key, bool* is_acceptable_typo);
264 |   static bool KeyIsDisallow(absl::string_view key, bool* is_acceptable_typo);
265 |   static bool KeyIsSitemap(absl::string_view key, bool* is_acceptable_typo);
266 | 
267 |   KeyType type_;
268 |   absl::string_view key_text_;
269 | };
270 | 
271 | void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key,
272 |                            absl::string_view value,
273 |                            RobotsParseHandler* handler) {
274 |   typedef ParsedRobotsKey Key;
275 |   switch (key.type()) {
276 |     case Key::USER_AGENT:     handler->HandleUserAgent(line, value); break;
277 |     case Key::ALLOW:          handler->HandleAllow(line, value); break;
278 |     case Key::DISALLOW:       handler->HandleDisallow(line, value); break;
279 |     case Key::SITEMAP:        handler->HandleSitemap(line, value); break;
280 |     case Key::UNKNOWN:
281 |       handler->HandleUnknownAction(line, key.GetUnknownText(), value);
282 |       break;
283 |       // No default case Key:: to have the compiler warn about new values.
284 |   }
285 | }
286 | 
287 | class RobotsTxtParser {
288 |  public:
289 |   typedef ParsedRobotsKey Key;
290 | 
291 |   RobotsTxtParser(absl::string_view robots_body,
292 |                   RobotsParseHandler* handler)
293 |       : robots_body_(robots_body), handler_(handler) {
294 |   }
295 | 
296 |   void Parse();
297 | 
298 |  private:
299 |   // Note that `key` and `value` are only set when `metadata->has_directive
300 |   // == true`.
301 |   static void GetKeyAndValueFrom(char** key, char** value, char* line,
302 |                                  RobotsParseHandler::LineMetadata* metadata);
303 |   static void StripWhitespaceSlowly(char ** s);
304 | 
305 |   void ParseAndEmitLine(int current_line, char* line,
306 |                         bool* line_too_long_strict);
307 |   bool NeedEscapeValueForKey(const Key& key);
308 | 
309 |   absl::string_view robots_body_;
310 |   RobotsParseHandler* const handler_;
311 | };
312 | 
313 | bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) {
314 |   switch (key.type()) {
315 |     case RobotsTxtParser::Key::USER_AGENT:
316 |     case RobotsTxtParser::Key::SITEMAP:
317 |       return false;
318 |     default:
319 |       return true;
320 |   }
321 | }
322 | 
323 | // Removes leading and trailing whitespace from null-terminated string s.
324 | /* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) {
325 |   absl::string_view stripped = absl::StripAsciiWhitespace(*s);
326 |   *s = const_cast<char*>(stripped.data());
327 |   (*s)[stripped.size()] = '\0';
328 | }
329 | 
330 | void RobotsTxtParser::GetKeyAndValueFrom(
331 |     char** key, char** value, char* line,
332 |     RobotsParseHandler::LineMetadata* metadata) {
333 |   // Remove comments from the current robots.txt line.
334 |   char* const comment = strchr(line, '#');
335 |   if (nullptr != comment) {
336 |     metadata->has_comment = true;
337 |     *comment = '\0';
338 |   }
339 |   StripWhitespaceSlowly(&line);
340 |   // If the line became empty after removing the comment, return.
341 |   if (strlen(line) == 0) {
342 |     if (metadata->has_comment) {
343 |       metadata->is_comment = true;
344 |     } else {
345 |       metadata->is_empty = true;
346 |     }
347 |     return;
348 |   }
349 | 
350 |   // Rules must match the following pattern:
351 |   //   <key>[ \t]*:[ \t]*<value>
352 |   char* sep = strchr(line, ':');
353 |   if (nullptr == sep) {
354 |     // Google-specific optimization: some people forget the colon, so we need to
355 |     // accept whitespace in its stead.
356 |     static const char * const kWhite = " \t";
357 |     sep = strpbrk(line, kWhite);
358 |     if (nullptr != sep) {
359 |       const char* const val = sep + strspn(sep, kWhite);
360 |       assert(*val);  // since we dropped trailing whitespace above.
361 |       if (nullptr != strpbrk(val, kWhite)) {
362 |         // We only accept whitespace as a separator if there are exactly two
363 |         // sequences of non-whitespace characters.  If we get here, there were
364 |         // more than 2 such sequences since we stripped trailing whitespace
365 |         // above.
366 |         return;
367 |       }
368 |       metadata->is_missing_colon_separator = true;
369 |     }
370 |   }
371 |   if (nullptr == sep) {
372 |     return;                     // Couldn't find a separator.
373 |   }
374 | 
375 |   *key = line;                        // Key starts at beginning of line.
376 |   *sep = '\0';                        // And stops at the separator.
377 |   StripWhitespaceSlowly(key);         // Get rid of any trailing whitespace.
378 | 
379 |   if (strlen(*key) > 0) {
380 |     *value = 1 + sep;                 // Value starts after the separator.
381 |     StripWhitespaceSlowly(value);     // Get rid of any leading whitespace.
382 |     metadata->has_directive = true;
383 |     return;
384 |   }
385 | }
386 | 
387 | void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line,
388 |                                        bool* line_too_long_strict) {
389 |   char* string_key;
390 |   char* value;
391 |   RobotsParseHandler::LineMetadata line_metadata;
392 |   // Note that `string_key` and `value` are only set when
393 |   // `line_metadata->has_directive == true`.
394 |   GetKeyAndValueFrom(&string_key, &value, line, &line_metadata);
395 |   line_metadata.is_line_too_long = *line_too_long_strict;
396 |   if (!line_metadata.has_directive) {
397 |     handler_->ReportLineMetadata(current_line, line_metadata);
398 |     return;
399 |   }
400 |   Key key;
401 |   key.Parse(string_key, &line_metadata.is_acceptable_typo);
402 |   if (NeedEscapeValueForKey(key)) {
403 |     char* escaped_value = nullptr;
404 |     const bool is_escaped = MaybeEscapePattern(value, &escaped_value);
405 |     EmitKeyValueToHandler(current_line, key, escaped_value, handler_);
406 |     if (is_escaped) delete[] escaped_value;
407 |   } else {
408 |     EmitKeyValueToHandler(current_line, key, value, handler_);
409 |   }
410 |   // Finish adding metadata to the line.
411 |   handler_->ReportLineMetadata(current_line, line_metadata);
412 | }
413 | 
414 | void RobotsTxtParser::Parse() {
415 |   // UTF-8 byte order marks.
416 |   static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF};
417 | 
418 |   // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
419 |   // fairly safe to assume any valid line isn't going to be more than many times
420 |   // that max url length of 2KB. We want some padding for
421 |   // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
422 |   // If so, we can ignore the chars on a line past that.
423 |   const int kBrowserMaxLineLen = 2083;
424 |   const int kMaxLineLen = kBrowserMaxLineLen * 8;
425 |   // Allocate a buffer used to process the current line.
426 |   char* const line_buffer = new char[kMaxLineLen];
427 |   // last_line_pos is the last writeable pos within the line array
428 |   // (only a final '\0' may go here).
429 |   const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
430 |   bool line_too_long_strict = false;
431 |   char* line_pos = line_buffer;
432 |   int line_num = 0;
433 |   size_t bom_pos = 0;
434 |   bool last_was_carriage_return = false;
435 |   handler_->HandleRobotsStart();
436 | 
437 |       {
438 |         for (const unsigned char ch : robots_body_) {
439 |       ABSL_ASSERT(line_pos <= line_buffer_end);
440 |       // Google-specific optimization: UTF-8 byte order marks should never
441 |       // appear in a robots.txt file, but they do nevertheless. Skipping
442 |       // possible BOM-prefix in the first bytes of the input.
443 |       if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) {
444 |         continue;
445 |       }
446 |       bom_pos = sizeof(utf_bom);
447 |       if (ch != 0x0A && ch != 0x0D) {  // Non-line-ending char case.
448 |         // Put in next spot on current line, as long as there's room.
449 |         if (line_pos < line_buffer_end) {
450 |           *(line_pos++) = ch;
451 |         } else {
452 |           line_too_long_strict = true;
453 |         }
454 |       } else {                         // Line-ending character char case.
455 |         *line_pos = '\0';
456 |         // Only emit an empty line if this was not due to the second character
457 |         // of the DOS line-ending \r\n .
458 |         const bool is_CRLF_continuation =
459 |             (line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
460 |         if (!is_CRLF_continuation) {
461 |           ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
462 |           line_too_long_strict = false;
463 |         }
464 |         line_pos = line_buffer;
465 |         last_was_carriage_return = (ch == 0x0D);
466 |       }
467 |     }
468 |   }
469 |   *line_pos = '\0';
470 |   ParseAndEmitLine(++line_num, line_buffer, &line_too_long_strict);
471 |   handler_->HandleRobotsEnd();
472 |   delete [] line_buffer;
473 | }
474 | 
475 | // Implements the default robots.txt matching strategy. The maximum number of
476 | // characters matched by a pattern is returned as its match priority.
477 | class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
478 |  public:
479 |   LongestMatchRobotsMatchStrategy() { }
480 | 
481 |   // Disallow copying and assignment.
482 |   LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
483 |       delete;
484 |   LongestMatchRobotsMatchStrategy& operator=(
485 |       const LongestMatchRobotsMatchStrategy&) = delete;
486 | 
487 |   int MatchAllow(absl::string_view path, absl::string_view pattern) override;
488 |   int MatchDisallow(absl::string_view path, absl::string_view pattern) override;
489 | };
490 | }  // end anonymous namespace
491 | 
492 | void ParseRobotsTxt(absl::string_view robots_body,
493 |                     RobotsParseHandler* parse_callback) {
494 |   RobotsTxtParser parser(robots_body, parse_callback);
495 |   parser.Parse();
496 | }
497 | 
498 | RobotsMatcher::RobotsMatcher()
499 |     : seen_global_agent_(false),
500 |       seen_specific_agent_(false),
501 |       ever_seen_specific_agent_(false),
502 |       seen_separator_(false),
503 |       path_(nullptr),
504 |       user_agents_(nullptr) {
505 |   match_strategy_ = new LongestMatchRobotsMatchStrategy();
506 | }
507 | 
508 | RobotsMatcher::~RobotsMatcher() {
509 |   delete match_strategy_;
510 | }
511 | 
512 | bool RobotsMatcher::ever_seen_specific_agent() const {
513 |   return ever_seen_specific_agent_;
514 | }
515 | 
516 | void RobotsMatcher::InitUserAgentsAndPath(
517 |     const std::vector<std::string>* user_agents, const char* path) {
518 |   // The RobotsParser object doesn't own path_ or user_agents_, so overwriting
519 |   // these pointers doesn't cause a memory leak.
520 |   path_ = path;
521 |   ABSL_ASSERT('/' == *path_);
522 |   user_agents_ = user_agents;
523 | }
524 | 
525 | bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
526 |                                     const std::vector<std::string>* user_agents,
527 |                                     const std::string& url) {
528 |   // The url is not normalized (escaped, percent encoded) here because the user
529 |   // is asked to provide it in escaped form already.
530 |   std::string path = GetPathParamsQuery(url);
531 |   InitUserAgentsAndPath(user_agents, path.c_str());
532 |   ParseRobotsTxt(robots_body, this);
533 |   return !disallow();
534 | }
535 | 
536 | bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
537 |                                             const std::string& user_agent,
538 |                                             const std::string& url) {
539 |   std::vector<std::string> v;
540 |   v.push_back(user_agent);
541 |   return AllowedByRobots(robots_txt, &v, url);
542 | }
543 | 
544 | bool RobotsMatcher::disallow() const {
545 |   if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
546 |     return (disallow_.specific.priority() > allow_.specific.priority());
547 |   }
548 | 
549 |   if (ever_seen_specific_agent_) {
550 |     // Matching group for user-agent but either without disallow or empty one,
551 |     // i.e. priority == 0.
552 |     return false;
553 |   }
554 | 
555 |   if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) {
556 |     return disallow_.global.priority() > allow_.global.priority();
557 |   }
558 |   return false;
559 | }
560 | 
561 | bool RobotsMatcher::disallow_ignore_global() const {
562 |   if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
563 |     return disallow_.specific.priority() > allow_.specific.priority();
564 |   }
565 |   return false;
566 | }
567 | 
568 | int RobotsMatcher::matching_line() const {
569 |   if (ever_seen_specific_agent_) {
570 |     return Match::HigherPriorityMatch(disallow_.specific, allow_.specific)
571 |         .line();
572 |   }
573 |   return Match::HigherPriorityMatch(disallow_.global, allow_.global).line();
574 | }
575 | 
576 | void RobotsMatcher::HandleRobotsStart() {
577 |   // This is a new robots.txt file, so we need to reset all the instance member
578 |   // variables. We do it in the same order the instance member variables are
579 |   // declared, so it's easier to keep track of which ones we have (or maybe
580 |   // haven't!) done.
581 |   allow_.Clear();
582 |   disallow_.Clear();
583 | 
584 |   seen_global_agent_ = false;
585 |   seen_specific_agent_ = false;
586 |   ever_seen_specific_agent_ = false;
587 |   seen_separator_ = false;
588 | }
589 | 
590 | /*static*/ absl::string_view RobotsMatcher::ExtractUserAgent(
591 |     absl::string_view user_agent) {
592 |   // Allowed characters in user-agent are [a-zA-Z_-].
593 |   const char* end = user_agent.data();
594 |   while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') {
595 |     ++end;
596 |   }
597 |   return user_agent.substr(0, end - user_agent.data());
598 | }
599 | 
600 | /*static*/ bool RobotsMatcher::IsValidUserAgentToObey(
601 |     absl::string_view user_agent) {
602 |   return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent;
603 | }
604 | 
605 | void RobotsMatcher::HandleUserAgent(int line_num,
606 |                                     absl::string_view user_agent) {
607 |   if (seen_separator_) {
608 |     seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
609 |   }
610 | 
611 |   // Google-specific optimization: a '*' followed by space and more characters
612 |   // in a user-agent record is still regarded a global rule.
613 |   if (user_agent.length() >= 1 && user_agent[0] == '*' &&
614 |       (user_agent.length() == 1 || isspace(user_agent[1]))) {
615 |     seen_global_agent_ = true;
616 |   } else {
617 |     user_agent = ExtractUserAgent(user_agent);
618 |     for (const auto& agent : *user_agents_) {
619 |       if (absl::EqualsIgnoreCase(user_agent, agent)) {
620 |         ever_seen_specific_agent_ = seen_specific_agent_ = true;
621 |         break;
622 |       }
623 |     }
624 |   }
625 | }
626 | 
627 | void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) {
628 |   if (!seen_any_agent()) return;
629 |   seen_separator_ = true;
630 |   const int priority = match_strategy_->MatchAllow(path_, value);
631 |   if (priority >= 0) {
632 |     if (seen_specific_agent_) {
633 |       if (allow_.specific.priority() < priority) {
634 |         allow_.specific.Set(priority, line_num);
635 |       }
636 |     } else {
637 |       assert(seen_global_agent_);
638 |       if (allow_.global.priority() < priority) {
639 |         allow_.global.Set(priority, line_num);
640 |       }
641 |     }
642 |   } else {
643 |     // Google-specific optimization: 'index.htm' and 'index.html' are normalized
644 |     // to '/'.
645 |     const size_t slash_pos = value.find_last_of('/');
646 | 
647 |     if (slash_pos != absl::string_view::npos &&
648 |         absl::StartsWith(absl::ClippedSubstr(value, slash_pos),
649 |                             "/index.htm")) {
650 |       const int len = slash_pos + 1;
651 |       absl::FixedArray<char> newpattern(len + 1);
652 |       strncpy(newpattern.data(), value.data(), len);
653 |       newpattern[len] = '$';
654 |       HandleAllow(line_num,
655 |                   absl::string_view(newpattern.data(), newpattern.size()));
656 |     }
657 |   }
658 | }
659 | 
660 | void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) {
661 |   if (!seen_any_agent()) return;
662 |   seen_separator_ = true;
663 |   const int priority = match_strategy_->MatchDisallow(path_, value);
664 |   if (priority >= 0) {
665 |     if (seen_specific_agent_) {
666 |       if (disallow_.specific.priority() < priority) {
667 |         disallow_.specific.Set(priority, line_num);
668 |       }
669 |     } else {
670 |       assert(seen_global_agent_);
671 |       if (disallow_.global.priority() < priority) {
672 |         disallow_.global.Set(priority, line_num);
673 |       }
674 |     }
675 |   }
676 | }
677 | 
678 | int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path,
679 |                                                 absl::string_view pattern) {
680 |   return Matches(path, pattern) ? pattern.length() : -1;
681 | }
682 | 
683 | int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path,
684 |                                                    absl::string_view pattern) {
685 |   return Matches(path, pattern) ? pattern.length() : -1;
686 | }
687 | 
688 | void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {}
689 | 
690 | void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action,
691 |                                         absl::string_view value) {}
692 | 
693 | void ParsedRobotsKey::Parse(absl::string_view key, bool* is_acceptable_typo) {
694 |   key_text_ = absl::string_view();
695 |   if (KeyIsUserAgent(key, is_acceptable_typo)) {
696 |     type_ = USER_AGENT;
697 |   } else if (KeyIsAllow(key, is_acceptable_typo)) {
698 |     type_ = ALLOW;
699 |   } else if (KeyIsDisallow(key, is_acceptable_typo)) {
700 |     type_ = DISALLOW;
701 |   } else if (KeyIsSitemap(key, is_acceptable_typo)) {
702 |     type_ = SITEMAP;
703 |   } else {
704 |     type_ = UNKNOWN;
705 |     key_text_ = key;
706 |   }
707 | }
708 | 
709 | absl::string_view ParsedRobotsKey::GetUnknownText() const {
710 |   ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty());
711 |   return key_text_;
712 | }
713 | 
714 | bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key,
715 |                                      bool* is_acceptable_typo) {
716 |   *is_acceptable_typo =
717 |       (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") ||
718 |                                absl::StartsWithIgnoreCase(key, "user agent")));
719 |   return (absl::StartsWithIgnoreCase(key, "user-agent") || *is_acceptable_typo);
720 | }
721 | 
722 | bool ParsedRobotsKey::KeyIsAllow(absl::string_view key,
723 |                                  bool* is_acceptable_typo) {
724 |   // We don't support typos for the "allow" key.
725 |   *is_acceptable_typo = false;
726 |   return absl::StartsWithIgnoreCase(key, "allow");
727 | }
728 | 
729 | bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key,
730 |                                     bool* is_acceptable_typo) {
731 |   *is_acceptable_typo =
732 |       (kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) ||
733 |                                (absl::StartsWithIgnoreCase(key, "dissalow")) ||
734 |                                (absl::StartsWithIgnoreCase(key, "disalow")) ||
735 |                                (absl::StartsWithIgnoreCase(key, "diasllow")) ||
736 |                                (absl::StartsWithIgnoreCase(key, "disallaw"))));
737 |   return (absl::StartsWithIgnoreCase(key, "disallow") || *is_acceptable_typo);
738 | }
739 | 
740 | bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key,
741 |                                    bool* is_acceptable_typo) {
742 |   *is_acceptable_typo =
743 |       (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "site-map")));
744 |   return absl::StartsWithIgnoreCase(key, "sitemap") || *is_acceptable_typo;
745 | }
746 | 
747 | }  // namespace googlebot
748 | 


--------------------------------------------------------------------------------
/robots.h:
--------------------------------------------------------------------------------
  1 | // Copyright 1999 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     https://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | //
 15 | // -----------------------------------------------------------------------------
 16 | // File: robots.h
 17 | // -----------------------------------------------------------------------------
 18 | //
 19 | // This file implements the standard defined by the Robots Exclusion Protocol
 20 | // (REP) internet draft (I-D).
 21 | //   https://www.rfc-editor.org/rfc/rfc9309.html
 22 | //
 23 | // Google doesn't follow the standard strictly, because there are a lot of
 24 | // non-conforming robots.txt files out there, and we err on the side of
 25 | // disallowing when this seems intended.
 26 | //
 27 | // An more user-friendly description of how Google handles robots.txt can be
 28 | // found at:
 29 | //   https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt
 30 | //
 31 | // This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
 32 | // and a matcher for URLs against a robots.txt (class RobotsMatcher).
 33 | 
 34 | #ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
 35 | #define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
 36 | 
 37 | #include <string>
 38 | #include <vector>
 39 | 
 40 | #include "absl/strings/string_view.h"
 41 | 
 42 | namespace googlebot {
 43 | // Handler for directives found in robots.txt. These callbacks are called by
 44 | // ParseRobotsTxt() in the sequence they have been found in the file.
 45 | class RobotsParseHandler {
 46 |  public:
 47 |   RobotsParseHandler() {}
 48 |   virtual ~RobotsParseHandler() {}
 49 | 
 50 |   // Disallow copying and assignment.
 51 |   RobotsParseHandler(const RobotsParseHandler&) = delete;
 52 |   RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
 53 | 
 54 |   virtual void HandleRobotsStart() = 0;
 55 |   virtual void HandleRobotsEnd() = 0;
 56 | 
 57 |   virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
 58 |   virtual void HandleAllow(int line_num, absl::string_view value) = 0;
 59 |   virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
 60 | 
 61 |   virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
 62 | 
 63 |   // Any other unrecognized name/value pairs.
 64 |   virtual void HandleUnknownAction(int line_num, absl::string_view action,
 65 |                                    absl::string_view value) = 0;
 66 | 
 67 |   struct LineMetadata {
 68 |     // Indicates if the line is totally empty.
 69 |     bool is_empty = false;
 70 |     // Indicates if the line has a comment (may have content before it).
 71 |     bool has_comment = false;
 72 |     // Indicates if the whole line is a comment.
 73 |     bool is_comment = false;
 74 |     // Indicates that the line has a valid robots.txt directive and one of the
 75 |     // `Handle*` methods will be called.
 76 |     bool has_directive = false;
 77 |     // Indicates that the found directive is one of the acceptable typo variants
 78 |     // of the directive. See the key functions in ParsedRobotsKey for accepted
 79 |     // typos.
 80 |     bool is_acceptable_typo = false;
 81 |     // Indicates that the line is too long, specifically over 2083 * 8 bytes.
 82 |     bool is_line_too_long = false;
 83 |     // Indicates that the key-value pair is missing the colon separator.
 84 |     bool is_missing_colon_separator = false;
 85 |   };
 86 | 
 87 |   virtual void ReportLineMetadata(int line_num, const LineMetadata& metadata) {}
 88 | };
 89 | 
 90 | // Parses body of a robots.txt and emits parse callbacks. This will accept
 91 | // typical typos found in robots.txt, such as 'disalow'.
 92 | //
 93 | // Note, this function will accept all kind of input but will skip
 94 | // everything that does not look like a robots directive.
 95 | void ParseRobotsTxt(absl::string_view robots_body,
 96 |                     RobotsParseHandler* parse_callback);
 97 | 
 98 | // RobotsMatcher - matches robots.txt against URLs.
 99 | //
100 | // The Matcher uses a default match strategy for Allow/Disallow patterns which
101 | // is the official way of Google crawler to match robots.txt. It is also
102 | // possible to provide a custom match strategy.
103 | //
104 | // The entry point for the user is to call one of the *AllowedByRobots()
105 | // methods that return directly if a URL is being allowed according to the
106 | // robots.txt and the crawl agent.
107 | // The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
108 | class RobotsMatchStrategy;
109 | class RobotsMatcher : protected RobotsParseHandler {
110 |  public:
111 |   // Create a RobotsMatcher with the default matching strategy. The default
112 |   // matching strategy is longest-match as opposed to the former internet draft
113 |   // that provisioned first-match strategy. Analysis shows that longest-match,
114 |   // while more restrictive for crawlers, is what webmasters assume when writing
115 |   // directives. For example, in case of conflicting matches (both Allow and
116 |   // Disallow), the longest match is the one the user wants. For example, in
117 |   // case of a robots.txt file that has the following rules
118 |   //   Allow: /
119 |   //   Disallow: /cgi-bin
120 |   // it's pretty obvious what the webmaster wants: they want to allow crawl of
121 |   // every URI except /cgi-bin. However, according to the expired internet
122 |   // standard, crawlers should be allowed to crawl everything with such a rule.
123 |   RobotsMatcher();
124 | 
125 |   ~RobotsMatcher() override;
126 | 
127 |   // Disallow copying and assignment.
128 |   RobotsMatcher(const RobotsMatcher&) = delete;
129 |   RobotsMatcher& operator=(const RobotsMatcher&) = delete;
130 | 
131 |   // Verifies that the given user agent is valid to be matched against
132 |   // robots.txt. Valid user agent strings only contain the characters
133 |   // [a-zA-Z_-].
134 |   static bool IsValidUserAgentToObey(absl::string_view user_agent);
135 | 
136 |   // Returns true iff 'url' is allowed to be fetched by any member of the
137 |   // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
138 |   bool AllowedByRobots(absl::string_view robots_body,
139 |                        const std::vector<std::string>* user_agents,
140 |                        const std::string& url);
141 | 
142 |   // Do robots check for 'url' when there is only one user agent. 'url' must
143 |   // be %-encoded according to RFC3986.
144 |   bool OneAgentAllowedByRobots(absl::string_view robots_txt,
145 |                                const std::string& user_agent,
146 |                                const std::string& url);
147 | 
148 |   // Returns true if we are disallowed from crawling a matching URI.
149 |   bool disallow() const;
150 | 
151 |   // Returns true if we are disallowed from crawling a matching URI. Ignores any
152 |   // rules specified for the default user agent, and bases its results only on
153 |   // the specified user agents.
154 |   bool disallow_ignore_global() const;
155 | 
156 |   // Returns true iff, when AllowedByRobots() was called, the robots file
157 |   // referred explicitly to one of the specified user agents.
158 |   bool ever_seen_specific_agent() const;
159 | 
160 |   // Returns the line that matched or 0 if none matched.
161 |   int matching_line() const;
162 | 
163 |  protected:
164 |   // Parse callbacks.
165 |   // Protected because used in unittest. Never override RobotsMatcher, implement
166 |   // googlebot::RobotsParseHandler instead.
167 |   void HandleRobotsStart() override;
168 |   void HandleRobotsEnd() override {}
169 | 
170 |   void HandleUserAgent(int line_num, absl::string_view user_agent) override;
171 |   void HandleAllow(int line_num, absl::string_view value) override;
172 |   void HandleDisallow(int line_num, absl::string_view value) override;
173 | 
174 |   void HandleSitemap(int line_num, absl::string_view value) override;
175 |   void HandleUnknownAction(int line_num, absl::string_view action,
176 |                            absl::string_view value) override;
177 | 
178 |  protected:
179 |   // Extract the matchable part of a user agent string, essentially stopping at
180 |   // the first invalid character.
181 |   // Example: 'Googlebot/2.1' becomes 'Googlebot'
182 |   static absl::string_view ExtractUserAgent(absl::string_view user_agent);
183 | 
184 |   // Initialize next path and user-agents to check. Path must contain only the
185 |   // path, params, and query (if any) of the url and must start with a '/'.
186 |   void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
187 |                              const char* path);
188 | 
189 |   // Returns true if any user-agent was seen.
190 |   bool seen_any_agent() const {
191 |     return seen_global_agent_ || seen_specific_agent_;
192 |   }
193 | 
194 |   // Instead of just maintaining a Boolean indicating whether a given line has
195 |   // matched, we maintain a count of the maximum number of characters matched by
196 |   // that pattern.
197 |   //
198 |   // This structure stores the information associated with a match (e.g. when a
199 |   // Disallow is matched) as priority of the match and line matching.
200 |   //
201 |   // The priority is initialized with a negative value to make sure that a match
202 |   // of priority 0 is higher priority than no match at all.
203 |   class Match {
204 |    private:
205 |     static const int kNoMatchPriority = -1;
206 | 
207 |    public:
208 |     Match(int priority, int line) : priority_(priority), line_(line) {}
209 |     Match() : priority_(kNoMatchPriority), line_(0) {}
210 | 
211 |     void Set(int priority, int line) {
212 |       priority_ = priority;
213 |       line_ = line;
214 |     }
215 | 
216 |     void Clear() { Set(kNoMatchPriority, 0); }
217 | 
218 |     int line() const { return line_; }
219 |     int priority() const { return priority_; }
220 | 
221 |     static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
222 |       if (a.priority() > b.priority()) {
223 |         return a;
224 |       } else {
225 |         return b;
226 |       }
227 |     }
228 | 
229 |    private:
230 |     int priority_;
231 |     int line_;
232 |   };
233 | 
234 |   // For each of the directives within user-agents, we keep global and specific
235 |   // match scores.
236 |   struct MatchHierarchy {
237 |     Match global;    // Match for '*'
238 |     Match specific;  // Match for queried agent.
239 |     void Clear() {
240 |       global.Clear();
241 |       specific.Clear();
242 |     }
243 |   };
244 |   MatchHierarchy allow_;     // Characters of 'url' matching Allow.
245 |   MatchHierarchy disallow_;  // Characters of 'url' matching Disallow.
246 | 
247 |   bool seen_global_agent_;         // True if processing global agent rules.
248 |   bool seen_specific_agent_;       // True if processing our specific agent.
249 |   bool ever_seen_specific_agent_;  // True if we ever saw a block for our agent.
250 |   bool seen_separator_;            // True if saw any key: value pair.
251 | 
252 |   // The path we want to pattern match. Not owned and only a valid pointer
253 |   // during the lifetime of *AllowedByRobots calls.
254 |   const char* path_;
255 |   // The User-Agents we are interested in. Not owned and only a valid
256 |   // pointer during the lifetime of *AllowedByRobots calls.
257 |   const std::vector<std::string>* user_agents_;
258 | 
259 |   RobotsMatchStrategy* match_strategy_;
260 | };
261 | 
262 | }  // namespace googlebot
263 | #endif  // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
264 | 


--------------------------------------------------------------------------------
/robots_main.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     https://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | //
 15 | // -----------------------------------------------------------------------------
 16 | // File: robots_main.cc
 17 | // -----------------------------------------------------------------------------
 18 | //
 19 | // Simple binary to assess whether a URL is accessible to a user-agent according
 20 | // to records found in a local robots.txt file, based on Google's robots.txt
 21 | // parsing and matching algorithms.
 22 | // Usage:
 23 | //     robots_main <local_path_to_robotstxt> <user_agent> <url>
 24 | // Arguments:
 25 | // local_path_to_robotstxt: local path to a file containing robots.txt records.
 26 | //   For example: /home/users/username/robots.txt
 27 | // user_agent: a token to be matched against records in the robots.txt.
 28 | //   For example: Googlebot
 29 | // url: a url to be matched against records in the robots.txt. The URL must be
 30 | // %-encoded according to RFC3986.
 31 | //   For example: https://example.com/accessible/url.html
 32 | // Output: Prints a sentence with verdict about whether 'user_agent' is allowed
 33 | // to access 'url' based on records in 'local_path_to_robotstxt'.
 34 | // Return code:
 35 | //   0 when the url is ALLOWED for the user_agent.
 36 | //   1 when the url is DISALLOWED for the user_agent.
 37 | //   2 when --help is requested or if there is something invalid in the flags
 38 | //   passed.
 39 | //
 40 | #include <fstream>
 41 | #include <iostream>
 42 | 
 43 | #include "robots.h"
 44 | 
 45 | bool LoadFile(const std::string& filename, std::string* result) {
 46 |   std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
 47 |   if (file.is_open()) {
 48 |     size_t size = file.tellg();
 49 |     std::vector<char> buffer(size);
 50 |     file.seekg(0, std::ios::beg);
 51 |     file.read(buffer.data(), size);
 52 |     file.close();
 53 |     if (!file) return false;  // file reading error (failbit or badbit).
 54 |     result->assign(buffer.begin(), buffer.end());
 55 |     return true;
 56 |   }
 57 |   return false;
 58 | }
 59 | 
 60 | void ShowHelp(int argc, char** argv) {
 61 |   std::cerr << "Shows whether the given user_agent and URI combination"
 62 |             << " is allowed or disallowed by the given robots.txt file. "
 63 |             << std::endl
 64 |             << std::endl;
 65 |   std::cerr << "Usage: " << std::endl
 66 |             << "  " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
 67 |             << std::endl
 68 |             << std::endl;
 69 |   std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
 70 |             << std::endl;
 71 |   std::cerr << "Example: " << std::endl
 72 |             << "  " << argv[0] << " robots.txt FooBot http://example.com/foo"
 73 |             << std::endl;
 74 | }
 75 | 
 76 | int main(int argc, char** argv) {
 77 |   std::string filename = argc >= 2 ? argv[1] : "";
 78 |   if (filename == "-h" || filename == "-help" || filename == "--help") {
 79 |     ShowHelp(argc, argv);
 80 |     return 2;
 81 |   }
 82 |   if (argc != 4) {
 83 |     std::cerr << "Invalid amount of arguments. Showing help." << std::endl
 84 |               << std::endl;
 85 |     ShowHelp(argc, argv);
 86 |     return 2;
 87 |   }
 88 |   std::string robots_content;
 89 |   if (!(LoadFile(filename, &robots_content))) {
 90 |     std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
 91 |     return 2;
 92 |   }
 93 | 
 94 |   std::string user_agent = argv[2];
 95 |   std::vector<std::string> user_agents(1, user_agent);
 96 |   googlebot::RobotsMatcher matcher;
 97 |   std::string url = argv[3];
 98 |   bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
 99 | 
100 |   std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
101 |             << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
102 |   if (robots_content.empty()) {
103 |     std::cout << "notice: robots file is empty so all user-agents are allowed"
104 |               << std::endl;
105 |   }
106 | 
107 |   return allowed ? 0 : 1;
108 | }
109 | 


--------------------------------------------------------------------------------
/robots_test.cc:
--------------------------------------------------------------------------------
   1 | // Copyright 2019 Google LLC
   2 | //
   3 | // Licensed under the Apache License, Version 2.0 (the "License");
   4 | // you may not use this file except in compliance with the License.
   5 | // You may obtain a copy of the License at
   6 | //
   7 | //     https://www.apache.org/licenses/LICENSE-2.0
   8 | //
   9 | // Unless required by applicable law or agreed to in writing, software
  10 | // distributed under the License is distributed on an "AS IS" BASIS,
  11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 | // See the License for the specific language governing permissions and
  13 | // limitations under the License.
  14 | //
  15 | // This file tests the robots.txt parsing and matching code found in robots.cc
  16 | // against the current Robots Exclusion Protocol (REP) RFC.
  17 | // https://www.rfc-editor.org/rfc/rfc9309.html
  18 | #include "robots.h"
  19 | 
  20 | #include <string>
  21 | 
  22 | #include "gtest/gtest.h"
  23 | #include "absl/strings/str_cat.h"
  24 | #include "absl/strings/string_view.h"
  25 | 
  26 | namespace {
  27 | 
  28 | using ::googlebot::RobotsMatcher;
  29 | 
  30 | bool IsUserAgentAllowed(const absl::string_view robotstxt,
  31 |                         const std::string& useragent, const std::string& url) {
  32 |   RobotsMatcher matcher;
  33 |   return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
  34 | }
  35 | 
  36 | // Google-specific: system test.
  37 | TEST(RobotsUnittest, GoogleOnly_SystemTest) {
  38 |   const absl::string_view robotstxt =
  39 |       "user-agent: FooBot\n"
  40 |       "disallow: /\n";
  41 |   // Empty robots.txt: everything allowed.
  42 |   EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", ""));
  43 | 
  44 |   // Empty user-agent to be matched: everything allowed.
  45 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", ""));
  46 | 
  47 |   // Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
  48 |   // in robots.cc.
  49 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", ""));
  50 | 
  51 |   // All params empty: same as robots.txt empty, everything allowed.
  52 |   EXPECT_TRUE(IsUserAgentAllowed("", "", ""));
  53 | }
  54 | // Rules are colon separated name-value pairs. The following names are
  55 | // provisioned:
  56 | //     user-agent: <value>
  57 | //     allow: <value>
  58 | //     disallow: <value>
  59 | // See REP RFC section "Protocol Definition".
  60 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
  61 | //
  62 | // Google specific: webmasters sometimes miss the colon separator, but it's
  63 | // obvious what they mean by "disallow /", so we assume the colon if it's
  64 | // missing.
  65 | TEST(RobotsUnittest, ID_LineSyntax_Line) {
  66 |   const absl::string_view robotstxt_correct =
  67 |       "user-agent: FooBot\n"
  68 |       "disallow: /\n";
  69 |   const absl::string_view robotstxt_incorrect =
  70 |       "foo: FooBot\n"
  71 |       "bar: /\n";
  72 |   const absl::string_view robotstxt_incorrect_accepted =
  73 |       "user-agent FooBot\n"
  74 |       "disallow /\n";
  75 |   const std::string url = "http://foo.bar/x/y";
  76 | 
  77 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
  78 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
  79 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
  80 | }
  81 | 
  82 | // A group is one or more user-agent line followed by rules, and terminated
  83 | // by a another user-agent line. Rules for same user-agents are combined
  84 | // opaquely into one group. Rules outside groups are ignored.
  85 | // See REP RFC section "Protocol Definition".
  86 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
  87 | TEST(RobotsUnittest, ID_LineSyntax_Groups) {
  88 |   const absl::string_view robotstxt =
  89 |       "allow: /foo/bar/\n"
  90 |       "\n"
  91 |       "user-agent: FooBot\n"
  92 |       "disallow: /\n"
  93 |       "allow: /x/\n"
  94 |       "user-agent: BarBot\n"
  95 |       "disallow: /\n"
  96 |       "allow: /y/\n"
  97 |       "\n"
  98 |       "\n"
  99 |       "allow: /w/\n"
 100 |       "user-agent: BazBot\n"
 101 |       "\n"
 102 |       "user-agent: FooBot\n"
 103 |       "allow: /z/\n"
 104 |       "disallow: /\n";
 105 | 
 106 |   const std::string url_w = "http://foo.bar/w/a";
 107 |   const std::string url_x = "http://foo.bar/x/b";
 108 |   const std::string url_y = "http://foo.bar/y/c";
 109 |   const std::string url_z = "http://foo.bar/z/d";
 110 |   const std::string url_foo = "http://foo.bar/foo/bar/";
 111 | 
 112 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x));
 113 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z));
 114 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y));
 115 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y));
 116 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w));
 117 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z));
 118 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z));
 119 | 
 120 |   // Lines with rules outside groups are ignored.
 121 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
 122 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
 123 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
 124 | }
 125 | 
 126 | // Group must not be closed by rules not explicitly defined in the REP RFC.
 127 | // See REP RFC section "Protocol Definition".
 128 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
 129 | TEST(RobotsUnittest, ID_LineSyntax_Groups_OtherRules) {
 130 |   {
 131 |     const absl::string_view robotstxt =
 132 |         "User-agent: BarBot\n"
 133 |         "Sitemap: https://foo.bar/sitemap\n"
 134 |         "User-agent: *\n"
 135 |         "Disallow: /\n";
 136 |     std::string url = "http://foo.bar/";
 137 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 138 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url));
 139 |   }
 140 |   {
 141 |     const absl::string_view robotstxt =
 142 |         "User-agent: FooBot\n"
 143 |         "Invalid-Unknown-Line: unknown\n"
 144 |         "User-agent: *\n"
 145 |         "Disallow: /\n";
 146 |     std::string url = "http://foo.bar/";
 147 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 148 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url));
 149 |   }
 150 | }
 151 | 
 152 | // REP lines are case insensitive. See REP RFC section "Protocol Definition".
 153 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.1
 154 | TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
 155 |   const absl::string_view robotstxt_upper =
 156 |       "USER-AGENT: FooBot\n"
 157 |       "ALLOW: /x/\n"
 158 |       "DISALLOW: /\n";
 159 |   const absl::string_view robotstxt_lower =
 160 |       "user-agent: FooBot\n"
 161 |       "allow: /x/\n"
 162 |       "disallow: /\n";
 163 |   const absl::string_view robotstxt_camel =
 164 |       "uSeR-aGeNt: FooBot\n"
 165 |       "AlLoW: /x/\n"
 166 |       "dIsAlLoW: /\n";
 167 |   const std::string url_allowed = "http://foo.bar/x/y";
 168 |   const std::string url_disallowed = "http://foo.bar/a/b";
 169 | 
 170 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
 171 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
 172 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
 173 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
 174 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
 175 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
 176 | }
 177 | 
 178 | // A user-agent line is expected to contain only [a-zA-Z_-] characters and must
 179 | // not be empty. See REP RFC section "The user-agent line".
 180 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
 181 | TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) {
 182 |   EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot"));
 183 |   EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar"));
 184 |   EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar"));
 185 | 
 186 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view()));
 187 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(""));
 188 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ"));
 189 | 
 190 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*"));
 191 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot "));
 192 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1"));
 193 | 
 194 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
 195 | }
 196 | 
 197 | // User-agent line values are case insensitive. See REP RFC section "The
 198 | // user-agent line".
 199 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
 200 | TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) {
 201 |   const absl::string_view robotstxt_upper =
 202 |       "User-Agent: FOO BAR\n"
 203 |       "Allow: /x/\n"
 204 |       "Disallow: /\n";
 205 |   const absl::string_view robotstxt_lower =
 206 |       "User-Agent: foo bar\n"
 207 |       "Allow: /x/\n"
 208 |       "Disallow: /\n";
 209 |   const absl::string_view robotstxt_camel =
 210 |       "User-Agent: FoO bAr\n"
 211 |       "Allow: /x/\n"
 212 |       "Disallow: /\n";
 213 |   const std::string url_allowed = "http://foo.bar/x/y";
 214 |   const std::string url_disallowed = "http://foo.bar/a/b";
 215 | 
 216 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
 217 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
 218 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
 219 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
 220 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
 221 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
 222 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
 223 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
 224 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
 225 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
 226 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
 227 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
 228 | }
 229 | 
 230 | // Google specific: accept user-agent value up to the first space. Space is not
 231 | // allowed in user-agent values, but that doesn't stop webmasters from using
 232 | // them. This is more restrictive than the RFC, since in case of the bad value
 233 | // "Googlebot Images" we'd still obey the rules with "Googlebot".
 234 | // Extends REP RFC section "The user-agent line"
 235 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
 236 | TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) {
 237 |   EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
 238 |   const absl::string_view robotstxt =
 239 |       "User-Agent: *\n"
 240 |       "Disallow: /\n"
 241 |       "User-Agent: Foo Bar\n"
 242 |       "Allow: /x/\n"
 243 |       "Disallow: /\n";
 244 |   const std::string url = "http://foo.bar/x/y";
 245 | 
 246 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url));
 247 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url));
 248 | }
 249 | 
 250 | // If no group matches the user-agent, crawlers must obey the first group with a
 251 | // user-agent line with a "*" value, if present. If no group satisfies either
 252 | // condition, or no groups are present at all, no rules apply.
 253 | // See REP RFC section "The user-agent line".
 254 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
 255 | TEST(RobotsUnittest, ID_GlobalGroups_Secondary) {
 256 |   const absl::string_view robotstxt_empty = "";
 257 |   const absl::string_view robotstxt_global =
 258 |       "user-agent: *\n"
 259 |       "allow: /\n"
 260 |       "user-agent: FooBot\n"
 261 |       "disallow: /\n";
 262 |   const absl::string_view robotstxt_only_specific =
 263 |       "user-agent: FooBot\n"
 264 |       "allow: /\n"
 265 |       "user-agent: BarBot\n"
 266 |       "disallow: /\n"
 267 |       "user-agent: BazBot\n"
 268 |       "disallow: /\n";
 269 |   const std::string url = "http://foo.bar/x/y";
 270 | 
 271 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
 272 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url));
 273 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url));
 274 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
 275 | }
 276 | 
 277 | // Matching rules against URIs is case sensitive.
 278 | // See REP RFC section "The Allow and Disallow lines".
 279 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
 280 | TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) {
 281 |   const absl::string_view robotstxt_lowercase_url =
 282 |       "user-agent: FooBot\n"
 283 |       "disallow: /x/\n";
 284 |   const absl::string_view robotstxt_uppercase_url =
 285 |       "user-agent: FooBot\n"
 286 |       "disallow: /X/\n";
 287 |   const std::string url = "http://foo.bar/x/y";
 288 | 
 289 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
 290 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
 291 | }
 292 | 
 293 | // The most specific match found MUST be used. The most specific match is the
 294 | // match that has the most octets. In case of multiple rules with the same
 295 | // length, the least strict rule must be used.
 296 | // See REP RFC section "The Allow and Disallow lines".
 297 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
 298 | TEST(RobotsUnittest, ID_LongestMatch) {
 299 |   const std::string url = "http://foo.bar/x/page.html";
 300 |   {
 301 |     const absl::string_view robotstxt =
 302 |         "user-agent: FooBot\n"
 303 |         "disallow: /x/page.html\n"
 304 |         "allow: /x/\n";
 305 | 
 306 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 307 |   }
 308 |   {
 309 |     const absl::string_view robotstxt =
 310 |         "user-agent: FooBot\n"
 311 |         "allow: /x/page.html\n"
 312 |         "disallow: /x/\n";
 313 | 
 314 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 315 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
 316 |   }
 317 |   {
 318 |     const absl::string_view robotstxt =
 319 |         "user-agent: FooBot\n"
 320 |         "disallow: \n"
 321 |         "allow: \n";
 322 |     // In case of equivalent disallow and allow patterns for the same
 323 |     // user-agent, allow is used.
 324 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 325 |   }
 326 |   {
 327 |     const absl::string_view robotstxt =
 328 |         "user-agent: FooBot\n"
 329 |         "disallow: /\n"
 330 |         "allow: /\n";
 331 |     // In case of equivalent disallow and allow patterns for the same
 332 |     // user-agent, allow is used.
 333 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 334 |   }
 335 |   {
 336 |     std::string url_a = "http://foo.bar/x";
 337 |     std::string url_b = "http://foo.bar/x/";
 338 |     const absl::string_view robotstxt =
 339 |         "user-agent: FooBot\n"
 340 |         "disallow: /x\n"
 341 |         "allow: /x/\n";
 342 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a));
 343 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b));
 344 |   }
 345 | 
 346 |   {
 347 |     const absl::string_view robotstxt =
 348 |         "user-agent: FooBot\n"
 349 |         "disallow: /x/page.html\n"
 350 |         "allow: /x/page.html\n";
 351 |     // In case of equivalent disallow and allow patterns for the same
 352 |     // user-agent, allow is used.
 353 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 354 |   }
 355 |   {
 356 |     const absl::string_view robotstxt =
 357 |         "user-agent: FooBot\n"
 358 |         "allow: /page\n"
 359 |         "disallow: /*.html\n";
 360 |     // Longest match wins.
 361 |     EXPECT_FALSE(
 362 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
 363 |     EXPECT_TRUE(
 364 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
 365 |   }
 366 |   {
 367 |     const absl::string_view robotstxt =
 368 |         "user-agent: FooBot\n"
 369 |         "allow: /x/page.\n"
 370 |         "disallow: /*.html\n";
 371 |     // Longest match wins.
 372 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 373 |     EXPECT_FALSE(
 374 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
 375 |   }
 376 |   {
 377 |     const absl::string_view robotstxt =
 378 |         "User-agent: *\n"
 379 |         "Disallow: /x/\n"
 380 |         "User-agent: FooBot\n"
 381 |         "Disallow: /y/\n";
 382 |     // Most specific group for FooBot allows implicitly /x/page.
 383 |     EXPECT_TRUE(
 384 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
 385 |     EXPECT_FALSE(
 386 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
 387 |   }
 388 | }
 389 | 
 390 | // Octets in the URI and robots.txt paths outside the range of the US-ASCII
 391 | // coded character set, and those in the reserved range defined by RFC3986,
 392 | // MUST be percent-encoded as defined by RFC3986 prior to comparison.
 393 | // See REP RFC section "The Allow and Disallow lines".
 394 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2
 395 | //
 396 | // NOTE: It's up to the caller to percent encode a URL before passing it to the
 397 | // parser. Percent encoding URIs in the rules is unnecessary.
 398 | TEST(RobotsUnittest, ID_Encoding) {
 399 |   // /foo/bar?baz=http://foo.bar stays unencoded.
 400 |   {
 401 |     const absl::string_view robotstxt =
 402 |         "User-agent: FooBot\n"
 403 |         "Disallow: /\n"
 404 |         "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
 405 |     EXPECT_TRUE(IsUserAgentAllowed(
 406 |         robotstxt, "FooBot",
 407 |         "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
 408 |   }
 409 | 
 410 |   // 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
 411 |   {
 412 |     const absl::string_view robotstxt =
 413 |         "User-agent: FooBot\n"
 414 |         "Disallow: /\n"
 415 |         "Allow: /foo/bar/ツ\n";
 416 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 417 |                                    "http://foo.bar/foo/bar/%E3%83%84"));
 418 |     // The parser encodes the 3-byte character, but the URL is not %-encoded.
 419 |     EXPECT_FALSE(
 420 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
 421 |   }
 422 |   // Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
 423 |   {
 424 |     const absl::string_view robotstxt =
 425 |         "User-agent: FooBot\n"
 426 |         "Disallow: /\n"
 427 |         "Allow: /foo/bar/%E3%83%84\n";
 428 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 429 |                                    "http://foo.bar/foo/bar/%E3%83%84"));
 430 |     EXPECT_FALSE(
 431 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
 432 |   }
 433 |   // Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
 434 |   // This is illegal according to RFC3986 and while it may work here due to
 435 |   // simple string matching, it should not be relied on.
 436 |   {
 437 |     const absl::string_view robotstxt =
 438 |         "User-agent: FooBot\n"
 439 |         "Disallow: /\n"
 440 |         "Allow: /foo/bar/%62%61%7A\n";
 441 |     EXPECT_FALSE(
 442 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
 443 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 444 |                                    "http://foo.bar/foo/bar/%62%61%7A"));
 445 |   }
 446 | }
 447 | 
 448 | // The REP RFC defines the following characters that have special meaning in
 449 | // robots.txt:
 450 | // # - inline comment.
 451 | // $ - end of pattern.
 452 | // * - any number of characters.
 453 | // See REP RFC section "Special Characters".
 454 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.3
 455 | TEST(RobotsUnittest, ID_SpecialCharacters) {
 456 |   {
 457 |     const absl::string_view robotstxt =
 458 |         "User-agent: FooBot\n"
 459 |         "Disallow: /foo/bar/quz\n"
 460 |         "Allow: /foo/*/qux\n";
 461 |     EXPECT_FALSE(
 462 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
 463 |     EXPECT_TRUE(
 464 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
 465 |     EXPECT_TRUE(
 466 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
 467 |     EXPECT_TRUE(
 468 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
 469 |   }
 470 |   {
 471 |     const absl::string_view robotstxt =
 472 |         "User-agent: FooBot\n"
 473 |         "Disallow: /foo/bar$\n"
 474 |         "Allow: /foo/bar/qux\n";
 475 |     EXPECT_FALSE(
 476 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
 477 |     EXPECT_TRUE(
 478 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
 479 |     EXPECT_TRUE(
 480 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
 481 |     EXPECT_TRUE(
 482 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
 483 |   }
 484 |   {
 485 |     const absl::string_view robotstxt =
 486 |         "User-agent: FooBot\n"
 487 |         "# Disallow: /\n"
 488 |         "Disallow: /foo/quz#qux\n"
 489 |         "Allow: /\n";
 490 |     EXPECT_TRUE(
 491 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
 492 |     EXPECT_FALSE(
 493 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
 494 |   }
 495 | }
 496 | 
 497 | // Google-specific: "index.html" (and only that) at the end of a pattern is
 498 | // equivalent to "/".
 499 | TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) {
 500 |   const absl::string_view robotstxt =
 501 |       "User-Agent: *\n"
 502 |       "Allow: /allowed-slash/index.html\n"
 503 |       "Disallow: /\n";
 504 |   // If index.html is allowed, we interpret this as / being allowed too.
 505 |   EXPECT_TRUE(
 506 |       IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
 507 |   // Does not exatly match.
 508 |   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot",
 509 |                                   "http://foo.com/allowed-slash/index.htm"));
 510 |   // Exact match.
 511 |   EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot",
 512 |                                  "http://foo.com/allowed-slash/index.html"));
 513 |   EXPECT_FALSE(
 514 |       IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
 515 | }
 516 | 
 517 | // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
 518 | // RobotsTxtParser::Parse().
 519 | TEST(RobotsUnittest, GoogleOnly_LineTooLong) {
 520 |   size_t kEOLLen = std::string("\n").length();
 521 |   int kMaxLineLen = 2083 * 8;
 522 |   std::string allow = "allow: ";
 523 |   std::string disallow = "disallow: ";
 524 | 
 525 |   // Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
 526 |   {
 527 |     std::string robotstxt = "user-agent: FooBot\n";
 528 |     std::string longline = "/x/";
 529 |     size_t max_length =
 530 |         kMaxLineLen - longline.length() - disallow.length() + kEOLLen;
 531 |     while (longline.size() < max_length) {
 532 |       absl::StrAppend(&longline, "a");
 533 |     }
 534 |     absl::StrAppend(&robotstxt, disallow, longline, "/qux\n");
 535 | 
 536 |     // Matches nothing, so URL is allowed.
 537 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
 538 |     // Matches cut off disallow rule.
 539 |     EXPECT_FALSE(IsUserAgentAllowed(
 540 |         robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux")));
 541 |   }
 542 | 
 543 |   {
 544 |     std::string robotstxt =
 545 |         "user-agent: FooBot\n"
 546 |         "disallow: /\n";
 547 |     std::string longline_a = "/x/";
 548 |     std::string longline_b = "/x/";
 549 |     size_t max_length =
 550 |         kMaxLineLen - longline_a.length() - allow.length() + kEOLLen;
 551 |     while (longline_a.size() < max_length) {
 552 |       absl::StrAppend(&longline_a, "a");
 553 |       absl::StrAppend(&longline_b, "b");
 554 |     }
 555 |     absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n");
 556 |     absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n");
 557 | 
 558 |     // URL matches the disallow rule.
 559 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
 560 |     // Matches the allow rule exactly.
 561 |     EXPECT_TRUE(
 562 |         IsUserAgentAllowed(robotstxt, "FooBot",
 563 |                            absl::StrCat("http://foo.bar", longline_a, "/qux")));
 564 |     // Matches cut off allow rule.
 565 |     EXPECT_TRUE(
 566 |         IsUserAgentAllowed(robotstxt, "FooBot",
 567 |                            absl::StrCat("http://foo.bar", longline_b, "/fux")));
 568 |   }
 569 | }
 570 | 
 571 | TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) {
 572 |   // Test documentation from
 573 |   // https://developers.google.com/search/reference/robots_txt
 574 |   // Section "URL matching based on path values".
 575 |   {
 576 |     std::string robotstxt =
 577 |         "user-agent: FooBot\n"
 578 |         "disallow: /\n"
 579 |         "allow: /fish\n";
 580 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 581 | 
 582 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
 583 |     EXPECT_TRUE(
 584 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
 585 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 586 |                                    "http://foo.bar/fish/salmon.html"));
 587 |     EXPECT_TRUE(
 588 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
 589 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 590 |                                    "http://foo.bar/fishheads/yummy.html"));
 591 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 592 |                                    "http://foo.bar/fish.html?id=anything"));
 593 | 
 594 |     EXPECT_FALSE(
 595 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
 596 |     EXPECT_FALSE(
 597 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
 598 |     EXPECT_FALSE(
 599 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
 600 |   }
 601 |   // "/fish*" equals "/fish"
 602 |   {
 603 |     std::string robotstxt =
 604 |         "user-agent: FooBot\n"
 605 |         "disallow: /\n"
 606 |         "allow: /fish*\n";
 607 |     EXPECT_FALSE(
 608 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 609 | 
 610 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
 611 |     EXPECT_TRUE(
 612 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
 613 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 614 |                                    "http://foo.bar/fish/salmon.html"));
 615 |     EXPECT_TRUE(
 616 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
 617 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 618 |                                    "http://foo.bar/fishheads/yummy.html"));
 619 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 620 |                                    "http://foo.bar/fish.html?id=anything"));
 621 | 
 622 |     EXPECT_FALSE(
 623 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
 624 |     EXPECT_FALSE(
 625 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
 626 |     EXPECT_FALSE(
 627 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
 628 |   }
 629 |   // "/fish/" does not equal "/fish"
 630 |   {
 631 |     std::string robotstxt =
 632 |         "user-agent: FooBot\n"
 633 |         "disallow: /\n"
 634 |         "allow: /fish/\n";
 635 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 636 | 
 637 |     EXPECT_TRUE(
 638 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
 639 |     EXPECT_TRUE(
 640 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
 641 |     EXPECT_TRUE(
 642 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
 643 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 644 |                                    "http://foo.bar/fish/salmon.html"));
 645 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 646 |                                    "http://foo.bar/fish/?id=anything"));
 647 | 
 648 |     EXPECT_FALSE(
 649 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
 650 |     EXPECT_FALSE(
 651 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
 652 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 653 |                                     "http://foo.bar/Fish/Salmon.html"));
 654 |   }
 655 |   // "/*.php"
 656 |   {
 657 |     std::string robotstxt =
 658 |         "user-agent: FooBot\n"
 659 |         "disallow: /\n"
 660 |         "allow: /*.php\n";
 661 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 662 | 
 663 |     EXPECT_TRUE(
 664 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
 665 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 666 |                                    "http://foo.bar/folder/filename.php"));
 667 |     EXPECT_TRUE(IsUserAgentAllowed(
 668 |         robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
 669 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 670 |                                    "http://foo.bar//folder/any.php.file.html"));
 671 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 672 |                                    "http://foo.bar/filename.php/"));
 673 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 674 |                                    "http://foo.bar/index?f=filename.php/"));
 675 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 676 |                                     "http://foo.bar/php/"));
 677 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 678 |                                     "http://foo.bar/index?php"));
 679 | 
 680 |     EXPECT_FALSE(
 681 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
 682 |   }
 683 |   // "/*.php$"
 684 |   {
 685 |     std::string robotstxt =
 686 |         "user-agent: FooBot\n"
 687 |         "disallow: /\n"
 688 |         "allow: /*.php$\n";
 689 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 690 | 
 691 |     EXPECT_TRUE(
 692 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
 693 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
 694 |                                    "http://foo.bar/folder/filename.php"));
 695 | 
 696 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 697 |                                     "http://foo.bar/filename.php?parameters"));
 698 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 699 |                                     "http://foo.bar/filename.php/"));
 700 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 701 |                                     "http://foo.bar/filename.php5"));
 702 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 703 |                                     "http://foo.bar/php/"));
 704 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 705 |                                     "http://foo.bar/filename?php"));
 706 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
 707 |                                     "http://foo.bar/aaaphpaaa"));
 708 |     EXPECT_FALSE(
 709 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
 710 |   }
 711 |   // "/fish*.php"
 712 |   {
 713 |     std::string robotstxt =
 714 |         "user-agent: FooBot\n"
 715 |         "disallow: /\n"
 716 |         "allow: /fish*.php\n";
 717 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
 718 | 
 719 |     EXPECT_TRUE(
 720 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
 721 |     EXPECT_TRUE(
 722 |         IsUserAgentAllowed(robotstxt, "FooBot",
 723 |                            "http://foo.bar/fishheads/catfish.php?parameters"));
 724 | 
 725 |     EXPECT_FALSE(
 726 |         IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
 727 |   }
 728 |   // Section "Order of precedence for group-member records".
 729 |   {
 730 |     std::string robotstxt =
 731 |         "user-agent: FooBot\n"
 732 |         "allow: /p\n"
 733 |         "disallow: /\n";
 734 |     std::string url = "http://example.com/page";
 735 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 736 |   }
 737 |   {
 738 |     std::string robotstxt =
 739 |         "user-agent: FooBot\n"
 740 |         "allow: /folder\n"
 741 |         "disallow: /folder\n";
 742 |     std::string url = "http://example.com/folder/page";
 743 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 744 |   }
 745 |   {
 746 |     std::string robotstxt =
 747 |         "user-agent: FooBot\n"
 748 |         "allow: /page\n"
 749 |         "disallow: /*.htm\n";
 750 |     std::string url = "http://example.com/page.htm";
 751 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 752 |   }
 753 |   {
 754 |     std::string robotstxt =
 755 |         "user-agent: FooBot\n"
 756 |         "allow: /$\n"
 757 |         "disallow: /\n";
 758 |     std::string url = "http://example.com/";
 759 |     std::string url_page = "http://example.com/page.html";
 760 |     EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
 761 |     EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page));
 762 |   }
 763 | }
 764 | 
 765 | class RobotsStatsReporter : public googlebot::RobotsParseHandler {
 766 |  public:
 767 |   void HandleRobotsStart() override {
 768 |     last_line_seen_ = 0;
 769 |     valid_directives_ = 0;
 770 |     unknown_directives_ = 0;
 771 |     sitemap_.clear();
 772 |   }
 773 |   void HandleRobotsEnd() override {}
 774 | 
 775 |   void HandleUserAgent(int line_num, absl::string_view value) override {
 776 |     Digest(line_num);
 777 |   }
 778 |   void HandleAllow(int line_num, absl::string_view value) override {
 779 |     Digest(line_num);
 780 |   }
 781 |   void HandleDisallow(int line_num, absl::string_view value) override {
 782 |     Digest(line_num);
 783 |   }
 784 | 
 785 |   void HandleSitemap(int line_num, absl::string_view value) override {
 786 |     Digest(line_num);
 787 |     sitemap_.append(value.data(), value.length());
 788 |   }
 789 | 
 790 |   // Any other unrecognized name/v pairs.
 791 |   void HandleUnknownAction(int line_num, absl::string_view action,
 792 |                            absl::string_view value) override {
 793 |     last_line_seen_ = line_num;
 794 |     unknown_directives_++;
 795 |   }
 796 | 
 797 |   int last_line_seen() const { return last_line_seen_; }
 798 | 
 799 |   // All directives found, including unknown.
 800 |   int valid_directives() const { return valid_directives_; }
 801 | 
 802 |   // Number of unknown directives.
 803 |   int unknown_directives() const { return unknown_directives_; }
 804 | 
 805 |   // Parsed sitemap line.
 806 |   std::string sitemap() const { return sitemap_; }
 807 | 
 808 |  private:
 809 |   void Digest(int line_num) {
 810 |     ASSERT_GE(line_num, last_line_seen_);
 811 |     last_line_seen_ = line_num;
 812 |     valid_directives_++;
 813 |   }
 814 | 
 815 |   int last_line_seen_ = 0;
 816 |   int valid_directives_ = 0;
 817 |   int unknown_directives_ = 0;
 818 |   std::string sitemap_;
 819 | };
 820 | 
 821 | // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
 822 | TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) {
 823 |   RobotsStatsReporter report;
 824 |   static const char kUnixFile[] =
 825 |       "User-Agent: foo\n"
 826 |       "Allow: /some/path\n"
 827 |       "User-Agent: bar\n"
 828 |       "\n"
 829 |       "\n"
 830 |       "Disallow: /\n";
 831 |   googlebot::ParseRobotsTxt(kUnixFile, &report);
 832 |   EXPECT_EQ(4, report.valid_directives());
 833 |   EXPECT_EQ(6, report.last_line_seen());
 834 | 
 835 |   static const char kDosFile[] =
 836 |       "User-Agent: foo\r\n"
 837 |       "Allow: /some/path\r\n"
 838 |       "User-Agent: bar\r\n"
 839 |       "\r\n"
 840 |       "\r\n"
 841 |       "Disallow: /\r\n";
 842 |   googlebot::ParseRobotsTxt(kDosFile, &report);
 843 |   EXPECT_EQ(4, report.valid_directives());
 844 |   EXPECT_EQ(6, report.last_line_seen());
 845 | 
 846 |   static const char kMacFile[] =
 847 |       "User-Agent: foo\r"
 848 |       "Allow: /some/path\r"
 849 |       "User-Agent: bar\r"
 850 |       "\r"
 851 |       "\r"
 852 |       "Disallow: /\r";
 853 |   googlebot::ParseRobotsTxt(kMacFile, &report);
 854 |   EXPECT_EQ(4, report.valid_directives());
 855 |   EXPECT_EQ(6, report.last_line_seen());
 856 | 
 857 |   static const char kNoFinalNewline[] =
 858 |       "User-Agent: foo\n"
 859 |       "Allow: /some/path\n"
 860 |       "User-Agent: bar\n"
 861 |       "\n"
 862 |       "\n"
 863 |       "Disallow: /";
 864 |   googlebot::ParseRobotsTxt(kNoFinalNewline, &report);
 865 |   EXPECT_EQ(4, report.valid_directives());
 866 |   EXPECT_EQ(6, report.last_line_seen());
 867 | 
 868 |   static const char kMixedFile[] =
 869 |       "User-Agent: foo\n"
 870 |       "Allow: /some/path\r\n"
 871 |       "User-Agent: bar\n"
 872 |       "\r\n"
 873 |       "\n"
 874 |       "Disallow: /";
 875 |   googlebot::ParseRobotsTxt(kMixedFile, &report);
 876 |   EXPECT_EQ(4, report.valid_directives());
 877 |   EXPECT_EQ(6, report.last_line_seen());
 878 | }
 879 | 
 880 | // BOM characters are unparseable and thus skipped. The rules following the line
 881 | // are used.
 882 | TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) {
 883 |   RobotsStatsReporter report;
 884 |   static const char kUtf8FileFullBOM[] =
 885 |       "\xEF\xBB\xBF"
 886 |       "User-Agent: foo\n"
 887 |       "Allow: /AnyValue\n";
 888 |   googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report);
 889 |   EXPECT_EQ(2, report.valid_directives());
 890 |   EXPECT_EQ(0, report.unknown_directives());
 891 | 
 892 |   // We allow as well partial ByteOrderMarks.
 893 |   static const char kUtf8FilePartial2BOM[] =
 894 |       "\xEF\xBB"
 895 |       "User-Agent: foo\n"
 896 |       "Allow: /AnyValue\n";
 897 |   googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report);
 898 |   EXPECT_EQ(2, report.valid_directives());
 899 |   EXPECT_EQ(0, report.unknown_directives());
 900 | 
 901 |   static const char kUtf8FilePartial1BOM[] =
 902 |       "\xEF"
 903 |       "User-Agent: foo\n"
 904 |       "Allow: /AnyValue\n";
 905 |   googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report);
 906 |   EXPECT_EQ(2, report.valid_directives());
 907 |   EXPECT_EQ(0, report.unknown_directives());
 908 | 
 909 |   // If the BOM is not the right sequence, the first line looks like garbage
 910 |   // that is skipped (we essentially see "\x11\xBFUser-Agent").
 911 |   static const char kUtf8FileBrokenBOM[] =
 912 |       "\xEF\x11\xBF"
 913 |       "User-Agent: foo\n"
 914 |       "Allow: /AnyValue\n";
 915 |   googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report);
 916 |   EXPECT_EQ(1, report.valid_directives());
 917 |   EXPECT_EQ(1, report.unknown_directives());  // We get one broken line.
 918 | 
 919 |   // Some other messed up file: BOMs only valid in the beginning of the file.
 920 |   static const char kUtf8BOMSomewhereInMiddleOfFile[] =
 921 |       "User-Agent: foo\n"
 922 |       "\xEF\xBB\xBF"
 923 |       "Allow: /AnyValue\n";
 924 |   googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report);
 925 |   EXPECT_EQ(1, report.valid_directives());
 926 |   EXPECT_EQ(1, report.unknown_directives());
 927 | }
 928 | 
 929 | // Google specific: the RFC allows any line that crawlers might need, such as
 930 | // sitemaps, which Google supports.
 931 | // See REP RFC section "Other records".
 932 | // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.4
 933 | TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) {
 934 |   RobotsStatsReporter report;
 935 |   {
 936 |     std::string sitemap_loc = "http://foo.bar/sitemap.xml";
 937 |     std::string robotstxt =
 938 |         "User-Agent: foo\n"
 939 |         "Allow: /some/path\n"
 940 |         "User-Agent: bar\n"
 941 |         "\n"
 942 |         "\n";
 943 |     absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n");
 944 | 
 945 |     googlebot::ParseRobotsTxt(robotstxt, &report);
 946 |     EXPECT_EQ(sitemap_loc, report.sitemap());
 947 |   }
 948 |   // A sitemap line may appear anywhere in the file.
 949 |   {
 950 |     std::string robotstxt;
 951 |     std::string sitemap_loc = "http://foo.bar/sitemap.xml";
 952 |     std::string robotstxt_temp =
 953 |         "User-Agent: foo\n"
 954 |         "Allow: /some/path\n"
 955 |         "User-Agent: bar\n"
 956 |         "\n"
 957 |         "\n";
 958 |     absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp);
 959 | 
 960 |     googlebot::ParseRobotsTxt(robotstxt, &report);
 961 |     EXPECT_EQ(sitemap_loc, report.sitemap());
 962 |   }
 963 | }
 964 | 
 965 | }  // namespace
 966 | 
 967 | // Integrity tests. These functions are available to the linker, but not in the
 968 | // header, because they should only be used for testing.
 969 | namespace googlebot {
 970 | std::string GetPathParamsQuery(const std::string& url);
 971 | bool MaybeEscapePattern(const char* src, char** dst);
 972 | }  // namespace googlebot
 973 | 
 974 | void TestPath(const std::string& url, const std::string& expected_path) {
 975 |   EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url));
 976 | }
 977 | 
 978 | void TestEscape(const std::string& url, const std::string& expected) {
 979 |   char* escaped_value = nullptr;
 980 |   const bool is_escaped =
 981 |       googlebot::MaybeEscapePattern(url.c_str(), &escaped_value);
 982 |   const std::string escaped = escaped_value;
 983 |   if (is_escaped) delete[] escaped_value;
 984 | 
 985 |   EXPECT_EQ(expected, escaped);
 986 | }
 987 | 
 988 | TEST(RobotsUnittest, TestGetPathParamsQuery) {
 989 |   // Only testing URLs that are already correctly escaped here.
 990 |   TestPath("", "/");
 991 |   TestPath("http://www.example.com", "/");
 992 |   TestPath("http://www.example.com/", "/");
 993 |   TestPath("http://www.example.com/a", "/a");
 994 |   TestPath("http://www.example.com/a/", "/a/");
 995 |   TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
 996 |   TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
 997 |   TestPath("example.com", "/");
 998 |   TestPath("example.com/", "/");
 999 |   TestPath("example.com/a", "/a");
1000 |   TestPath("example.com/a/", "/a/");
1001 |   TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
1002 |   TestPath("a", "/");
1003 |   TestPath("a/", "/");
1004 |   TestPath("/a", "/a");
1005 |   TestPath("a/b", "/b");
1006 |   TestPath("example.com?a", "/?a");
1007 |   TestPath("example.com/a;b#c", "/a;b");
1008 |   TestPath("//a/b/c", "/b/c");
1009 | }
1010 | 
1011 | TEST(RobotsUnittest, TestMaybeEscapePattern) {
1012 |   TestEscape("http://www.example.com", "http://www.example.com");
1013 |   TestEscape("/a/b/c", "/a/b/c");
1014 |   TestEscape("á", "%C3%A1");
1015 |   TestEscape("%aa", "%AA");
1016 | }
1017 | 


--------------------------------------------------------------------------------