├── .github ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── code-of-conduct.md └── contributing.md ├── pom.xml └── src ├── main └── java │ └── com │ └── google │ └── search │ └── robotstxt │ ├── Matcher.java │ ├── MatchingStrategy.java │ ├── ParseException.java │ ├── ParseHandler.java │ ├── Parser.java │ ├── RobotsContents.java │ ├── RobotsLongestMatchStrategy.java │ ├── RobotsMatcher.java │ ├── RobotsParseHandler.java │ ├── RobotsParser.java │ └── RobotsParserApp.java └── test └── java └── com └── google └── search └── robotstxt ├── RobotsMatcherTest.java └── RobotsParserTest.java /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Expected Behavior 2 | 3 | 4 | ## Actual Behavior 5 | 6 | 7 | ## Steps to Reproduce the Problem 8 | 9 | 1. 10 | 1. 11 | 1. 12 | 13 | ## Specifications 14 | 15 | - Version: 16 | - Platform: -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | > It's a good idea to open an issue first for discussion. 4 | 5 | - [ ] Tests pass 6 | - [ ] Appropriate changes to README are included in PR -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | target/ 26 | pom.xml.tag 27 | pom.xml.releaseBackup 28 | pom.xml.versionsBackup 29 | pom.xml.next 30 | release.properties 31 | dependency-reduced-pom.xml 32 | buildNumber.properties 33 | .mvn/timing.properties 34 | # https://github.com/takari/maven-wrapper#usage-without-binary-jar 35 | .mvn/wrapper/maven-wrapper.jar 36 | 37 | .metadata 38 | bin/ 39 | tmp/ 40 | *.tmp 41 | *.bak 42 | *.swp 43 | *~.nib 44 | local.properties 45 | .settings/ 46 | .loadpath 47 | .recommenders 48 | 49 | # Eclipse Core 50 | .project 51 | 52 | # External tool builders 53 | .externalToolBuilders/ 54 | 55 | # Locally stored "Eclipse launch configurations" 56 | *.launch 57 | 58 | # PyDev specific (Python IDE for Eclipse) 59 | *.pydevproject 60 | 61 | # CDT-specific (C/C++ Development Tooling) 62 | .cproject 63 | 64 | # CDT- autotools 65 | .autotools 66 | 67 | # Java annotation processor (APT) 68 | .factorypath 69 | 70 | # JDT-specific (Eclipse Java Development Tools) 71 | .classpath 72 | 73 | # PDT-specific (PHP Development Tools) 74 | .buildpath 75 | 76 | # sbteclipse plugin 77 | .target 78 | 79 | # Tern plugin 80 | .tern-project 81 | 82 | # TeXlipse plugin 83 | .texlipse 84 | 85 | # STS (Spring Tool Suite) 86 | .springBeans 87 | 88 | # Code Recommenders 89 | .recommenders/ 90 | 91 | # Annotation Processing 92 | .apt_generated/ 93 | .apt_generated_test/ 94 | 95 | # Scala IDE specific (Scala & Java development for Eclipse) 96 | .cache-main 97 | .scala_dependencies 98 | .worksheet 99 | 100 | # User-specific stuff 101 | .idea/**/workspace.xml 102 | .idea/**/tasks.xml 103 | .idea/**/usage.statistics.xml 104 | .idea/**/dictionaries 105 | .idea/**/shelf 106 | 107 | # Generated files 108 | .idea/**/contentModel.xml 109 | 110 | # Sensitive or high-churn files 111 | .idea/**/dataSources/ 112 | .idea/**/dataSources.ids 113 | .idea/**/dataSources.local.xml 114 | .idea/**/sqlDataSources.xml 115 | .idea/**/dynamic.xml 116 | .idea/**/uiDesigner.xml 117 | .idea/**/dbnavigator.xml 118 | 119 | # Gradle 120 | .idea/**/gradle.xml 121 | .idea/**/libraries 122 | 123 | # Gradle and Maven with auto-import 124 | # When using Gradle or Maven with auto-import, you should exclude module files, 125 | # since they will be recreated, and may cause churn. 126 | .idea/artifacts 127 | .idea/compiler.xml 128 | .idea/jarRepositories.xml 129 | .idea/modules.xml 130 | .idea/*.iml 131 | .idea/modules 132 | *.iml 133 | *.ipr 134 | 135 | # CMake 136 | cmake-build-*/ 137 | 138 | # Mongo Explorer plugin 139 | .idea/**/mongoSettings.xml 140 | 141 | # File-based project format 142 | *.iws 143 | 144 | # IntelliJ 145 | out/ 146 | .idea/encodings.xml 147 | .idea/misc.xml 148 | .idea/vcs.xml 149 | .idea/codeStyles/codeStyleConfig.xml 150 | 151 | # mpeltonen/sbt-idea plugin 152 | .idea_modules/ 153 | 154 | # JIRA plugin 155 | atlassian-ide-plugin.xml 156 | 157 | # Cursive Clojure plugin 158 | .idea/replstate.xml 159 | 160 | # Crashlytics plugin (for Android Studio and IntelliJ) 161 | com_crashlytics_export_strings.xml 162 | crashlytics.properties 163 | crashlytics-build.properties 164 | fabric.properties 165 | 166 | # Editor-based Rest Client 167 | .idea/httpRequests 168 | 169 | # Android studio 3.1+ serialized cache file 170 | .idea/caches/build_file_checksums.ser 171 | 172 | # Visual Studio Code 173 | .vscode/ 174 | *.code-workspace 175 | .history/ 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Robots.txt Parser and Matcher Library in Java 2 | 3 | This project aims to implement the robots.txt parser and matcher in Java. It is 4 | based on the [C++ implementation](https://github.com/google/robotstxt). 5 | 6 | ## About the library 7 | 8 | The Robots Exclusion Protocol (REP) is a standard that enables website owners 9 | to control which URLs may be accessed by automated clients (i.e. crawlers) 10 | through a simple text file with a specific syntax. It's one of the basic 11 | building blocks of the internet as we know it and what allows search engines 12 | to operate. 13 | 14 | Because the REP was only a de-facto standard for the past 25 years, different 15 | implementers implement parsing of robots.txt slightly differently, leading to 16 | confusion. This project aims to fix that by releasing the parser that Google 17 | uses. 18 | 19 | The library is a Java port of 20 | [C++ parser and matcher](https://github.com/google/robotstxt) which is a 21 | slightly modified production code used by Googlebot, Google's crawler. The 22 | library is released open-source to help developers build tools that better 23 | reflect Google's robots.txt parsing and matching. 24 | 25 | For webmasters, we included a runnable class `RobotsParserApp` which is a small 26 | application that allows testing a single URL and several user-agents against a 27 | robots.txt. 28 | 29 | ## Development 30 | 31 | ### Prerequisites 32 | 33 | You need Maven to build this project. 34 | [Download](https://maven.apache.org/download.html) and 35 | [install](https://maven.apache.org/install.html) it from the official website. 36 | 37 | You can also install it like this if your Linux supports it: 38 | 39 | ``` 40 | $ sudo apt-get install maven 41 | ``` 42 | 43 | ### Build it 44 | 45 | #### Using Maven 46 | 47 | Standard maven commands work here. 48 | 49 | ``` 50 | $ mvn install 51 | ``` 52 | 53 | Or if you want a build from scratch: 54 | 55 | ``` 56 | $ mvn clean install 57 | ``` 58 | 59 | #### Using Maven Assembly Plugin 60 | 61 | Alternatively, you can compile the entire project into a single JAR using the 62 | following command: 63 | 64 | ``` 65 | $ mvn clean compile assembly:single 66 | ``` 67 | 68 | You can find the result in `target` directory. 69 | 70 | ### Run it 71 | 72 | #### Using Maven 73 | 74 | Following commands will run an application that parses given robots.txt file 75 | and print a matching verdict: `ALLOWED` or `DISALLOWED` (exit codes are `0` 76 | and `1` respectively). 77 | 78 | You should provide a target URL using `-u` (`--url`) flag. At least one agent 79 | must be specified using `-a` (`--agent`) flag (verdict `DISALLOWED` is printed 80 | iff none of the user-agents are allowed to crawl given URL). 81 | 82 | When flag `-f` (`--file`) is omitted, robots.txt contents are expected to be 83 | received via standard input: 84 | 85 | ``` 86 | $ mvn exec:java -Dexec.mainClass=com.google.search.robotstxt.RobotsParserApp -Dexec.args="--agent FooBot --url http://foo.com/bar" 87 | ``` 88 | 89 | If you want the application to read an existing robots.txt file, use flag `-f` 90 | (`--file`): 91 | 92 | ``` 93 | $ mvn exec:java -Dexec.mainClass=com.google.search.robotstxt.RobotsParserApp -Dexec.args="--agent FooBot --url http://foo.com/bar --file path/to/robots.txt" 94 | ``` 95 | 96 | #### From JAR 97 | 98 | If you have built the project into JAR, you can run it from there (reading 99 | robots.txt from standard input): 100 | 101 | ``` 102 | $ java -jar target/robotstxt-java-1.0-SNAPSHOT-jar-with-dependencies.jar --agent FooBot --url http://foo.com/bar 103 | ``` 104 | 105 | Or (reading from file): 106 | 107 | ``` 108 | $ java -jar target/robotstxt-java-1.0-SNAPSHOT-jar-with-dependencies.jar --agent FooBot --url http://foo.com/bar --file path/to/robots.txt 109 | ``` 110 | 111 | ## Notes 112 | 113 | Parsing of robots.txt files themselves is done exactly as in the production 114 | version of Googlebot, including how percent codes and unicode characters in 115 | patterns are handled. The user must ensure however that the URI passed to the 116 | `Matcher` methods, or to the `--url` parameter of the application, follows the 117 | format specified by RFC3986, since this library will not perform full 118 | normalization of those URI parameters. Only if the URI is in this format, the 119 | matching will be done according to the REP specification. 120 | 121 | ## License 122 | 123 | The robots.txt parser and matcher Java library is licensed under the terms of 124 | the Apache license. See LICENSE for more information. 125 | 126 | ## Source Code Headers 127 | 128 | Every file containing source code must include copyright and license 129 | information. This includes any JS/CSS files that you might be serving out to 130 | browsers. (This is to help well-intentioned people avoid accidental copying 131 | that doesn't comply with the license.) 132 | 133 | Apache header: 134 | 135 | Copyright 2020 Google LLC 136 | 137 | Licensed under the Apache License, Version 2.0 (the "License"); 138 | you may not use this file except in compliance with the License. 139 | You may obtain a copy of the License at 140 | 141 | https://www.apache.org/licenses/LICENSE-2.0 142 | 143 | Unless required by applicable law or agreed to in writing, software 144 | distributed under the License is distributed on an "AS IS" BASIS, 145 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 146 | See the License for the specific language governing permissions and 147 | limitations under the License. 148 | 149 | It can be done easily by using the 150 | [addlicense](https://github.com/google/addlicense) tool. 151 | 152 | Install it: 153 | 154 | ``` 155 | $ go get -u github.com/google/addlicense 156 | ``` 157 | 158 | Use it like this to make sure all files have the licence: 159 | 160 | ``` 161 | $ ~/go/bin/addlicense -c "Google LLC" -l apache . 162 | ``` 163 | -------------------------------------------------------------------------------- /docs/code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Google Open Source Community Guidelines 2 | 3 | At Google, we recognize and celebrate the creativity and collaboration of open 4 | source contributors and the diversity of skills, experiences, cultures, and 5 | opinions they bring to the projects and communities they participate in. 6 | 7 | Every one of Google's open source projects and communities are inclusive 8 | environments, based on treating all individuals respectfully, regardless of 9 | gender identity and expression, sexual orientation, disabilities, 10 | neurodiversity, physical appearance, body size, ethnicity, nationality, race, 11 | age, religion, or similar personal characteristic. 12 | 13 | We value diverse opinions, but we value respectful behavior more. 14 | 15 | Respectful behavior includes: 16 | 17 | * Being considerate, kind, constructive, and helpful. 18 | * Not engaging in demeaning, discriminatory, harassing, hateful, sexualized, or 19 | physically threatening behavior, speech, and imagery. 20 | * Not engaging in unwanted physical contact. 21 | 22 | Some Google open source projects [may adopt][] an explicit project code of 23 | conduct, which may have additional detailed expectations for participants. Most 24 | of those projects will use our [modified Contributor Covenant][]. 25 | 26 | [may adopt]: https://opensource.google/docs/releasing/preparing/#conduct 27 | [modified Contributor Covenant]: https://opensource.google/docs/releasing/template/CODE_OF_CONDUCT/ 28 | 29 | ## Resolve peacefully 30 | 31 | We do not believe that all conflict is necessarily bad; healthy debate and 32 | disagreement often yields positive results. However, it is never okay to be 33 | disrespectful. 34 | 35 | If you see someone behaving disrespectfully, you are encouraged to address the 36 | behavior directly with those involved. Many issues can be resolved quickly and 37 | easily, and this gives people more control over the outcome of their dispute. 38 | If you are unable to resolve the matter for any reason, or if the behavior is 39 | threatening or harassing, report it. We are dedicated to providing an 40 | environment where participants feel welcome and safe. 41 | 42 | ## Reporting problems 43 | 44 | Some Google open source projects may adopt a project-specific code of conduct. 45 | In those cases, a Google employee will be identified as the Project Steward, 46 | who will receive and handle reports of code of conduct violations. In the event 47 | that a project hasn’t identified a Project Steward, you can report problems by 48 | emailing opensource@google.com. 49 | 50 | We will investigate every complaint, but you may not receive a direct response. 51 | We will use our discretion in determining when and how to follow up on reported 52 | incidents, which may range from not taking action to permanent expulsion from 53 | the project and project-sponsored spaces. We will notify the accused of the 54 | report and provide them an opportunity to discuss it before any action is 55 | taken. The identity of the reporter will be omitted from the details of the 56 | report supplied to the accused. In potentially harmful situations, such as 57 | ongoing harassment or threats to anyone's safety, we may take action without 58 | notice. 59 | 60 | *This document was adapted from the [IndieWeb Code of Conduct][] and can also 61 | be found at .* 62 | 63 | [IndieWeb Code of Conduct]: https://indieweb.org/code-of-conduct 64 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google/conduct/). 29 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 21 | 4.0.0 22 | 23 | com.google.search.robotstxt 24 | robotstxt-java 25 | 1.0-SNAPSHOT 26 | 27 | robotstxt-java 28 | https://github.com/google/robotstxt-java 29 | 30 | 31 | UTF-8 32 | 11 33 | 11 34 | 0.7.4 35 | 36 | 37 | 38 | 39 | com.google.protobuf 40 | protobuf-java 41 | 3.19.2 42 | 43 | 44 | 45 | junit 46 | junit 47 | 4.13.2 48 | test 49 | 50 | 51 | 52 | com.google.flogger 53 | flogger-system-backend 54 | ${flogger.version} 55 | 56 | 57 | 58 | com.google.flogger 59 | flogger 60 | ${flogger.version} 61 | 62 | 63 | 64 | com.google.truth 65 | truth 66 | 1.1.3 67 | test 68 | 69 | 70 | 71 | info.picocli 72 | picocli 73 | 4.6.2 74 | 75 | 76 | 77 | com.google.guava 78 | guava 79 | 31.0.1-jre 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | maven-clean-plugin 91 | 3.1.0 92 | 93 | 94 | 95 | maven-resources-plugin 96 | 3.0.2 97 | 98 | 99 | maven-compiler-plugin 100 | 3.8.0 101 | 102 | 103 | maven-surefire-plugin 104 | 2.22.1 105 | 106 | 107 | maven-jar-plugin 108 | 3.0.2 109 | 110 | 111 | maven-install-plugin 112 | 2.5.2 113 | 114 | 115 | maven-deploy-plugin 116 | 2.8.2 117 | 118 | 119 | 120 | maven-site-plugin 121 | 3.7.1 122 | 123 | 124 | maven-project-info-reports-plugin 125 | 3.0.0 126 | 127 | 128 | com.coveo 129 | fmt-maven-plugin 130 | 2.10 131 | 132 | 133 | org.xolstice.maven.plugins 134 | protobuf-maven-plugin 135 | 0.6.1 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | com.coveo 144 | fmt-maven-plugin 145 | 146 | 147 | 148 | format 149 | 150 | 151 | 152 | 153 | 154 | 155 | org.apache.maven.plugins 156 | maven-compiler-plugin 157 | 158 | 159 | -XDcompilePolicy=simple 160 | -Xplugin:ErrorProne 161 | 162 | 163 | 164 | com.google.errorprone 165 | error_prone_core 166 | 2.4.0 167 | 168 | 169 | 170 | 171 | 172 | 173 | org.xolstice.maven.plugins 174 | protobuf-maven-plugin 175 | 181 | 184 | 185 | 186 | 187 | compile 188 | test-compile 189 | 190 | 191 | 192 | 193 | 194 | 195 | maven-assembly-plugin 196 | 197 | 198 | 199 | com.google.search.robotstxt.RobotsParserApp 200 | 201 | 202 | 203 | jar-with-dependencies 204 | 205 | 206 | 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/Matcher.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import java.util.List; 18 | 19 | /** Interface of a matcher class. */ 20 | public interface Matcher { 21 | /** 22 | * Check whether at least one of given user agents is allowed to visit given URL based on 23 | * robots.txt which this matcher represents. 24 | * 25 | * @param userAgents interested user agents 26 | * @param url target URL 27 | * @return {@code true} iff verdict is ALLOWED 28 | */ 29 | boolean allowedByRobots(final List userAgents, final String url); 30 | 31 | /** 32 | * Check whether given user agent is allowed to visit given URL based on robots.txt which this 33 | * matcher represents. 34 | * 35 | * @param userAgent interested user agent 36 | * @param url target URL 37 | * @return {@code true} iff verdict is ALLOWED 38 | */ 39 | boolean singleAgentAllowedByRobots(final String userAgent, final String url); 40 | 41 | /** 42 | * Check whether at least one of given user agents is allowed to visit given URL based on 43 | * robots.txt which this matcher represents. All global rule groups are ignored. 44 | * 45 | * @param userAgents interested user agents 46 | * @param url target URL 47 | * @return {@code true} iff verdict is ALLOWED 48 | */ 49 | boolean ignoreGlobalAllowedByRobots(final List userAgents, final String url); 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/MatchingStrategy.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | /** 18 | * Provides methods to calculate match priority for given directives against given path. It is 19 | * required to compute match verdict in {@link RobotsMatcher}. 20 | */ 21 | public interface MatchingStrategy { 22 | /** 23 | * Calculates priority of ALLOW verdict based on given directive. 24 | * 25 | * @param path path to calculate ALLOW match priority against 26 | * @param pattern ALLOW directive value 27 | * @return match priority (higher value means higher chance of ALLOW verdict) 28 | */ 29 | int matchAllowPriority(final String path, final String pattern); 30 | 31 | /** 32 | * Calculates priority of DISALLOW verdict based on given directive. 33 | * 34 | * @param path path to calculate DISALLOW match priority against 35 | * @param pattern DISALLOW directive value 36 | * @return match priority (higher value means higher chance of DISALLOW verdict) 37 | */ 38 | int matchDisallowPriority(final String path, final String pattern); 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/ParseException.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | /** Used in parsing process. */ 18 | public class ParseException extends Exception { 19 | public ParseException() { 20 | super(); 21 | } 22 | 23 | public ParseException(String message, Throwable cause) { 24 | super(message, cause); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/ParseHandler.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | /** 18 | * The interface provides parsing logic for {@link Parser} class. Its implementation is expected to 19 | * accumulate parsed robots.txt lines and be able to compute {@link RobotsMatcher} instance as soon 20 | * as all robots.txt lines were inputted. 21 | */ 22 | public interface ParseHandler { 23 | /** 24 | * Handler for the beginning of parsing process. This method will be called single time before any 25 | * other method of this class. 26 | */ 27 | void handleStart(); 28 | 29 | /** 30 | * Directive receiver. Each directive consists of type and value. This method will be called after 31 | * {@link this#handleStart()} and will not be called after {@link this#handleEnd()}. May be called 32 | * multiple times. 33 | * 34 | * @param directiveType type of received directive 35 | * @param directiveValue value of received directive 36 | */ 37 | void handleDirective(final Parser.DirectiveType directiveType, final String directiveValue); 38 | 39 | /** 40 | * Handler for the end of parsing process. This method will be called single time after {@link 41 | * this#handleStart()} or {@link this#handleDirective(Parser.DirectiveType, String)}. 42 | */ 43 | void handleEnd(); 44 | 45 | /** 46 | * Calling this method produces a matcher based on all earlier received information via {@link 47 | * this#handleDirective(Parser.DirectiveType, String)} method. Thus, it returns serialized view of 48 | * robots.txt file with matching functionality. This method will be called after {@link 49 | * this#handleEnd()}. May be called multiple times. 50 | * 51 | * @return matcher representing original robots.txt file 52 | */ 53 | Matcher compute(); 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/Parser.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | /** 18 | * Abstract parser. All parser implementations must extend it. This class extensions are expected to 19 | * provide tokenizer logic while parsing logic is delegated to a {@link ParseHandler} class. 20 | */ 21 | public abstract class Parser { 22 | enum DirectiveType { 23 | USER_AGENT, 24 | ALLOW, 25 | DISALLOW, 26 | SITEMAP, 27 | UNKNOWN 28 | } 29 | 30 | protected ParseHandler parseHandler; 31 | 32 | /** 33 | * Parser must follow specific {@link ParseHandler} rules in order to parse. Thus it requires an 34 | * instance of it upon creation. 35 | * 36 | * @param parseHandler handler to follow during parsing process. 37 | */ 38 | protected Parser(ParseHandler parseHandler) { 39 | this.parseHandler = parseHandler; 40 | } 41 | 42 | /** 43 | * Method to parse robots.txt file into a matcher. 44 | * 45 | * @param robotsTxtBodyBytes body of robots.txt file to parse 46 | * @return matcher representing given robots.txt file 47 | */ 48 | abstract Matcher parse(final byte[] robotsTxtBodyBytes); 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsContents.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import com.google.common.flogger.FluentLogger; 18 | import java.util.ArrayList; 19 | import java.util.HashSet; 20 | import java.util.List; 21 | import java.util.Objects; 22 | import java.util.Set; 23 | 24 | /** Representation of robots.txt contents: multiple groups of rules. */ 25 | public class RobotsContents { 26 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); 27 | /** 28 | * Representation of robots.txt group of rules: multiple user-agents to which multiple rules are 29 | * applied. 30 | */ 31 | static class Group { 32 | /** Representation of robots.txt rule: pair of directive and value. */ 33 | static class Rule { 34 | private final Parser.DirectiveType directiveType; 35 | private final String directiveValue; 36 | 37 | Rule(final Parser.DirectiveType directiveType, final String directiveValue) { 38 | this.directiveType = directiveType; 39 | this.directiveValue = directiveValue; 40 | } 41 | 42 | public Parser.DirectiveType getDirectiveType() { 43 | return directiveType; 44 | } 45 | 46 | public String getDirectiveValue() { 47 | return directiveValue; 48 | } 49 | 50 | @Override 51 | public boolean equals(Object obj) { 52 | if (this == obj) return true; 53 | if (obj == null || getClass() != obj.getClass()) return false; 54 | Rule other = (Rule) obj; 55 | return Objects.equals(directiveType, other.directiveType) 56 | && Objects.equals(directiveValue, other.directiveValue); 57 | } 58 | 59 | @Override 60 | public int hashCode() { 61 | return Objects.hash(directiveType, directiveValue); 62 | } 63 | } 64 | 65 | private final Set userAgents; 66 | private final Set rules; 67 | private boolean global = false; 68 | 69 | Group() { 70 | userAgents = new HashSet<>(); 71 | rules = new HashSet<>(); 72 | } 73 | 74 | // Intended to be used from tests only. 75 | Group(final List userAgents, final List rules) { 76 | this(userAgents, rules, false); 77 | } 78 | 79 | // Intended to be used from tests only. 80 | Group(final List userAgents, final List rules, final boolean global) { 81 | this.userAgents = new HashSet<>(userAgents); 82 | this.rules = new HashSet<>(rules); 83 | this.global = global; 84 | } 85 | 86 | void addUserAgent(final String userAgent) { 87 | // Google-specific optimization: a '*' followed by space and more characters 88 | // in a user-agent record is still regarded a global rule. 89 | if (userAgent.length() >= 1 90 | && userAgent.charAt(0) == '*' 91 | && (userAgent.length() == 1 || Character.isWhitespace(userAgent.charAt(1)))) { 92 | 93 | if (userAgent.length() > 1 && Character.isWhitespace(userAgent.charAt(1))) { 94 | logger.atInfo().log("Assuming \"%s\" user-agent as \"*\"", userAgent); 95 | } 96 | 97 | global = true; 98 | } else { 99 | int end = 0; 100 | for (; end < userAgent.length(); end++) { 101 | final char ch = userAgent.charAt(end); 102 | if (!Character.isAlphabetic(ch) && ch != '-' && ch != '_') { 103 | break; 104 | } 105 | } 106 | userAgents.add(userAgent.substring(0, end)); 107 | } 108 | } 109 | 110 | void addRule(final Parser.DirectiveType directiveType, final String directiveValue) { 111 | rules.add(new Rule(directiveType, directiveValue)); 112 | } 113 | 114 | boolean hasRule(final Parser.DirectiveType directiveType, final String directiveValue) { 115 | return rules.contains(new Rule(directiveType, directiveValue)); 116 | } 117 | 118 | public Set getUserAgents() { 119 | return userAgents; 120 | } 121 | 122 | public Set getRules() { 123 | return rules; 124 | } 125 | 126 | public boolean isGlobal() { 127 | return global; 128 | } 129 | 130 | @Override 131 | public boolean equals(Object obj) { 132 | if (this == obj) return true; 133 | if (obj == null || getClass() != obj.getClass()) return false; 134 | Group other = (Group) obj; 135 | return Objects.equals(userAgents, other.userAgents) 136 | && Objects.equals(rules, other.rules) 137 | && Objects.equals(global, other.global); 138 | } 139 | 140 | @Override 141 | public int hashCode() { 142 | return Objects.hash(userAgents, rules); 143 | } 144 | } 145 | 146 | private final List groups; 147 | 148 | RobotsContents() { 149 | groups = new ArrayList<>(); 150 | } 151 | 152 | public RobotsContents(final List groups) { 153 | this.groups = groups; 154 | } 155 | 156 | void addGroup(Group group) { 157 | groups.add(group); 158 | } 159 | 160 | public List getGroups() { 161 | return groups; 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsLongestMatchStrategy.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | /** 18 | * Implementation of matching strategy used in robots.txt matching. Implements longest-match 19 | * strategy. 20 | */ 21 | public class RobotsLongestMatchStrategy implements MatchingStrategy { 22 | /** 23 | * Checks whether the given path may be matched to the given pattern. Treats '*' as a wildcard and 24 | * '$' as a termination symbol iff it is in the end of pattern. 25 | * 26 | * @param path path to match 27 | * @param pattern pattern to match to 28 | * @return {@code true} iff given path matches given pattern 29 | */ 30 | private static boolean matches(final String path, final String pattern) { 31 | // "Prefixes" array stores "path" prefixes that match specific prefix of "pattern". 32 | // Prefixes of "pattern" are iterated over in ascending order in the loop below. 33 | // Each prefix is represented by its end index (exclusive), the array stores them in ascending 34 | // order. 35 | final int[] prefixes = new int[path.length() + 1]; 36 | prefixes[0] = 0; 37 | int prefixesCount = 1; 38 | 39 | for (int i = 0; i < pattern.length(); i++) { 40 | final char ch = pattern.charAt(i); 41 | 42 | // '$' in the end of pattern indicates its termination. 43 | if (ch == '$' && i + 1 == pattern.length()) { 44 | return prefixes[prefixesCount - 1] == path.length(); 45 | } 46 | 47 | // In case of '*' occurrence all path prefixes starting from the shortest one may be matched. 48 | if (ch == '*') { 49 | prefixesCount = path.length() - prefixes[0] + 1; 50 | for (int j = 1; j < prefixesCount; j++) { 51 | prefixes[j] = prefixes[j - 1] + 1; 52 | } 53 | } else { 54 | // Iterate over each previous prefix and try to extend by one character. 55 | int newPrefixesCount = 0; 56 | for (int j = 0; j < prefixesCount; j++) { 57 | if (prefixes[j] < path.length() && path.charAt(prefixes[j]) == ch) { 58 | prefixes[newPrefixesCount++] = prefixes[j] + 1; 59 | } 60 | } 61 | if (newPrefixesCount == 0) { 62 | return false; 63 | } 64 | prefixesCount = newPrefixesCount; 65 | } 66 | } 67 | 68 | return true; 69 | } 70 | 71 | @Override 72 | public int matchAllowPriority(String path, String pattern) { 73 | return matches(path, pattern) ? pattern.length() : -1; 74 | } 75 | 76 | @Override 77 | public int matchDisallowPriority(String path, String pattern) { 78 | return matches(path, pattern) ? pattern.length() : -1; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsMatcher.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import com.google.common.flogger.FluentLogger; 18 | import java.net.MalformedURLException; 19 | import java.net.URL; 20 | import java.util.Collections; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | /** 25 | * Class implementing matching logic based on directives priorities those calculation is delegated 26 | * to a {@link MatchingStrategy} class. 27 | */ 28 | public class RobotsMatcher implements Matcher { 29 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); 30 | 31 | /** Class containing current match priorities */ 32 | private static class Match { 33 | /** Priority based on agent-specific rules */ 34 | private int prioritySpecific = 0; 35 | /** Priority based on global wildcard (*) rules */ 36 | private int priorityGlobal = 0; 37 | 38 | void updateSpecific(final int priority) { 39 | prioritySpecific = Math.max(prioritySpecific, priority); 40 | } 41 | 42 | void updateGlobal(final int priority) { 43 | priorityGlobal = Math.max(priorityGlobal, priority); 44 | } 45 | 46 | public int getPrioritySpecific() { 47 | return prioritySpecific; 48 | } 49 | 50 | public int getPriorityGlobal() { 51 | return priorityGlobal; 52 | } 53 | 54 | public void resetGlobal() { 55 | priorityGlobal = 0; 56 | } 57 | } 58 | 59 | private final RobotsContents robotsContents; 60 | private final MatchingStrategy matchingStrategy = new RobotsLongestMatchStrategy(); 61 | 62 | public RobotsMatcher(final RobotsContents robotsContents) { 63 | this.robotsContents = robotsContents; 64 | } 65 | 66 | /** Used to extract contents for testing purposes. */ 67 | RobotsContents getRobotsContents() { 68 | return robotsContents; 69 | } 70 | 71 | private static String getPath(final String url) { 72 | final URL parsedUrl; 73 | try { 74 | parsedUrl = new URL(url); 75 | } catch (final MalformedURLException e) { 76 | logger.atWarning().log("Malformed URL: \"%s\", replaced with \"/\"", url); 77 | return "/"; 78 | } 79 | String path = parsedUrl.getPath(); 80 | final String args = parsedUrl.getQuery(); 81 | if (args != null) { 82 | path += "?" + args; 83 | } 84 | 85 | return path; 86 | } 87 | 88 | /** 89 | * Computes {@link Match} priorities for ALLOW and DISALLOW verdicts. Rules are considered 90 | * effective if at least one user agent is listed in "user-agent" directives or applied globally 91 | * (if global rules are not ignored). 92 | * 93 | * @param userAgents list of interested user agents 94 | * @param path target path 95 | * @param ignoreGlobal global rules will not be considered if set to {@code true} 96 | * @return pair of {@link Match} representing ALLOW and DISALLOW priorities respectively 97 | */ 98 | private Map.Entry computeMatchPriorities( 99 | final List userAgents, final String path, final boolean ignoreGlobal) { 100 | final Match allow = new Match(); 101 | final Match disallow = new Match(); 102 | boolean foundSpecificGroup = false; 103 | 104 | for (RobotsContents.Group group : robotsContents.getGroups()) { 105 | final boolean isSpecificGroup = 106 | userAgents.stream() 107 | .anyMatch( 108 | userAgent -> 109 | group.getUserAgents().stream().anyMatch(userAgent::equalsIgnoreCase)); 110 | foundSpecificGroup |= isSpecificGroup; 111 | if (!isSpecificGroup && (ignoreGlobal || !group.isGlobal())) { 112 | continue; 113 | } 114 | 115 | for (RobotsContents.Group.Rule rule : group.getRules()) { 116 | switch (rule.getDirectiveType()) { 117 | case ALLOW: 118 | { 119 | final int priority = 120 | matchingStrategy.matchAllowPriority(path, rule.getDirectiveValue()); 121 | if (isSpecificGroup) { 122 | allow.updateSpecific(priority); 123 | } 124 | if (!ignoreGlobal && group.isGlobal()) { 125 | allow.updateGlobal(priority); 126 | } 127 | break; 128 | } 129 | case DISALLOW: 130 | { 131 | final int priority = 132 | matchingStrategy.matchDisallowPriority(path, rule.getDirectiveValue()); 133 | if (isSpecificGroup) { 134 | disallow.updateSpecific(priority); 135 | } 136 | if (!ignoreGlobal && group.isGlobal()) { 137 | disallow.updateGlobal(priority); 138 | } 139 | break; 140 | } 141 | case SITEMAP: 142 | case UNKNOWN: 143 | case USER_AGENT: 144 | break; 145 | } 146 | } 147 | } 148 | 149 | // If there is at least one group specific for current agents, global groups should be 150 | // disregarded. 151 | if (foundSpecificGroup) { 152 | allow.resetGlobal(); 153 | disallow.resetGlobal(); 154 | } 155 | 156 | return Map.entry(allow, disallow); 157 | } 158 | 159 | private Map.Entry computeMatchPriorities( 160 | final List userAgents, final String path) { 161 | return computeMatchPriorities(userAgents, path, false); 162 | } 163 | 164 | /** 165 | * Return {@code true} iff verdict must be ALLOW based on ALLOW and DISALLOW priorities. 166 | * 167 | * @param allow ALLOW priorities 168 | * @param disallow DISALLOW priorities 169 | * @return match verdict 170 | */ 171 | private static boolean allowVerdict(final Match allow, final Match disallow) { 172 | if (allow.getPrioritySpecific() > 0 || disallow.getPrioritySpecific() > 0) { 173 | return allow.getPrioritySpecific() >= disallow.getPrioritySpecific(); 174 | } 175 | 176 | if (allow.getPriorityGlobal() > 0 || disallow.getPriorityGlobal() > 0) { 177 | return allow.getPriorityGlobal() >= disallow.getPriorityGlobal(); 178 | } 179 | 180 | return true; 181 | } 182 | 183 | @Override 184 | public boolean allowedByRobots(final List userAgents, final String url) { 185 | final String path = getPath(url); 186 | Map.Entry matches = computeMatchPriorities(userAgents, path); 187 | return allowVerdict(matches.getKey(), matches.getValue()); 188 | } 189 | 190 | @Override 191 | public boolean singleAgentAllowedByRobots(final String userAgent, final String url) { 192 | return allowedByRobots(Collections.singletonList(userAgent), url); 193 | } 194 | 195 | @Override 196 | public boolean ignoreGlobalAllowedByRobots(final List userAgents, final String url) { 197 | final String path = getPath(url); 198 | Map.Entry matches = computeMatchPriorities(userAgents, path, true); 199 | return allowVerdict(matches.getKey(), matches.getValue()); 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsParseHandler.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import com.google.common.flogger.FluentLogger; 18 | import java.nio.charset.StandardCharsets; 19 | 20 | /** Implementation of parsing strategy used in robots.txt parsing. */ 21 | public class RobotsParseHandler implements ParseHandler { 22 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); 23 | 24 | protected RobotsContents robotsContents; 25 | private RobotsContents.Group currentGroup; 26 | private boolean foundContent; 27 | 28 | @Override 29 | public void handleStart() { 30 | robotsContents = new RobotsContents(); 31 | currentGroup = new RobotsContents.Group(); 32 | foundContent = false; 33 | } 34 | 35 | private void flushCompleteGroup(boolean createNew) { 36 | robotsContents.addGroup(currentGroup); 37 | if (createNew) { 38 | currentGroup = new RobotsContents.Group(); 39 | } 40 | } 41 | 42 | @Override 43 | public void handleEnd() { 44 | flushCompleteGroup(false); 45 | } 46 | 47 | private void handleUserAgent(final String value) { 48 | if (foundContent) { 49 | flushCompleteGroup(true); 50 | foundContent = false; 51 | } 52 | currentGroup.addUserAgent(value); 53 | } 54 | 55 | private static boolean isHexChar(final byte b) { 56 | return Character.isDigit(b) || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F'); 57 | } 58 | 59 | /** 60 | * Canonicalize paths: escape characters outside of US-ASCII charset (e.g. /SanJoséSellers ==> 61 | * /Sanjos%C3%A9Sellers) and normalize escape-characters (e.g. %aa ==> %AA) 62 | * 63 | * @param path Path to canonicalize. 64 | * @return escaped and normalized path 65 | */ 66 | private static String maybeEscapePattern(final String path) { 67 | final byte[] bytes = path.getBytes(StandardCharsets.UTF_8); 68 | 69 | int unescapedCount = 0; 70 | boolean notCapitalized = false; 71 | 72 | // Check if any changes required 73 | for (int i = 0; i < bytes.length; i++) { 74 | if (i < bytes.length - 2 75 | && bytes[i] == '%' 76 | && isHexChar(bytes[i + 1]) 77 | && isHexChar(bytes[i + 2])) { 78 | if (Character.isLowerCase(bytes[i + 1]) || Character.isLowerCase(bytes[i + 2])) { 79 | notCapitalized = true; 80 | } 81 | i += 2; 82 | } else if ((bytes[i] & 0x80) != 0) { 83 | unescapedCount++; 84 | } 85 | } 86 | 87 | // Return if no changes needed 88 | if (unescapedCount == 0 && !notCapitalized) { 89 | return path; 90 | } 91 | 92 | final StringBuilder stringBuilder = new StringBuilder(); 93 | for (int i = 0; i < bytes.length; i++) { 94 | if (i < bytes.length - 2 95 | && bytes[i] == '%' 96 | && isHexChar(bytes[i + 1]) 97 | && isHexChar(bytes[i + 2])) { 98 | stringBuilder.append((char) bytes[i++]); 99 | stringBuilder.append((char) Character.toUpperCase(bytes[i++])); 100 | stringBuilder.append((char) Character.toUpperCase(bytes[i])); 101 | } else if ((bytes[i] & 0x80) != 0) { 102 | stringBuilder.append('%'); 103 | stringBuilder.append(Integer.toHexString((bytes[i] >> 4) & 0xf).toUpperCase()); 104 | stringBuilder.append(Integer.toHexString(bytes[i] & 0xf).toUpperCase()); 105 | } else { 106 | stringBuilder.append((char) bytes[i]); 107 | } 108 | } 109 | return stringBuilder.toString(); 110 | } 111 | 112 | @Override 113 | public void handleDirective( 114 | final Parser.DirectiveType directiveType, final String directiveValue) { 115 | switch (directiveType) { 116 | case USER_AGENT: 117 | { 118 | handleUserAgent(directiveValue); 119 | break; 120 | } 121 | case ALLOW: 122 | case DISALLOW: 123 | { 124 | foundContent = true; 125 | if (currentGroup.isGlobal() || currentGroup.getUserAgents().size() > 0) { 126 | final String path = maybeEscapePattern(directiveValue); 127 | currentGroup.addRule(directiveType, path); 128 | 129 | if (directiveType == Parser.DirectiveType.ALLOW) { 130 | // Google-specific optimization: 'index.htm' and 'index.html' are normalized to '/'. 131 | final int slashPos = path.lastIndexOf('/'); 132 | 133 | if (slashPos != -1) { 134 | final String fileName = path.substring(slashPos + 1); 135 | if ("index.htm".equals(fileName) || "index.html".equals(fileName)) { 136 | final String normalizedPath = path.substring(0, slashPos + 1) + '$'; 137 | 138 | if (!currentGroup.hasRule(Parser.DirectiveType.ALLOW, normalizedPath)) { 139 | logger.atInfo().log( 140 | "Allowing normalized path: \"%s\" -> \"%s\"", 141 | directiveValue, normalizedPath); 142 | currentGroup.addRule(Parser.DirectiveType.ALLOW, normalizedPath); 143 | } 144 | } 145 | } 146 | } 147 | } 148 | break; 149 | } 150 | case SITEMAP: 151 | case UNKNOWN: 152 | { 153 | foundContent = true; 154 | break; 155 | } 156 | } 157 | } 158 | 159 | @Override 160 | public Matcher compute() { 161 | return new RobotsMatcher(robotsContents); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsParser.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import com.google.common.flogger.FluentLogger; 18 | import java.nio.charset.StandardCharsets; 19 | import java.util.Arrays; 20 | import java.util.logging.Level; 21 | import java.util.stream.Stream; 22 | 23 | /** Robots.txt parser implementation. */ 24 | public class RobotsParser extends Parser { 25 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); 26 | private final int valueMaxLengthBytes; 27 | 28 | public RobotsParser(final ParseHandler parseHandler) { 29 | super(parseHandler); 30 | this.valueMaxLengthBytes = 2083; 31 | } 32 | 33 | RobotsParser(final ParseHandler parseHandler, final int valueMaxLengthBytes) { 34 | super(parseHandler); 35 | this.valueMaxLengthBytes = valueMaxLengthBytes; 36 | } 37 | 38 | private static boolean isWhitespace(final char ch) { 39 | return ch == ' ' || ch == '\t'; 40 | } 41 | 42 | /** 43 | * Extracts substring between given indexes and trims preceding and succeeding whitespace 44 | * characters. 45 | * 46 | * @param bytes data to extract from 47 | * @param beginIndex the beginning index, inclusive 48 | * @param endIndex the ending index, exclusive 49 | * @return extracted substring with trimmed whitespaces 50 | * @throws ParseException if there are only whitespace characters between given indexes 51 | */ 52 | private static String trimBounded(final byte[] bytes, final int beginIndex, final int endIndex) 53 | throws ParseException { 54 | int begin = beginIndex; 55 | int end = endIndex; 56 | while (begin < endIndex && isWhitespace((char) bytes[begin])) { 57 | begin++; 58 | } 59 | while (end > beginIndex && isWhitespace((char) bytes[end - 1])) { 60 | end--; 61 | } 62 | if (begin >= end) { 63 | throw new ParseException(); 64 | } else { 65 | return new String(Arrays.copyOfRange(bytes, begin, end), StandardCharsets.UTF_8); 66 | } 67 | } 68 | 69 | private static DirectiveType parseDirective(final String key) { 70 | if (key.equalsIgnoreCase("user-agent")) { 71 | return DirectiveType.USER_AGENT; 72 | } else { 73 | try { 74 | return DirectiveType.valueOf(key.toUpperCase()); 75 | } catch (final IllegalArgumentException e) { 76 | final boolean disallowTypoDetected = 77 | Stream.of("dissallow", "dissalow", "disalow", "diasllow", "disallaw") 78 | .anyMatch(s -> key.compareToIgnoreCase(s) == 0); 79 | if (disallowTypoDetected) { 80 | logger.atInfo().log("Fixed typo: \"%s\" -> \"%s\"", key, "disallow"); 81 | return DirectiveType.DISALLOW; 82 | } 83 | 84 | return DirectiveType.UNKNOWN; 85 | } 86 | } 87 | } 88 | 89 | private static void log( 90 | final Level level, 91 | final String message, 92 | final byte[] robotsTxtBodyBytes, 93 | final int lineBegin, 94 | final int lineEnd, 95 | final int lineNumber) { 96 | logger.at(level).log( 97 | "%s%nAt line %d:%n%s\t", 98 | message, 99 | lineNumber, 100 | new String(Arrays.copyOfRange(robotsTxtBodyBytes, lineBegin, lineEnd))); 101 | } 102 | 103 | /** 104 | * Extracts value from robots.txt body and trims it to {@link this#valueMaxLengthBytes} bytes if 105 | * necessary. Most of parameters are used for logging. 106 | * 107 | * @param robotsTxtBodyBytes contents of robots.txt file 108 | * @param separator index of separator between key and value 109 | * @param limit index of key and value ending 110 | * @param lineBegin index of line beginning 111 | * @param lineEnd index of line ending 112 | * @param lineNumber number of line in robots.txt file 113 | * @return parsed value within given line of robots.txt 114 | * @throws ParseException if line limits are invalid 115 | */ 116 | private String getValue( 117 | final byte[] robotsTxtBodyBytes, 118 | final int separator, 119 | final int limit, 120 | final int lineBegin, 121 | final int lineEnd, 122 | final int lineNumber) 123 | throws ParseException { 124 | String value = trimBounded(robotsTxtBodyBytes, separator + 1, limit); 125 | 126 | // Google-specific optimization: since no search engine will process more than 2083 bytes 127 | // per URL all values are trimmed to fit this size. 128 | final byte[] valueBytes = value.getBytes(StandardCharsets.UTF_8); 129 | 130 | // We decrease max size by two bytes. It is done to fit a replacement character (\uFFFD) 131 | // if the last character is trimmed to an invalid one. 132 | final int maxLengthBytes = valueMaxLengthBytes - 2; 133 | 134 | if (valueBytes.length > maxLengthBytes) { 135 | log( 136 | Level.INFO, 137 | "Value truncated to " + valueMaxLengthBytes + " bytes.", 138 | robotsTxtBodyBytes, 139 | lineBegin, 140 | lineEnd, 141 | lineNumber); 142 | 143 | value = 144 | new String( 145 | valueBytes, 0, Math.min(valueBytes.length, maxLengthBytes), StandardCharsets.UTF_8); 146 | } 147 | 148 | return value; 149 | } 150 | 151 | private void parseLine( 152 | final byte[] robotsTxtBodyBytes, 153 | final int lineBegin, 154 | final int lineEnd, 155 | final int lineNumber) { 156 | int limit = lineEnd; 157 | int separator = lineEnd; 158 | int whitespaceSeparator = lineEnd; 159 | boolean hasContents = false; 160 | 161 | for (int i = lineBegin; i < lineEnd; i++) { 162 | final byte b = robotsTxtBodyBytes[i]; 163 | if (b == '#') { 164 | limit = i; 165 | break; 166 | } 167 | if (!isWhitespace((char) b)) { 168 | hasContents = true; 169 | } 170 | if (isWhitespace((char) b) && hasContents && whitespaceSeparator == lineEnd) { 171 | whitespaceSeparator = i; 172 | } 173 | if (separator == lineEnd && b == ':') { 174 | separator = i; 175 | } 176 | } 177 | 178 | if (separator == lineEnd) { 179 | // Google-specific optimization: some people forget the colon, so we need to 180 | // accept whitespace instead. 181 | if (whitespaceSeparator != lineEnd) { 182 | log( 183 | Level.INFO, 184 | "Assuming whitespace as a separator.", 185 | robotsTxtBodyBytes, 186 | lineBegin, 187 | lineEnd, 188 | lineNumber); 189 | separator = whitespaceSeparator; 190 | } else { 191 | if (hasContents) { 192 | log( 193 | Level.WARNING, 194 | "No separator found.", 195 | robotsTxtBodyBytes, 196 | lineBegin, 197 | lineEnd, 198 | lineNumber); 199 | } 200 | return; 201 | } 202 | } 203 | 204 | final String key; 205 | try { 206 | key = trimBounded(robotsTxtBodyBytes, lineBegin, separator); 207 | } catch (ParseException e) { 208 | log(Level.WARNING, "No key found.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber); 209 | return; 210 | } 211 | 212 | DirectiveType directiveType = parseDirective(key); 213 | if (directiveType == DirectiveType.UNKNOWN) { 214 | log(Level.WARNING, "Unknown key.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber); 215 | } 216 | 217 | String value; 218 | try { 219 | value = getValue(robotsTxtBodyBytes, separator, limit, lineBegin, lineEnd, lineNumber); 220 | } catch (final ParseException e) { 221 | log(Level.WARNING, "No value found.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber); 222 | value = ""; 223 | directiveType = DirectiveType.UNKNOWN; 224 | } 225 | parseHandler.handleDirective(directiveType, value); 226 | } 227 | 228 | @Override 229 | Matcher parse(byte[] robotsTxtBodyBytes) { 230 | final byte[] bomUtf8 = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; 231 | int bomPos = 0; 232 | 233 | int posBegin = 0; 234 | int posEnd = 0; 235 | int lineNumber = 0; 236 | boolean previousWasCarriageReturn = false; 237 | 238 | parseHandler.handleStart(); 239 | 240 | // Iteration over characters is preferred over utilities that split text into lines to avoid 241 | // having to create additional Strings and comply with line breaking defined in standard. 242 | for (int i = 0; i <= robotsTxtBodyBytes.length; i++) { 243 | final byte b = (i == robotsTxtBodyBytes.length) ? (byte) '\0' : robotsTxtBodyBytes[i]; 244 | 245 | // Google-specific optimization: UTF-8 byte order marks should never 246 | // appear in a robots.txt file, but they do nevertheless. Skipping 247 | // possible BOM-prefix in the first bytes of the input. 248 | if (bomPos < bomUtf8.length && b == bomUtf8[bomPos++]) { 249 | posBegin++; 250 | posEnd++; 251 | continue; 252 | } 253 | bomPos = bomUtf8.length; 254 | 255 | if (b != '\n' && b != '\r' && b != '\0') { 256 | posEnd++; 257 | } else { 258 | if (posBegin != posEnd || !previousWasCarriageReturn || b != '\n') { 259 | parseLine(robotsTxtBodyBytes, posBegin, posEnd, ++lineNumber); 260 | } 261 | posBegin = posEnd = i + 1; 262 | previousWasCarriageReturn = b == '\r'; 263 | } 264 | } 265 | 266 | parseHandler.handleEnd(); 267 | 268 | return parseHandler.compute(); 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /src/main/java/com/google/search/robotstxt/RobotsParserApp.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import com.google.common.flogger.FluentLogger; 18 | import com.google.common.io.ByteStreams; 19 | import java.io.IOException; 20 | import java.io.UncheckedIOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.InvalidPathException; 23 | import java.nio.file.Path; 24 | import java.util.List; 25 | import java.util.Objects; 26 | import java.util.concurrent.Callable; 27 | import picocli.CommandLine; 28 | 29 | /** 30 | * Console application for parsing robots.txt and matching URLs against it. 31 | * 32 | * @see Parser 33 | * @see Matcher 34 | */ 35 | @CommandLine.Command( 36 | name = "robotsParser", 37 | description = 38 | "Parses and matches given agents against given robots.txt to determine " 39 | + "whether any agent is allowed to visit given URL.", 40 | exitCodeOnExecutionException = 2, 41 | exitCodeOnInvalidInput = 3) 42 | public class RobotsParserApp implements Callable { 43 | private static final FluentLogger logger = FluentLogger.forEnclosingClass(); 44 | 45 | public RobotsParserApp() {} 46 | 47 | public static void main(final String[] args) { 48 | final int exitCode = new CommandLine(new RobotsParserApp()).execute(args); 49 | System.exit(exitCode); 50 | } 51 | 52 | /** robots.txt file path. */ 53 | @CommandLine.Option(names = {"-f", "--file"}) 54 | private String robotsTxtPath; 55 | 56 | /** Interested user-agents. */ 57 | @CommandLine.Option( 58 | names = {"-a", "--agent"}, 59 | required = true) 60 | private List agents; 61 | 62 | /** Target URL to match. */ 63 | @CommandLine.Option( 64 | names = {"-u", "--url"}, 65 | required = true) 66 | private String url; 67 | 68 | private byte[] readRobotsTxt() throws ParseException { 69 | try { 70 | if (Objects.isNull(robotsTxtPath)) { 71 | // Reading from stdin 72 | return ByteStreams.toByteArray(System.in); 73 | } else { 74 | // Reading from file 75 | return Files.readAllBytes(Path.of(robotsTxtPath)); 76 | } 77 | } catch (final UncheckedIOException | IOException | InvalidPathException e) { 78 | throw new ParseException("Failed to read robots.txt file.", e); 79 | } 80 | } 81 | 82 | private static void logError(final Exception e) { 83 | System.out.println("ERROR: " + e.getMessage()); 84 | logger.atInfo().withCause(e).log("Stack trace:"); 85 | } 86 | 87 | /** 88 | * Parses given robots.txt file and performs matching process. 89 | * 90 | * @return {@code 0} if any of user-agents is allowed to crawl given URL and {@code 1} otherwise. 91 | */ 92 | @Override 93 | public Integer call() { 94 | final byte[] robotsTxtContents; 95 | try { 96 | robotsTxtContents = readRobotsTxt(); 97 | } catch (final ParseException e) { 98 | logError(e); 99 | return 2; 100 | } 101 | 102 | final Parser parser = new RobotsParser(new RobotsParseHandler()); 103 | final RobotsMatcher matcher = (RobotsMatcher) parser.parse(robotsTxtContents); 104 | 105 | final boolean parseResult; 106 | parseResult = matcher.allowedByRobots(agents, url); 107 | 108 | if (parseResult) { 109 | System.out.println("ALLOWED"); 110 | return 0; 111 | } else { 112 | System.out.println("DISALLOWED"); 113 | return 1; 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/test/java/com/google/search/robotstxt/RobotsMatcherTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import static org.junit.Assert.assertFalse; 18 | import static org.junit.Assert.assertTrue; 19 | 20 | import java.nio.charset.StandardCharsets; 21 | import org.junit.Test; 22 | 23 | /** Unit tests validating matching behavior. */ 24 | public class RobotsMatcherTest { 25 | private static Matcher parse(final String robotsTxtBody) { 26 | final Parser parser = new RobotsParser(new RobotsParseHandler()); 27 | return parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8)); 28 | } 29 | 30 | /** Verifies: parsing an matching robots.txt containing single group */ 31 | @Test 32 | public void testSingleGroup() { 33 | final String robotsTxtBodyCorrect = "user-agent: FooBot\n" + "disallow: /\n"; 34 | final String robotsTxtBodyIncorrect = "foo: FooBot\n" + "bar: /\n"; 35 | final String robotsTxtMissingSeparator = "user-agent FooBot\n" + "disallow /\n"; 36 | 37 | final String url = "http://foo.bar/x/y"; 38 | 39 | final Matcher matcherCorrect = parse(robotsTxtBodyCorrect); 40 | assertFalse(matcherCorrect.singleAgentAllowedByRobots("FooBot", url)); 41 | 42 | final Matcher matcherIncorrect = parse(robotsTxtBodyIncorrect); 43 | assertTrue(matcherIncorrect.singleAgentAllowedByRobots("FooBot", url)); 44 | 45 | final Matcher matcherMissingSeparator = parse(robotsTxtMissingSeparator); 46 | assertFalse(matcherMissingSeparator.singleAgentAllowedByRobots("FooBot", url)); 47 | } 48 | 49 | /** 50 | * Verifies: parsing an matching robots.txt containing multiple groups, invalid directives 51 | * ignorance. 52 | */ 53 | @Test 54 | public void testMultipleGroups() { 55 | final String robotsTxtBody = 56 | "allow: /foo/bar/\n" 57 | + "\n" 58 | + "user-agent: FooBot\n" 59 | + "disallow: /\n" 60 | + "allow: /x/\n" 61 | + "user-agent: BarBot\n" 62 | + "disallow: /\n" 63 | + "allow: /y/\n" 64 | + "\n" 65 | + "\n" 66 | + "allow: /w/\n" 67 | + "user-agent: BazBot\n" 68 | + "\n" 69 | + "user-agent: FooBot\n" 70 | + "allow: /z/\n" 71 | + "disallow: /\n"; 72 | 73 | final String urlWa = "http://foo.bar/w/a"; 74 | final String urlXb = "http://foo.bar/x/b"; 75 | final String urlYc = "http://foo.bar/y/c"; 76 | final String urlZd = "http://foo.bar/z/d"; 77 | final String urlFooBar = "http://foo.bar/foo/bar/"; 78 | 79 | final Matcher matcher = parse(robotsTxtBody); 80 | 81 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urlXb)); 82 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urlZd)); 83 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urlYc)); 84 | assertTrue(matcher.singleAgentAllowedByRobots("BarBot", urlYc)); 85 | assertTrue(matcher.singleAgentAllowedByRobots("BarBot", urlWa)); 86 | assertFalse(matcher.singleAgentAllowedByRobots("BarBot", urlZd)); 87 | assertTrue(matcher.singleAgentAllowedByRobots("BazBot", urlZd)); 88 | 89 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urlFooBar)); 90 | assertFalse(matcher.singleAgentAllowedByRobots("BarBot", urlFooBar)); 91 | assertFalse(matcher.singleAgentAllowedByRobots("BazBot", urlFooBar)); 92 | } 93 | 94 | /** Verifies: directives case insensitivity. */ 95 | @Test 96 | public void testDirectiveCaseInsensitivity() { 97 | final String robotsTxtBodyUpper = "USER-AGENT: FooBot\n" + "ALLOW: /x/\n" + "DISALLOW: /\n"; 98 | final String robotsTxtBodyLower = "user-agent: FooBot\n" + "allow: /x/\n" + "disallow: /\n"; 99 | final String robotsTxtBodyRandom = "uSeR-aGeNt: FooBot\n" + "AlLoW: /x/\n" + "dIsAlLoW: /\n"; 100 | 101 | final String urlAllowed = "http://foo.bar/x/y"; 102 | final String urlDisallowed = "http://foo.bar/a/b"; 103 | 104 | final Matcher matcherUpper = parse(robotsTxtBodyUpper); 105 | assertTrue(matcherUpper.singleAgentAllowedByRobots("FooBot", urlAllowed)); 106 | assertFalse(matcherUpper.singleAgentAllowedByRobots("FooBot", urlDisallowed)); 107 | 108 | final Matcher matcherLower = parse(robotsTxtBodyLower); 109 | assertTrue(matcherLower.singleAgentAllowedByRobots("FooBot", urlAllowed)); 110 | assertFalse(matcherLower.singleAgentAllowedByRobots("FooBot", urlDisallowed)); 111 | 112 | final Matcher matcherRandom = parse(robotsTxtBodyRandom); 113 | assertTrue(matcherRandom.singleAgentAllowedByRobots("FooBot", urlAllowed)); 114 | assertFalse(matcherRandom.singleAgentAllowedByRobots("FooBot", urlDisallowed)); 115 | } 116 | 117 | /** Verifies: user agent case insensitivity, user agent names convention compliance. */ 118 | @Test 119 | public void testUserAgentCaseInsensitivity() { 120 | final String robotsTxtBodyUpper = "user-agent: FOO BAR\n" + "allow: /x/\n" + "disallow: /\n"; 121 | final String robotsTxtBodyLower = "user-agent: foo bar\n" + "allow: /x/\n" + "disallow: /\n"; 122 | final String robotsTxtBodyRandom = "user-agent: FoO bAr\n" + "allow: /x/\n" + "disallow: /\n"; 123 | 124 | final String urlAllowed = "http://foo.bar/x/y"; 125 | final String urlDisallowed = "http://foo.bar/a/b"; 126 | 127 | final Matcher matcherUpper = parse(robotsTxtBodyUpper); 128 | assertTrue(matcherUpper.singleAgentAllowedByRobots("Foo", urlAllowed)); 129 | assertTrue(matcherUpper.singleAgentAllowedByRobots("foo", urlAllowed)); 130 | assertFalse(matcherUpper.singleAgentAllowedByRobots("Foo", urlDisallowed)); 131 | assertFalse(matcherUpper.singleAgentAllowedByRobots("foo", urlDisallowed)); 132 | 133 | final Matcher matcherLower = parse(robotsTxtBodyLower); 134 | assertTrue(matcherLower.singleAgentAllowedByRobots("Foo", urlAllowed)); 135 | assertTrue(matcherLower.singleAgentAllowedByRobots("foo", urlAllowed)); 136 | assertFalse(matcherLower.singleAgentAllowedByRobots("Foo", urlDisallowed)); 137 | assertFalse(matcherLower.singleAgentAllowedByRobots("foo", urlDisallowed)); 138 | 139 | final Matcher matcherRandom = parse(robotsTxtBodyRandom); 140 | assertTrue(matcherRandom.singleAgentAllowedByRobots("Foo", urlAllowed)); 141 | assertTrue(matcherRandom.singleAgentAllowedByRobots("foo", urlAllowed)); 142 | assertFalse(matcherRandom.singleAgentAllowedByRobots("Foo", urlDisallowed)); 143 | assertFalse(matcherRandom.singleAgentAllowedByRobots("foo", urlDisallowed)); 144 | } 145 | 146 | /** [Google-specific] Verifies: accepting user-agent value up to the first space. */ 147 | @Test 148 | public void testAcceptUserAgentUpToFirstSpace() { 149 | final String robotsTxtBody = 150 | "User-Agent: *\n" 151 | + "Disallow: /\n" 152 | + "User-Agent: Foo Bar\n" 153 | + "Allow: /x/\n" 154 | + "Disallow: /\n"; 155 | 156 | final String url = "http://foo.bar/x/y"; 157 | 158 | final Matcher matcher = parse(robotsTxtBody); 159 | assertTrue(matcher.singleAgentAllowedByRobots("Foo", url)); 160 | assertFalse(matcher.singleAgentAllowedByRobots("Foo Bar", url)); 161 | } 162 | 163 | /** Verifies: global rules. */ 164 | @Test 165 | public void testGlobalGroups() { 166 | final String robotsTxtBodyEmpty = ""; 167 | final String robotsTxtBodyGlobal = 168 | "user-agent: *\n" + "disallow: /x\n" + "user-agent: FooBot\n" + "allow: /x/y\n"; 169 | final String robotsTxtBodySpecific = 170 | "user-agent: FooBot\n" 171 | + "allow: /\n" 172 | + "user-agent: BarBot\n" 173 | + "disallow: /\n" 174 | + "user-agent: BazBot\n" 175 | + "disallow: /\n"; 176 | 177 | final String url = "http://foo.bar/x/y"; 178 | 179 | final Matcher matcherEmpty = parse(robotsTxtBodyEmpty); 180 | assertTrue(matcherEmpty.singleAgentAllowedByRobots("FooBot", url)); 181 | 182 | final Matcher matcherGlobal = parse(robotsTxtBodyGlobal); 183 | assertTrue(matcherGlobal.singleAgentAllowedByRobots("FooBot", url)); 184 | assertFalse(matcherGlobal.singleAgentAllowedByRobots("BarBot", url)); 185 | 186 | final Matcher matcherSpecific = parse(robotsTxtBodySpecific); 187 | assertTrue(matcherSpecific.singleAgentAllowedByRobots("QuxBot", url)); 188 | } 189 | 190 | /** 191 | * [Google-specific] Verifies: any user-agent with prefix "* " is considered as global wildcard. 192 | */ 193 | @Test 194 | public void testGlobalGroupsPrefix() { 195 | final String robotsTxtBody = 196 | "user-agent: * baz\n" + "disallow: /x\n" + "user-agent: FooBot\n" + "allow: /x/y\n"; 197 | 198 | final String url = "http://foo.bar/x/y"; 199 | 200 | final Matcher matcher = parse(robotsTxtBody); 201 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 202 | assertFalse(matcher.singleAgentAllowedByRobots("BarBot", url)); 203 | } 204 | 205 | /** Verifies: case sensitivity of URIs. */ 206 | @Test 207 | public void testUriCaseSensitivity() { 208 | final String robotsTxtBodyUpper = "user-agent: FooBot\n" + "disallow: /X/\n"; 209 | final String robotsTxtBodyLower = "user-agent: FooBot\n" + "disallow: /x/\n"; 210 | 211 | final String url = "http://foo.bar/x/y"; 212 | 213 | final Matcher matcherUpper = parse(robotsTxtBodyUpper); 214 | assertTrue(matcherUpper.singleAgentAllowedByRobots("FooBot", url)); 215 | 216 | final Matcher matcherLower = parse(robotsTxtBodyLower); 217 | assertFalse(matcherLower.singleAgentAllowedByRobots("FooBot", url)); 218 | } 219 | 220 | /** Verifies: longest match strategy. */ 221 | @Test 222 | public void testLongestMatch() { 223 | final String url = "http://foo.bar/x/page.html"; 224 | 225 | { 226 | final String robotsTxtBody = 227 | "user-agent: FooBot\n" + "disallow: /x/page.html\n" + "allow: /x/\n"; 228 | 229 | final Matcher matcher = parse(robotsTxtBody); 230 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", url)); 231 | } 232 | { 233 | final String robotsTxtBody = 234 | "user-agent: FooBot\n" + "allow: /x/page.html\n" + "disallow: /x/\n"; 235 | 236 | final Matcher matcher = parse(robotsTxtBody); 237 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 238 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/")); 239 | } 240 | { 241 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: \n" + "allow: \n"; 242 | 243 | final Matcher matcher = parse(robotsTxtBody); 244 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 245 | } 246 | { 247 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /\n"; 248 | 249 | final Matcher matcher = parse(robotsTxtBody); 250 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 251 | } 252 | { 253 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /x\n" + "allow: /x/\n"; 254 | 255 | final String url0 = "http://foo.bar/x"; 256 | final String url1 = "http://foo.bar/x/"; 257 | 258 | final Matcher matcher = parse(robotsTxtBody); 259 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", url0)); 260 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url1)); 261 | } 262 | { 263 | final String robotsTxtBody = 264 | "user-agent: FooBot\n" + "disallow: /x/page.html\n" + "allow: /x/page.html\n"; 265 | 266 | final Matcher matcher = parse(robotsTxtBody); 267 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 268 | } 269 | { 270 | final String robotsTxtBody = 271 | "user-agent: FooBot\n" + "allow: /page\n" + "disallow: /*.html\n"; 272 | 273 | final Matcher matcher = parse(robotsTxtBody); 274 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/page.html")); 275 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/page")); 276 | } 277 | { 278 | final String robotsTxtBody = 279 | "user-agent: FooBot\n" + "allow: /x/page.\n" + "disallow: /*.html\n"; 280 | 281 | final Matcher matcher = parse(robotsTxtBody); 282 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url)); 283 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/y.html")); 284 | } 285 | { 286 | final String robotsTxtBody = 287 | "User-agent: *\n" + "Disallow: /x/\n" + "User-agent: FooBot\n" + "Disallow: /y/\n"; 288 | 289 | final Matcher matcher = parse(robotsTxtBody); 290 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/page")); 291 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/y/page")); 292 | } 293 | } 294 | 295 | /** Verifies: percent-encoding of characters outside the range of the US-ASCII. */ 296 | @Test 297 | public void testPercentEncoding() { 298 | { 299 | final String robotsTxtBody = 300 | "User-agent: FooBot\n" 301 | + "Disallow: /\n" 302 | + "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n"; 303 | 304 | final Matcher matcher = parse(robotsTxtBody); 305 | assertTrue( 306 | matcher.singleAgentAllowedByRobots( 307 | "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par")); 308 | } 309 | { 310 | final String robotsTxtBody = "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/ツ\n"; 311 | 312 | final Matcher matcher = parse(robotsTxtBody); 313 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%E3%83%84")); 314 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/ツ")); 315 | } 316 | { 317 | final String robotsTxtBody = 318 | "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/%E3%83%84\n"; 319 | 320 | final Matcher matcher = parse(robotsTxtBody); 321 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%E3%83%84")); 322 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/ツ")); 323 | } 324 | { 325 | final String robotsTxtBody = 326 | "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/%62%61%7A\n"; 327 | 328 | final Matcher matcher = parse(robotsTxtBody); 329 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/baz")); 330 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%62%61%7A")); 331 | } 332 | } 333 | 334 | /** Verifies: valid parsing of special characters ('*', '$', '#') */ 335 | @Test 336 | public void testSpecialCharacters() { 337 | { 338 | final String robotsTxtBody = 339 | "User-agent: FooBot\n" + "Disallow: /foo/bar/quz\n" + "Allow: /foo/*/qux\n"; 340 | 341 | final Matcher matcher = parse(robotsTxtBody); 342 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/quz")); 343 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/quz")); 344 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo//quz")); 345 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bax/quz")); 346 | } 347 | { 348 | final String robotsTxtBody = 349 | "User-agent: FooBot\n" + "Disallow: /foo/bar$\n" + "Allow: /foo/bar/qux\n"; 350 | 351 | final Matcher matcher = parse(robotsTxtBody); 352 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar")); 353 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/qux")); 354 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/")); 355 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/baz")); 356 | } 357 | { 358 | final String robotsTxtBody = 359 | "User-agent: FooBot\n" + "# Disallow: /\n" + "Disallow: /foo/quz#qux\n" + "Allow: /\n"; 360 | 361 | final Matcher matcher = parse(robotsTxtBody); 362 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar")); 363 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/quz")); 364 | } 365 | } 366 | 367 | /** 368 | * [Google-specific] Verifies: {@code /index.htm} or {@code /index.html} should be normalised to 369 | * {@code /}. 370 | */ 371 | @Test 372 | public void testIndexNormalisation() { 373 | final String robotsTxtBody = 374 | "user-agent: FooBot\n" 375 | + "disallow: /\n" 376 | + "allow: /index.htm\n" 377 | + "allow: /index.html\n" 378 | + "allow: /x\n" 379 | + "disallow: /x/index.htm\n" 380 | + "disallow: /x/index.html\n"; 381 | 382 | final String[] urls = { 383 | "http://foo.bar/", 384 | "http://foo.bar/index.htm", 385 | "http://foo.bar/index.html", 386 | "http://foo.bar/x/", 387 | "http://foo.bar/x/index.htm", 388 | "http://foo.bar/x/index.html" 389 | }; 390 | 391 | final Matcher matcher = parse(robotsTxtBody); 392 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[0])); 393 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[1])); 394 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[2])); 395 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[3])); 396 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urls[4])); 397 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urls[5])); 398 | } 399 | 400 | /** [Google-specific] Verifies: Empty arguments corner cases. */ 401 | @Test 402 | public void testEmptyArgs() { 403 | { 404 | final String robotsTxtBody = ""; 405 | 406 | final Matcher matcher = parse(robotsTxtBody); 407 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "")); 408 | assertTrue(matcher.singleAgentAllowedByRobots("", "")); 409 | } 410 | { 411 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n"; 412 | 413 | final Matcher matcher = parse(robotsTxtBody); 414 | assertTrue(matcher.singleAgentAllowedByRobots("", "")); 415 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "")); 416 | } 417 | } 418 | 419 | /** [Google-specific] Verifies: Long lines should be ignored after 8 * 2083 bytes. */ 420 | @Test 421 | public void testLongLines() { 422 | final int eolLength = "\n".length(); 423 | final int maxLength = 2083 * 8; 424 | final String allow = "allow: "; 425 | final String disallow = "disallow: "; 426 | 427 | { 428 | String robotsTxtBody = "user-agent: FooBot\n"; 429 | final StringBuilder longValueBuilder = new StringBuilder("/x/"); 430 | final int maxValueLength = 431 | maxLength - longValueBuilder.length() - disallow.length() + eolLength; 432 | while (longValueBuilder.length() < maxValueLength) { 433 | longValueBuilder.append('a'); 434 | } 435 | robotsTxtBody += disallow + longValueBuilder.append("/qux\n").toString(); 436 | 437 | final Matcher matcher = parse(robotsTxtBody); 438 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fux")); 439 | assertFalse( 440 | matcher.singleAgentAllowedByRobots( 441 | "FooBot", "http://foo.bar" + longValueBuilder.toString() + "/fux")); 442 | } 443 | { 444 | String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n"; 445 | final StringBuilder longValueBuilderA = new StringBuilder("/x/"); 446 | final StringBuilder longValueBuilderB = new StringBuilder("/x/"); 447 | final int maxValueLength = 448 | maxLength - longValueBuilderA.length() - disallow.length() + eolLength; 449 | while (longValueBuilderA.length() < maxValueLength) { 450 | longValueBuilderA.append('a'); 451 | longValueBuilderB.append('b'); 452 | } 453 | robotsTxtBody += allow + longValueBuilderA.toString() + "/qux\n"; 454 | robotsTxtBody += allow + longValueBuilderB.toString() + "/qux\n"; 455 | 456 | final Matcher matcher = parse(robotsTxtBody); 457 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/")); 458 | assertTrue( 459 | matcher.singleAgentAllowedByRobots( 460 | "FooBot", "http://foo.bar" + longValueBuilderA.toString() + "/qux")); 461 | assertTrue( 462 | matcher.singleAgentAllowedByRobots( 463 | "FooBot", "http://foo.bar" + longValueBuilderB.toString() + "/fux")); 464 | } 465 | } 466 | 467 | /** [Google-specific] Verifies: Google-only documentation compliance. */ 468 | @Test 469 | public void testGoogleOnlyDocumentationCompliance() { 470 | { 471 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish\n"; 472 | 473 | final Matcher matcher = parse(robotsTxtBody); 474 | 475 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 476 | 477 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish")); 478 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html")); 479 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html")); 480 | 481 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads")); 482 | assertTrue( 483 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads/yummy.html")); 484 | assertTrue( 485 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html?id=anything")); 486 | 487 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.asp")); 488 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/catfish")); 489 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/?id=fish")); 490 | } 491 | { 492 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish*\n"; 493 | 494 | final Matcher matcher = parse(robotsTxtBody); 495 | 496 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 497 | 498 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish")); 499 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html")); 500 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html")); 501 | 502 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads")); 503 | assertTrue( 504 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads/yummy.html")); 505 | assertTrue( 506 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html?id=anything")); 507 | 508 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.bar")); 509 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/catfish")); 510 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/?id=fish")); 511 | } 512 | { 513 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish/\n"; 514 | 515 | final Matcher matcher = parse(robotsTxtBody); 516 | 517 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 518 | 519 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/")); 520 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon")); 521 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/?salmon")); 522 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html")); 523 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/?id=anything")); 524 | 525 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish")); 526 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html")); 527 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish/Salmon.html")); 528 | } 529 | { 530 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /*.php\n"; 531 | 532 | final Matcher matcher = parse(robotsTxtBody); 533 | 534 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 535 | 536 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php")); 537 | assertTrue( 538 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/folder/filename.php")); 539 | assertTrue( 540 | matcher.singleAgentAllowedByRobots( 541 | "FooBot", "http://foo.bar/folder/filename.php?parameters")); 542 | assertTrue( 543 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar//folder/any.php.file.html")); 544 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php/")); 545 | assertTrue( 546 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index?f=filename.php/")); 547 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/php/")); 548 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index?php")); 549 | 550 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/windows.PHP")); 551 | } 552 | { 553 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /*.php$\n"; 554 | 555 | final Matcher matcher = parse(robotsTxtBody); 556 | 557 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 558 | 559 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php")); 560 | assertTrue( 561 | matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/folder/filename.php")); 562 | 563 | assertFalse( 564 | matcher.singleAgentAllowedByRobots( 565 | "FooBot", "http://foo.bar/folder/filename.php?parameters")); 566 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php/")); 567 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php5/")); 568 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/php/")); 569 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename?php")); 570 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/aaaphpaaa")); 571 | 572 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/windows.PHP")); 573 | } 574 | { 575 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish*.php\n"; 576 | 577 | final Matcher matcher = parse(robotsTxtBody); 578 | 579 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar")); 580 | 581 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.php")); 582 | assertTrue( 583 | matcher.singleAgentAllowedByRobots( 584 | "FooBot", "http://foo.bar/fishheads/catfish.php?parameters")); 585 | 586 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.PHP")); 587 | } 588 | { 589 | final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /p\n" + "disallow: /\n"; 590 | 591 | final Matcher matcher = parse(robotsTxtBody); 592 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page")); 593 | } 594 | { 595 | final String robotsTxtBody = 596 | "user-agent: FooBot\n" + "allow: /folder\n" + "disallow: /folder\n"; 597 | 598 | final Matcher matcher = parse(robotsTxtBody); 599 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/folder/page")); 600 | } 601 | { 602 | final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /page\n" + "disallow: /*.htm\n"; 603 | 604 | final Matcher matcher = parse(robotsTxtBody); 605 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page.htm")); 606 | } 607 | { 608 | final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /$\n" + "disallow: /\n"; 609 | 610 | final Matcher matcher = parse(robotsTxtBody); 611 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/")); 612 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page.html")); 613 | } 614 | } 615 | 616 | /** [Google-specific] Verifies: common typos in {@code DISALLOW} key should be fixed. */ 617 | @Test 618 | public void testTyposFixes() { 619 | final String robotsTxtBody = 620 | "user-agent: FooBot\n" 621 | + "disallow: /a/\n" 622 | + "dissallow: /b/\n" 623 | + "dissalow: /c/\n" 624 | + "disalow: /d/\n" 625 | + "diasllow: /e/\n" 626 | + "disallaw: /f/\n"; 627 | 628 | final Matcher matcher = parse(robotsTxtBody); 629 | assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index.html")); 630 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/a/")); 631 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/b/")); 632 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/c/")); 633 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/d/")); 634 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/e/")); 635 | assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/f/")); 636 | } 637 | } 638 | -------------------------------------------------------------------------------- /src/test/java/com/google/search/robotstxt/RobotsParserTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.search.robotstxt; 16 | 17 | import static com.google.common.truth.Truth.assertThat; 18 | 19 | import java.nio.charset.StandardCharsets; 20 | import java.util.Arrays; 21 | import java.util.Collections; 22 | import org.junit.Test; 23 | 24 | /** 25 | * Unit tests validating parsing behavior. 26 | * 27 | * @see RobotsParser 28 | */ 29 | public class RobotsParserTest { 30 | /** 31 | * Parses given robots.txt contents via {@link RobotsParser} and compares the result with an 32 | * expected one. 33 | * 34 | * @param robotsTxtBody Contents of robots.txt file 35 | * @param expectedContents Expected contents 36 | */ 37 | private static void parseAndValidate( 38 | final String robotsTxtBody, final RobotsContents expectedContents) { 39 | final Parser parser = new RobotsParser(new RobotsParseHandler()); 40 | final Matcher matcher = parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8)); 41 | final RobotsContents actualContents = ((RobotsMatcher) matcher).getRobotsContents(); 42 | 43 | expectedContents 44 | .getGroups() 45 | .forEach(expectedGroup -> assertThat(expectedGroup).isIn(actualContents.getGroups())); 46 | } 47 | 48 | /** Verifies: rules grouping, rules parsing, invalid directives ignorance. */ 49 | @Test 50 | public void testMultipleGroups() { 51 | final String robotsTxtBody = 52 | "allow: /foo/bar/\n" 53 | + "\n" 54 | + "user-agent: FooBot\n" 55 | + "disallow: /\n" 56 | + "allow: /x/\n" 57 | + "user-agent: BarBot\n" 58 | + "disallow: /\n" 59 | + "allow: /y/\n" 60 | + "\n" 61 | + "\n" 62 | + "allow: /w/\n" 63 | + "user-agent: BazBot\n" 64 | + "\n" 65 | + "user-agent: FooBot\n" 66 | + "allow: /z/\n" 67 | + "disallow: /\n"; 68 | 69 | final RobotsContents expectedContents = 70 | new RobotsContents( 71 | Arrays.asList( 72 | new RobotsContents.Group( 73 | Collections.singletonList("FooBot"), 74 | Arrays.asList( 75 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 76 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))), 77 | new RobotsContents.Group( 78 | Collections.singletonList("BarBot"), 79 | Arrays.asList( 80 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 81 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"), 82 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/w/"))), 83 | new RobotsContents.Group( 84 | Arrays.asList("BazBot", "FooBot"), 85 | Arrays.asList( 86 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/z/"), 87 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"))))); 88 | 89 | parseAndValidate(robotsTxtBody, expectedContents); 90 | } 91 | 92 | /** Verifies: CR character must be treated as EOL, invalid directives ignorance. */ 93 | @Test 94 | public void testCrParsing() { 95 | final String robotsTxtBody = 96 | "user-agent: FooBot\n" 97 | + "disallow: /\n" 98 | + "allow: /x/\rallow: /y/\n" 99 | + "al\r\r\r\r\rdisallow: /z/\n"; 100 | 101 | final RobotsContents expectedContents = 102 | new RobotsContents( 103 | Collections.singletonList( 104 | new RobotsContents.Group( 105 | Collections.singletonList("FooBot"), 106 | Arrays.asList( 107 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 108 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"), 109 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"), 110 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/z/"))))); 111 | 112 | parseAndValidate(robotsTxtBody, expectedContents); 113 | } 114 | 115 | /** Verifies: CL RF must be treated as EOL. */ 116 | @Test 117 | public void testCrLfParsing() { 118 | final String robotsTxtBody = 119 | "allow: /foo/bar/\r\n" 120 | + "\r\n" 121 | + "user-agent: FooBot\r\n" 122 | + "disallow: /\r\n" 123 | + "allow: /x/\r\n" 124 | + "user-agent: BarBot\r\n" 125 | + "disallow: /\r\n" 126 | + "allow: /y/\r\n" 127 | + "\r\n"; 128 | 129 | final RobotsContents expectedContents = 130 | new RobotsContents( 131 | Arrays.asList( 132 | new RobotsContents.Group( 133 | Collections.singletonList("FooBot"), 134 | Arrays.asList( 135 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 136 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))), 137 | new RobotsContents.Group( 138 | Collections.singletonList("BarBot"), 139 | Arrays.asList( 140 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 141 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"))))); 142 | 143 | parseAndValidate(robotsTxtBody, expectedContents); 144 | } 145 | 146 | /** Verifies: Last line may not end with EOL. */ 147 | @Test 148 | public void testNoFinalNewline() { 149 | final String robotsTxtBody = 150 | "User-Agent: foo\n" 151 | + "Allow: /some/path\n" 152 | + "User-Agent: bar\n" 153 | + "\n" 154 | + "\n" 155 | + "Disallow: /"; 156 | 157 | final RobotsContents expectedContents = 158 | new RobotsContents( 159 | Arrays.asList( 160 | new RobotsContents.Group( 161 | Collections.singletonList("foo"), 162 | Collections.singletonList( 163 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/some/path"))), 164 | new RobotsContents.Group( 165 | Collections.singletonList("bar"), 166 | Collections.singletonList( 167 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"))))); 168 | 169 | parseAndValidate(robotsTxtBody, expectedContents); 170 | } 171 | 172 | /** Verifies: surrounding whitespace characters (LF, CR) ignorance. */ 173 | @Test 174 | public void testWhitespacesParsing() { 175 | final String robotsTxtBody = 176 | "user-agent \t: \tFooBot\n" 177 | + "disallow : / \n" 178 | + " allow: /x/\n" 179 | + " \n" 180 | + " \t \t \n" 181 | + "user-agent:BarBot\n" 182 | + "\t \t disallow\t \t :\t \t /\t \t \n" 183 | + "\t\tallow\t\t:\t\t/y/\t\t\n" 184 | + "\n"; 185 | 186 | final RobotsContents expectedContents = 187 | new RobotsContents( 188 | Arrays.asList( 189 | new RobotsContents.Group( 190 | Collections.singletonList("FooBot"), 191 | Arrays.asList( 192 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 193 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))), 194 | new RobotsContents.Group( 195 | Collections.singletonList("BarBot"), 196 | Arrays.asList( 197 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 198 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"))))); 199 | 200 | parseAndValidate(robotsTxtBody, expectedContents); 201 | } 202 | 203 | /** Verifies: global rules parsing. */ 204 | @Test 205 | public void testGlobalGroup() { 206 | final String robotsTxtBody = 207 | "User-agent: *\n" + "Disallow: /x/\n" + "User-agent: FooBot\n" + "Disallow: /y/\n"; 208 | 209 | final RobotsContents expectedContents = 210 | new RobotsContents( 211 | Arrays.asList( 212 | new RobotsContents.Group( 213 | Collections.emptyList(), 214 | Collections.singletonList( 215 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/x/")), 216 | true), 217 | new RobotsContents.Group( 218 | Collections.singletonList("FooBot"), 219 | Collections.singletonList( 220 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/y/"))))); 221 | 222 | parseAndValidate(robotsTxtBody, expectedContents); 223 | } 224 | 225 | /** [Google-specific] Verifies: assuming colon if it's missing. */ 226 | @Test 227 | public void testMissingSeparator() { 228 | final String robotsTxtBody = "user-agent FooBot\n" + "disallow /\n" + "allow foo bar\n"; 229 | 230 | final RobotsContents expectedContents = 231 | new RobotsContents( 232 | Collections.singletonList( 233 | new RobotsContents.Group( 234 | Collections.singletonList("FooBot"), 235 | Arrays.asList( 236 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"), 237 | new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "foo bar"))))); 238 | 239 | parseAndValidate(robotsTxtBody, expectedContents); 240 | } 241 | 242 | /** [Google-specific] Verifies: trimming values to specific number of bytes. */ 243 | @Test 244 | public void testTrimmingToBytes() { 245 | final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /foo/bar/baz/qux\n"; 246 | 247 | final RobotsContents expectedContents = 248 | new RobotsContents( 249 | Collections.singletonList( 250 | new RobotsContents.Group( 251 | Collections.singletonList("FooBot"), 252 | Collections.singletonList( 253 | new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/foo/b"))))); 254 | 255 | final Parser parser = new RobotsParser(new RobotsParseHandler(), 8); 256 | final Matcher matcher = parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8)); 257 | final RobotsContents actualContents = ((RobotsMatcher) matcher).getRobotsContents(); 258 | 259 | expectedContents 260 | .getGroups() 261 | .forEach(expectedGroup -> assertThat(expectedGroup).isIn(actualContents.getGroups())); 262 | } 263 | 264 | /** Verifies: Path normalisation corner case. */ 265 | @Test 266 | public void testPathNormalisationCornerCase() { 267 | final String robotsTxtBody = 268 | "user-agent: FooBot\n" + "disallow: /foo?bar%aa%\n" + "disallow: /foo?bar%aa%a\n"; 269 | 270 | final RobotsContents expectedContents = 271 | new RobotsContents( 272 | Collections.singletonList( 273 | new RobotsContents.Group( 274 | Collections.singletonList("FooBot"), 275 | Arrays.asList( 276 | new RobotsContents.Group.Rule( 277 | Parser.DirectiveType.DISALLOW, "/foo?bar%AA%"), 278 | new RobotsContents.Group.Rule( 279 | Parser.DirectiveType.DISALLOW, "/foo?bar%AA%a"))))); 280 | 281 | parseAndValidate(robotsTxtBody, expectedContents); 282 | } 283 | } 284 | --------------------------------------------------------------------------------