├── .github
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── code-of-conduct.md
    └── contributing.md
├── pom.xml
└── src
    ├── main
        └── java
        │   └── com
        │       └── google
        │           └── search
        │               └── robotstxt
        │                   ├── Matcher.java
        │                   ├── MatchingStrategy.java
        │                   ├── ParseException.java
        │                   ├── ParseHandler.java
        │                   ├── Parser.java
        │                   ├── RobotsContents.java
        │                   ├── RobotsLongestMatchStrategy.java
        │                   ├── RobotsMatcher.java
        │                   ├── RobotsParseHandler.java
        │                   ├── RobotsParser.java
        │                   └── RobotsParserApp.java
    └── test
        └── java
            └── com
                └── google
                    └── search
                        └── robotstxt
                            ├── RobotsMatcherTest.java
                            └── RobotsParserTest.java


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Expected Behavior
 2 | 
 3 | 
 4 | ## Actual Behavior
 5 | 
 6 | 
 7 | ## Steps to Reproduce the Problem
 8 | 
 9 | 1.
10 | 1.
11 | 1.
12 | 
13 | ## Specifications
14 | 
15 | - Version:
16 | - Platform:


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #<issue_number_goes_here>
2 | 
3 | > It's a good idea to open an issue first for discussion.
4 | 
5 | - [ ] Tests pass
6 | - [ ] Appropriate changes to README are included in PR


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Compiled class file
  2 | *.class
  3 | 
  4 | # Log file
  5 | *.log
  6 | 
  7 | # BlueJ files
  8 | *.ctxt
  9 | 
 10 | # Mobile Tools for Java (J2ME)
 11 | .mtj.tmp/
 12 | 
 13 | # Package Files #
 14 | *.jar
 15 | *.war
 16 | *.nar
 17 | *.ear
 18 | *.zip
 19 | *.tar.gz
 20 | *.rar
 21 | 
 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 23 | hs_err_pid*
 24 | 
 25 | target/
 26 | pom.xml.tag
 27 | pom.xml.releaseBackup
 28 | pom.xml.versionsBackup
 29 | pom.xml.next
 30 | release.properties
 31 | dependency-reduced-pom.xml
 32 | buildNumber.properties
 33 | .mvn/timing.properties
 34 | # https://github.com/takari/maven-wrapper#usage-without-binary-jar
 35 | .mvn/wrapper/maven-wrapper.jar
 36 | 
 37 | .metadata
 38 | bin/
 39 | tmp/
 40 | *.tmp
 41 | *.bak
 42 | *.swp
 43 | *~.nib
 44 | local.properties
 45 | .settings/
 46 | .loadpath
 47 | .recommenders
 48 | 
 49 | # Eclipse Core
 50 | .project
 51 | 
 52 | # External tool builders
 53 | .externalToolBuilders/
 54 | 
 55 | # Locally stored "Eclipse launch configurations"
 56 | *.launch
 57 | 
 58 | # PyDev specific (Python IDE for Eclipse)
 59 | *.pydevproject
 60 | 
 61 | # CDT-specific (C/C++ Development Tooling)
 62 | .cproject
 63 | 
 64 | # CDT- autotools
 65 | .autotools
 66 | 
 67 | # Java annotation processor (APT)
 68 | .factorypath
 69 | 
 70 | # JDT-specific (Eclipse Java Development Tools)
 71 | .classpath
 72 | 
 73 | # PDT-specific (PHP Development Tools)
 74 | .buildpath
 75 | 
 76 | # sbteclipse plugin
 77 | .target
 78 | 
 79 | # Tern plugin
 80 | .tern-project
 81 | 
 82 | # TeXlipse plugin
 83 | .texlipse
 84 | 
 85 | # STS (Spring Tool Suite)
 86 | .springBeans
 87 | 
 88 | # Code Recommenders
 89 | .recommenders/
 90 | 
 91 | # Annotation Processing
 92 | .apt_generated/
 93 | .apt_generated_test/
 94 | 
 95 | # Scala IDE specific (Scala & Java development for Eclipse)
 96 | .cache-main
 97 | .scala_dependencies
 98 | .worksheet
 99 | 
100 | # User-specific stuff
101 | .idea/**/workspace.xml
102 | .idea/**/tasks.xml
103 | .idea/**/usage.statistics.xml
104 | .idea/**/dictionaries
105 | .idea/**/shelf
106 | 
107 | # Generated files
108 | .idea/**/contentModel.xml
109 | 
110 | # Sensitive or high-churn files
111 | .idea/**/dataSources/
112 | .idea/**/dataSources.ids
113 | .idea/**/dataSources.local.xml
114 | .idea/**/sqlDataSources.xml
115 | .idea/**/dynamic.xml
116 | .idea/**/uiDesigner.xml
117 | .idea/**/dbnavigator.xml
118 | 
119 | # Gradle
120 | .idea/**/gradle.xml
121 | .idea/**/libraries
122 | 
123 | # Gradle and Maven with auto-import
124 | # When using Gradle or Maven with auto-import, you should exclude module files,
125 | # since they will be recreated, and may cause churn.
126 | .idea/artifacts
127 | .idea/compiler.xml
128 | .idea/jarRepositories.xml
129 | .idea/modules.xml
130 | .idea/*.iml
131 | .idea/modules
132 | *.iml
133 | *.ipr
134 | 
135 | # CMake
136 | cmake-build-*/
137 | 
138 | # Mongo Explorer plugin
139 | .idea/**/mongoSettings.xml
140 | 
141 | # File-based project format
142 | *.iws
143 | 
144 | # IntelliJ
145 | out/
146 | .idea/encodings.xml
147 | .idea/misc.xml
148 | .idea/vcs.xml
149 | .idea/codeStyles/codeStyleConfig.xml
150 | 
151 | # mpeltonen/sbt-idea plugin
152 | .idea_modules/
153 | 
154 | # JIRA plugin
155 | atlassian-ide-plugin.xml
156 | 
157 | # Cursive Clojure plugin
158 | .idea/replstate.xml
159 | 
160 | # Crashlytics plugin (for Android Studio and IntelliJ)
161 | com_crashlytics_export_strings.xml
162 | crashlytics.properties
163 | crashlytics-build.properties
164 | fabric.properties
165 | 
166 | # Editor-based Rest Client
167 | .idea/httpRequests
168 | 
169 | # Android studio 3.1+ serialized cache file
170 | .idea/caches/build_file_checksums.ser
171 | 
172 | # Visual Studio Code
173 | .vscode/
174 | *.code-workspace
175 | .history/
176 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Google Robots.txt Parser and Matcher Library in Java
  2 | 
  3 | This project aims to implement the robots.txt parser and matcher in Java. It is
  4 | based on the [C++ implementation](https://github.com/google/robotstxt).
  5 | 
  6 | ## About the library
  7 | 
  8 | The Robots Exclusion Protocol (REP) is a standard that enables website owners
  9 | to control which URLs may be accessed by automated clients (i.e. crawlers)
 10 | through a simple text file with a specific syntax. It's one of the basic
 11 | building blocks of the internet as we know it and what allows search engines
 12 | to operate.
 13 | 
 14 | Because the REP was only a de-facto standard for the past 25 years, different
 15 | implementers implement parsing of robots.txt slightly differently, leading to
 16 | confusion. This project aims to fix that by releasing the parser that Google
 17 | uses.
 18 | 
 19 | The library is a Java port of 
 20 | [C++ parser and matcher](https://github.com/google/robotstxt) which is a
 21 | slightly modified production code used by Googlebot, Google's crawler. The
 22 | library is released open-source to help developers build tools that better
 23 | reflect Google's robots.txt parsing and matching.
 24 | 
 25 | For webmasters, we included a runnable class `RobotsParserApp` which is a small
 26 | application that allows testing a single URL and several user-agents against a
 27 | robots.txt.
 28 | 
 29 | ## Development
 30 | 
 31 | ### Prerequisites
 32 | 
 33 | You need Maven to build this project.
 34 | [Download](https://maven.apache.org/download.html) and
 35 | [install](https://maven.apache.org/install.html) it from the official website.
 36 | 
 37 | You can also install it like this if your Linux supports it:
 38 | 
 39 | ```
 40 | $ sudo apt-get install maven
 41 | ```
 42 | 
 43 | ### Build it
 44 | 
 45 | #### Using Maven
 46 | 
 47 | Standard maven commands work here.
 48 | 
 49 | ```
 50 | $ mvn install
 51 | ```
 52 | 
 53 | Or if you want a build from scratch:
 54 | 
 55 | ```
 56 | $ mvn clean install
 57 | ```
 58 | 
 59 | #### Using Maven Assembly Plugin
 60 | 
 61 | Alternatively, you can compile the entire project into a single JAR using the
 62 | following command:
 63 | 
 64 | ```
 65 | $ mvn clean compile assembly:single
 66 | ```
 67 | 
 68 | You can find the result in `target` directory. 
 69 | 
 70 | ### Run it
 71 | 
 72 | #### Using Maven
 73 | 
 74 | Following commands will run an application that parses given robots.txt file
 75 | and print a matching verdict: `ALLOWED` or `DISALLOWED` (exit codes are `0`
 76 | and `1` respectively). 
 77 | 
 78 | You should provide a target URL using `-u` (`--url`) flag. At least one agent
 79 | must be specified using `-a` (`--agent`) flag (verdict `DISALLOWED` is printed
 80 | iff none of the user-agents are allowed to crawl given URL).
 81 | 
 82 | When flag `-f` (`--file`) is omitted, robots.txt contents are expected to be
 83 | received via standard input:
 84 | 
 85 | ```
 86 | $ mvn exec:java -Dexec.mainClass=com.google.search.robotstxt.RobotsParserApp -Dexec.args="--agent FooBot --url http://foo.com/bar"
 87 | ```
 88 | 
 89 | If you want the application to read an existing robots.txt file, use flag `-f`
 90 | (`--file`):
 91 | 
 92 | ```
 93 | $ mvn exec:java -Dexec.mainClass=com.google.search.robotstxt.RobotsParserApp -Dexec.args="--agent FooBot --url http://foo.com/bar --file path/to/robots.txt"
 94 | ```
 95 | 
 96 | #### From JAR
 97 | 
 98 | If you have built the project into JAR, you can run it from there (reading
 99 | robots.txt from standard input):
100 | 
101 | ```
102 | $ java -jar target/robotstxt-java-1.0-SNAPSHOT-jar-with-dependencies.jar --agent FooBot --url http://foo.com/bar
103 | ```
104 | 
105 | Or (reading from file):
106 | 
107 | ```
108 | $ java -jar target/robotstxt-java-1.0-SNAPSHOT-jar-with-dependencies.jar --agent FooBot --url http://foo.com/bar --file path/to/robots.txt
109 | ```
110 | 
111 | ## Notes
112 | 
113 | Parsing of robots.txt files themselves is done exactly as in the production
114 | version of Googlebot, including how percent codes and unicode characters in
115 | patterns are handled. The user must ensure however that the URI passed to the
116 | `Matcher` methods, or to the `--url` parameter of the application, follows the
117 | format specified by RFC3986, since this library will not perform full
118 | normalization of those URI parameters. Only if the URI is in this format, the
119 | matching will be done according to the REP specification.
120 | 
121 | ## License
122 | 
123 | The robots.txt parser and matcher Java library is licensed under the terms of
124 | the Apache license. See LICENSE for more information.
125 | 
126 | ## Source Code Headers
127 | 
128 | Every file containing source code must include copyright and license
129 | information. This includes any JS/CSS files that you might be serving out to
130 | browsers. (This is to help well-intentioned people avoid accidental copying
131 | that doesn't comply with the license.)
132 | 
133 | Apache header:
134 | 
135 |     Copyright 2020 Google LLC
136 | 
137 |     Licensed under the Apache License, Version 2.0 (the "License");
138 |     you may not use this file except in compliance with the License.
139 |     You may obtain a copy of the License at
140 | 
141 |         https://www.apache.org/licenses/LICENSE-2.0
142 | 
143 |     Unless required by applicable law or agreed to in writing, software
144 |     distributed under the License is distributed on an "AS IS" BASIS,
145 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
146 |     See the License for the specific language governing permissions and
147 |     limitations under the License.
148 | 
149 | It can be done easily by using the
150 | [addlicense](https://github.com/google/addlicense) tool.
151 | 
152 | Install it:
153 | 
154 | ```
155 | $ go get -u github.com/google/addlicense
156 | ```
157 | 
158 | Use it like this to make sure all files have the licence:
159 | 
160 | ```
161 | $ ~/go/bin/addlicense -c "Google LLC" -l apache .
162 | ```
163 | 


--------------------------------------------------------------------------------
/docs/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Google Open Source Community Guidelines
 2 | 
 3 | At Google, we recognize and celebrate the creativity and collaboration of open
 4 | source contributors and the diversity of skills, experiences, cultures, and
 5 | opinions they bring to the projects and communities they participate in.
 6 | 
 7 | Every one of Google's open source projects and communities are inclusive
 8 | environments, based on treating all individuals respectfully, regardless of
 9 | gender identity and expression, sexual orientation, disabilities,
10 | neurodiversity, physical appearance, body size, ethnicity, nationality, race,
11 | age, religion, or similar personal characteristic.
12 | 
13 | We value diverse opinions, but we value respectful behavior more.
14 | 
15 | Respectful behavior includes:
16 | 
17 | * Being considerate, kind, constructive, and helpful.
18 | * Not engaging in demeaning, discriminatory, harassing, hateful, sexualized, or
19 |   physically threatening behavior, speech, and imagery.
20 | * Not engaging in unwanted physical contact.
21 | 
22 | Some Google open source projects [may adopt][] an explicit project code of
23 | conduct, which may have additional detailed expectations for participants. Most
24 | of those projects will use our [modified Contributor Covenant][].
25 | 
26 | [may adopt]: https://opensource.google/docs/releasing/preparing/#conduct
27 | [modified Contributor Covenant]: https://opensource.google/docs/releasing/template/CODE_OF_CONDUCT/
28 | 
29 | ## Resolve peacefully
30 | 
31 | We do not believe that all conflict is necessarily bad; healthy debate and
32 | disagreement often yields positive results. However, it is never okay to be
33 | disrespectful.
34 | 
35 | If you see someone behaving disrespectfully, you are encouraged to address the
36 | behavior directly with those involved. Many issues can be resolved quickly and
37 | easily, and this gives people more control over the outcome of their dispute.
38 | If you are unable to resolve the matter for any reason, or if the behavior is
39 | threatening or harassing, report it. We are dedicated to providing an
40 | environment where participants feel welcome and safe.
41 | 
42 | ## Reporting problems
43 | 
44 | Some Google open source projects may adopt a project-specific code of conduct.
45 | In those cases, a Google employee will be identified as the Project Steward,
46 | who will receive and handle reports of code of conduct violations. In the event
47 | that a project hasn’t identified a Project Steward, you can report problems by
48 | emailing opensource@google.com.
49 | 
50 | We will investigate every complaint, but you may not receive a direct response.
51 | We will use our discretion in determining when and how to follow up on reported
52 | incidents, which may range from not taking action to permanent expulsion from
53 | the project and project-sponsored spaces. We will notify the accused of the
54 | report and provide them an opportunity to discuss it before any action is
55 | taken. The identity of the reporter will be omitted from the details of the
56 | report supplied to the accused. In potentially harmful situations, such as
57 | ongoing harassment or threats to anyone's safety, we may take action without
58 | notice.
59 | 
60 | *This document was adapted from the [IndieWeb Code of Conduct][] and can also
61 | be found at <https://opensource.google/conduct/>.*
62 | 
63 | [IndieWeb Code of Conduct]: https://indieweb.org/code-of-conduct
64 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google/conduct/).
29 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |  Copyright 2020 Google LLC
  4 | 
  5 |  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  you may not use this file except in compliance with the License.
  7 |  You may obtain a copy of the License at
  8 | 
  9 |       http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |  Unless required by applicable law or agreed to in writing, software
 12 |  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  See the License for the specific language governing permissions and
 15 |  limitations under the License.
 16 | -->
 17 | 
 18 | 
 19 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 20 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 21 |   <modelVersion>4.0.0</modelVersion>
 22 | 
 23 |   <groupId>com.google.search.robotstxt</groupId>
 24 |   <artifactId>robotstxt-java</artifactId>
 25 |   <version>1.0-SNAPSHOT</version>
 26 | 
 27 |   <name>robotstxt-java</name>
 28 |   <url>https://github.com/google/robotstxt-java</url>
 29 | 
 30 |   <properties>
 31 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 32 |     <maven.compiler.source>11</maven.compiler.source>
 33 |     <maven.compiler.target>11</maven.compiler.target>
 34 |     <flogger.version>0.7.4</flogger.version>
 35 |   </properties>
 36 | 
 37 |   <dependencies>
 38 |     <dependency>
 39 |       <groupId>com.google.protobuf</groupId>
 40 |       <artifactId>protobuf-java</artifactId>
 41 |       <version>3.19.2</version>
 42 |     </dependency>
 43 | 
 44 |     <dependency>
 45 |       <groupId>junit</groupId>
 46 |       <artifactId>junit</artifactId>
 47 |       <version>4.13.2</version>
 48 |       <scope>test</scope>
 49 |     </dependency>
 50 | 
 51 |     <dependency>
 52 |       <groupId>com.google.flogger</groupId>
 53 |       <artifactId>flogger-system-backend</artifactId>
 54 |       <version>${flogger.version}</version>
 55 |     </dependency>
 56 | 
 57 |     <dependency>
 58 |       <groupId>com.google.flogger</groupId>
 59 |       <artifactId>flogger</artifactId>
 60 |       <version>${flogger.version}</version>
 61 |     </dependency>
 62 | 
 63 |     <dependency>
 64 |       <groupId>com.google.truth</groupId>
 65 |       <artifactId>truth</artifactId>
 66 |       <version>1.1.3</version>
 67 |       <scope>test</scope>
 68 |     </dependency>
 69 | 
 70 |     <dependency>
 71 |       <groupId>info.picocli</groupId>
 72 |       <artifactId>picocli</artifactId>
 73 |       <version>4.6.2</version>
 74 |     </dependency>
 75 | 
 76 |     <dependency>
 77 |       <groupId>com.google.guava</groupId>
 78 |       <artifactId>guava</artifactId>
 79 |       <version>31.0.1-jre</version>
 80 |     </dependency>
 81 | 
 82 |   </dependencies>
 83 | 
 84 |   <build>
 85 |     <!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
 86 |     <pluginManagement>
 87 |       <plugins>
 88 |         <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
 89 |         <plugin>
 90 |           <artifactId>maven-clean-plugin</artifactId>
 91 |           <version>3.1.0</version>
 92 |         </plugin>
 93 |         <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
 94 |         <plugin>
 95 |           <artifactId>maven-resources-plugin</artifactId>
 96 |           <version>3.0.2</version>
 97 |         </plugin>
 98 |         <plugin>
 99 |           <artifactId>maven-compiler-plugin</artifactId>
100 |           <version>3.8.0</version>
101 |         </plugin>
102 |         <plugin>
103 |           <artifactId>maven-surefire-plugin</artifactId>
104 |           <version>2.22.1</version>
105 |         </plugin>
106 |         <plugin>
107 |           <artifactId>maven-jar-plugin</artifactId>
108 |           <version>3.0.2</version>
109 |         </plugin>
110 |         <plugin>
111 |           <artifactId>maven-install-plugin</artifactId>
112 |           <version>2.5.2</version>
113 |         </plugin>
114 |         <plugin>
115 |           <artifactId>maven-deploy-plugin</artifactId>
116 |           <version>2.8.2</version>
117 |         </plugin>
118 |         <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
119 |         <plugin>
120 |           <artifactId>maven-site-plugin</artifactId>
121 |           <version>3.7.1</version>
122 |         </plugin>
123 |         <plugin>
124 |           <artifactId>maven-project-info-reports-plugin</artifactId>
125 |           <version>3.0.0</version>
126 |         </plugin>
127 |         <plugin>
128 |           <groupId>com.coveo</groupId>
129 |           <artifactId>fmt-maven-plugin</artifactId>
130 |           <version>2.10</version>
131 |         </plugin>
132 |         <plugin>
133 |           <groupId>org.xolstice.maven.plugins</groupId>
134 |           <artifactId>protobuf-maven-plugin</artifactId>
135 |           <version>0.6.1</version>
136 |         </plugin>
137 |       </plugins>
138 |     </pluginManagement>
139 | 
140 |     <plugins>
141 |       <plugin>
142 |         <!-- https://github.com/coveooss/fmt-maven-plugin -->
143 |         <groupId>com.coveo</groupId>
144 |         <artifactId>fmt-maven-plugin</artifactId>
145 |         <executions>
146 |           <execution>
147 |             <goals>
148 |               <goal>format</goal>
149 |             </goals>
150 |           </execution>
151 |         </executions>
152 |       </plugin>
153 |       <plugin>
154 |         <!-- https://errorprone.info/docs/installation#maven -->
155 |         <groupId>org.apache.maven.plugins</groupId>
156 |         <artifactId>maven-compiler-plugin</artifactId>
157 |         <configuration>
158 |           <compilerArgs>
159 |             <arg>-XDcompilePolicy=simple</arg>
160 |             <arg>-Xplugin:ErrorProne</arg>
161 |           </compilerArgs>
162 |           <annotationProcessorPaths>
163 |             <path>
164 |               <groupId>com.google.errorprone</groupId>
165 |               <artifactId>error_prone_core</artifactId>
166 |               <version>2.4.0</version>
167 |             </path>
168 |           </annotationProcessorPaths>
169 |         </configuration>
170 |       </plugin>
171 |       <plugin>
172 |         <!-- https://www.xolstice.org/protobuf-maven-plugin/usage.html -->
173 |         <groupId>org.xolstice.maven.plugins</groupId>
174 |         <artifactId>protobuf-maven-plugin</artifactId>
175 |         <!--
176 |           This plugin requires the protocol buffer compiler (protoc) to be in
177 |           the PATH.
178 |           $ sudo apt-get install protobuf-compiler
179 |           That can be overriden with the protocExecutable configuration.
180 |         -->
181 |         <!-- <configuration>
182 |           <protocExecutable>/usr/local/bin/protoc</protocExecutable>
183 |         </configuration> -->
184 |         <executions>
185 |           <execution>
186 |             <goals>
187 |               <goal>compile</goal>
188 |               <goal>test-compile</goal>
189 |             </goals>
190 |           </execution>
191 |         </executions>
192 |       </plugin>
193 | 
194 |       <plugin>
195 |         <artifactId>maven-assembly-plugin</artifactId>
196 |         <configuration>
197 |           <archive>
198 |             <manifest>
199 |               <mainClass>com.google.search.robotstxt.RobotsParserApp</mainClass>
200 |             </manifest>
201 |           </archive>
202 |           <descriptorRefs>
203 |             <descriptorRef>jar-with-dependencies</descriptorRef>
204 |           </descriptorRefs>
205 |         </configuration>
206 |       </plugin>
207 |     </plugins>
208 |   </build>
209 | </project>
210 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/Matcher.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | import java.util.List;
18 | 
19 | /** Interface of a matcher class. */
20 | public interface Matcher {
21 |   /**
22 |    * Check whether at least one of given user agents is allowed to visit given URL based on
23 |    * robots.txt which this matcher represents.
24 |    *
25 |    * @param userAgents interested user agents
26 |    * @param url target URL
27 |    * @return {@code true} iff verdict is ALLOWED
28 |    */
29 |   boolean allowedByRobots(final List<String> userAgents, final String url);
30 | 
31 |   /**
32 |    * Check whether given user agent is allowed to visit given URL based on robots.txt which this
33 |    * matcher represents.
34 |    *
35 |    * @param userAgent interested user agent
36 |    * @param url target URL
37 |    * @return {@code true} iff verdict is ALLOWED
38 |    */
39 |   boolean singleAgentAllowedByRobots(final String userAgent, final String url);
40 | 
41 |   /**
42 |    * Check whether at least one of given user agents is allowed to visit given URL based on
43 |    * robots.txt which this matcher represents. All global rule groups are ignored.
44 |    *
45 |    * @param userAgents interested user agents
46 |    * @param url target URL
47 |    * @return {@code true} iff verdict is ALLOWED
48 |    */
49 |   boolean ignoreGlobalAllowedByRobots(final List<String> userAgents, final String url);
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/MatchingStrategy.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | /**
18 |  * Provides methods to calculate match priority for given directives against given path. It is
19 |  * required to compute match verdict in {@link RobotsMatcher}.
20 |  */
21 | public interface MatchingStrategy {
22 |   /**
23 |    * Calculates priority of ALLOW verdict based on given directive.
24 |    *
25 |    * @param path path to calculate ALLOW match priority against
26 |    * @param pattern ALLOW directive value
27 |    * @return match priority (higher value means higher chance of ALLOW verdict)
28 |    */
29 |   int matchAllowPriority(final String path, final String pattern);
30 | 
31 |   /**
32 |    * Calculates priority of DISALLOW verdict based on given directive.
33 |    *
34 |    * @param path path to calculate DISALLOW match priority against
35 |    * @param pattern DISALLOW directive value
36 |    * @return match priority (higher value means higher chance of DISALLOW verdict)
37 |    */
38 |   int matchDisallowPriority(final String path, final String pattern);
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/ParseException.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | /** Used in parsing process. */
18 | public class ParseException extends Exception {
19 |   public ParseException() {
20 |     super();
21 |   }
22 | 
23 |   public ParseException(String message, Throwable cause) {
24 |     super(message, cause);
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/ParseHandler.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | /**
18 |  * The interface provides parsing logic for {@link Parser} class. Its implementation is expected to
19 |  * accumulate parsed robots.txt lines and be able to compute {@link RobotsMatcher} instance as soon
20 |  * as all robots.txt lines were inputted.
21 |  */
22 | public interface ParseHandler {
23 |   /**
24 |    * Handler for the beginning of parsing process. This method will be called single time before any
25 |    * other method of this class.
26 |    */
27 |   void handleStart();
28 | 
29 |   /**
30 |    * Directive receiver. Each directive consists of type and value. This method will be called after
31 |    * {@link this#handleStart()} and will not be called after {@link this#handleEnd()}. May be called
32 |    * multiple times.
33 |    *
34 |    * @param directiveType type of received directive
35 |    * @param directiveValue value of received directive
36 |    */
37 |   void handleDirective(final Parser.DirectiveType directiveType, final String directiveValue);
38 | 
39 |   /**
40 |    * Handler for the end of parsing process. This method will be called single time after {@link
41 |    * this#handleStart()} or {@link this#handleDirective(Parser.DirectiveType, String)}.
42 |    */
43 |   void handleEnd();
44 | 
45 |   /**
46 |    * Calling this method produces a matcher based on all earlier received information via {@link
47 |    * this#handleDirective(Parser.DirectiveType, String)} method. Thus, it returns serialized view of
48 |    * robots.txt file with matching functionality. This method will be called after {@link
49 |    * this#handleEnd()}. May be called multiple times.
50 |    *
51 |    * @return matcher representing original robots.txt file
52 |    */
53 |   Matcher compute();
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/Parser.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | /**
18 |  * Abstract parser. All parser implementations must extend it. This class extensions are expected to
19 |  * provide tokenizer logic while parsing logic is delegated to a {@link ParseHandler} class.
20 |  */
21 | public abstract class Parser {
22 |   enum DirectiveType {
23 |     USER_AGENT,
24 |     ALLOW,
25 |     DISALLOW,
26 |     SITEMAP,
27 |     UNKNOWN
28 |   }
29 | 
30 |   protected ParseHandler parseHandler;
31 | 
32 |   /**
33 |    * Parser must follow specific {@link ParseHandler} rules in order to parse. Thus it requires an
34 |    * instance of it upon creation.
35 |    *
36 |    * @param parseHandler handler to follow during parsing process.
37 |    */
38 |   protected Parser(ParseHandler parseHandler) {
39 |     this.parseHandler = parseHandler;
40 |   }
41 | 
42 |   /**
43 |    * Method to parse robots.txt file into a matcher.
44 |    *
45 |    * @param robotsTxtBodyBytes body of robots.txt file to parse
46 |    * @return matcher representing given robots.txt file
47 |    */
48 |   abstract Matcher parse(final byte[] robotsTxtBodyBytes);
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsContents.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import com.google.common.flogger.FluentLogger;
 18 | import java.util.ArrayList;
 19 | import java.util.HashSet;
 20 | import java.util.List;
 21 | import java.util.Objects;
 22 | import java.util.Set;
 23 | 
 24 | /** Representation of robots.txt contents: multiple groups of rules. */
 25 | public class RobotsContents {
 26 |   private static final FluentLogger logger = FluentLogger.forEnclosingClass();
 27 |   /**
 28 |    * Representation of robots.txt group of rules: multiple user-agents to which multiple rules are
 29 |    * applied.
 30 |    */
 31 |   static class Group {
 32 |     /** Representation of robots.txt rule: pair of directive and value. */
 33 |     static class Rule {
 34 |       private final Parser.DirectiveType directiveType;
 35 |       private final String directiveValue;
 36 | 
 37 |       Rule(final Parser.DirectiveType directiveType, final String directiveValue) {
 38 |         this.directiveType = directiveType;
 39 |         this.directiveValue = directiveValue;
 40 |       }
 41 | 
 42 |       public Parser.DirectiveType getDirectiveType() {
 43 |         return directiveType;
 44 |       }
 45 | 
 46 |       public String getDirectiveValue() {
 47 |         return directiveValue;
 48 |       }
 49 | 
 50 |       @Override
 51 |       public boolean equals(Object obj) {
 52 |         if (this == obj) return true;
 53 |         if (obj == null || getClass() != obj.getClass()) return false;
 54 |         Rule other = (Rule) obj;
 55 |         return Objects.equals(directiveType, other.directiveType)
 56 |             && Objects.equals(directiveValue, other.directiveValue);
 57 |       }
 58 | 
 59 |       @Override
 60 |       public int hashCode() {
 61 |         return Objects.hash(directiveType, directiveValue);
 62 |       }
 63 |     }
 64 | 
 65 |     private final Set<String> userAgents;
 66 |     private final Set<Rule> rules;
 67 |     private boolean global = false;
 68 | 
 69 |     Group() {
 70 |       userAgents = new HashSet<>();
 71 |       rules = new HashSet<>();
 72 |     }
 73 | 
 74 |     // Intended to be used from tests only.
 75 |     Group(final List<String> userAgents, final List<Rule> rules) {
 76 |       this(userAgents, rules, false);
 77 |     }
 78 | 
 79 |     // Intended to be used from tests only.
 80 |     Group(final List<String> userAgents, final List<Rule> rules, final boolean global) {
 81 |       this.userAgents = new HashSet<>(userAgents);
 82 |       this.rules = new HashSet<>(rules);
 83 |       this.global = global;
 84 |     }
 85 | 
 86 |     void addUserAgent(final String userAgent) {
 87 |       // Google-specific optimization: a '*' followed by space and more characters
 88 |       // in a user-agent record is still regarded a global rule.
 89 |       if (userAgent.length() >= 1
 90 |           && userAgent.charAt(0) == '*'
 91 |           && (userAgent.length() == 1 || Character.isWhitespace(userAgent.charAt(1)))) {
 92 | 
 93 |         if (userAgent.length() > 1 && Character.isWhitespace(userAgent.charAt(1))) {
 94 |           logger.atInfo().log("Assuming \"%s\" user-agent as \"*\"", userAgent);
 95 |         }
 96 | 
 97 |         global = true;
 98 |       } else {
 99 |         int end = 0;
100 |         for (; end < userAgent.length(); end++) {
101 |           final char ch = userAgent.charAt(end);
102 |           if (!Character.isAlphabetic(ch) && ch != '-' && ch != '_') {
103 |             break;
104 |           }
105 |         }
106 |         userAgents.add(userAgent.substring(0, end));
107 |       }
108 |     }
109 | 
110 |     void addRule(final Parser.DirectiveType directiveType, final String directiveValue) {
111 |       rules.add(new Rule(directiveType, directiveValue));
112 |     }
113 | 
114 |     boolean hasRule(final Parser.DirectiveType directiveType, final String directiveValue) {
115 |       return rules.contains(new Rule(directiveType, directiveValue));
116 |     }
117 | 
118 |     public Set<String> getUserAgents() {
119 |       return userAgents;
120 |     }
121 | 
122 |     public Set<Rule> getRules() {
123 |       return rules;
124 |     }
125 | 
126 |     public boolean isGlobal() {
127 |       return global;
128 |     }
129 | 
130 |     @Override
131 |     public boolean equals(Object obj) {
132 |       if (this == obj) return true;
133 |       if (obj == null || getClass() != obj.getClass()) return false;
134 |       Group other = (Group) obj;
135 |       return Objects.equals(userAgents, other.userAgents)
136 |           && Objects.equals(rules, other.rules)
137 |           && Objects.equals(global, other.global);
138 |     }
139 | 
140 |     @Override
141 |     public int hashCode() {
142 |       return Objects.hash(userAgents, rules);
143 |     }
144 |   }
145 | 
146 |   private final List<Group> groups;
147 | 
148 |   RobotsContents() {
149 |     groups = new ArrayList<>();
150 |   }
151 | 
152 |   public RobotsContents(final List<Group> groups) {
153 |     this.groups = groups;
154 |   }
155 | 
156 |   void addGroup(Group group) {
157 |     groups.add(group);
158 |   }
159 | 
160 |   public List<Group> getGroups() {
161 |     return groups;
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsLongestMatchStrategy.java:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package com.google.search.robotstxt;
16 | 
17 | /**
18 |  * Implementation of matching strategy used in robots.txt matching. Implements longest-match
19 |  * strategy.
20 |  */
21 | public class RobotsLongestMatchStrategy implements MatchingStrategy {
22 |   /**
23 |    * Checks whether the given path may be matched to the given pattern. Treats '*' as a wildcard and
24 |    * '$' as a termination symbol iff it is in the end of pattern.
25 |    *
26 |    * @param path path to match
27 |    * @param pattern pattern to match to
28 |    * @return {@code true} iff given path matches given pattern
29 |    */
30 |   private static boolean matches(final String path, final String pattern) {
31 |     // "Prefixes" array stores "path" prefixes that match specific prefix of "pattern".
32 |     // Prefixes of "pattern" are iterated over in ascending order in the loop below.
33 |     // Each prefix is represented by its end index (exclusive), the array stores them in ascending
34 |     // order.
35 |     final int[] prefixes = new int[path.length() + 1];
36 |     prefixes[0] = 0;
37 |     int prefixesCount = 1;
38 | 
39 |     for (int i = 0; i < pattern.length(); i++) {
40 |       final char ch = pattern.charAt(i);
41 | 
42 |       // '$' in the end of pattern indicates its termination.
43 |       if (ch == '$' && i + 1 == pattern.length()) {
44 |         return prefixes[prefixesCount - 1] == path.length();
45 |       }
46 | 
47 |       // In case of '*' occurrence all path prefixes starting from the shortest one may be matched.
48 |       if (ch == '*') {
49 |         prefixesCount = path.length() - prefixes[0] + 1;
50 |         for (int j = 1; j < prefixesCount; j++) {
51 |           prefixes[j] = prefixes[j - 1] + 1;
52 |         }
53 |       } else {
54 |         // Iterate over each previous prefix and try to extend by one character.
55 |         int newPrefixesCount = 0;
56 |         for (int j = 0; j < prefixesCount; j++) {
57 |           if (prefixes[j] < path.length() && path.charAt(prefixes[j]) == ch) {
58 |             prefixes[newPrefixesCount++] = prefixes[j] + 1;
59 |           }
60 |         }
61 |         if (newPrefixesCount == 0) {
62 |           return false;
63 |         }
64 |         prefixesCount = newPrefixesCount;
65 |       }
66 |     }
67 | 
68 |     return true;
69 |   }
70 | 
71 |   @Override
72 |   public int matchAllowPriority(String path, String pattern) {
73 |     return matches(path, pattern) ? pattern.length() : -1;
74 |   }
75 | 
76 |   @Override
77 |   public int matchDisallowPriority(String path, String pattern) {
78 |     return matches(path, pattern) ? pattern.length() : -1;
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsMatcher.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import com.google.common.flogger.FluentLogger;
 18 | import java.net.MalformedURLException;
 19 | import java.net.URL;
 20 | import java.util.Collections;
 21 | import java.util.List;
 22 | import java.util.Map;
 23 | 
 24 | /**
 25 |  * Class implementing matching logic based on directives priorities those calculation is delegated
 26 |  * to a {@link MatchingStrategy} class.
 27 |  */
 28 | public class RobotsMatcher implements Matcher {
 29 |   private static final FluentLogger logger = FluentLogger.forEnclosingClass();
 30 | 
 31 |   /** Class containing current match priorities */
 32 |   private static class Match {
 33 |     /** Priority based on agent-specific rules */
 34 |     private int prioritySpecific = 0;
 35 |     /** Priority based on global wildcard (*) rules */
 36 |     private int priorityGlobal = 0;
 37 | 
 38 |     void updateSpecific(final int priority) {
 39 |       prioritySpecific = Math.max(prioritySpecific, priority);
 40 |     }
 41 | 
 42 |     void updateGlobal(final int priority) {
 43 |       priorityGlobal = Math.max(priorityGlobal, priority);
 44 |     }
 45 | 
 46 |     public int getPrioritySpecific() {
 47 |       return prioritySpecific;
 48 |     }
 49 | 
 50 |     public int getPriorityGlobal() {
 51 |       return priorityGlobal;
 52 |     }
 53 | 
 54 |     public void resetGlobal() {
 55 |       priorityGlobal = 0;
 56 |     }
 57 |   }
 58 | 
 59 |   private final RobotsContents robotsContents;
 60 |   private final MatchingStrategy matchingStrategy = new RobotsLongestMatchStrategy();
 61 | 
 62 |   public RobotsMatcher(final RobotsContents robotsContents) {
 63 |     this.robotsContents = robotsContents;
 64 |   }
 65 | 
 66 |   /** Used to extract contents for testing purposes. */
 67 |   RobotsContents getRobotsContents() {
 68 |     return robotsContents;
 69 |   }
 70 | 
 71 |   private static String getPath(final String url) {
 72 |     final URL parsedUrl;
 73 |     try {
 74 |       parsedUrl = new URL(url);
 75 |     } catch (final MalformedURLException e) {
 76 |       logger.atWarning().log("Malformed URL: \"%s\", replaced with \"/\"", url);
 77 |       return "/";
 78 |     }
 79 |     String path = parsedUrl.getPath();
 80 |     final String args = parsedUrl.getQuery();
 81 |     if (args != null) {
 82 |       path += "?" + args;
 83 |     }
 84 | 
 85 |     return path;
 86 |   }
 87 | 
 88 |   /**
 89 |    * Computes {@link Match} priorities for ALLOW and DISALLOW verdicts. Rules are considered
 90 |    * effective if at least one user agent is listed in "user-agent" directives or applied globally
 91 |    * (if global rules are not ignored).
 92 |    *
 93 |    * @param userAgents list of interested user agents
 94 |    * @param path target path
 95 |    * @param ignoreGlobal global rules will not be considered if set to {@code true}
 96 |    * @return pair of {@link Match} representing ALLOW and DISALLOW priorities respectively
 97 |    */
 98 |   private Map.Entry<Match, Match> computeMatchPriorities(
 99 |       final List<String> userAgents, final String path, final boolean ignoreGlobal) {
100 |     final Match allow = new Match();
101 |     final Match disallow = new Match();
102 |     boolean foundSpecificGroup = false;
103 | 
104 |     for (RobotsContents.Group group : robotsContents.getGroups()) {
105 |       final boolean isSpecificGroup =
106 |           userAgents.stream()
107 |               .anyMatch(
108 |                   userAgent ->
109 |                       group.getUserAgents().stream().anyMatch(userAgent::equalsIgnoreCase));
110 |       foundSpecificGroup |= isSpecificGroup;
111 |       if (!isSpecificGroup && (ignoreGlobal || !group.isGlobal())) {
112 |         continue;
113 |       }
114 | 
115 |       for (RobotsContents.Group.Rule rule : group.getRules()) {
116 |         switch (rule.getDirectiveType()) {
117 |           case ALLOW:
118 |             {
119 |               final int priority =
120 |                   matchingStrategy.matchAllowPriority(path, rule.getDirectiveValue());
121 |               if (isSpecificGroup) {
122 |                 allow.updateSpecific(priority);
123 |               }
124 |               if (!ignoreGlobal && group.isGlobal()) {
125 |                 allow.updateGlobal(priority);
126 |               }
127 |               break;
128 |             }
129 |           case DISALLOW:
130 |             {
131 |               final int priority =
132 |                   matchingStrategy.matchDisallowPriority(path, rule.getDirectiveValue());
133 |               if (isSpecificGroup) {
134 |                 disallow.updateSpecific(priority);
135 |               }
136 |               if (!ignoreGlobal && group.isGlobal()) {
137 |                 disallow.updateGlobal(priority);
138 |               }
139 |               break;
140 |             }
141 |           case SITEMAP:
142 |           case UNKNOWN:
143 |           case USER_AGENT:
144 |             break;
145 |         }
146 |       }
147 |     }
148 | 
149 |     // If there is at least one group specific for current agents, global groups should be
150 |     // disregarded.
151 |     if (foundSpecificGroup) {
152 |       allow.resetGlobal();
153 |       disallow.resetGlobal();
154 |     }
155 | 
156 |     return Map.entry(allow, disallow);
157 |   }
158 | 
159 |   private Map.Entry<Match, Match> computeMatchPriorities(
160 |       final List<String> userAgents, final String path) {
161 |     return computeMatchPriorities(userAgents, path, false);
162 |   }
163 | 
164 |   /**
165 |    * Return {@code true} iff verdict must be ALLOW based on ALLOW and DISALLOW priorities.
166 |    *
167 |    * @param allow ALLOW priorities
168 |    * @param disallow DISALLOW priorities
169 |    * @return match verdict
170 |    */
171 |   private static boolean allowVerdict(final Match allow, final Match disallow) {
172 |     if (allow.getPrioritySpecific() > 0 || disallow.getPrioritySpecific() > 0) {
173 |       return allow.getPrioritySpecific() >= disallow.getPrioritySpecific();
174 |     }
175 | 
176 |     if (allow.getPriorityGlobal() > 0 || disallow.getPriorityGlobal() > 0) {
177 |       return allow.getPriorityGlobal() >= disallow.getPriorityGlobal();
178 |     }
179 | 
180 |     return true;
181 |   }
182 | 
183 |   @Override
184 |   public boolean allowedByRobots(final List<String> userAgents, final String url) {
185 |     final String path = getPath(url);
186 |     Map.Entry<Match, Match> matches = computeMatchPriorities(userAgents, path);
187 |     return allowVerdict(matches.getKey(), matches.getValue());
188 |   }
189 | 
190 |   @Override
191 |   public boolean singleAgentAllowedByRobots(final String userAgent, final String url) {
192 |     return allowedByRobots(Collections.singletonList(userAgent), url);
193 |   }
194 | 
195 |   @Override
196 |   public boolean ignoreGlobalAllowedByRobots(final List<String> userAgents, final String url) {
197 |     final String path = getPath(url);
198 |     Map.Entry<Match, Match> matches = computeMatchPriorities(userAgents, path, true);
199 |     return allowVerdict(matches.getKey(), matches.getValue());
200 |   }
201 | }
202 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsParseHandler.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import com.google.common.flogger.FluentLogger;
 18 | import java.nio.charset.StandardCharsets;
 19 | 
 20 | /** Implementation of parsing strategy used in robots.txt parsing. */
 21 | public class RobotsParseHandler implements ParseHandler {
 22 |   private static final FluentLogger logger = FluentLogger.forEnclosingClass();
 23 | 
 24 |   protected RobotsContents robotsContents;
 25 |   private RobotsContents.Group currentGroup;
 26 |   private boolean foundContent;
 27 | 
 28 |   @Override
 29 |   public void handleStart() {
 30 |     robotsContents = new RobotsContents();
 31 |     currentGroup = new RobotsContents.Group();
 32 |     foundContent = false;
 33 |   }
 34 | 
 35 |   private void flushCompleteGroup(boolean createNew) {
 36 |     robotsContents.addGroup(currentGroup);
 37 |     if (createNew) {
 38 |       currentGroup = new RobotsContents.Group();
 39 |     }
 40 |   }
 41 | 
 42 |   @Override
 43 |   public void handleEnd() {
 44 |     flushCompleteGroup(false);
 45 |   }
 46 | 
 47 |   private void handleUserAgent(final String value) {
 48 |     if (foundContent) {
 49 |       flushCompleteGroup(true);
 50 |       foundContent = false;
 51 |     }
 52 |     currentGroup.addUserAgent(value);
 53 |   }
 54 | 
 55 |   private static boolean isHexChar(final byte b) {
 56 |     return Character.isDigit(b) || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F');
 57 |   }
 58 | 
 59 |   /**
 60 |    * Canonicalize paths: escape characters outside of US-ASCII charset (e.g. /SanJoséSellers ==>
 61 |    * /Sanjos%C3%A9Sellers) and normalize escape-characters (e.g. %aa ==> %AA)
 62 |    *
 63 |    * @param path Path to canonicalize.
 64 |    * @return escaped and normalized path
 65 |    */
 66 |   private static String maybeEscapePattern(final String path) {
 67 |     final byte[] bytes = path.getBytes(StandardCharsets.UTF_8);
 68 | 
 69 |     int unescapedCount = 0;
 70 |     boolean notCapitalized = false;
 71 | 
 72 |     // Check if any changes required
 73 |     for (int i = 0; i < bytes.length; i++) {
 74 |       if (i < bytes.length - 2
 75 |           && bytes[i] == '%'
 76 |           && isHexChar(bytes[i + 1])
 77 |           && isHexChar(bytes[i + 2])) {
 78 |         if (Character.isLowerCase(bytes[i + 1]) || Character.isLowerCase(bytes[i + 2])) {
 79 |           notCapitalized = true;
 80 |         }
 81 |         i += 2;
 82 |       } else if ((bytes[i] & 0x80) != 0) {
 83 |         unescapedCount++;
 84 |       }
 85 |     }
 86 | 
 87 |     // Return if no changes needed
 88 |     if (unescapedCount == 0 && !notCapitalized) {
 89 |       return path;
 90 |     }
 91 | 
 92 |     final StringBuilder stringBuilder = new StringBuilder();
 93 |     for (int i = 0; i < bytes.length; i++) {
 94 |       if (i < bytes.length - 2
 95 |           && bytes[i] == '%'
 96 |           && isHexChar(bytes[i + 1])
 97 |           && isHexChar(bytes[i + 2])) {
 98 |         stringBuilder.append((char) bytes[i++]);
 99 |         stringBuilder.append((char) Character.toUpperCase(bytes[i++]));
100 |         stringBuilder.append((char) Character.toUpperCase(bytes[i]));
101 |       } else if ((bytes[i] & 0x80) != 0) {
102 |         stringBuilder.append('%');
103 |         stringBuilder.append(Integer.toHexString((bytes[i] >> 4) & 0xf).toUpperCase());
104 |         stringBuilder.append(Integer.toHexString(bytes[i] & 0xf).toUpperCase());
105 |       } else {
106 |         stringBuilder.append((char) bytes[i]);
107 |       }
108 |     }
109 |     return stringBuilder.toString();
110 |   }
111 | 
112 |   @Override
113 |   public void handleDirective(
114 |       final Parser.DirectiveType directiveType, final String directiveValue) {
115 |     switch (directiveType) {
116 |       case USER_AGENT:
117 |         {
118 |           handleUserAgent(directiveValue);
119 |           break;
120 |         }
121 |       case ALLOW:
122 |       case DISALLOW:
123 |         {
124 |           foundContent = true;
125 |           if (currentGroup.isGlobal() || currentGroup.getUserAgents().size() > 0) {
126 |             final String path = maybeEscapePattern(directiveValue);
127 |             currentGroup.addRule(directiveType, path);
128 | 
129 |             if (directiveType == Parser.DirectiveType.ALLOW) {
130 |               // Google-specific optimization: 'index.htm' and 'index.html' are normalized to '/'.
131 |               final int slashPos = path.lastIndexOf('/');
132 | 
133 |               if (slashPos != -1) {
134 |                 final String fileName = path.substring(slashPos + 1);
135 |                 if ("index.htm".equals(fileName) || "index.html".equals(fileName)) {
136 |                   final String normalizedPath = path.substring(0, slashPos + 1) + '$';
137 | 
138 |                   if (!currentGroup.hasRule(Parser.DirectiveType.ALLOW, normalizedPath)) {
139 |                     logger.atInfo().log(
140 |                         "Allowing normalized path: \"%s\" -> \"%s\"",
141 |                         directiveValue, normalizedPath);
142 |                     currentGroup.addRule(Parser.DirectiveType.ALLOW, normalizedPath);
143 |                   }
144 |                 }
145 |               }
146 |             }
147 |           }
148 |           break;
149 |         }
150 |       case SITEMAP:
151 |       case UNKNOWN:
152 |         {
153 |           foundContent = true;
154 |           break;
155 |         }
156 |     }
157 |   }
158 | 
159 |   @Override
160 |   public Matcher compute() {
161 |     return new RobotsMatcher(robotsContents);
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsParser.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import com.google.common.flogger.FluentLogger;
 18 | import java.nio.charset.StandardCharsets;
 19 | import java.util.Arrays;
 20 | import java.util.logging.Level;
 21 | import java.util.stream.Stream;
 22 | 
 23 | /** Robots.txt parser implementation. */
 24 | public class RobotsParser extends Parser {
 25 |   private static final FluentLogger logger = FluentLogger.forEnclosingClass();
 26 |   private final int valueMaxLengthBytes;
 27 | 
 28 |   public RobotsParser(final ParseHandler parseHandler) {
 29 |     super(parseHandler);
 30 |     this.valueMaxLengthBytes = 2083;
 31 |   }
 32 | 
 33 |   RobotsParser(final ParseHandler parseHandler, final int valueMaxLengthBytes) {
 34 |     super(parseHandler);
 35 |     this.valueMaxLengthBytes = valueMaxLengthBytes;
 36 |   }
 37 | 
 38 |   private static boolean isWhitespace(final char ch) {
 39 |     return ch == ' ' || ch == '\t';
 40 |   }
 41 | 
 42 |   /**
 43 |    * Extracts substring between given indexes and trims preceding and succeeding whitespace
 44 |    * characters.
 45 |    *
 46 |    * @param bytes data to extract from
 47 |    * @param beginIndex the beginning index, inclusive
 48 |    * @param endIndex the ending index, exclusive
 49 |    * @return extracted substring with trimmed whitespaces
 50 |    * @throws ParseException if there are only whitespace characters between given indexes
 51 |    */
 52 |   private static String trimBounded(final byte[] bytes, final int beginIndex, final int endIndex)
 53 |       throws ParseException {
 54 |     int begin = beginIndex;
 55 |     int end = endIndex;
 56 |     while (begin < endIndex && isWhitespace((char) bytes[begin])) {
 57 |       begin++;
 58 |     }
 59 |     while (end > beginIndex && isWhitespace((char) bytes[end - 1])) {
 60 |       end--;
 61 |     }
 62 |     if (begin >= end) {
 63 |       throw new ParseException();
 64 |     } else {
 65 |       return new String(Arrays.copyOfRange(bytes, begin, end), StandardCharsets.UTF_8);
 66 |     }
 67 |   }
 68 | 
 69 |   private static DirectiveType parseDirective(final String key) {
 70 |     if (key.equalsIgnoreCase("user-agent")) {
 71 |       return DirectiveType.USER_AGENT;
 72 |     } else {
 73 |       try {
 74 |         return DirectiveType.valueOf(key.toUpperCase());
 75 |       } catch (final IllegalArgumentException e) {
 76 |         final boolean disallowTypoDetected =
 77 |             Stream.of("dissallow", "dissalow", "disalow", "diasllow", "disallaw")
 78 |                 .anyMatch(s -> key.compareToIgnoreCase(s) == 0);
 79 |         if (disallowTypoDetected) {
 80 |           logger.atInfo().log("Fixed typo: \"%s\" -> \"%s\"", key, "disallow");
 81 |           return DirectiveType.DISALLOW;
 82 |         }
 83 | 
 84 |         return DirectiveType.UNKNOWN;
 85 |       }
 86 |     }
 87 |   }
 88 | 
 89 |   private static void log(
 90 |       final Level level,
 91 |       final String message,
 92 |       final byte[] robotsTxtBodyBytes,
 93 |       final int lineBegin,
 94 |       final int lineEnd,
 95 |       final int lineNumber) {
 96 |     logger.at(level).log(
 97 |         "%s%nAt line %d:%n%s\t",
 98 |         message,
 99 |         lineNumber,
100 |         new String(Arrays.copyOfRange(robotsTxtBodyBytes, lineBegin, lineEnd)));
101 |   }
102 | 
103 |   /**
104 |    * Extracts value from robots.txt body and trims it to {@link this#valueMaxLengthBytes} bytes if
105 |    * necessary. Most of parameters are used for logging.
106 |    *
107 |    * @param robotsTxtBodyBytes contents of robots.txt file
108 |    * @param separator index of separator between key and value
109 |    * @param limit index of key and value ending
110 |    * @param lineBegin index of line beginning
111 |    * @param lineEnd index of line ending
112 |    * @param lineNumber number of line in robots.txt file
113 |    * @return parsed value within given line of robots.txt
114 |    * @throws ParseException if line limits are invalid
115 |    */
116 |   private String getValue(
117 |       final byte[] robotsTxtBodyBytes,
118 |       final int separator,
119 |       final int limit,
120 |       final int lineBegin,
121 |       final int lineEnd,
122 |       final int lineNumber)
123 |       throws ParseException {
124 |     String value = trimBounded(robotsTxtBodyBytes, separator + 1, limit);
125 | 
126 |     // Google-specific optimization: since no search engine will process more than 2083 bytes
127 |     // per URL all values are trimmed to fit this size.
128 |     final byte[] valueBytes = value.getBytes(StandardCharsets.UTF_8);
129 | 
130 |     // We decrease max size by two bytes. It is done to fit a replacement character (\uFFFD)
131 |     // if the last character is trimmed to an invalid one.
132 |     final int maxLengthBytes = valueMaxLengthBytes - 2;
133 | 
134 |     if (valueBytes.length > maxLengthBytes) {
135 |       log(
136 |           Level.INFO,
137 |           "Value truncated to " + valueMaxLengthBytes + " bytes.",
138 |           robotsTxtBodyBytes,
139 |           lineBegin,
140 |           lineEnd,
141 |           lineNumber);
142 | 
143 |       value =
144 |           new String(
145 |               valueBytes, 0, Math.min(valueBytes.length, maxLengthBytes), StandardCharsets.UTF_8);
146 |     }
147 | 
148 |     return value;
149 |   }
150 | 
151 |   private void parseLine(
152 |       final byte[] robotsTxtBodyBytes,
153 |       final int lineBegin,
154 |       final int lineEnd,
155 |       final int lineNumber) {
156 |     int limit = lineEnd;
157 |     int separator = lineEnd;
158 |     int whitespaceSeparator = lineEnd;
159 |     boolean hasContents = false;
160 | 
161 |     for (int i = lineBegin; i < lineEnd; i++) {
162 |       final byte b = robotsTxtBodyBytes[i];
163 |       if (b == '#') {
164 |         limit = i;
165 |         break;
166 |       }
167 |       if (!isWhitespace((char) b)) {
168 |         hasContents = true;
169 |       }
170 |       if (isWhitespace((char) b) && hasContents && whitespaceSeparator == lineEnd) {
171 |         whitespaceSeparator = i;
172 |       }
173 |       if (separator == lineEnd && b == ':') {
174 |         separator = i;
175 |       }
176 |     }
177 | 
178 |     if (separator == lineEnd) {
179 |       // Google-specific optimization: some people forget the colon, so we need to
180 |       // accept whitespace instead.
181 |       if (whitespaceSeparator != lineEnd) {
182 |         log(
183 |             Level.INFO,
184 |             "Assuming whitespace as a separator.",
185 |             robotsTxtBodyBytes,
186 |             lineBegin,
187 |             lineEnd,
188 |             lineNumber);
189 |         separator = whitespaceSeparator;
190 |       } else {
191 |         if (hasContents) {
192 |           log(
193 |               Level.WARNING,
194 |               "No separator found.",
195 |               robotsTxtBodyBytes,
196 |               lineBegin,
197 |               lineEnd,
198 |               lineNumber);
199 |         }
200 |         return;
201 |       }
202 |     }
203 | 
204 |     final String key;
205 |     try {
206 |       key = trimBounded(robotsTxtBodyBytes, lineBegin, separator);
207 |     } catch (ParseException e) {
208 |       log(Level.WARNING, "No key found.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber);
209 |       return;
210 |     }
211 | 
212 |     DirectiveType directiveType = parseDirective(key);
213 |     if (directiveType == DirectiveType.UNKNOWN) {
214 |       log(Level.WARNING, "Unknown key.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber);
215 |     }
216 | 
217 |     String value;
218 |     try {
219 |       value = getValue(robotsTxtBodyBytes, separator, limit, lineBegin, lineEnd, lineNumber);
220 |     } catch (final ParseException e) {
221 |       log(Level.WARNING, "No value found.", robotsTxtBodyBytes, lineBegin, lineEnd, lineNumber);
222 |       value = "";
223 |       directiveType = DirectiveType.UNKNOWN;
224 |     }
225 |     parseHandler.handleDirective(directiveType, value);
226 |   }
227 | 
228 |   @Override
229 |   Matcher parse(byte[] robotsTxtBodyBytes) {
230 |     final byte[] bomUtf8 = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
231 |     int bomPos = 0;
232 | 
233 |     int posBegin = 0;
234 |     int posEnd = 0;
235 |     int lineNumber = 0;
236 |     boolean previousWasCarriageReturn = false;
237 | 
238 |     parseHandler.handleStart();
239 | 
240 |     // Iteration over characters is preferred over utilities that split text into lines to avoid
241 |     // having to create additional Strings and comply with line breaking defined in standard.
242 |     for (int i = 0; i <= robotsTxtBodyBytes.length; i++) {
243 |       final byte b = (i == robotsTxtBodyBytes.length) ? (byte) '\0' : robotsTxtBodyBytes[i];
244 | 
245 |       // Google-specific optimization: UTF-8 byte order marks should never
246 |       // appear in a robots.txt file, but they do nevertheless. Skipping
247 |       // possible BOM-prefix in the first bytes of the input.
248 |       if (bomPos < bomUtf8.length && b == bomUtf8[bomPos++]) {
249 |         posBegin++;
250 |         posEnd++;
251 |         continue;
252 |       }
253 |       bomPos = bomUtf8.length;
254 | 
255 |       if (b != '\n' && b != '\r' && b != '\0') {
256 |         posEnd++;
257 |       } else {
258 |         if (posBegin != posEnd || !previousWasCarriageReturn || b != '\n') {
259 |           parseLine(robotsTxtBodyBytes, posBegin, posEnd, ++lineNumber);
260 |         }
261 |         posBegin = posEnd = i + 1;
262 |         previousWasCarriageReturn = b == '\r';
263 |       }
264 |     }
265 | 
266 |     parseHandler.handleEnd();
267 | 
268 |     return parseHandler.compute();
269 |   }
270 | }
271 | 


--------------------------------------------------------------------------------
/src/main/java/com/google/search/robotstxt/RobotsParserApp.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import com.google.common.flogger.FluentLogger;
 18 | import com.google.common.io.ByteStreams;
 19 | import java.io.IOException;
 20 | import java.io.UncheckedIOException;
 21 | import java.nio.file.Files;
 22 | import java.nio.file.InvalidPathException;
 23 | import java.nio.file.Path;
 24 | import java.util.List;
 25 | import java.util.Objects;
 26 | import java.util.concurrent.Callable;
 27 | import picocli.CommandLine;
 28 | 
 29 | /**
 30 |  * Console application for parsing robots.txt and matching URLs against it.
 31 |  *
 32 |  * @see Parser
 33 |  * @see Matcher
 34 |  */
 35 | @CommandLine.Command(
 36 |     name = "robotsParser",
 37 |     description =
 38 |         "Parses and matches given agents against given robots.txt to determine "
 39 |             + "whether any agent is allowed to visit given URL.",
 40 |     exitCodeOnExecutionException = 2,
 41 |     exitCodeOnInvalidInput = 3)
 42 | public class RobotsParserApp implements Callable<Integer> {
 43 |   private static final FluentLogger logger = FluentLogger.forEnclosingClass();
 44 | 
 45 |   public RobotsParserApp() {}
 46 | 
 47 |   public static void main(final String[] args) {
 48 |     final int exitCode = new CommandLine(new RobotsParserApp()).execute(args);
 49 |     System.exit(exitCode);
 50 |   }
 51 | 
 52 |   /** robots.txt file path. */
 53 |   @CommandLine.Option(names = {"-f", "--file"})
 54 |   private String robotsTxtPath;
 55 | 
 56 |   /** Interested user-agents. */
 57 |   @CommandLine.Option(
 58 |       names = {"-a", "--agent"},
 59 |       required = true)
 60 |   private List<String> agents;
 61 | 
 62 |   /** Target URL to match. */
 63 |   @CommandLine.Option(
 64 |       names = {"-u", "--url"},
 65 |       required = true)
 66 |   private String url;
 67 | 
 68 |   private byte[] readRobotsTxt() throws ParseException {
 69 |     try {
 70 |       if (Objects.isNull(robotsTxtPath)) {
 71 |         // Reading from stdin
 72 |         return ByteStreams.toByteArray(System.in);
 73 |       } else {
 74 |         // Reading from file
 75 |         return Files.readAllBytes(Path.of(robotsTxtPath));
 76 |       }
 77 |     } catch (final UncheckedIOException | IOException | InvalidPathException e) {
 78 |       throw new ParseException("Failed to read robots.txt file.", e);
 79 |     }
 80 |   }
 81 | 
 82 |   private static void logError(final Exception e) {
 83 |     System.out.println("ERROR: " + e.getMessage());
 84 |     logger.atInfo().withCause(e).log("Stack trace:");
 85 |   }
 86 | 
 87 |   /**
 88 |    * Parses given robots.txt file and performs matching process.
 89 |    *
 90 |    * @return {@code 0} if any of user-agents is allowed to crawl given URL and {@code 1} otherwise.
 91 |    */
 92 |   @Override
 93 |   public Integer call() {
 94 |     final byte[] robotsTxtContents;
 95 |     try {
 96 |       robotsTxtContents = readRobotsTxt();
 97 |     } catch (final ParseException e) {
 98 |       logError(e);
 99 |       return 2;
100 |     }
101 | 
102 |     final Parser parser = new RobotsParser(new RobotsParseHandler());
103 |     final RobotsMatcher matcher = (RobotsMatcher) parser.parse(robotsTxtContents);
104 | 
105 |     final boolean parseResult;
106 |     parseResult = matcher.allowedByRobots(agents, url);
107 | 
108 |     if (parseResult) {
109 |       System.out.println("ALLOWED");
110 |       return 0;
111 |     } else {
112 |       System.out.println("DISALLOWED");
113 |       return 1;
114 |     }
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/search/robotstxt/RobotsMatcherTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import static org.junit.Assert.assertFalse;
 18 | import static org.junit.Assert.assertTrue;
 19 | 
 20 | import java.nio.charset.StandardCharsets;
 21 | import org.junit.Test;
 22 | 
 23 | /** Unit tests validating matching behavior. */
 24 | public class RobotsMatcherTest {
 25 |   private static Matcher parse(final String robotsTxtBody) {
 26 |     final Parser parser = new RobotsParser(new RobotsParseHandler());
 27 |     return parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8));
 28 |   }
 29 | 
 30 |   /** Verifies: parsing an matching robots.txt containing single group */
 31 |   @Test
 32 |   public void testSingleGroup() {
 33 |     final String robotsTxtBodyCorrect = "user-agent: FooBot\n" + "disallow: /\n";
 34 |     final String robotsTxtBodyIncorrect = "foo: FooBot\n" + "bar: /\n";
 35 |     final String robotsTxtMissingSeparator = "user-agent FooBot\n" + "disallow /\n";
 36 | 
 37 |     final String url = "http://foo.bar/x/y";
 38 | 
 39 |     final Matcher matcherCorrect = parse(robotsTxtBodyCorrect);
 40 |     assertFalse(matcherCorrect.singleAgentAllowedByRobots("FooBot", url));
 41 | 
 42 |     final Matcher matcherIncorrect = parse(robotsTxtBodyIncorrect);
 43 |     assertTrue(matcherIncorrect.singleAgentAllowedByRobots("FooBot", url));
 44 | 
 45 |     final Matcher matcherMissingSeparator = parse(robotsTxtMissingSeparator);
 46 |     assertFalse(matcherMissingSeparator.singleAgentAllowedByRobots("FooBot", url));
 47 |   }
 48 | 
 49 |   /**
 50 |    * Verifies: parsing an matching robots.txt containing multiple groups, invalid directives
 51 |    * ignorance.
 52 |    */
 53 |   @Test
 54 |   public void testMultipleGroups() {
 55 |     final String robotsTxtBody =
 56 |         "allow: /foo/bar/\n"
 57 |             + "\n"
 58 |             + "user-agent: FooBot\n"
 59 |             + "disallow: /\n"
 60 |             + "allow: /x/\n"
 61 |             + "user-agent: BarBot\n"
 62 |             + "disallow: /\n"
 63 |             + "allow: /y/\n"
 64 |             + "\n"
 65 |             + "\n"
 66 |             + "allow: /w/\n"
 67 |             + "user-agent: BazBot\n"
 68 |             + "\n"
 69 |             + "user-agent: FooBot\n"
 70 |             + "allow: /z/\n"
 71 |             + "disallow: /\n";
 72 | 
 73 |     final String urlWa = "http://foo.bar/w/a";
 74 |     final String urlXb = "http://foo.bar/x/b";
 75 |     final String urlYc = "http://foo.bar/y/c";
 76 |     final String urlZd = "http://foo.bar/z/d";
 77 |     final String urlFooBar = "http://foo.bar/foo/bar/";
 78 | 
 79 |     final Matcher matcher = parse(robotsTxtBody);
 80 | 
 81 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urlXb));
 82 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urlZd));
 83 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urlYc));
 84 |     assertTrue(matcher.singleAgentAllowedByRobots("BarBot", urlYc));
 85 |     assertTrue(matcher.singleAgentAllowedByRobots("BarBot", urlWa));
 86 |     assertFalse(matcher.singleAgentAllowedByRobots("BarBot", urlZd));
 87 |     assertTrue(matcher.singleAgentAllowedByRobots("BazBot", urlZd));
 88 | 
 89 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urlFooBar));
 90 |     assertFalse(matcher.singleAgentAllowedByRobots("BarBot", urlFooBar));
 91 |     assertFalse(matcher.singleAgentAllowedByRobots("BazBot", urlFooBar));
 92 |   }
 93 | 
 94 |   /** Verifies: directives case insensitivity. */
 95 |   @Test
 96 |   public void testDirectiveCaseInsensitivity() {
 97 |     final String robotsTxtBodyUpper = "USER-AGENT: FooBot\n" + "ALLOW: /x/\n" + "DISALLOW: /\n";
 98 |     final String robotsTxtBodyLower = "user-agent: FooBot\n" + "allow: /x/\n" + "disallow: /\n";
 99 |     final String robotsTxtBodyRandom = "uSeR-aGeNt: FooBot\n" + "AlLoW: /x/\n" + "dIsAlLoW: /\n";
100 | 
101 |     final String urlAllowed = "http://foo.bar/x/y";
102 |     final String urlDisallowed = "http://foo.bar/a/b";
103 | 
104 |     final Matcher matcherUpper = parse(robotsTxtBodyUpper);
105 |     assertTrue(matcherUpper.singleAgentAllowedByRobots("FooBot", urlAllowed));
106 |     assertFalse(matcherUpper.singleAgentAllowedByRobots("FooBot", urlDisallowed));
107 | 
108 |     final Matcher matcherLower = parse(robotsTxtBodyLower);
109 |     assertTrue(matcherLower.singleAgentAllowedByRobots("FooBot", urlAllowed));
110 |     assertFalse(matcherLower.singleAgentAllowedByRobots("FooBot", urlDisallowed));
111 | 
112 |     final Matcher matcherRandom = parse(robotsTxtBodyRandom);
113 |     assertTrue(matcherRandom.singleAgentAllowedByRobots("FooBot", urlAllowed));
114 |     assertFalse(matcherRandom.singleAgentAllowedByRobots("FooBot", urlDisallowed));
115 |   }
116 | 
117 |   /** Verifies: user agent case insensitivity, user agent names convention compliance. */
118 |   @Test
119 |   public void testUserAgentCaseInsensitivity() {
120 |     final String robotsTxtBodyUpper = "user-agent: FOO BAR\n" + "allow: /x/\n" + "disallow: /\n";
121 |     final String robotsTxtBodyLower = "user-agent: foo bar\n" + "allow: /x/\n" + "disallow: /\n";
122 |     final String robotsTxtBodyRandom = "user-agent: FoO bAr\n" + "allow: /x/\n" + "disallow: /\n";
123 | 
124 |     final String urlAllowed = "http://foo.bar/x/y";
125 |     final String urlDisallowed = "http://foo.bar/a/b";
126 | 
127 |     final Matcher matcherUpper = parse(robotsTxtBodyUpper);
128 |     assertTrue(matcherUpper.singleAgentAllowedByRobots("Foo", urlAllowed));
129 |     assertTrue(matcherUpper.singleAgentAllowedByRobots("foo", urlAllowed));
130 |     assertFalse(matcherUpper.singleAgentAllowedByRobots("Foo", urlDisallowed));
131 |     assertFalse(matcherUpper.singleAgentAllowedByRobots("foo", urlDisallowed));
132 | 
133 |     final Matcher matcherLower = parse(robotsTxtBodyLower);
134 |     assertTrue(matcherLower.singleAgentAllowedByRobots("Foo", urlAllowed));
135 |     assertTrue(matcherLower.singleAgentAllowedByRobots("foo", urlAllowed));
136 |     assertFalse(matcherLower.singleAgentAllowedByRobots("Foo", urlDisallowed));
137 |     assertFalse(matcherLower.singleAgentAllowedByRobots("foo", urlDisallowed));
138 | 
139 |     final Matcher matcherRandom = parse(robotsTxtBodyRandom);
140 |     assertTrue(matcherRandom.singleAgentAllowedByRobots("Foo", urlAllowed));
141 |     assertTrue(matcherRandom.singleAgentAllowedByRobots("foo", urlAllowed));
142 |     assertFalse(matcherRandom.singleAgentAllowedByRobots("Foo", urlDisallowed));
143 |     assertFalse(matcherRandom.singleAgentAllowedByRobots("foo", urlDisallowed));
144 |   }
145 | 
146 |   /** [Google-specific] Verifies: accepting user-agent value up to the first space. */
147 |   @Test
148 |   public void testAcceptUserAgentUpToFirstSpace() {
149 |     final String robotsTxtBody =
150 |         "User-Agent: *\n"
151 |             + "Disallow: /\n"
152 |             + "User-Agent: Foo Bar\n"
153 |             + "Allow: /x/\n"
154 |             + "Disallow: /\n";
155 | 
156 |     final String url = "http://foo.bar/x/y";
157 | 
158 |     final Matcher matcher = parse(robotsTxtBody);
159 |     assertTrue(matcher.singleAgentAllowedByRobots("Foo", url));
160 |     assertFalse(matcher.singleAgentAllowedByRobots("Foo Bar", url));
161 |   }
162 | 
163 |   /** Verifies: global rules. */
164 |   @Test
165 |   public void testGlobalGroups() {
166 |     final String robotsTxtBodyEmpty = "";
167 |     final String robotsTxtBodyGlobal =
168 |         "user-agent: *\n" + "disallow: /x\n" + "user-agent: FooBot\n" + "allow: /x/y\n";
169 |     final String robotsTxtBodySpecific =
170 |         "user-agent: FooBot\n"
171 |             + "allow: /\n"
172 |             + "user-agent: BarBot\n"
173 |             + "disallow: /\n"
174 |             + "user-agent: BazBot\n"
175 |             + "disallow: /\n";
176 | 
177 |     final String url = "http://foo.bar/x/y";
178 | 
179 |     final Matcher matcherEmpty = parse(robotsTxtBodyEmpty);
180 |     assertTrue(matcherEmpty.singleAgentAllowedByRobots("FooBot", url));
181 | 
182 |     final Matcher matcherGlobal = parse(robotsTxtBodyGlobal);
183 |     assertTrue(matcherGlobal.singleAgentAllowedByRobots("FooBot", url));
184 |     assertFalse(matcherGlobal.singleAgentAllowedByRobots("BarBot", url));
185 | 
186 |     final Matcher matcherSpecific = parse(robotsTxtBodySpecific);
187 |     assertTrue(matcherSpecific.singleAgentAllowedByRobots("QuxBot", url));
188 |   }
189 | 
190 |   /**
191 |    * [Google-specific] Verifies: any user-agent with prefix "* " is considered as global wildcard.
192 |    */
193 |   @Test
194 |   public void testGlobalGroupsPrefix() {
195 |     final String robotsTxtBody =
196 |         "user-agent: * baz\n" + "disallow: /x\n" + "user-agent: FooBot\n" + "allow: /x/y\n";
197 | 
198 |     final String url = "http://foo.bar/x/y";
199 | 
200 |     final Matcher matcher = parse(robotsTxtBody);
201 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
202 |     assertFalse(matcher.singleAgentAllowedByRobots("BarBot", url));
203 |   }
204 | 
205 |   /** Verifies: case sensitivity of URIs. */
206 |   @Test
207 |   public void testUriCaseSensitivity() {
208 |     final String robotsTxtBodyUpper = "user-agent: FooBot\n" + "disallow: /X/\n";
209 |     final String robotsTxtBodyLower = "user-agent: FooBot\n" + "disallow: /x/\n";
210 | 
211 |     final String url = "http://foo.bar/x/y";
212 | 
213 |     final Matcher matcherUpper = parse(robotsTxtBodyUpper);
214 |     assertTrue(matcherUpper.singleAgentAllowedByRobots("FooBot", url));
215 | 
216 |     final Matcher matcherLower = parse(robotsTxtBodyLower);
217 |     assertFalse(matcherLower.singleAgentAllowedByRobots("FooBot", url));
218 |   }
219 | 
220 |   /** Verifies: longest match strategy. */
221 |   @Test
222 |   public void testLongestMatch() {
223 |     final String url = "http://foo.bar/x/page.html";
224 | 
225 |     {
226 |       final String robotsTxtBody =
227 |           "user-agent: FooBot\n" + "disallow: /x/page.html\n" + "allow: /x/\n";
228 | 
229 |       final Matcher matcher = parse(robotsTxtBody);
230 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", url));
231 |     }
232 |     {
233 |       final String robotsTxtBody =
234 |           "user-agent: FooBot\n" + "allow: /x/page.html\n" + "disallow: /x/\n";
235 | 
236 |       final Matcher matcher = parse(robotsTxtBody);
237 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
238 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/"));
239 |     }
240 |     {
241 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: \n" + "allow: \n";
242 | 
243 |       final Matcher matcher = parse(robotsTxtBody);
244 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
245 |     }
246 |     {
247 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /\n";
248 | 
249 |       final Matcher matcher = parse(robotsTxtBody);
250 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
251 |     }
252 |     {
253 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /x\n" + "allow: /x/\n";
254 | 
255 |       final String url0 = "http://foo.bar/x";
256 |       final String url1 = "http://foo.bar/x/";
257 | 
258 |       final Matcher matcher = parse(robotsTxtBody);
259 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", url0));
260 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url1));
261 |     }
262 |     {
263 |       final String robotsTxtBody =
264 |           "user-agent: FooBot\n" + "disallow: /x/page.html\n" + "allow: /x/page.html\n";
265 | 
266 |       final Matcher matcher = parse(robotsTxtBody);
267 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
268 |     }
269 |     {
270 |       final String robotsTxtBody =
271 |           "user-agent: FooBot\n" + "allow: /page\n" + "disallow: /*.html\n";
272 | 
273 |       final Matcher matcher = parse(robotsTxtBody);
274 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/page.html"));
275 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/page"));
276 |     }
277 |     {
278 |       final String robotsTxtBody =
279 |           "user-agent: FooBot\n" + "allow: /x/page.\n" + "disallow: /*.html\n";
280 | 
281 |       final Matcher matcher = parse(robotsTxtBody);
282 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", url));
283 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/y.html"));
284 |     }
285 |     {
286 |       final String robotsTxtBody =
287 |           "User-agent: *\n" + "Disallow: /x/\n" + "User-agent: FooBot\n" + "Disallow: /y/\n";
288 | 
289 |       final Matcher matcher = parse(robotsTxtBody);
290 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/x/page"));
291 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/y/page"));
292 |     }
293 |   }
294 | 
295 |   /** Verifies: percent-encoding of characters outside the range of the US-ASCII. */
296 |   @Test
297 |   public void testPercentEncoding() {
298 |     {
299 |       final String robotsTxtBody =
300 |           "User-agent: FooBot\n"
301 |               + "Disallow: /\n"
302 |               + "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
303 | 
304 |       final Matcher matcher = parse(robotsTxtBody);
305 |       assertTrue(
306 |           matcher.singleAgentAllowedByRobots(
307 |               "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
308 |     }
309 |     {
310 |       final String robotsTxtBody = "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/ツ\n";
311 | 
312 |       final Matcher matcher = parse(robotsTxtBody);
313 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%E3%83%84"));
314 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/ツ"));
315 |     }
316 |     {
317 |       final String robotsTxtBody =
318 |           "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/%E3%83%84\n";
319 | 
320 |       final Matcher matcher = parse(robotsTxtBody);
321 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%E3%83%84"));
322 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/ツ"));
323 |     }
324 |     {
325 |       final String robotsTxtBody =
326 |           "User-agent: FooBot\n" + "Disallow: /\n" + "Allow: /foo/bar/%62%61%7A\n";
327 | 
328 |       final Matcher matcher = parse(robotsTxtBody);
329 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/baz"));
330 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/%62%61%7A"));
331 |     }
332 |   }
333 | 
334 |   /** Verifies: valid parsing of special characters ('*', '$', '#') */
335 |   @Test
336 |   public void testSpecialCharacters() {
337 |     {
338 |       final String robotsTxtBody =
339 |           "User-agent: FooBot\n" + "Disallow: /foo/bar/quz\n" + "Allow: /foo/*/qux\n";
340 | 
341 |       final Matcher matcher = parse(robotsTxtBody);
342 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/quz"));
343 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/quz"));
344 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo//quz"));
345 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bax/quz"));
346 |     }
347 |     {
348 |       final String robotsTxtBody =
349 |           "User-agent: FooBot\n" + "Disallow: /foo/bar$\n" + "Allow: /foo/bar/qux\n";
350 | 
351 |       final Matcher matcher = parse(robotsTxtBody);
352 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar"));
353 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/qux"));
354 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/"));
355 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar/baz"));
356 |     }
357 |     {
358 |       final String robotsTxtBody =
359 |           "User-agent: FooBot\n" + "# Disallow: /\n" + "Disallow: /foo/quz#qux\n" + "Allow: /\n";
360 | 
361 |       final Matcher matcher = parse(robotsTxtBody);
362 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/bar"));
363 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/foo/quz"));
364 |     }
365 |   }
366 | 
367 |   /**
368 |    * [Google-specific] Verifies: {@code /index.htm} or {@code /index.html} should be normalised to
369 |    * {@code /}.
370 |    */
371 |   @Test
372 |   public void testIndexNormalisation() {
373 |     final String robotsTxtBody =
374 |         "user-agent: FooBot\n"
375 |             + "disallow: /\n"
376 |             + "allow: /index.htm\n"
377 |             + "allow: /index.html\n"
378 |             + "allow: /x\n"
379 |             + "disallow: /x/index.htm\n"
380 |             + "disallow: /x/index.html\n";
381 | 
382 |     final String[] urls = {
383 |       "http://foo.bar/",
384 |       "http://foo.bar/index.htm",
385 |       "http://foo.bar/index.html",
386 |       "http://foo.bar/x/",
387 |       "http://foo.bar/x/index.htm",
388 |       "http://foo.bar/x/index.html"
389 |     };
390 | 
391 |     final Matcher matcher = parse(robotsTxtBody);
392 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[0]));
393 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[1]));
394 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[2]));
395 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", urls[3]));
396 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urls[4]));
397 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", urls[5]));
398 |   }
399 | 
400 |   /** [Google-specific] Verifies: Empty arguments corner cases. */
401 |   @Test
402 |   public void testEmptyArgs() {
403 |     {
404 |       final String robotsTxtBody = "";
405 | 
406 |       final Matcher matcher = parse(robotsTxtBody);
407 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", ""));
408 |       assertTrue(matcher.singleAgentAllowedByRobots("", ""));
409 |     }
410 |     {
411 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n";
412 | 
413 |       final Matcher matcher = parse(robotsTxtBody);
414 |       assertTrue(matcher.singleAgentAllowedByRobots("", ""));
415 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", ""));
416 |     }
417 |   }
418 | 
419 |   /** [Google-specific] Verifies: Long lines should be ignored after 8 * 2083 bytes. */
420 |   @Test
421 |   public void testLongLines() {
422 |     final int eolLength = "\n".length();
423 |     final int maxLength = 2083 * 8;
424 |     final String allow = "allow: ";
425 |     final String disallow = "disallow: ";
426 | 
427 |     {
428 |       String robotsTxtBody = "user-agent: FooBot\n";
429 |       final StringBuilder longValueBuilder = new StringBuilder("/x/");
430 |       final int maxValueLength =
431 |           maxLength - longValueBuilder.length() - disallow.length() + eolLength;
432 |       while (longValueBuilder.length() < maxValueLength) {
433 |         longValueBuilder.append('a');
434 |       }
435 |       robotsTxtBody += disallow + longValueBuilder.append("/qux\n").toString();
436 | 
437 |       final Matcher matcher = parse(robotsTxtBody);
438 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fux"));
439 |       assertFalse(
440 |           matcher.singleAgentAllowedByRobots(
441 |               "FooBot", "http://foo.bar" + longValueBuilder.toString() + "/fux"));
442 |     }
443 |     {
444 |       String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n";
445 |       final StringBuilder longValueBuilderA = new StringBuilder("/x/");
446 |       final StringBuilder longValueBuilderB = new StringBuilder("/x/");
447 |       final int maxValueLength =
448 |           maxLength - longValueBuilderA.length() - disallow.length() + eolLength;
449 |       while (longValueBuilderA.length() < maxValueLength) {
450 |         longValueBuilderA.append('a');
451 |         longValueBuilderB.append('b');
452 |       }
453 |       robotsTxtBody += allow + longValueBuilderA.toString() + "/qux\n";
454 |       robotsTxtBody += allow + longValueBuilderB.toString() + "/qux\n";
455 | 
456 |       final Matcher matcher = parse(robotsTxtBody);
457 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/"));
458 |       assertTrue(
459 |           matcher.singleAgentAllowedByRobots(
460 |               "FooBot", "http://foo.bar" + longValueBuilderA.toString() + "/qux"));
461 |       assertTrue(
462 |           matcher.singleAgentAllowedByRobots(
463 |               "FooBot", "http://foo.bar" + longValueBuilderB.toString() + "/fux"));
464 |     }
465 |   }
466 | 
467 |   /** [Google-specific] Verifies: Google-only documentation compliance. */
468 |   @Test
469 |   public void testGoogleOnlyDocumentationCompliance() {
470 |     {
471 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish\n";
472 | 
473 |       final Matcher matcher = parse(robotsTxtBody);
474 | 
475 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
476 | 
477 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish"));
478 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html"));
479 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html"));
480 | 
481 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads"));
482 |       assertTrue(
483 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads/yummy.html"));
484 |       assertTrue(
485 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html?id=anything"));
486 | 
487 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.asp"));
488 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/catfish"));
489 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/?id=fish"));
490 |     }
491 |     {
492 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish*\n";
493 | 
494 |       final Matcher matcher = parse(robotsTxtBody);
495 | 
496 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
497 | 
498 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish"));
499 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html"));
500 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html"));
501 | 
502 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads"));
503 |       assertTrue(
504 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fishheads/yummy.html"));
505 |       assertTrue(
506 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html?id=anything"));
507 | 
508 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.bar"));
509 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/catfish"));
510 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/?id=fish"));
511 |     }
512 |     {
513 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish/\n";
514 | 
515 |       final Matcher matcher = parse(robotsTxtBody);
516 | 
517 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
518 | 
519 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/"));
520 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon"));
521 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/?salmon"));
522 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/salmon.html"));
523 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish/?id=anything"));
524 | 
525 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish"));
526 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.html"));
527 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish/Salmon.html"));
528 |     }
529 |     {
530 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /*.php\n";
531 | 
532 |       final Matcher matcher = parse(robotsTxtBody);
533 | 
534 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
535 | 
536 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php"));
537 |       assertTrue(
538 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/folder/filename.php"));
539 |       assertTrue(
540 |           matcher.singleAgentAllowedByRobots(
541 |               "FooBot", "http://foo.bar/folder/filename.php?parameters"));
542 |       assertTrue(
543 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar//folder/any.php.file.html"));
544 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php/"));
545 |       assertTrue(
546 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index?f=filename.php/"));
547 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/php/"));
548 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index?php"));
549 | 
550 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/windows.PHP"));
551 |     }
552 |     {
553 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /*.php$\n";
554 | 
555 |       final Matcher matcher = parse(robotsTxtBody);
556 | 
557 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
558 | 
559 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php"));
560 |       assertTrue(
561 |           matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/folder/filename.php"));
562 | 
563 |       assertFalse(
564 |           matcher.singleAgentAllowedByRobots(
565 |               "FooBot", "http://foo.bar/folder/filename.php?parameters"));
566 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php/"));
567 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename.php5/"));
568 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/php/"));
569 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/filename?php"));
570 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/aaaphpaaa"));
571 | 
572 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/windows.PHP"));
573 |     }
574 |     {
575 |       final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /\n" + "allow: /fish*.php\n";
576 | 
577 |       final Matcher matcher = parse(robotsTxtBody);
578 | 
579 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/bar"));
580 | 
581 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/fish.php"));
582 |       assertTrue(
583 |           matcher.singleAgentAllowedByRobots(
584 |               "FooBot", "http://foo.bar/fishheads/catfish.php?parameters"));
585 | 
586 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/Fish.PHP"));
587 |     }
588 |     {
589 |       final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /p\n" + "disallow: /\n";
590 | 
591 |       final Matcher matcher = parse(robotsTxtBody);
592 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page"));
593 |     }
594 |     {
595 |       final String robotsTxtBody =
596 |           "user-agent: FooBot\n" + "allow: /folder\n" + "disallow: /folder\n";
597 | 
598 |       final Matcher matcher = parse(robotsTxtBody);
599 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/folder/page"));
600 |     }
601 |     {
602 |       final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /page\n" + "disallow: /*.htm\n";
603 | 
604 |       final Matcher matcher = parse(robotsTxtBody);
605 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page.htm"));
606 |     }
607 |     {
608 |       final String robotsTxtBody = "user-agent: FooBot\n" + "allow: /$\n" + "disallow: /\n";
609 | 
610 |       final Matcher matcher = parse(robotsTxtBody);
611 |       assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/"));
612 |       assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://example.com/page.html"));
613 |     }
614 |   }
615 |     
616 |   /** [Google-specific] Verifies: common typos in {@code DISALLOW} key should be fixed. */
617 |   @Test
618 |   public void testTyposFixes() {
619 |     final String robotsTxtBody =
620 |         "user-agent: FooBot\n"
621 |             + "disallow: /a/\n"
622 |             + "dissallow: /b/\n"
623 |             + "dissalow: /c/\n"
624 |             + "disalow: /d/\n"
625 |             + "diasllow: /e/\n"
626 |             + "disallaw: /f/\n";
627 | 
628 |     final Matcher matcher = parse(robotsTxtBody);
629 |     assertTrue(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/index.html"));
630 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/a/"));
631 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/b/"));
632 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/c/"));
633 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/d/"));
634 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/e/"));
635 |     assertFalse(matcher.singleAgentAllowedByRobots("FooBot", "http://foo.bar/f/"));
636 |   }
637 | }
638 | 


--------------------------------------------------------------------------------
/src/test/java/com/google/search/robotstxt/RobotsParserTest.java:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package com.google.search.robotstxt;
 16 | 
 17 | import static com.google.common.truth.Truth.assertThat;
 18 | 
 19 | import java.nio.charset.StandardCharsets;
 20 | import java.util.Arrays;
 21 | import java.util.Collections;
 22 | import org.junit.Test;
 23 | 
 24 | /**
 25 |  * Unit tests validating parsing behavior.
 26 |  *
 27 |  * @see RobotsParser
 28 |  */
 29 | public class RobotsParserTest {
 30 |   /**
 31 |    * Parses given robots.txt contents via {@link RobotsParser} and compares the result with an
 32 |    * expected one.
 33 |    *
 34 |    * @param robotsTxtBody Contents of robots.txt file
 35 |    * @param expectedContents Expected contents
 36 |    */
 37 |   private static void parseAndValidate(
 38 |       final String robotsTxtBody, final RobotsContents expectedContents) {
 39 |     final Parser parser = new RobotsParser(new RobotsParseHandler());
 40 |     final Matcher matcher = parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8));
 41 |     final RobotsContents actualContents = ((RobotsMatcher) matcher).getRobotsContents();
 42 | 
 43 |     expectedContents
 44 |         .getGroups()
 45 |         .forEach(expectedGroup -> assertThat(expectedGroup).isIn(actualContents.getGroups()));
 46 |   }
 47 | 
 48 |   /** Verifies: rules grouping, rules parsing, invalid directives ignorance. */
 49 |   @Test
 50 |   public void testMultipleGroups() {
 51 |     final String robotsTxtBody =
 52 |         "allow: /foo/bar/\n"
 53 |             + "\n"
 54 |             + "user-agent: FooBot\n"
 55 |             + "disallow: /\n"
 56 |             + "allow: /x/\n"
 57 |             + "user-agent: BarBot\n"
 58 |             + "disallow: /\n"
 59 |             + "allow: /y/\n"
 60 |             + "\n"
 61 |             + "\n"
 62 |             + "allow: /w/\n"
 63 |             + "user-agent: BazBot\n"
 64 |             + "\n"
 65 |             + "user-agent: FooBot\n"
 66 |             + "allow: /z/\n"
 67 |             + "disallow: /\n";
 68 | 
 69 |     final RobotsContents expectedContents =
 70 |         new RobotsContents(
 71 |             Arrays.asList(
 72 |                 new RobotsContents.Group(
 73 |                     Collections.singletonList("FooBot"),
 74 |                     Arrays.asList(
 75 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
 76 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))),
 77 |                 new RobotsContents.Group(
 78 |                     Collections.singletonList("BarBot"),
 79 |                     Arrays.asList(
 80 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
 81 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"),
 82 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/w/"))),
 83 |                 new RobotsContents.Group(
 84 |                     Arrays.asList("BazBot", "FooBot"),
 85 |                     Arrays.asList(
 86 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/z/"),
 87 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/")))));
 88 | 
 89 |     parseAndValidate(robotsTxtBody, expectedContents);
 90 |   }
 91 | 
 92 |   /** Verifies: CR character must be treated as EOL, invalid directives ignorance. */
 93 |   @Test
 94 |   public void testCrParsing() {
 95 |     final String robotsTxtBody =
 96 |         "user-agent: FooBot\n"
 97 |             + "disallow: /\n"
 98 |             + "allow: /x/\rallow: /y/\n"
 99 |             + "al\r\r\r\r\rdisallow: /z/\n";
100 | 
101 |     final RobotsContents expectedContents =
102 |         new RobotsContents(
103 |             Collections.singletonList(
104 |                 new RobotsContents.Group(
105 |                     Collections.singletonList("FooBot"),
106 |                     Arrays.asList(
107 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
108 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"),
109 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/"),
110 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/z/")))));
111 | 
112 |     parseAndValidate(robotsTxtBody, expectedContents);
113 |   }
114 | 
115 |   /** Verifies: CL RF must be treated as EOL. */
116 |   @Test
117 |   public void testCrLfParsing() {
118 |     final String robotsTxtBody =
119 |         "allow: /foo/bar/\r\n"
120 |             + "\r\n"
121 |             + "user-agent: FooBot\r\n"
122 |             + "disallow: /\r\n"
123 |             + "allow: /x/\r\n"
124 |             + "user-agent: BarBot\r\n"
125 |             + "disallow: /\r\n"
126 |             + "allow: /y/\r\n"
127 |             + "\r\n";
128 | 
129 |     final RobotsContents expectedContents =
130 |         new RobotsContents(
131 |             Arrays.asList(
132 |                 new RobotsContents.Group(
133 |                     Collections.singletonList("FooBot"),
134 |                     Arrays.asList(
135 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
136 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))),
137 |                 new RobotsContents.Group(
138 |                     Collections.singletonList("BarBot"),
139 |                     Arrays.asList(
140 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
141 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/")))));
142 | 
143 |     parseAndValidate(robotsTxtBody, expectedContents);
144 |   }
145 | 
146 |   /** Verifies: Last line may not end with EOL. */
147 |   @Test
148 |   public void testNoFinalNewline() {
149 |     final String robotsTxtBody =
150 |         "User-Agent: foo\n"
151 |             + "Allow: /some/path\n"
152 |             + "User-Agent: bar\n"
153 |             + "\n"
154 |             + "\n"
155 |             + "Disallow: /";
156 | 
157 |     final RobotsContents expectedContents =
158 |         new RobotsContents(
159 |             Arrays.asList(
160 |                 new RobotsContents.Group(
161 |                     Collections.singletonList("foo"),
162 |                     Collections.singletonList(
163 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/some/path"))),
164 |                 new RobotsContents.Group(
165 |                     Collections.singletonList("bar"),
166 |                     Collections.singletonList(
167 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/")))));
168 | 
169 |     parseAndValidate(robotsTxtBody, expectedContents);
170 |   }
171 | 
172 |   /** Verifies: surrounding whitespace characters (LF, CR) ignorance. */
173 |   @Test
174 |   public void testWhitespacesParsing() {
175 |     final String robotsTxtBody =
176 |         "user-agent \t: \tFooBot\n"
177 |             + "disallow  : /  \n"
178 |             + "  allow:  /x/\n"
179 |             + "    \n"
180 |             + " \t \t \n"
181 |             + "user-agent:BarBot\n"
182 |             + "\t \t disallow\t \t :\t \t /\t \t \n"
183 |             + "\t\tallow\t\t:\t\t/y/\t\t\n"
184 |             + "\n";
185 | 
186 |     final RobotsContents expectedContents =
187 |         new RobotsContents(
188 |             Arrays.asList(
189 |                 new RobotsContents.Group(
190 |                     Collections.singletonList("FooBot"),
191 |                     Arrays.asList(
192 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
193 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/x/"))),
194 |                 new RobotsContents.Group(
195 |                     Collections.singletonList("BarBot"),
196 |                     Arrays.asList(
197 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
198 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "/y/")))));
199 | 
200 |     parseAndValidate(robotsTxtBody, expectedContents);
201 |   }
202 | 
203 |   /** Verifies: global rules parsing. */
204 |   @Test
205 |   public void testGlobalGroup() {
206 |     final String robotsTxtBody =
207 |         "User-agent: *\n" + "Disallow: /x/\n" + "User-agent: FooBot\n" + "Disallow: /y/\n";
208 | 
209 |     final RobotsContents expectedContents =
210 |         new RobotsContents(
211 |             Arrays.asList(
212 |                 new RobotsContents.Group(
213 |                     Collections.emptyList(),
214 |                     Collections.singletonList(
215 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/x/")),
216 |                     true),
217 |                 new RobotsContents.Group(
218 |                     Collections.singletonList("FooBot"),
219 |                     Collections.singletonList(
220 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/y/")))));
221 | 
222 |     parseAndValidate(robotsTxtBody, expectedContents);
223 |   }
224 | 
225 |   /** [Google-specific] Verifies: assuming colon if it's missing. */
226 |   @Test
227 |   public void testMissingSeparator() {
228 |     final String robotsTxtBody = "user-agent FooBot\n" + "disallow /\n" + "allow foo bar\n";
229 | 
230 |     final RobotsContents expectedContents =
231 |         new RobotsContents(
232 |             Collections.singletonList(
233 |                 new RobotsContents.Group(
234 |                     Collections.singletonList("FooBot"),
235 |                     Arrays.asList(
236 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/"),
237 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.ALLOW, "foo bar")))));
238 | 
239 |     parseAndValidate(robotsTxtBody, expectedContents);
240 |   }
241 | 
242 |   /** [Google-specific] Verifies: trimming values to specific number of bytes. */
243 |   @Test
244 |   public void testTrimmingToBytes() {
245 |     final String robotsTxtBody = "user-agent: FooBot\n" + "disallow: /foo/bar/baz/qux\n";
246 | 
247 |     final RobotsContents expectedContents =
248 |         new RobotsContents(
249 |             Collections.singletonList(
250 |                 new RobotsContents.Group(
251 |                     Collections.singletonList("FooBot"),
252 |                     Collections.singletonList(
253 |                         new RobotsContents.Group.Rule(Parser.DirectiveType.DISALLOW, "/foo/b")))));
254 | 
255 |     final Parser parser = new RobotsParser(new RobotsParseHandler(), 8);
256 |     final Matcher matcher = parser.parse(robotsTxtBody.getBytes(StandardCharsets.UTF_8));
257 |     final RobotsContents actualContents = ((RobotsMatcher) matcher).getRobotsContents();
258 | 
259 |     expectedContents
260 |         .getGroups()
261 |         .forEach(expectedGroup -> assertThat(expectedGroup).isIn(actualContents.getGroups()));
262 |   }
263 | 
264 |   /** Verifies: Path normalisation corner case. */
265 |   @Test
266 |   public void testPathNormalisationCornerCase() {
267 |     final String robotsTxtBody =
268 |         "user-agent: FooBot\n" + "disallow: /foo?bar%aa%\n" + "disallow: /foo?bar%aa%a\n";
269 | 
270 |     final RobotsContents expectedContents =
271 |         new RobotsContents(
272 |             Collections.singletonList(
273 |                 new RobotsContents.Group(
274 |                     Collections.singletonList("FooBot"),
275 |                     Arrays.asList(
276 |                         new RobotsContents.Group.Rule(
277 |                             Parser.DirectiveType.DISALLOW, "/foo?bar%AA%"),
278 |                         new RobotsContents.Group.Rule(
279 |                             Parser.DirectiveType.DISALLOW, "/foo?bar%AA%a")))));
280 | 
281 |     parseAndValidate(robotsTxtBody, expectedContents);
282 |   }
283 | }
284 | 


--------------------------------------------------------------------------------