├── .github
├── FUNDING.yml
├── example.png
└── verbose.png
├── .gitignore
├── README.md
└── robotsvalidator.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: p0dalirius
4 | patreon: Podalirius
--------------------------------------------------------------------------------
/.github/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/p0dalirius/RobotsValidator/916bffa6dcd662a556d5ae8cf50aa56d51a32fe5/.github/example.png
--------------------------------------------------------------------------------
/.github/verbose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/p0dalirius/RobotsValidator/916bffa6dcd662a556d5ae8cf50aa56d51a32fe5/.github/verbose.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | .idea/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RobotsValidator
2 |
3 |
4 | The robotsvalidator script allows you to check if URLs are allowed or disallowed by a robots.txt file.
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | 
14 |
15 | ## Features
16 |
17 | - [x] Getting robots.txt file from local file
18 | - [x] Getting robots.txt file from an URL
19 | - [x] Verbose mode, showing all the rules with their results.
20 |
21 | ## Verbose mode
22 |
23 | There is a verbose mode using `--debug` option, which prints every rule with its result:
24 |
25 | 
26 |
27 | ## Contributing
28 |
29 | Pull requests are welcome. Feel free to open an issue if you want to add other features.
30 |
--------------------------------------------------------------------------------
/robotsvalidator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # File name : robotsvalidator.py
4 | # Author : Podalirius (@podalirius_)
5 | # Date created : 30 Nov 2021
6 |
7 |
8 | # https://www.robotstxt.org/norobots-rfc.txt
9 |
10 | import argparse
11 | import re
12 | import os
13 | import sys
14 | import readline
15 | import requests
16 | import urllib.parse
17 |
18 | readline.parse_and_bind('tab: complete')
19 | readline.set_completer_delims('\n')
20 |
21 |
22 | class Logger(object):
23 | def __init__(self, debug=False, logfile=None, nocolors=False):
24 | super(Logger, self).__init__()
25 | self.__debug = debug
26 | self.__nocolors = nocolors
27 | self.logfile = logfile
28 | #
29 | if self.logfile is not None:
30 | if os.path.exists(self.logfile):
31 | k = 1
32 | while os.path.exists(self.logfile + (".%d" % k)):
33 | k += 1
34 | self.logfile = self.logfile + (".%d" % k)
35 | open(self.logfile, "w").close()
36 |
37 | def print(self, message=""):
38 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message)
39 | if self.__nocolors:
40 | print(nocolor_message)
41 | else:
42 | print(message)
43 | if self.logfile is not None:
44 | f = open(self.logfile, "a")
45 | f.write(nocolor_message + "\n")
46 | f.close()
47 |
48 | def info(self, message):
49 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message)
50 | if self.__nocolors:
51 | print("[info] %s" % nocolor_message)
52 | else:
53 | print("[info] %s" % message)
54 | if self.logfile is not None:
55 | f = open(self.logfile, "a")
56 | f.write(nocolor_message + "\n")
57 | f.close()
58 |
59 | def debug(self, message):
60 | if self.__debug == True:
61 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message)
62 | if self.__nocolors:
63 | print("[debug] %s" % nocolor_message)
64 | else:
65 | print("[debug] %s" % message)
66 | if self.logfile is not None:
67 | f = open(self.logfile, "a")
68 | f.write("[debug] %s" % nocolor_message + "\n")
69 | f.close()
70 |
71 | def error(self, message):
72 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message)
73 | if self.__nocolors:
74 | print("[error] %s" % nocolor_message)
75 | else:
76 | print("[error] %s" % message)
77 | if self.logfile is not None:
78 | f = open(self.logfile, "a")
79 | f.write("[error] %s" % nocolor_message + "\n")
80 | f.close()
81 |
82 |
83 | class RobotsTXT(object):
84 | def __init__(self, robotsdata, logger):
85 | super(RobotsTXT, self).__init__()
86 | self.logger = logger
87 | self.robotsdata = robotsdata
88 | self._parse()
89 |
90 | def _parse(self):
91 | # Cleanup empty lines and parse content
92 | self.entries = []
93 | for line in self.robotsdata.split('\n'):
94 | if len(line.strip()) != 0:
95 | if line.startswith("#"):
96 | content = line.lstrip("#").strip()
97 | self.entries.append({"type": "commentary", "content": content, "raw": line})
98 | elif line.lower().startswith("disallow"):
99 | content = line.split(':', 1)[1].strip()
100 | self.entries.append({"type": "disallow", "content": content, "raw": line})
101 | elif line.lower().startswith("allow"):
102 | content = line.split(':', 1)[1].strip()
103 | self.entries.append({"type": "allow", "content": content, "raw": line})
104 |
105 | def _to_re_regex(self, data):
106 | # replace * by .*
107 | data = re.sub("^\*", '.*', data)
108 | data = re.sub("([^.])\*", '\\1.*', data)
109 | return data
110 |
111 | def validate(self, url):
112 | path = None
113 | if '://' in url:
114 | matched = re.match("([a-z]+://[^/]+[/]?)(.*)", url)
115 | if matched is not None:
116 | path = '/' + matched.group(2)
117 | else:
118 | path = url
119 | logger.debug("Using path '%s'" % path)
120 |
121 | l_allow, l_disallow = [], []
122 | for entry in self.entries:
123 | if entry["type"] == "allow":
124 | rule_regex = self._to_re_regex(entry["content"])
125 | if re.match(rule_regex, path):
126 | logger.debug("%-50s : \x1b[92maccepted by rule\x1b[0m" % ("Rule 'Allow: %s'" % entry["content"]))
127 | l_allow.append(entry)
128 | else:
129 | logger.debug("%-50s : \x1b[91mrejected by rule\x1b[0m" % ("Rule 'Allow: %s'" % entry["content"]))
130 | elif entry["type"] == "disallow":
131 | rule_regex = self._to_re_regex(entry["content"])
132 | if re.match(rule_regex, path):
133 | logger.debug("%-50s : \x1b[91mrejected by rule\x1b[0m" % ("Rule 'Disallow: %s'" % entry["content"]))
134 | l_disallow.append(entry)
135 | else:
136 | logger.debug("%-50s : \x1b[92maccepted by rule\x1b[0m" % ("Rule 'Disallow: %s'" % entry["content"]))
137 |
138 | return l_allow, l_disallow
139 |
140 |
141 | def parseArgs():
142 | print("RobotsValidator v1.2 - by Remi GASCOU (Podalirius)\n")
143 |
144 | parser = argparse.ArgumentParser(description="Description message")
145 | parser.add_argument("--debug", dest="debug", action="store_true", default=False, help="Debug mode.")
146 | parser.add_argument("--no-colors", dest="no_colors", action="store_true", default=False, help="No colors mode.")
147 | parser.add_argument("-l", "--logfile", dest="logfile", type=str, default=None, help="Log file to save output to.")
148 |
149 | parse_robots_source = parser.add_mutually_exclusive_group()
150 | parse_robots_source.add_argument("-r", "--robots-file", dest="robots_file", default=None, help='robots.txt file')
151 | parse_robots_source.add_argument("-R", "--robots-url", dest="robots_url", default=None,
152 | help='robots.txt location URL.')
153 |
154 | options = parser.parse_args()
155 |
156 | if options.robots_file is None and options.robots_url is None:
157 | print("%s: error: Either -r/--robots-file or -R/--robots-url are required." % sys.argv[0])
158 | exit(0)
159 |
160 | return options
161 |
162 |
163 | if __name__ == '__main__':
164 | options = parseArgs()
165 | logger = Logger(debug=options.debug, nocolors=options.no_colors, logfile=options.logfile)
166 |
167 | robotsdata = None
168 | if options.robots_file is not None:
169 | logger.debug("Reading file '%s' ..." % options.robots_file)
170 | if os.path.exists(options.robots_file):
171 | f = open(options.robots_file, 'r')
172 | robotsdata = f.read().replace('\r', '')
173 | f.close()
174 | logger.debug("Read %d bytes." % (len(robotsdata)))
175 | else:
176 | logger.error("File '%s' does not exists or is not readable." % options.robots_file)
177 | sys.exit()
178 | elif options.robots_url is not None:
179 | logger.debug("Querying '%s' ..." % options.robots_url)
180 | r = requests.get(options.robots_url)
181 | if r.status_code == 200:
182 | robotsdata = r.content.decode("UTF-8").replace('\r', '')
183 | logger.debug("HTTP %d response: %d bytes returned." % (r.status_code, len(r.content)))
184 | else:
185 | logger.error("Access to '%s' returned a %d status code." % (options.robots_url, r.status_code))
186 | sys.exit()
187 |
188 | robotstxt = RobotsTXT(robotsdata, logger=logger)
189 |
190 | prompt = "[%s]> " % urllib.parse.urlparse(options.robots_url).netloc
191 |
192 | try:
193 | while True:
194 | url = input(prompt)
195 | if len(url) != 0:
196 | l_allow, l_disallow = robotstxt.validate(url)
197 | if len(l_allow) == 0 and len(l_disallow) == 0:
198 | logger.print("\x1b[1;92mAllowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % (
199 | len(l_allow), len(l_disallow)
200 | )
201 | )
202 | elif len(l_allow) != 0:
203 | logger.print("\x1b[1;92mAllowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % (
204 | len(l_allow), len(l_disallow)
205 | )
206 | )
207 | elif len(l_disallow) != 0:
208 | logger.print("\x1b[1;91mNot allowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % (
209 | len(l_allow), len(l_disallow)
210 | )
211 | )
212 |
213 | for rule in l_allow:
214 | logger.print(" | Rule '%s'" % rule["raw"])
215 | for rule in l_disallow:
216 | logger.print(" | Rule '%s'" % rule["raw"])
217 | except KeyboardInterrupt as e:
218 | print()
219 | pass
220 |
--------------------------------------------------------------------------------