├── .github ├── FUNDING.yml ├── example.png └── verbose.png ├── .gitignore ├── README.md └── robotsvalidator.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: p0dalirius 4 | patreon: Podalirius -------------------------------------------------------------------------------- /.github/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p0dalirius/RobotsValidator/916bffa6dcd662a556d5ae8cf50aa56d51a32fe5/.github/example.png -------------------------------------------------------------------------------- /.github/verbose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p0dalirius/RobotsValidator/916bffa6dcd662a556d5ae8cf50aa56d51a32fe5/.github/verbose.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | .idea/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RobotsValidator 2 | 3 |

4 | The robotsvalidator script allows you to check if URLs are allowed or disallowed by a robots.txt file. 5 |
6 | GitHub release (latest by date) 7 | 8 | YouTube Channel Subscribers 9 |
10 |

11 | 12 | 13 | ![](./.github/example.png) 14 | 15 | ## Features 16 | 17 | - [x] Getting robots.txt file from local file 18 | - [x] Getting robots.txt file from an URL 19 | - [x] Verbose mode, showing all the rules with their results. 20 | 21 | ## Verbose mode 22 | 23 | There is a verbose mode using `--debug` option, which prints every rule with its result: 24 | 25 | ![](./.github/verbose.png) 26 | 27 | ## Contributing 28 | 29 | Pull requests are welcome. Feel free to open an issue if you want to add other features. 30 | -------------------------------------------------------------------------------- /robotsvalidator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # File name : robotsvalidator.py 4 | # Author : Podalirius (@podalirius_) 5 | # Date created : 30 Nov 2021 6 | 7 | 8 | # https://www.robotstxt.org/norobots-rfc.txt 9 | 10 | import argparse 11 | import re 12 | import os 13 | import sys 14 | import readline 15 | import requests 16 | import urllib.parse 17 | 18 | readline.parse_and_bind('tab: complete') 19 | readline.set_completer_delims('\n') 20 | 21 | 22 | class Logger(object): 23 | def __init__(self, debug=False, logfile=None, nocolors=False): 24 | super(Logger, self).__init__() 25 | self.__debug = debug 26 | self.__nocolors = nocolors 27 | self.logfile = logfile 28 | # 29 | if self.logfile is not None: 30 | if os.path.exists(self.logfile): 31 | k = 1 32 | while os.path.exists(self.logfile + (".%d" % k)): 33 | k += 1 34 | self.logfile = self.logfile + (".%d" % k) 35 | open(self.logfile, "w").close() 36 | 37 | def print(self, message=""): 38 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message) 39 | if self.__nocolors: 40 | print(nocolor_message) 41 | else: 42 | print(message) 43 | if self.logfile is not None: 44 | f = open(self.logfile, "a") 45 | f.write(nocolor_message + "\n") 46 | f.close() 47 | 48 | def info(self, message): 49 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message) 50 | if self.__nocolors: 51 | print("[info] %s" % nocolor_message) 52 | else: 53 | print("[info] %s" % message) 54 | if self.logfile is not None: 55 | f = open(self.logfile, "a") 56 | f.write(nocolor_message + "\n") 57 | f.close() 58 | 59 | def debug(self, message): 60 | if self.__debug == True: 61 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message) 62 | if self.__nocolors: 63 | print("[debug] %s" % nocolor_message) 64 | else: 65 | print("[debug] %s" % message) 66 | if self.logfile is not None: 67 | f = open(self.logfile, "a") 68 | f.write("[debug] %s" % nocolor_message + "\n") 69 | f.close() 70 | 71 | def error(self, message): 72 | nocolor_message = re.sub("\x1b[\[]([0-9;]+)m", "", message) 73 | if self.__nocolors: 74 | print("[error] %s" % nocolor_message) 75 | else: 76 | print("[error] %s" % message) 77 | if self.logfile is not None: 78 | f = open(self.logfile, "a") 79 | f.write("[error] %s" % nocolor_message + "\n") 80 | f.close() 81 | 82 | 83 | class RobotsTXT(object): 84 | def __init__(self, robotsdata, logger): 85 | super(RobotsTXT, self).__init__() 86 | self.logger = logger 87 | self.robotsdata = robotsdata 88 | self._parse() 89 | 90 | def _parse(self): 91 | # Cleanup empty lines and parse content 92 | self.entries = [] 93 | for line in self.robotsdata.split('\n'): 94 | if len(line.strip()) != 0: 95 | if line.startswith("#"): 96 | content = line.lstrip("#").strip() 97 | self.entries.append({"type": "commentary", "content": content, "raw": line}) 98 | elif line.lower().startswith("disallow"): 99 | content = line.split(':', 1)[1].strip() 100 | self.entries.append({"type": "disallow", "content": content, "raw": line}) 101 | elif line.lower().startswith("allow"): 102 | content = line.split(':', 1)[1].strip() 103 | self.entries.append({"type": "allow", "content": content, "raw": line}) 104 | 105 | def _to_re_regex(self, data): 106 | # replace * by .* 107 | data = re.sub("^\*", '.*', data) 108 | data = re.sub("([^.])\*", '\\1.*', data) 109 | return data 110 | 111 | def validate(self, url): 112 | path = None 113 | if '://' in url: 114 | matched = re.match("([a-z]+://[^/]+[/]?)(.*)", url) 115 | if matched is not None: 116 | path = '/' + matched.group(2) 117 | else: 118 | path = url 119 | logger.debug("Using path '%s'" % path) 120 | 121 | l_allow, l_disallow = [], [] 122 | for entry in self.entries: 123 | if entry["type"] == "allow": 124 | rule_regex = self._to_re_regex(entry["content"]) 125 | if re.match(rule_regex, path): 126 | logger.debug("%-50s : \x1b[92maccepted by rule\x1b[0m" % ("Rule 'Allow: %s'" % entry["content"])) 127 | l_allow.append(entry) 128 | else: 129 | logger.debug("%-50s : \x1b[91mrejected by rule\x1b[0m" % ("Rule 'Allow: %s'" % entry["content"])) 130 | elif entry["type"] == "disallow": 131 | rule_regex = self._to_re_regex(entry["content"]) 132 | if re.match(rule_regex, path): 133 | logger.debug("%-50s : \x1b[91mrejected by rule\x1b[0m" % ("Rule 'Disallow: %s'" % entry["content"])) 134 | l_disallow.append(entry) 135 | else: 136 | logger.debug("%-50s : \x1b[92maccepted by rule\x1b[0m" % ("Rule 'Disallow: %s'" % entry["content"])) 137 | 138 | return l_allow, l_disallow 139 | 140 | 141 | def parseArgs(): 142 | print("RobotsValidator v1.2 - by Remi GASCOU (Podalirius)\n") 143 | 144 | parser = argparse.ArgumentParser(description="Description message") 145 | parser.add_argument("--debug", dest="debug", action="store_true", default=False, help="Debug mode.") 146 | parser.add_argument("--no-colors", dest="no_colors", action="store_true", default=False, help="No colors mode.") 147 | parser.add_argument("-l", "--logfile", dest="logfile", type=str, default=None, help="Log file to save output to.") 148 | 149 | parse_robots_source = parser.add_mutually_exclusive_group() 150 | parse_robots_source.add_argument("-r", "--robots-file", dest="robots_file", default=None, help='robots.txt file') 151 | parse_robots_source.add_argument("-R", "--robots-url", dest="robots_url", default=None, 152 | help='robots.txt location URL.') 153 | 154 | options = parser.parse_args() 155 | 156 | if options.robots_file is None and options.robots_url is None: 157 | print("%s: error: Either -r/--robots-file or -R/--robots-url are required." % sys.argv[0]) 158 | exit(0) 159 | 160 | return options 161 | 162 | 163 | if __name__ == '__main__': 164 | options = parseArgs() 165 | logger = Logger(debug=options.debug, nocolors=options.no_colors, logfile=options.logfile) 166 | 167 | robotsdata = None 168 | if options.robots_file is not None: 169 | logger.debug("Reading file '%s' ..." % options.robots_file) 170 | if os.path.exists(options.robots_file): 171 | f = open(options.robots_file, 'r') 172 | robotsdata = f.read().replace('\r', '') 173 | f.close() 174 | logger.debug("Read %d bytes." % (len(robotsdata))) 175 | else: 176 | logger.error("File '%s' does not exists or is not readable." % options.robots_file) 177 | sys.exit() 178 | elif options.robots_url is not None: 179 | logger.debug("Querying '%s' ..." % options.robots_url) 180 | r = requests.get(options.robots_url) 181 | if r.status_code == 200: 182 | robotsdata = r.content.decode("UTF-8").replace('\r', '') 183 | logger.debug("HTTP %d response: %d bytes returned." % (r.status_code, len(r.content))) 184 | else: 185 | logger.error("Access to '%s' returned a %d status code." % (options.robots_url, r.status_code)) 186 | sys.exit() 187 | 188 | robotstxt = RobotsTXT(robotsdata, logger=logger) 189 | 190 | prompt = "[%s]> " % urllib.parse.urlparse(options.robots_url).netloc 191 | 192 | try: 193 | while True: 194 | url = input(prompt) 195 | if len(url) != 0: 196 | l_allow, l_disallow = robotstxt.validate(url) 197 | if len(l_allow) == 0 and len(l_disallow) == 0: 198 | logger.print("\x1b[1;92mAllowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % ( 199 | len(l_allow), len(l_disallow) 200 | ) 201 | ) 202 | elif len(l_allow) != 0: 203 | logger.print("\x1b[1;92mAllowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % ( 204 | len(l_allow), len(l_disallow) 205 | ) 206 | ) 207 | elif len(l_disallow) != 0: 208 | logger.print("\x1b[1;91mNot allowed by robots.txt!\x1b[0m (allow:%d, disallow:%d)" % ( 209 | len(l_allow), len(l_disallow) 210 | ) 211 | ) 212 | 213 | for rule in l_allow: 214 | logger.print(" | Rule '%s'" % rule["raw"]) 215 | for rule in l_disallow: 216 | logger.print(" | Rule '%s'" % rule["raw"]) 217 | except KeyboardInterrupt as e: 218 | print() 219 | pass 220 | --------------------------------------------------------------------------------