├── docs ├── useragent.md ├── robotstxt.md ├── imageasascii.md ├── img │ ├── favicon.ico │ └── feature-image.png ├── developers.md ├── tutorial.md └── index.md ├── tests └── test_robot_name.py ├── examples ├── robots.txt └── complete_example.py ├── .github └── workflows │ ├── makedocs.yml │ └── pipy-publish.yml ├── mkdocs.yml ├── LICENSE ├── README.md ├── .gitignore ├── setup.py └── pyrobotstxt └── __init__.py /docs/useragent.md: -------------------------------------------------------------------------------- 1 | # UserAgent Class 2 | 3 | ::: pyrobotstxt.UserAgent -------------------------------------------------------------------------------- /docs/robotstxt.md: -------------------------------------------------------------------------------- 1 | 2 | # RobotsTxt Class 3 | 4 | ::: pyrobotstxt.RobotsTxt 5 | -------------------------------------------------------------------------------- /docs/imageasascii.md: -------------------------------------------------------------------------------- 1 | # ImageAsASCII Class 2 | 3 | ::: pyrobotstxt.ImageAsASCII 4 | 5 | -------------------------------------------------------------------------------- /docs/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serpwings/pyrobotstxt/main/docs/img/favicon.ico -------------------------------------------------------------------------------- /docs/img/feature-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serpwings/pyrobotstxt/main/docs/img/feature-image.png -------------------------------------------------------------------------------- /docs/developers.md: -------------------------------------------------------------------------------- 1 | # Developers Tutorial 2 | 3 | A developer tutorial is available at [serpwings pyrobotstxt tutorial page](https://serpwings.com/software/python-robots-txt/#developers-tutorial) -------------------------------------------------------------------------------- /docs/tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial: How to Create Robots.txt File 2 | 3 | A user tutorial is available at [serpwings pyrobotstxt tutorial page](https://serpwings.com/software/python-robots-txt/#installation) -------------------------------------------------------------------------------- /tests/test_robot_name.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyrobotstxt import RobotsTxt 3 | 4 | 5 | def test_robot_name(): 6 | mrt = RobotsTxt() 7 | resutl = mrt.robots_name("facebook") 8 | assert "facebot" in resutl, "test passed" 9 | -------------------------------------------------------------------------------- /examples/robots.txt: -------------------------------------------------------------------------------- 1 | # Welcome Crawlers 2 | # Created on 2023-03-17 23:42:50.589282 using pyrobotstxtUser-agent: * 3 | # Allowed Patterns 4 | Allow: /home 5 | Allow: /deep 6 | 7 | # Disallowed Patterns 8 | Disallow: /topi?a 9 | Disallow: /nopi$ 10 | Disallow: /img*.png$ 11 | 12 | User-agent: Google 13 | # Allowed Patterns 14 | Allow: /home 15 | Allow: /deep 16 | 17 | # Disallowed Patterns 18 | Disallow: /topi?a 19 | Disallow: /nopi$ 20 | Disallow: /img*.png$ 21 | 22 | # Site Maps 23 | Sitemap: https://seowings.org/sitemap.xml 24 | 25 | 26 | 27 | # Good Bye Crawlers -------------------------------------------------------------------------------- /.github/workflows/makedocs.yml: -------------------------------------------------------------------------------- 1 | name: Documentation Generator 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | with: 13 | fetch-depth: 0 14 | - uses: actions/setup-python@v2 15 | - run: pip install --upgrade pip && pip install mkdocs mkdocs-gen-files mkdocstrings[python] pymdown-extensions 16 | - run: git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com' 17 | - name: Publish docs 18 | run: mkdocs gh-deploy 19 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: pyrobotstxt 2 | site_description: Python Package to Create, Analyze and Manipulate Robots.txt Files 3 | site_author: Faisal Shahzad 4 | 5 | site_url: https://pyrobotstxt.pages.dev 6 | 7 | repo_url: https://github.com/serpwings/pyrobotstxt 8 | edit_uri: blob/main/docs/ 9 | 10 | nav: 11 | - Home: index.md 12 | - User Tutorial: tutorial.md 13 | - Developer Tutorial: developers.md 14 | - API: 15 | - ImageAsASCII: imageasascii.md 16 | - RobotsTxt: robotstxt.md 17 | - UserAgent: useragent.md 18 | 19 | plugins: 20 | - search 21 | - mkdocstrings 22 | 23 | markdown_extensions: 24 | - admonition 25 | - codehilite 26 | - smarty 27 | - meta 28 | - toc: 29 | permalink: True 30 | - attr_list 31 | 32 | theme: readthedocs 33 | 34 | copyright: © Copyright 2022-2023 Faisal Shahzad (seowings.org) -------------------------------------------------------------------------------- /.github/workflows/pipy-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | #push: 8 | # branches: [main] 9 | 10 | release: 11 | types: [published] 12 | 13 | jobs: 14 | deploy: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: '3.x' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install setuptools wheel twine 28 | - name: Build and publish 29 | env: 30 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 31 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 32 | run: | 33 | python setup.py sdist bdist_wheel 34 | twine upload dist/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Seo Wings 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![pyrobotstx feature image](docs/img/feature-image.png) 2 | 3 | # pyrobotstxt: Python Package for **robots.txt** Files 4 | 5 | ``pyrobotstxt`` package can be used to (systematically) generate **robots.txt** files. Moreover, this package comes in handy for creating and including ``ASCII`` images in **robots.txt** files. 6 | 7 | In future releases, it would be possible to parse and analyze **robots.txt** file generated using any software (not limited to ``pyrobotstxt``) 8 | 9 | ## Whats in pyrobotstxt? 10 | 11 | We believe in monolithic software development and created this tiny package that does its job without any bloat. It is useful for 12 | 13 | - Createing **robots.txt** File 14 | - Parsing a **robots.txt** File [in progress] 15 | - Analyzing **robots.txt** File [in progress] 16 | 17 | ## How to Use pyrobotstxt? 18 | 19 | You can find detailed tutorial on [pyrobotstxt tutorial website](https://serpwings.com/software/python-robots-txt/). You can consult [pyrobotstxt documentation website](https://pyrobotstxt.pages.dev) for API reference. 20 | 21 | ## Contribute 22 | 23 | Pull Requests, Feature Suggestions, and collaborations are welcome. 24 | 25 | ## About Us 26 | 27 | This work is a collaborative effort of [seowings](https://seowings.org/), and [serpwings](https://serpwings.com/). 28 | -------------------------------------------------------------------------------- /examples/complete_example.py: -------------------------------------------------------------------------------- 1 | from pyrobotstxt import RobotsTxt, UserAgent 2 | 3 | robots_file = RobotsTxt() 4 | 5 | robots_file.include_header("Welcome Crawlers", append_date=True) 6 | robots_file.include_footer("Good Bye Crawlers") 7 | 8 | ua_general = UserAgent(ua_name="*") 9 | ua_general.add_allow( 10 | allow_items=["/home", "/deep", "/home"], 11 | unique=True, 12 | comments="This is a list of allowed items", 13 | ) 14 | 15 | ua_general.add_disallow( 16 | disallow_items=["/nopi$", "/topi?a", "/img*.png$"], 17 | unique=True, 18 | comments="This is a list of allowed items", 19 | ) 20 | 21 | ua_general_google = UserAgent(ua_name="Google") 22 | ua_general_google.add_allow( 23 | allow_items=["/home", "/deep", "/home"], 24 | unique=True, 25 | comments="This is a list of allowed items", 26 | ) 27 | ua_general_google.add_disallow( 28 | disallow_items=["/nopi$", "/topi?a", "/img*.png$"], 29 | unique=True, 30 | comments="This is a list of allowed items", 31 | ) 32 | ua_general_google.add_sitemap("https://seowings.org/sitemap.xml") 33 | 34 | robots_file.add_user_agent(ua_general) 35 | robots_file.add_user_agent(ua_general_google) 36 | 37 | robots_file.write("robots.txt") 38 | 39 | # Read Remote File 40 | robots_file_2 = RobotsTxt() 41 | robots_file_2.read("https://nike.com/robots.txt") 42 | robots_file_2.write("nike_robots.txt") 43 | 44 | print (robots_file_2.robots_details("Baiduspider")) 45 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![pyrobotstx feature image](img/feature-image.png) 2 | 3 | # pyrobotstxt - A Python Package for robots.txt Files 4 | 5 | ``pyrobotstxt`` package can be used to (systematically) generate **robots.txt** files. Moreover, this package comes in handy for creating and including ``ASCII`` images in **robots.txt** files. 6 | 7 | In future releases, it would be possible to parse, analyze and manipulate **robots.txt** file generated using any software (not limited to ``pyrobotstxt``) 8 | 9 | ## Whats in pyrobotstxt? 10 | 11 | We believe in monolithic software development and created this tiny package that does its job without any bloat. It is useful for 12 | 13 | - Createing **robots.txt** File 14 | - Parsing a **robots.txt** File [in progress] 15 | - Analyzing **robots.txt** File [in progress] 16 | 17 | ## How to Use pyrobotstxt? 18 | 19 | You can follow our [basic user tutorial](tutorial.md) on how to use this library. 20 | 21 | If you are a developer or want to test latest version of pyrobotstxt, you might find information on our [developers](developers.md) section useful. 22 | 23 | ## About Us 24 | 25 | 26 | [SERP Wings](https://www.serpwings.com){target=_blank} is a digital organization which develops software solutions to **Boosting SERP Performance Though Opensource Tools**. 27 | 28 | [seowings](https://www.seowings.org){target=_blank} is an opensource project to write, develop and promote tools for Data Sciences and Digital Marketing. 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- # 3 | 4 | """ 5 | pyrobotstxt: A Python Package for robots.txt Files. 6 | 7 | MIT License 8 | Copyright (c) 2022 SERP Wings www.serpwings.com 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | """ 25 | 26 | from setuptools import setup 27 | 28 | with open("README.md", "r", encoding="utf-8") as fh: 29 | long_description = fh.read() 30 | 31 | version = "0.0.5" 32 | 33 | setup( 34 | name="pyrobotstxt", 35 | version=version, 36 | author="Faisal Shahzad", 37 | author_email="seowingsorg@gmail.com", 38 | description="Python Package to Generate and Analyse Robots.txt files", 39 | long_description=long_description, 40 | long_description_content_type="text/markdown", 41 | url="https://github.com/serpwings/pyrobotstxt/", 42 | project_urls={ 43 | "Bug Tracker": "https://github.com/serpwings/pyrobotstxt/issues", 44 | "Documentation": "https://pyrobotstxt.pages.dev/", 45 | }, 46 | classifiers=[ 47 | "Topic :: Utilities", 48 | "Development Status :: 1 - Planning", 49 | "Intended Audience :: Education", 50 | "Intended Audience :: System Administrators", 51 | "Intended Audience :: Financial and Insurance Industry", 52 | "Intended Audience :: Healthcare Industry", 53 | "Intended Audience :: Science/Research", 54 | "Programming Language :: Python :: 3", 55 | "License :: OSI Approved :: MIT License", 56 | "Operating System :: OS Independent", 57 | "Topic :: Education", 58 | "Topic :: Office/Business :: Scheduling", 59 | "Topic :: Scientific/Engineering", 60 | "Topic :: Scientific/Engineering :: Visualization", 61 | "Topic :: Software Development :: Libraries", 62 | ], 63 | packages=["pyrobotstxt"], 64 | python_requires=">=3.9", 65 | install_requires=["pillow==10.0.1", "requests==2.31.0", "beautifulsoup4==4.11.2"], 66 | extras_require={ 67 | "dev": [ 68 | "setuptools", 69 | "pytest", 70 | "pytest-cov", 71 | "twine", 72 | "wheel", 73 | "mkdocs", 74 | "mkdocs-gen-files", 75 | "mkdocstrings[python]", 76 | "pymdown-extensions", 77 | ] 78 | }, 79 | ) 80 | -------------------------------------------------------------------------------- /pyrobotstxt/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- # 3 | 4 | """ 5 | pyrobotstxt: A Python Package for robots.txt Files. 6 | 7 | MIT License 8 | Copyright (c) 2022 SERP Wings www.serpwings.com 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | """ 25 | 26 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 27 | # IMPORTS Standard Library 28 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 29 | 30 | import re 31 | from unittest.mock import Mock 32 | import os 33 | import json 34 | from math import ceil 35 | from datetime import datetime 36 | 37 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 38 | # IMPORTS 3rd Party Libraries 39 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 40 | 41 | import requests 42 | from requests.adapters import HTTPAdapter 43 | from requests.models import Response 44 | from bs4 import BeautifulSoup 45 | from PIL import Image 46 | 47 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 48 | # CONSTANTS / ROBOTS DataBase 49 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 50 | 51 | ROBOTS = { 52 | "Applebot": "Apple", 53 | "AhrefsBot": "Ahrefs", 54 | "Baiduspider": "Baidu", 55 | "Bingbot": "Microsoft Bing", 56 | "Discordbot": "Discord", 57 | "DuckDuckBot": "DuckDuckGo", 58 | "Googlebot": "Google Search Bot", 59 | "Googlebot-Image": "Google Image Bot", 60 | "LinkedInBot": "LinkedIn Bot", 61 | "MJ12bot": "MJ12bot", 62 | "Pinterestbot": "Pinterest", 63 | "SemrushBot": "Semrsh", 64 | "Slurp": "Slurp", 65 | "TelegramBot": "Telegram", 66 | "Twitterbot": "Twitter Bot", 67 | "Yandex": "Yandex", 68 | "YandexBot": "YandexBot", 69 | "facebot": "Facebook", 70 | "msnbot": "MSN Bot", 71 | "rogerbot": "MOZ Bot", 72 | "xenu": "xenu", 73 | } 74 | 75 | HEADER = { 76 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0", 77 | } 78 | 79 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 80 | # UTIL FUNCTIONS 81 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 82 | 83 | 84 | def mock_requests_object(url): 85 | """ """ 86 | response = Mock(spec=Response) 87 | response.text = "" 88 | response.status_code = 9999 89 | response.url = url 90 | return response 91 | 92 | 93 | def get_remote_content(url, max_retires=5): 94 | """ """ 95 | try: 96 | s = requests.Session() 97 | s.mount(url, HTTPAdapter(max_retries=max_retires)) 98 | return s.get(url, headers=HEADER) 99 | except: 100 | return mock_requests_object(url) 101 | 102 | 103 | def get_corrected_url(url, fix_slash="sitemap.xml"): 104 | """ """ 105 | if not url.startswith("http://") and not url.startswith("https://"): 106 | url = f"http://{url}" 107 | 108 | if not url.endswith(fix_slash): 109 | url = f"{url}/{fix_slash}" 110 | 111 | return url 112 | 113 | 114 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 115 | # CLASSES 116 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++ 117 | 118 | 119 | class ImageAsASCII: 120 | """Class to Convert RGB/GRAYSCALE Images to ASCII (Text) format.""" 121 | 122 | def __init__(self, image_path=None, desired_width=90): 123 | """intializes an object of ImageAsASCII class. A user need to 124 | specify desired (ascii) image width and the path of RGB/Gray Image. 125 | 126 | Args: 127 | desired_width (int, optional): width of the desired output ASCII Image. 128 | image_path (str, optional):path of the input image. If None then conversion will not work. 129 | """ 130 | 131 | if not image_path: 132 | raise ValueError 133 | 134 | self.ascii_str_map = [" ", *("*$+?.%;:,@")] 135 | self.ascii_image = "" 136 | self.image = Image.open(image_path).convert("L") 137 | 138 | desired_height = desired_width * self.image.height / self.image.width 139 | self.image = self.image.resize((ceil(desired_width), ceil(desired_height))) 140 | 141 | def map_to_ascii(self): 142 | """map each pixel of indvidual image to a respective ascii value from ascii_str_map. 143 | This is achieved by deviding each pixel to ca. 10 equal parts (//25) and then maped to respecive value. 144 | """ 145 | 146 | str_container = "" # a container to hold ascii charcters 147 | for pixel in self.image.getdata(): 148 | str_container += self.ascii_str_map[pixel // 25] 149 | 150 | self.ascii_image = "#\t" # Now transform the string container to column format. 151 | for i in range(0, len(str_container), self.image.width): 152 | self.ascii_image += ( 153 | " ".join(str_container[i : i + self.image.width]) + "\n#\t" 154 | ) 155 | 156 | 157 | class UserAgent: 158 | def __init__(self, ua_name="*", crawl_delay=0): 159 | """Initialize UserAgent objet with a user-agent name and crawl delay varible. 160 | 161 | Args: 162 | ua_name (str, optional): name of the user-agent. Defaults to "*". 163 | crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0. 164 | """ 165 | self.user_agent_name = ua_name 166 | self.crawl_delay = crawl_delay 167 | self.sitemaps = [] # lists of sitemap for current UserAgent 168 | self.allowed = [] # lists of Allowed Items for current UserAgent 169 | self.disallowed = [] # lists of Disallowed Items for current UserAgent 170 | self.content = "" # consolidate content for robots.txt file 171 | 172 | def add_allow(self, allow_items, unique=True, comments=""): 173 | """Add allowed items/pages/slugs to current User Agent. 174 | 175 | Args: 176 | allow_items (str, list): single item or list of items allowed for current user agnet. 177 | unique (bool, optional): If True duplicate item stripped to single value. Defaults to True. 178 | comments (str, optional): Any comments for added value for human readability. Defaults to "". 179 | """ 180 | 181 | if isinstance(allow_items, str): 182 | allow_items = [allow_items] 183 | 184 | if not isinstance(allow_items, list): 185 | print("not supported", type(allow_items)) # raise exception 186 | raise TypeError 187 | else: 188 | self.allowed += allow_items 189 | if unique: 190 | self.allowed = list(set(self.allowed)) 191 | 192 | def remove_allow(self, allow_item): 193 | """Remove any previously added allowed item from allowed list. 194 | 195 | Args: 196 | allow_item (str, list): item(s) to be removed. 197 | """ 198 | 199 | if allow_item in self.allowed: 200 | self.allowed -= [allow_item] 201 | 202 | def add_disallow(self, disallow_items, unique=True, comments=""): 203 | """Add disallowed items/pages/slugs to current User Agent. 204 | 205 | Args: 206 | disallow_items (str, list): single item or list of items disallowed for current user agnet. 207 | unique (bool, optional): If True duplicate item stripped to single value. Defaults to True. 208 | comments (str, optional): Any comments for added value for human readability. Defaults to "". 209 | """ 210 | if isinstance(disallow_items, str): 211 | disallow_items = [disallow_items] 212 | 213 | if not isinstance(disallow_items, list): 214 | print("not supported", type(disallow_items)) # raise exception 215 | raise TypeError 216 | else: 217 | self.disallowed += disallow_items 218 | if unique: 219 | self.disallowed = list(set(self.disallowed)) 220 | 221 | def remove_disallow(self, disallow_item): 222 | """Remove any previously added disallowed item from allowed list. 223 | 224 | Args: 225 | disallow_item (str, list): item(s) to be removed. 226 | """ 227 | 228 | if disallow_item in self.disallowed: 229 | self.disallowed -= [disallow_item] 230 | 231 | def add_sitemap(self, site_map_path=None, comments=""): 232 | """add file path of sitemap to current user agent. 233 | 234 | Args: 235 | site_map_path (str): location of sitemap. Defaults to None. 236 | comments (str): any comments to include with sitemap path. Defaults to "". 237 | """ 238 | if not site_map_path: 239 | raise ValueError 240 | 241 | self.sitemaps.append(site_map_path) 242 | 243 | def remove_sitemap(self, site_map_path=None): 244 | """remove a sitemap from current user agent. 245 | 246 | Args: 247 | site_map_path (str): sitemap file path to be removed. Defaults to None. 248 | """ 249 | 250 | if site_map_path in self.sitemaps: 251 | self.sitemaps -= [site_map_path] 252 | 253 | def disallow_pagination(self, prefix="/page/*", comments=""): 254 | """Single function to disable pagination on a website using robots.txt file. 255 | 256 | Args: 257 | prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*". 258 | comments (str, optional): human readable comments for inclusion. Defaults to "". 259 | """ 260 | self.add_disallow(disallow_item=prefix, comments=comments) 261 | 262 | def consolidate(self): 263 | """consolidate all the information (allowed, disallowed, sitemaps) in single text string.""" 264 | 265 | self.content = f"User-agent: {self.user_agent_name}" 266 | 267 | # Support for including Crawl_delay. see feature request #1 268 | if self.crawl_delay > 0: 269 | self.content += f"\nCrawl-delay: {self.crawl_delay}\n" 270 | 271 | if self.allowed: 272 | self.content += "\n# Allowed Patterns\n" 273 | self.content += "\n".join([f"Allow: {item}" for item in self.allowed]) 274 | 275 | if self.disallowed: 276 | self.content += "\n\n# Disallowed Patterns\n" 277 | self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed]) 278 | 279 | if self.sitemaps: 280 | self.content += "\n\n# Site Maps\n" 281 | self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps]) 282 | 283 | self.content += "\n\n" 284 | 285 | 286 | class RobotsTxt: 287 | def __init__(self, version=""): 288 | """Intializes Robots.txt operations 289 | 290 | Args: 291 | version (str, optional): Version number (optional) for robots.txt. Defaults to "". 292 | """ 293 | self.user_agents = [] 294 | self.create_time = datetime.now() 295 | self.version = version 296 | self.image_branding = None 297 | self.header = "" # message added to the start of the output file. 298 | self.footer = "" # message added to the end of the output file. 299 | 300 | def read(self, robots_url): 301 | """Read a Remote Robots.txt file from a given URL 302 | 303 | If robots_txt is missing a robots.txt file extention then it will be automatically added. 304 | Parsing will only be carried out if robots_url returns a valid response object. 305 | 306 | Args: 307 | robots_url (str): robots.txt url at a remote location. 308 | """ 309 | 310 | self.create_time = datetime.now() 311 | robots_url = get_corrected_url(robots_url, "") 312 | response = get_remote_content(robots_url) 313 | 314 | if response.status_code < 400: 315 | for ua_item in response.text.split("User-agent:"): 316 | if ua_item: 317 | ua_content_items = [ 318 | ua_split_item.strip() 319 | for ua_split_item in ua_item.split("\n") 320 | if ua_split_item 321 | ] 322 | if not ua_content_items[0].startswith("#"): 323 | ua = UserAgent(ua_name=ua_content_items[0]) 324 | ua.add_allow( 325 | [ 326 | it.split("Allow:")[-1] 327 | for it in ua_content_items[1:] 328 | if it.startswith("Allow:") 329 | ] 330 | ) 331 | ua.add_disallow( 332 | [ 333 | it.split("Disallow:")[-1] 334 | for it in ua_content_items[1:] 335 | if it.startswith("Disallow:") 336 | ] 337 | ) 338 | # TODO: Comments are not included Yet 339 | comment = [ 340 | it.split("# ")[-1] 341 | for it in ua_content_items[1:] 342 | if it.startswith("#") 343 | ] 344 | 345 | self.add_user_agent(ua=ua) 346 | 347 | def write(self, file_path="robots.txt"): 348 | """write robots.txt file at a given file_path location. 349 | 350 | Args: 351 | file_path (str, optional): location of robots.txt file. Defaults to "robots.txt". 352 | """ 353 | 354 | with open(file_path, "w") as f: 355 | # include header 356 | if self.header: 357 | f.write(f"# {self.header}") 358 | 359 | # include user agents with consolidate text 360 | for ua in self.user_agents: 361 | ua.consolidate() 362 | f.write(ua.content) 363 | 364 | f.write("\n") 365 | 366 | # append ascii image, if available 367 | if self.image_branding: 368 | f.write(self.image_branding) 369 | 370 | # append footer message 371 | if self.footer: 372 | f.write(f"\n# {self.footer}") 373 | 374 | def include_header(self, message="", append_date=True): 375 | """include header message with/without creation date. 376 | 377 | Args: 378 | message (str, optional): header or header message. Defaults to "". 379 | append_date (bool, optional): Append date/time to the header. Defaults to True. 380 | """ 381 | 382 | self.header = message 383 | 384 | if append_date: 385 | self.header += f"\n# Created on {self.create_time} using pyrobotstxt" 386 | 387 | def include_footer(self, message=""): 388 | """include footer message 389 | 390 | Args: 391 | message (str, optional): footer message. Defaults to "". 392 | """ 393 | self.footer = message 394 | 395 | def include_image(self, image_path=None, desired_width=90): 396 | """includes ascii image provided at image_file 397 | 398 | Args: 399 | image_path (str): location of image file. Defaults to None. 400 | desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars). 401 | """ 402 | img = ImageAsASCII(image_path=image_path, desired_width=desired_width) 403 | img.map_to_ascii() 404 | self.image_branding = img.ascii_image 405 | 406 | def add_user_agent(self, ua): 407 | """Add/Append user agent to RobotsTxt 408 | 409 | Args: 410 | ua (UserAgent): user agent to be included in final robots.txt file. 411 | """ 412 | self.user_agents.append(ua) 413 | 414 | def remove_user_agent(self, ua_name=""): 415 | """Remove user agent from RobotsTxt 416 | 417 | Args: 418 | ua_name (UserAgent): user agent to be removed from already included in robots.txt file. 419 | """ 420 | self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name] 421 | 422 | @staticmethod 423 | def robots_name(crawl_bot): 424 | """Find robot name, if you know any keywrod about that crawl bot. 425 | 426 | Args: 427 | crawl_bot (str): description about the crawl bot. e.g. facebook 428 | 429 | Returns: 430 | (dict): all matching crawl bots with relevent information 431 | """ 432 | return { 433 | robot: ROBOTS[robot] 434 | for robot in ROBOTS 435 | if crawl_bot.capitalize() in ROBOTS[robot] 436 | } 437 | 438 | @staticmethod 439 | def robots_details(crawl_bot): 440 | """Static Method to return details about any crawl bot. 441 | 442 | Args: 443 | crawl_bot (str): name of crawl bot 444 | 445 | Returns: 446 | (dict): information about all crawl bots matching to input string. 447 | """ 448 | return { 449 | robot: ROBOTS[robot] 450 | for robot in ROBOTS 451 | if crawl_bot.lower() == robot.lower() 452 | } 453 | --------------------------------------------------------------------------------