├── docs
    ├── useragent.md
    ├── robotstxt.md
    ├── imageasascii.md
    ├── img
    │   ├── favicon.ico
    │   └── feature-image.png
    ├── developers.md
    ├── tutorial.md
    └── index.md
├── tests
    └── test_robot_name.py
├── examples
    ├── robots.txt
    └── complete_example.py
├── .github
    └── workflows
    │   ├── makedocs.yml
    │   └── pipy-publish.yml
├── mkdocs.yml
├── LICENSE
├── README.md
├── .gitignore
├── setup.py
└── pyrobotstxt
    └── __init__.py


/docs/useragent.md:
--------------------------------------------------------------------------------
1 | # UserAgent Class
2 | 
3 | ::: pyrobotstxt.UserAgent


--------------------------------------------------------------------------------
/docs/robotstxt.md:
--------------------------------------------------------------------------------
1 | 
2 | # RobotsTxt Class
3 | 
4 | ::: pyrobotstxt.RobotsTxt
5 | 


--------------------------------------------------------------------------------
/docs/imageasascii.md:
--------------------------------------------------------------------------------
1 | # ImageAsASCII Class
2 | 
3 | ::: pyrobotstxt.ImageAsASCII
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serpwings/pyrobotstxt/main/docs/img/favicon.ico


--------------------------------------------------------------------------------
/docs/img/feature-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serpwings/pyrobotstxt/main/docs/img/feature-image.png


--------------------------------------------------------------------------------
/docs/developers.md:
--------------------------------------------------------------------------------
1 | # Developers Tutorial
2 | 
3 | A developer tutorial is available at [serpwings pyrobotstxt tutorial page](https://serpwings.com/software/python-robots-txt/#developers-tutorial)


--------------------------------------------------------------------------------
/docs/tutorial.md:
--------------------------------------------------------------------------------
1 | # Tutorial: How to Create Robots.txt File
2 | 
3 | A user tutorial is available at [serpwings pyrobotstxt tutorial page](https://serpwings.com/software/python-robots-txt/#installation)


--------------------------------------------------------------------------------
/tests/test_robot_name.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from pyrobotstxt import RobotsTxt
3 | 
4 | 
5 | def test_robot_name():
6 |     mrt = RobotsTxt()
7 |     resutl = mrt.robots_name("facebook")
8 |     assert "facebot" in resutl, "test passed"
9 | 


--------------------------------------------------------------------------------
/examples/robots.txt:
--------------------------------------------------------------------------------
 1 | # Welcome Crawlers
 2 | # Created on 2023-03-17 23:42:50.589282 using pyrobotstxtUser-agent: *
 3 | # Allowed Patterns
 4 | Allow: /home
 5 | Allow: /deep
 6 | 
 7 | # Disallowed Patterns
 8 | Disallow: /topi?a
 9 | Disallow: /nopi$
10 | Disallow: /img*.png$
11 | 
12 | User-agent: Google
13 | # Allowed Patterns
14 | Allow: /home
15 | Allow: /deep
16 | 
17 | # Disallowed Patterns
18 | Disallow: /topi?a
19 | Disallow: /nopi$
20 | Disallow: /img*.png$
21 | 
22 | # Site Maps
23 | Sitemap: https://seowings.org/sitemap.xml
24 | 
25 | 
26 | 
27 | # Good Bye Crawlers


--------------------------------------------------------------------------------
/.github/workflows/makedocs.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation Generator
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |         with:
13 |           fetch-depth: 0
14 |       - uses: actions/setup-python@v2
15 |       - run: pip install --upgrade pip && pip install mkdocs mkdocs-gen-files mkdocstrings[python] pymdown-extensions
16 |       - run: git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com'
17 |       - name: Publish docs
18 |         run: mkdocs gh-deploy
19 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: pyrobotstxt
 2 | site_description: Python Package to Create, Analyze and Manipulate Robots.txt Files
 3 | site_author: Faisal Shahzad
 4 | 
 5 | site_url: https://pyrobotstxt.pages.dev
 6 | 
 7 | repo_url: https://github.com/serpwings/pyrobotstxt
 8 | edit_uri: blob/main/docs/
 9 | 
10 | nav:
11 |   - Home: index.md
12 |   - User Tutorial: tutorial.md
13 |   - Developer Tutorial: developers.md
14 |   - API:
15 |       - ImageAsASCII: imageasascii.md
16 |       - RobotsTxt: robotstxt.md
17 |       - UserAgent: useragent.md
18 | 
19 | plugins:
20 |   - search
21 |   - mkdocstrings
22 | 
23 | markdown_extensions:
24 |   - admonition
25 |   - codehilite
26 |   - smarty
27 |   - meta
28 |   - toc:
29 |       permalink: True
30 |   - attr_list
31 |   
32 | theme: readthedocs
33 | 
34 | copyright: © Copyright 2022-2023 Faisal Shahzad (seowings.org)


--------------------------------------------------------------------------------
/.github/workflows/pipy-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   #push:
 8 |   #  branches: [main]
 9 | 
10 |   release:
11 |     types: [published]
12 | 
13 | jobs:
14 |   deploy:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: '3.x'
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install setuptools wheel twine
28 |     - name: Build and publish
29 |       env:
30 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
31 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
32 |       run: |
33 |         python setup.py sdist bdist_wheel
34 |         twine upload dist/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Seo Wings
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![pyrobotstx feature image](docs/img/feature-image.png)
 2 | 
 3 | # pyrobotstxt: Python Package for **robots.txt** Files
 4 | 
 5 | ``pyrobotstxt`` package can be used to (systematically) generate **robots.txt** files. Moreover, this package comes in handy for creating and including ``ASCII`` images in **robots.txt** files.
 6 | 
 7 | In future releases, it would be possible to parse and analyze **robots.txt** file generated using any software (not limited to ``pyrobotstxt``)
 8 | 
 9 | ## Whats in pyrobotstxt?
10 | 
11 | We believe in monolithic software development and created this tiny package that does its job without any bloat. It is useful for 
12 | 
13 | - Createing **robots.txt** File
14 | - Parsing a **robots.txt** File [in progress]
15 | - Analyzing **robots.txt** File [in progress]
16 | 
17 | ## How to Use pyrobotstxt?
18 | 
19 | You can find detailed tutorial on [pyrobotstxt tutorial website](https://serpwings.com/software/python-robots-txt/). You can consult [pyrobotstxt documentation website](https://pyrobotstxt.pages.dev) for API reference.
20 | 
21 | ## Contribute
22 | 
23 | Pull Requests, Feature Suggestions, and collaborations are welcome.
24 | 
25 | ## About Us
26 | 
27 | This work is a collaborative effort of [seowings](https://seowings.org/), and [serpwings](https://serpwings.com/).
28 | 


--------------------------------------------------------------------------------
/examples/complete_example.py:
--------------------------------------------------------------------------------
 1 | from pyrobotstxt import RobotsTxt, UserAgent
 2 | 
 3 | robots_file = RobotsTxt()
 4 | 
 5 | robots_file.include_header("Welcome Crawlers", append_date=True)
 6 | robots_file.include_footer("Good Bye Crawlers")
 7 | 
 8 | ua_general = UserAgent(ua_name="*")
 9 | ua_general.add_allow(
10 |     allow_items=["/home", "/deep", "/home"],
11 |     unique=True,
12 |     comments="This is a list of allowed items",
13 | )
14 | 
15 | ua_general.add_disallow(
16 |     disallow_items=["/nopi$", "/topi?a", "/img*.png$"],
17 |     unique=True,
18 |     comments="This is a list of allowed items",
19 | )
20 | 
21 | ua_general_google = UserAgent(ua_name="Google")
22 | ua_general_google.add_allow(
23 |     allow_items=["/home", "/deep", "/home"],
24 |     unique=True,
25 |     comments="This is a list of allowed items",
26 | )
27 | ua_general_google.add_disallow(
28 |     disallow_items=["/nopi$", "/topi?a", "/img*.png$"],
29 |     unique=True,
30 |     comments="This is a list of allowed items",
31 | )
32 | ua_general_google.add_sitemap("https://seowings.org/sitemap.xml")
33 | 
34 | robots_file.add_user_agent(ua_general)
35 | robots_file.add_user_agent(ua_general_google)
36 | 
37 | robots_file.write("robots.txt")
38 | 
39 | # Read Remote File
40 | robots_file_2 = RobotsTxt()
41 | robots_file_2.read("https://nike.com/robots.txt")
42 | robots_file_2.write("nike_robots.txt")
43 | 
44 | print (robots_file_2.robots_details("Baiduspider"))
45 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ![pyrobotstx feature image](img/feature-image.png)
 2 | 
 3 | # pyrobotstxt - A Python Package for robots.txt Files
 4 | 
 5 | ``pyrobotstxt`` package can be used to (systematically) generate **robots.txt** files. Moreover, this package comes in handy for creating and including ``ASCII`` images in **robots.txt** files.
 6 | 
 7 | In future releases, it would be possible to parse, analyze and manipulate **robots.txt** file generated using any software (not limited to ``pyrobotstxt``)
 8 | 
 9 | ## Whats in pyrobotstxt?
10 | 
11 | We believe in monolithic software development and created this tiny package that does its job without any bloat. It is useful for 
12 | 
13 | - Createing **robots.txt** File
14 | - Parsing a **robots.txt** File [in progress]
15 | - Analyzing **robots.txt** File [in progress]
16 | 
17 | ## How to Use pyrobotstxt?
18 | 
19 | You can follow our [basic user tutorial](tutorial.md) on how to use this library.
20 | 
21 | If you are a developer or want to test latest version of pyrobotstxt, you might find information on our [developers](developers.md) section useful.
22 | 
23 | ## About Us
24 | 
25 | 
26 | [SERP Wings](https://www.serpwings.com){target=_blank} is a digital organization which develops software solutions to **Boosting SERP Performance Though Opensource Tools**.
27 | 
28 | [seowings](https://www.seowings.org){target=_blank} is an opensource project to write, develop and promote tools for Data Sciences and Digital Marketing.
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- #
 3 | 
 4 | """
 5 | pyrobotstxt: A Python Package for robots.txt Files.
 6 | 
 7 | MIT License
 8 | Copyright (c) 2022 SERP Wings www.serpwings.com
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | """
25 | 
26 | from setuptools import setup
27 | 
28 | with open("README.md", "r", encoding="utf-8") as fh:
29 |     long_description = fh.read()
30 | 
31 | version = "0.0.5"
32 | 
33 | setup(
34 |     name="pyrobotstxt",
35 |     version=version,
36 |     author="Faisal Shahzad",
37 |     author_email="seowingsorg@gmail.com",
38 |     description="Python Package to Generate and Analyse Robots.txt files",
39 |     long_description=long_description,
40 |     long_description_content_type="text/markdown",
41 |     url="https://github.com/serpwings/pyrobotstxt/",
42 |     project_urls={
43 |         "Bug Tracker": "https://github.com/serpwings/pyrobotstxt/issues",
44 |         "Documentation": "https://pyrobotstxt.pages.dev/",
45 |     },
46 |     classifiers=[
47 |         "Topic :: Utilities",
48 |         "Development Status :: 1 - Planning",
49 |         "Intended Audience :: Education",
50 |         "Intended Audience :: System Administrators",
51 |         "Intended Audience :: Financial and Insurance Industry",
52 |         "Intended Audience :: Healthcare Industry",
53 |         "Intended Audience :: Science/Research",
54 |         "Programming Language :: Python :: 3",
55 |         "License :: OSI Approved :: MIT License",
56 |         "Operating System :: OS Independent",
57 |         "Topic :: Education",
58 |         "Topic :: Office/Business :: Scheduling",
59 |         "Topic :: Scientific/Engineering",
60 |         "Topic :: Scientific/Engineering :: Visualization",
61 |         "Topic :: Software Development :: Libraries",
62 |     ],
63 |     packages=["pyrobotstxt"],
64 |     python_requires=">=3.9",
65 |     install_requires=["pillow==10.0.1", "requests==2.31.0", "beautifulsoup4==4.11.2"],
66 |     extras_require={
67 |         "dev": [
68 |             "setuptools",
69 |             "pytest",
70 |             "pytest-cov",
71 |             "twine",
72 |             "wheel",
73 |             "mkdocs",
74 |             "mkdocs-gen-files",
75 |             "mkdocstrings[python]",
76 |             "pymdown-extensions",
77 |         ]
78 |     },
79 | )
80 | 


--------------------------------------------------------------------------------
/pyrobotstxt/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- #
  3 | 
  4 | """
  5 | pyrobotstxt: A Python Package for robots.txt Files.
  6 | 
  7 | MIT License
  8 | Copyright (c) 2022 SERP Wings www.serpwings.com
  9 | Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | of this software and associated documentation files (the "Software"), to deal
 11 | in the Software without restriction, including without limitation the rights
 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | copies of the Software, and to permit persons to whom the Software is
 14 | furnished to do so, subject to the following conditions:
 15 | The above copyright notice and this permission notice shall be included in all
 16 | copies or substantial portions of the Software.
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | """
 25 | 
 26 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 27 | # IMPORTS Standard Library
 28 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 29 | 
 30 | import re
 31 | from unittest.mock import Mock
 32 | import os
 33 | import json
 34 | from math import ceil
 35 | from datetime import datetime
 36 | 
 37 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 38 | # IMPORTS 3rd Party Libraries
 39 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 40 | 
 41 | import requests
 42 | from requests.adapters import HTTPAdapter
 43 | from requests.models import Response
 44 | from bs4 import BeautifulSoup
 45 | from PIL import Image
 46 | 
 47 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 48 | # CONSTANTS / ROBOTS DataBase
 49 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 50 | 
 51 | ROBOTS = {
 52 |     "Applebot": "Apple",
 53 |     "AhrefsBot": "Ahrefs",
 54 |     "Baiduspider": "Baidu",
 55 |     "Bingbot": "Microsoft Bing",
 56 |     "Discordbot": "Discord",
 57 |     "DuckDuckBot": "DuckDuckGo",
 58 |     "Googlebot": "Google Search Bot",
 59 |     "Googlebot-Image": "Google Image Bot",
 60 |     "LinkedInBot": "LinkedIn Bot",
 61 |     "MJ12bot": "MJ12bot",
 62 |     "Pinterestbot": "Pinterest",
 63 |     "SemrushBot": "Semrsh",
 64 |     "Slurp": "Slurp",
 65 |     "TelegramBot": "Telegram",
 66 |     "Twitterbot": "Twitter Bot",
 67 |     "Yandex": "Yandex",
 68 |     "YandexBot": "YandexBot",
 69 |     "facebot": "Facebook",
 70 |     "msnbot": "MSN Bot",
 71 |     "rogerbot": "MOZ Bot",
 72 |     "xenu": "xenu",
 73 | }
 74 | 
 75 | HEADER = {
 76 |     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
 77 | }
 78 | 
 79 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 80 | # UTIL FUNCTIONS
 81 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
 82 | 
 83 | 
 84 | def mock_requests_object(url):
 85 |     """ """
 86 |     response = Mock(spec=Response)
 87 |     response.text = ""
 88 |     response.status_code = 9999
 89 |     response.url = url
 90 |     return response
 91 | 
 92 | 
 93 | def get_remote_content(url, max_retires=5):
 94 |     """ """
 95 |     try:
 96 |         s = requests.Session()
 97 |         s.mount(url, HTTPAdapter(max_retries=max_retires))
 98 |         return s.get(url, headers=HEADER)
 99 |     except:
100 |         return mock_requests_object(url)
101 | 
102 | 
103 | def get_corrected_url(url, fix_slash="sitemap.xml"):
104 |     """ """
105 |     if not url.startswith("http://") and not url.startswith("https://"):
106 |         url = f"http://{url}"
107 | 
108 |     if not url.endswith(fix_slash):
109 |         url = f"{url}/{fix_slash}"
110 | 
111 |     return url
112 | 
113 | 
114 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
115 | # CLASSES
116 | # +++++++++++++++++++++++++++++++++++++++++++++++++++++
117 | 
118 | 
119 | class ImageAsASCII:
120 |     """Class to Convert RGB/GRAYSCALE Images to ASCII (Text) format."""
121 | 
122 |     def __init__(self, image_path=None, desired_width=90):
123 |         """intializes an object of ImageAsASCII class. A user need to
124 |         specify desired (ascii) image width and the path of RGB/Gray Image.
125 | 
126 |         Args:
127 |             desired_width (int, optional): width of the desired output ASCII Image.
128 |             image_path (str, optional):path of the input image. If None then conversion will not work.
129 |         """
130 | 
131 |         if not image_path:
132 |             raise ValueError
133 | 
134 |         self.ascii_str_map = [" ", *("*$+?.%;:,@")]
135 |         self.ascii_image = ""
136 |         self.image = Image.open(image_path).convert("L")
137 | 
138 |         desired_height = desired_width * self.image.height / self.image.width
139 |         self.image = self.image.resize((ceil(desired_width), ceil(desired_height)))
140 | 
141 |     def map_to_ascii(self):
142 |         """map each pixel of indvidual image to a respective ascii value from ascii_str_map.
143 |         This is achieved by deviding each pixel to ca. 10 equal parts (//25) and then maped to respecive value.
144 |         """
145 | 
146 |         str_container = ""  # a container to hold ascii charcters
147 |         for pixel in self.image.getdata():
148 |             str_container += self.ascii_str_map[pixel // 25]
149 | 
150 |         self.ascii_image = "#\t"  # Now transform the string container to column format.
151 |         for i in range(0, len(str_container), self.image.width):
152 |             self.ascii_image += (
153 |                 " ".join(str_container[i : i + self.image.width]) + "\n#\t"
154 |             )
155 | 
156 | 
157 | class UserAgent:
158 |     def __init__(self, ua_name="*", crawl_delay=0):
159 |         """Initialize UserAgent objet with a user-agent name and crawl delay varible.
160 | 
161 |         Args:
162 |             ua_name (str, optional): name of the user-agent. Defaults to "*".
163 |             crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0.
164 |         """
165 |         self.user_agent_name = ua_name
166 |         self.crawl_delay = crawl_delay
167 |         self.sitemaps = []  # lists of sitemap for current UserAgent
168 |         self.allowed = []  # lists of Allowed Items for current UserAgent
169 |         self.disallowed = []  # lists of Disallowed Items for current UserAgent
170 |         self.content = ""  # consolidate content for robots.txt file
171 | 
172 |     def add_allow(self, allow_items, unique=True, comments=""):
173 |         """Add allowed items/pages/slugs to current User Agent.
174 | 
175 |         Args:
176 |             allow_items (str, list): single item or list of items allowed for current user agnet.
177 |             unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
178 |             comments (str, optional): Any comments for added value for human readability. Defaults to "".
179 |         """
180 | 
181 |         if isinstance(allow_items, str):
182 |             allow_items = [allow_items]
183 | 
184 |         if not isinstance(allow_items, list):
185 |             print("not supported", type(allow_items))  # raise exception
186 |             raise TypeError
187 |         else:
188 |             self.allowed += allow_items
189 |             if unique:
190 |                 self.allowed = list(set(self.allowed))
191 | 
192 |     def remove_allow(self, allow_item):
193 |         """Remove any previously added allowed item from allowed list.
194 | 
195 |         Args:
196 |             allow_item (str, list): item(s) to be removed.
197 |         """
198 | 
199 |         if allow_item in self.allowed:
200 |             self.allowed -= [allow_item]
201 | 
202 |     def add_disallow(self, disallow_items, unique=True, comments=""):
203 |         """Add disallowed items/pages/slugs to current User Agent.
204 | 
205 |         Args:
206 |             disallow_items (str, list): single item or list of items disallowed for current user agnet.
207 |             unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
208 |             comments (str, optional): Any comments for added value for human readability. Defaults to "".
209 |         """
210 |         if isinstance(disallow_items, str):
211 |             disallow_items = [disallow_items]
212 | 
213 |         if not isinstance(disallow_items, list):
214 |             print("not supported", type(disallow_items))  # raise exception
215 |             raise TypeError
216 |         else:
217 |             self.disallowed += disallow_items
218 |             if unique:
219 |                 self.disallowed = list(set(self.disallowed))
220 | 
221 |     def remove_disallow(self, disallow_item):
222 |         """Remove any previously added disallowed item from allowed list.
223 | 
224 |         Args:
225 |             disallow_item (str, list): item(s) to be removed.
226 |         """
227 | 
228 |         if disallow_item in self.disallowed:
229 |             self.disallowed -= [disallow_item]
230 | 
231 |     def add_sitemap(self, site_map_path=None, comments=""):
232 |         """add file path of sitemap to current user agent.
233 | 
234 |         Args:
235 |             site_map_path (str): location of sitemap. Defaults to None.
236 |             comments (str): any comments to include with sitemap path. Defaults to "".
237 |         """
238 |         if not site_map_path:
239 |             raise ValueError
240 | 
241 |         self.sitemaps.append(site_map_path)
242 | 
243 |     def remove_sitemap(self, site_map_path=None):
244 |         """remove a sitemap from current user agent.
245 | 
246 |         Args:
247 |             site_map_path (str): sitemap file path to be removed. Defaults to None.
248 |         """
249 | 
250 |         if site_map_path in self.sitemaps:
251 |             self.sitemaps -= [site_map_path]
252 | 
253 |     def disallow_pagination(self, prefix="/page/*", comments=""):
254 |         """Single function to disable pagination on a website using robots.txt file.
255 | 
256 |         Args:
257 |             prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*".
258 |             comments (str, optional): human readable comments for inclusion. Defaults to "".
259 |         """
260 |         self.add_disallow(disallow_item=prefix, comments=comments)
261 | 
262 |     def consolidate(self):
263 |         """consolidate all the information (allowed, disallowed, sitemaps) in single text string."""
264 | 
265 |         self.content = f"User-agent: {self.user_agent_name}"
266 | 
267 |         # Support for including Crawl_delay. see feature request #1
268 |         if self.crawl_delay > 0:
269 |             self.content += f"\nCrawl-delay: {self.crawl_delay}\n"
270 | 
271 |         if self.allowed:
272 |             self.content += "\n# Allowed Patterns\n"
273 |             self.content += "\n".join([f"Allow: {item}" for item in self.allowed])
274 | 
275 |         if self.disallowed:
276 |             self.content += "\n\n# Disallowed Patterns\n"
277 |             self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed])
278 | 
279 |         if self.sitemaps:
280 |             self.content += "\n\n# Site Maps\n"
281 |             self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])
282 | 
283 |         self.content += "\n\n"
284 | 
285 | 
286 | class RobotsTxt:
287 |     def __init__(self, version=""):
288 |         """Intializes Robots.txt operations
289 | 
290 |         Args:
291 |             version (str, optional): Version number (optional) for robots.txt. Defaults to "".
292 |         """
293 |         self.user_agents = []
294 |         self.create_time = datetime.now()
295 |         self.version = version
296 |         self.image_branding = None
297 |         self.header = ""  # message added to the start of the output file.
298 |         self.footer = ""  # message added to the end of the output file.
299 | 
300 |     def read(self, robots_url):
301 |         """Read a Remote Robots.txt file from a given URL
302 | 
303 |         If robots_txt is missing a robots.txt file extention then it will be automatically added.
304 |         Parsing will only be carried out if robots_url returns a valid response object.
305 | 
306 |         Args:
307 |             robots_url (str):  robots.txt url at a remote location.
308 |         """
309 | 
310 |         self.create_time = datetime.now()
311 |         robots_url = get_corrected_url(robots_url, "")
312 |         response = get_remote_content(robots_url)
313 | 
314 |         if response.status_code < 400:
315 |             for ua_item in response.text.split("User-agent:"):
316 |                 if ua_item:
317 |                     ua_content_items = [
318 |                         ua_split_item.strip()
319 |                         for ua_split_item in ua_item.split("\n")
320 |                         if ua_split_item
321 |                     ]
322 |                     if not ua_content_items[0].startswith("#"):
323 |                         ua = UserAgent(ua_name=ua_content_items[0])
324 |                         ua.add_allow(
325 |                             [
326 |                                 it.split("Allow:")[-1]
327 |                                 for it in ua_content_items[1:]
328 |                                 if it.startswith("Allow:")
329 |                             ]
330 |                         )
331 |                         ua.add_disallow(
332 |                             [
333 |                                 it.split("Disallow:")[-1]
334 |                                 for it in ua_content_items[1:]
335 |                                 if it.startswith("Disallow:")
336 |                             ]
337 |                         )
338 |                         # TODO: Comments are not included Yet
339 |                         comment = [
340 |                             it.split("# ")[-1]
341 |                             for it in ua_content_items[1:]
342 |                             if it.startswith("#")
343 |                         ]
344 | 
345 |                         self.add_user_agent(ua=ua)
346 | 
347 |     def write(self, file_path="robots.txt"):
348 |         """write robots.txt file at a given file_path location.
349 | 
350 |         Args:
351 |             file_path (str, optional): location of robots.txt file. Defaults to "robots.txt".
352 |         """
353 | 
354 |         with open(file_path, "w") as f:
355 |             # include header
356 |             if self.header:
357 |                 f.write(f"# {self.header}")
358 | 
359 |             # include user agents with consolidate text
360 |             for ua in self.user_agents:
361 |                 ua.consolidate()
362 |                 f.write(ua.content)
363 | 
364 |             f.write("\n")
365 | 
366 |             # append ascii image, if available
367 |             if self.image_branding:
368 |                 f.write(self.image_branding)
369 | 
370 |             # append footer message
371 |             if self.footer:
372 |                 f.write(f"\n# {self.footer}")
373 | 
374 |     def include_header(self, message="", append_date=True):
375 |         """include header message with/without creation date.
376 | 
377 |         Args:
378 |             message (str, optional): header or header message. Defaults to "".
379 |             append_date (bool, optional): Append date/time to the header. Defaults to True.
380 |         """
381 | 
382 |         self.header = message
383 | 
384 |         if append_date:
385 |             self.header += f"\n# Created on {self.create_time} using pyrobotstxt"
386 | 
387 |     def include_footer(self, message=""):
388 |         """include footer message
389 | 
390 |         Args:
391 |             message (str, optional): footer message. Defaults to "".
392 |         """
393 |         self.footer = message
394 | 
395 |     def include_image(self, image_path=None, desired_width=90):
396 |         """includes ascii image provided at image_file
397 | 
398 |         Args:
399 |             image_path (str): location of image file. Defaults to None.
400 |             desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars).
401 |         """
402 |         img = ImageAsASCII(image_path=image_path, desired_width=desired_width)
403 |         img.map_to_ascii()
404 |         self.image_branding = img.ascii_image
405 | 
406 |     def add_user_agent(self, ua):
407 |         """Add/Append user agent to RobotsTxt
408 | 
409 |         Args:
410 |             ua (UserAgent): user agent to be included in final robots.txt file.
411 |         """
412 |         self.user_agents.append(ua)
413 | 
414 |     def remove_user_agent(self, ua_name=""):
415 |         """Remove user agent from RobotsTxt
416 | 
417 |         Args:
418 |             ua_name (UserAgent): user agent to be removed from already included in robots.txt file.
419 |         """
420 |         self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name]
421 | 
422 |     @staticmethod
423 |     def robots_name(crawl_bot):
424 |         """Find robot name, if you know any keywrod about that crawl bot.
425 | 
426 |         Args:
427 |             crawl_bot (str): description about the crawl bot. e.g. facebook
428 | 
429 |         Returns:
430 |             (dict): all matching crawl bots with relevent information
431 |         """
432 |         return {
433 |             robot: ROBOTS[robot]
434 |             for robot in ROBOTS
435 |             if crawl_bot.capitalize() in ROBOTS[robot]
436 |         }
437 | 
438 |     @staticmethod
439 |     def robots_details(crawl_bot):
440 |         """Static Method to return details about any crawl bot.
441 | 
442 |         Args:
443 |             crawl_bot (str): name of crawl bot
444 | 
445 |         Returns:
446 |             (dict): information about all crawl bots matching to input string.
447 |         """
448 |         return {
449 |             robot: ROBOTS[robot]
450 |             for robot in ROBOTS
451 |             if crawl_bot.lower() == robot.lower()
452 |         }
453 | 


--------------------------------------------------------------------------------