├── tests ├── __init__.py ├── web_client │ ├── __init__.py │ └── test_requests_client.py ├── helpers.py ├── test_helpers.py └── test_tree.py ├── usp ├── objects │ ├── __init__.py │ ├── sitemap.py │ └── page.py ├── web_client │ ├── __init__.py │ ├── requests_client.py │ └── abstract_client.py ├── __init__.py ├── __about__.py ├── exceptions.py ├── log.py ├── tree.py ├── helpers.py └── fetch_parse.py ├── docs ├── modules.rst ├── index.rst ├── usp.objects.rst ├── Makefile ├── usp.rst ├── usp.web_client.rst └── conf.py ├── .coveragerc ├── MANIFEST.in ├── .idea ├── encodings.xml ├── vcs.xml ├── misc.xml ├── modules.xml ├── mediacloud-ultimate-sitemap-parser.iml ├── inspectionProfiles │ └── Project_Default.xml └── runConfigurations │ ├── pytest_in_test_helpers_py.xml │ └── pytest_in_test_tree_py.xml ├── setup.cfg ├── LICENSE.txt ├── .travis.yml ├── setup.py ├── README.rst └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /usp/objects/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/web_client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /usp/web_client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /usp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["tree"] 2 | -------------------------------------------------------------------------------- /usp/__about__.py: -------------------------------------------------------------------------------- 1 | """Package version.""" 2 | 3 | __version__ = "0.5" 4 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | usp 2 | === 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | usp 8 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = usp 3 | 4 | [report] 5 | omit = 6 | */python?.?/* 7 | tests/* 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include *.txt 3 | include setup.* 4 | recursive-include usp *.py 5 | include MANIFEST.in 6 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | 4 | [check-manifest] 5 | ignore = 6 | .travis.yml 7 | .gitignore 8 | .idea 9 | 10 | [aliases] 11 | test=pytest 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Ultimate Sitemap Parser 2 | ======================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | modules 8 | 9 | Indices and tables 10 | ================== 11 | 12 | * :ref:`genindex` 13 | * :ref:`modindex` 14 | * :ref:`search` 15 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2018 Linas Valiukas, Hal Roberts, 2018 Media Cloud project 2 | 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | any later version. 7 | 8 | This program is distributed in the hope that it will be useful, 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | GNU General Public License for more details . 12 | -------------------------------------------------------------------------------- /usp/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions used by the sitemap parser.""" 2 | 3 | 4 | class SitemapException(Exception): 5 | """ 6 | Problem due to which we can't run further, e.g. wrong input parameters. 7 | """ 8 | pass 9 | 10 | 11 | class SitemapXMLParsingException(Exception): 12 | """ 13 | XML parsing exception to be handled gracefully. 14 | """ 15 | pass 16 | 17 | 18 | class GunzipException(Exception): 19 | """ 20 | gunzip() exception. 21 | """ 22 | pass 23 | 24 | 25 | class StripURLToHomepageException(Exception): 26 | """ 27 | strip_url_to_homepage() exception. 28 | """ 29 | pass 30 | -------------------------------------------------------------------------------- /docs/usp.objects.rst: -------------------------------------------------------------------------------- 1 | usp.objects package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | usp.objects.page module 8 | --------------------------------------- 9 | 10 | .. automodule:: usp.objects.page 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | usp.objects.sitemap module 16 | --------------------------------------- 17 | 18 | .. automodule:: usp.objects.sitemap 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: usp.objects 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/usp.rst: -------------------------------------------------------------------------------- 1 | usp package 2 | =========== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | usp.objects 10 | usp.web_client 11 | 12 | Submodules 13 | ---------- 14 | 15 | usp.exceptions module 16 | --------------------- 17 | 18 | .. automodule:: usp.exceptions 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | usp.tree module 24 | --------------- 25 | 26 | .. automodule:: usp.tree 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: usp 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/usp.web_client.rst: -------------------------------------------------------------------------------- 1 | usp.web\_client package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | usp.web\_client.abstract\_client module 8 | --------------------------------------- 9 | 10 | .. automodule:: usp.web_client.abstract_client 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | usp.web\_client.requests\_client module 16 | --------------------------------------- 17 | 18 | .. automodule:: usp.web_client.requests_client 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: usp.web_client 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /.idea/mediacloud-ultimate-sitemap-parser.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | import gzip as gzip_lib 2 | 3 | from typing import Union 4 | 5 | 6 | def gzip(data: Union[str, bytes]) -> bytes: 7 | """Gzip data.""" 8 | 9 | if data is None: 10 | raise Exception("Data is None.") 11 | 12 | if isinstance(data, str): 13 | data = data.encode('utf-8') 14 | 15 | if not isinstance(data, bytes): 16 | raise Exception("Data is not str or bytes: %s" % str(data)) 17 | 18 | try: 19 | gzipped_data = gzip_lib.compress(data, compresslevel=9) 20 | except Exception as ex: 21 | raise Exception("Unable to gzip data: %s" % str(ex)) 22 | 23 | if gzipped_data is None: 24 | raise Exception("Gzipped data is None.") 25 | 26 | if not isinstance(gzipped_data, bytes): 27 | raise Exception("Gzipped data is not bytes.") 28 | 29 | return gzipped_data 30 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | -------------------------------------------------------------------------------- /.idea/runConfigurations/pytest_in_test_helpers_py.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | -------------------------------------------------------------------------------- /.idea/runConfigurations/pytest_in_test_tree_py.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.5' 4 | - '3.6' 5 | - nightly 6 | matrix: 7 | include: 8 | - python: 3.7 9 | dist: xenial 10 | sudo: true 11 | before_install: 12 | - pip install coverage coveralls 13 | install: 14 | - pip install . 15 | script: 16 | - pip install .[test] 17 | - coverage run --source=usp setup.py test 18 | after_success: 19 | - coveralls 20 | deploy: 21 | provider: pypi 22 | skip_existing: true 23 | user: mediacloud-travis 24 | on: 25 | tags: true 26 | distributions: "sdist bdist_wheel" 27 | password: 28 | secure: d2oQd9ojE8K50uGgjuisonPGEC4NLGVHbAx/IFDCC3K5/oVDeNG7BuIqQdNS0ObFJWH8yjHDcjoq1J1RvRhJlNNWYercm5qN+3ANMePINBt7iCgzcoSA8/MyyKvlId/8VqEnbU1ZD4ou3QBfG5y2AXzrGZSS3qJ7TlT5mt8N31bDAdB2CsR3bRcVjtylu8zPuFarhpnn0X7y/T/jOWhVuO8OI2kd2P+3h9zR88nJVv8xThsCclwHqZns48yDmOHKpAjuSAeexUdnbLNPadSS3ial79WGcnjsnfb5vNTrp3H9dhQLoIJbCXjemwgWiGadOee0HQJDZvdNPJzojw6QXiXASORmVhLV3I1IKa0g+m2HPcGqBKWMloAvVQEd4d9SKH6/lf0unSIOb1UAeMASPsZTw//60pBH8L7SmcwtskJNfAr2RnUDK7P6C/vkwEYiET44DCPzRGzcoaQfp/Cybh9tSxbHpdqkCkW59VeVFA0dWgSrVfywwDbFACky0gHK/YEQK45dzRrUxKTBcYBD3RH2iIjZsMEesTSfjZ9ePT3gUdA4sqsYRASGq8nEqFWY32dqr/4JQulzFffdVQcbnrk/gxk1mRFIjboyx6c0vJaroO+tsNRQSlnMa8PbH6BE02YIRFgHPauLx8XZpOruXYCap6Mr8w6VvWjYH8M41uk= 29 | 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | from usp.__about__ import __version__ 6 | 7 | 8 | def __readme(): 9 | with open('README.rst', mode='r', encoding='utf-8') as f: 10 | return f.read() 11 | 12 | 13 | tests_require = [ 14 | 15 | # Mock HTTP server 16 | 'requests_mock>=1.6.0,<2.0', 17 | 18 | # Running tests 19 | 'pytest>=2.8', 20 | 21 | ] 22 | 23 | setup( 24 | name='ultimate-sitemap-parser', 25 | version=__version__, 26 | description='Ultimate Sitemap Parser', 27 | long_description=__readme(), 28 | author='Linas Valiukas, Hal Roberts, Media Cloud project', 29 | author_email='linas@media.mit.edu, hroberts@cyber.law.harvard.edu', 30 | url='https://github.com/mediacloud/ultimate-sitemap-parser', 31 | license='GPLv3+', 32 | keywords="sitemap sitemap-xml parser", 33 | packages=find_packages(exclude=['tests']), 34 | zip_safe=True, 35 | python_requires='>=3.5', 36 | install_requires=[ 37 | 38 | # Parsing arbitrary dates (sitemap date format is standardized but some implementations take liberties) 39 | 'python-dateutil>=2.1,<3.0.0', 40 | 41 | # Making HTTP requests 42 | 'requests>=2.2.1', 43 | 44 | ], 45 | setup_requires=[ 46 | 47 | # Running tests as part of setup.py 48 | 'pytest-runner>=4.2,<5.0', 49 | 50 | ], 51 | tests_require=tests_require, 52 | extras_require={ 53 | 'test': tests_require, 54 | }, 55 | classifiers=[ 56 | 'Development Status :: 3 - Alpha', 57 | 'Intended Audience :: Developers', 58 | 'Intended Audience :: Information Technology', 59 | 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 60 | 'Programming Language :: Python', 61 | 'Operating System :: OS Independent', 62 | 'Programming Language :: Python :: 3.5', 63 | 'Programming Language :: Python :: 3.6', 64 | 'Programming Language :: Python :: 3.7', 65 | 'Programming Language :: Python :: 3.8', 66 | 'Programming Language :: Python :: 3 :: Only', 67 | 'Programming Language :: Python :: Implementation :: PyPy', 68 | 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 69 | 'Topic :: Text Processing :: Indexing', 70 | 'Topic :: Text Processing :: Markup :: XML', 71 | ] 72 | ) 73 | -------------------------------------------------------------------------------- /usp/log.py: -------------------------------------------------------------------------------- 1 | """Logging utilities.""" 2 | 3 | import logging 4 | 5 | 6 | class Logger(object): 7 | """ 8 | Logging helper class. 9 | """ 10 | 11 | __LEVELS = { 12 | 'CRITICAL': logging.CRITICAL, 13 | 'ERROR': logging.ERROR, 14 | 'WARNING': logging.WARNING, 15 | 'INFO': logging.INFO, 16 | 'DEBUG': logging.DEBUG, 17 | } 18 | """Valid logging levels and their "logging" counterparts.""" 19 | 20 | __DEFAULT_LEVEL = 'INFO' 21 | """Default logging level.""" 22 | 23 | __slots__ = [ 24 | # "logging" object 25 | '__l', 26 | ] 27 | 28 | def __init__(self, name: str): 29 | """ 30 | Initialize logger object for a given name. 31 | 32 | :param name: Module name that the logger should be initialized for. 33 | """ 34 | 35 | self.__l = logging.getLogger(name) 36 | if not self.__l.handlers: 37 | formatter = logging.Formatter( 38 | fmt='%(asctime)s %(levelname)s %(name)s [%(process)d/%(threadName)s]: %(message)s' 39 | ) 40 | 41 | handler = logging.StreamHandler() 42 | handler.setFormatter(formatter) 43 | self.__l.addHandler(handler) 44 | 45 | self.__l.setLevel(self.__LEVELS[self.__DEFAULT_LEVEL]) 46 | 47 | # Don't propagate handler to root logger 48 | # (http://stackoverflow.com/a/21127526/200603) 49 | self.__l.propagate = False 50 | 51 | def error(self, message: str) -> None: 52 | """ 53 | Log error message. 54 | 55 | :param message: Message to log. 56 | """ 57 | self.__l.error(message) 58 | 59 | def warning(self, message: str) -> None: 60 | """ 61 | Log warning message. 62 | 63 | :param message: Message to log. 64 | """ 65 | self.__l.warning(message) 66 | 67 | def info(self, message: str) -> None: 68 | """ 69 | Log informational message. 70 | 71 | :param message: Message to log. 72 | """ 73 | self.__l.info(message) 74 | 75 | def debug(self, message: str) -> None: 76 | """ 77 | Log debugging message. 78 | 79 | :param message: Message to log. 80 | """ 81 | self.__l.debug(message) 82 | 83 | 84 | def create_logger(name: str) -> Logger: 85 | """ 86 | Create and return Logger object. 87 | 88 | :param name: Module name that the logger should be initialized for. 89 | :return: Logger object. 90 | """ 91 | return Logger(name=name) 92 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://travis-ci.org/mediacloud/ultimate-sitemap-parser.svg?branch=develop 2 | :target: https://travis-ci.org/mediacloud/ultimate-sitemap-parser 3 | :alt: Build Status 4 | 5 | .. image:: https://readthedocs.org/projects/ultimate-sitemap-parser/badge/?version=latest 6 | :target: https://ultimate-sitemap-parser.readthedocs.io/en/latest/?badge=latest 7 | :alt: Documentation Status 8 | 9 | .. image:: https://coveralls.io/repos/github/mediacloud/ultimate-sitemap-parser/badge.svg?branch=develop 10 | :target: https://coveralls.io/github/mediacloud/ultimate-sitemap-parser?branch=develop 11 | :alt: Coverage Status 12 | 13 | .. image:: https://badge.fury.io/py/ultimate-sitemap-parser.svg 14 | :target: https://badge.fury.io/py/ultimate-sitemap-parser 15 | :alt: PyPI package 16 | 17 | .. image:: https://pepy.tech/badge/ultimate-sitemap-parser 18 | :target: https://pepy.tech/project/ultimate-sitemap-parser 19 | :alt: Download stats 20 | 21 | 22 | Website sitemap parser for Python 3.5+. 23 | 24 | 25 | Features 26 | ======== 27 | 28 | - Supports all sitemap formats: 29 | 30 | - `XML sitemaps `_ 31 | - `Google News sitemaps `_ 32 | - `plain text sitemaps `_ 33 | - `RSS 2.0 / Atom 0.3 / Atom 1.0 sitemaps `_ 34 | - `Sitemaps linked from robots.txt `_ 35 | 36 | - Field-tested with ~1 million URLs as part of the `Media Cloud project `_ 37 | - Error-tolerant with more common sitemap bugs 38 | - Tries to find sitemaps not listed in ``robots.txt`` 39 | - Uses fast and memory efficient Expat XML parsing 40 | - Doesn't consume much memory even with massive sitemap hierarchies 41 | - Provides a generated sitemap tree as easy to use object tree 42 | - Supports using a custom web client 43 | - Uses a small number of actively maintained third-party modules 44 | - Reasonably tested 45 | 46 | 47 | Installation 48 | ============ 49 | 50 | .. code:: sh 51 | 52 | pip install ultimate-sitemap-parser 53 | 54 | 55 | Usage 56 | ===== 57 | 58 | .. code:: python 59 | 60 | from usp.tree import sitemap_tree_for_homepage 61 | 62 | tree = sitemap_tree_for_homepage('https://www.nytimes.com/') 63 | print(tree) 64 | 65 | ``sitemap_tree_for_homepage()`` will return a tree of ``AbstractSitemap`` subclass objects that represent the sitemap 66 | hierarchy found on the website; see a `reference of AbstractSitemap subclasses `_. 67 | 68 | If you'd like to just list all the pages found in all of the sitemaps within the website, consider using ``all_pages()`` method: 69 | 70 | .. code:: python 71 | 72 | # all_pages() returns an Iterator 73 | for page in tree.all_pages(): 74 | print(page) 75 | 76 | ``all_pages()`` method will return an iterator yielding ``SitemapPage`` objects; see a `reference of SitemapPage `_. 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | 117 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 118 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 119 | 120 | # User-specific stuff 121 | .idea/**/workspace.xml 122 | .idea/**/tasks.xml 123 | .idea/**/usage.statistics.xml 124 | .idea/**/dictionaries 125 | .idea/**/shelf 126 | 127 | # Generated files 128 | .idea/**/contentModel.xml 129 | 130 | # Sensitive or high-churn files 131 | .idea/**/dataSources/ 132 | .idea/**/dataSources.ids 133 | .idea/**/dataSources.local.xml 134 | .idea/**/sqlDataSources.xml 135 | .idea/**/dynamic.xml 136 | .idea/**/uiDesigner.xml 137 | .idea/**/dbnavigator.xml 138 | 139 | # Gradle 140 | .idea/**/gradle.xml 141 | .idea/**/libraries 142 | 143 | # Gradle and Maven with auto-import 144 | # When using Gradle or Maven with auto-import, you should exclude module files, 145 | # since they will be recreated, and may cause churn. Uncomment if using 146 | # auto-import. 147 | # .idea/modules.xml 148 | # .idea/*.iml 149 | # .idea/modules 150 | 151 | # CMake 152 | cmake-build-*/ 153 | 154 | # Mongo Explorer plugin 155 | .idea/**/mongoSettings.xml 156 | 157 | # File-based project format 158 | *.iws 159 | 160 | # IntelliJ 161 | out/ 162 | 163 | # mpeltonen/sbt-idea plugin 164 | .idea_modules/ 165 | 166 | # JIRA plugin 167 | atlassian-ide-plugin.xml 168 | 169 | # Cursive Clojure plugin 170 | .idea/replstate.xml 171 | 172 | # Crashlytics plugin (for Android Studio and IntelliJ) 173 | com_crashlytics_export_strings.xml 174 | crashlytics.properties 175 | crashlytics-build.properties 176 | fabric.properties 177 | 178 | # Editor-based Rest Client 179 | .idea/httpRequests 180 | 181 | # Android studio 3.1+ serialized cache file 182 | .idea/caches/build_file_checksums.ser 183 | 184 | -------------------------------------------------------------------------------- /usp/web_client/requests_client.py: -------------------------------------------------------------------------------- 1 | """requests-based implementation of web client class.""" 2 | 3 | from http import HTTPStatus 4 | from typing import Optional, Dict 5 | 6 | import requests 7 | 8 | from .abstract_client import ( 9 | AbstractWebClient, 10 | AbstractWebClientResponse, 11 | AbstractWebClientSuccessResponse, 12 | WebClientErrorResponse, 13 | RETRYABLE_HTTP_STATUS_CODES, 14 | ) 15 | from usp.__about__ import __version__ 16 | 17 | 18 | class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse): 19 | """ 20 | requests-based successful response. 21 | """ 22 | 23 | __slots__ = [ 24 | '__requests_response', 25 | '__max_response_data_length', 26 | ] 27 | 28 | def __init__(self, requests_response: requests.Response, max_response_data_length: Optional[int] = None): 29 | self.__requests_response = requests_response 30 | self.__max_response_data_length = max_response_data_length 31 | 32 | def status_code(self) -> int: 33 | return int(self.__requests_response.status_code) 34 | 35 | def status_message(self) -> str: 36 | message = self.__requests_response.reason 37 | if not message: 38 | message = HTTPStatus(self.status_code(), None).phrase 39 | return message 40 | 41 | def header(self, case_insensitive_name: str) -> Optional[str]: 42 | return self.__requests_response.headers.get(case_insensitive_name.lower(), None) 43 | 44 | def raw_data(self) -> bytes: 45 | if self.__max_response_data_length: 46 | data = self.__requests_response.content[:self.__max_response_data_length] 47 | else: 48 | data = self.__requests_response.content 49 | 50 | return data 51 | 52 | 53 | class RequestsWebClientErrorResponse(WebClientErrorResponse): 54 | """ 55 | requests-based error response. 56 | """ 57 | pass 58 | 59 | 60 | class RequestsWebClient(AbstractWebClient): 61 | """requests-based web client to be used by the sitemap fetcher.""" 62 | 63 | __USER_AGENT = 'ultimate_sitemap_parser/{}'.format(__version__) 64 | 65 | __HTTP_REQUEST_TIMEOUT = 60 66 | """ 67 | HTTP request timeout. 68 | 69 | Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big. 70 | """ 71 | 72 | __slots__ = [ 73 | '__max_response_data_length', 74 | '__timeout', 75 | '__proxies', 76 | ] 77 | 78 | def __init__(self): 79 | self.__max_response_data_length = None 80 | self.__timeout = self.__HTTP_REQUEST_TIMEOUT 81 | self.__proxies = {} 82 | 83 | def set_timeout(self, timeout: int) -> None: 84 | """Set HTTP request timeout.""" 85 | # Used mostly for testing 86 | self.__timeout = timeout 87 | 88 | def set_proxies(self, proxies: Dict[str, str]) -> None: 89 | """ 90 | Set proxies from dictionnary where: 91 | 92 | * keys are schemes, e.g. "http" or "https"; 93 | * values are "scheme://user:password@host:port/". 94 | 95 | For example: 96 | 97 | proxies = {'http': 'http://user:pass@10.10.1.10:3128/'} 98 | """ 99 | # Used mostly for testing 100 | self.__proxies = proxies 101 | 102 | def set_max_response_data_length(self, max_response_data_length: int) -> None: 103 | self.__max_response_data_length = max_response_data_length 104 | 105 | def get(self, url: str) -> AbstractWebClientResponse: 106 | try: 107 | response = requests.get( 108 | url, 109 | timeout=self.__timeout, 110 | stream=True, 111 | headers={'User-Agent': self.__USER_AGENT}, 112 | proxies=self.__proxies 113 | ) 114 | except requests.exceptions.Timeout as ex: 115 | # Retryable timeouts 116 | return RequestsWebClientErrorResponse(message=str(ex), retryable=True) 117 | 118 | except requests.exceptions.RequestException as ex: 119 | # Other errors, e.g. redirect loops 120 | return RequestsWebClientErrorResponse(message=str(ex), retryable=False) 121 | 122 | else: 123 | 124 | if 200 <= response.status_code < 300: 125 | return RequestsWebClientSuccessResponse( 126 | requests_response=response, 127 | max_response_data_length=self.__max_response_data_length, 128 | ) 129 | else: 130 | 131 | message = '{} {}'.format(response.status_code, response.reason) 132 | 133 | if response.status_code in RETRYABLE_HTTP_STATUS_CODES: 134 | return RequestsWebClientErrorResponse(message=message, retryable=True) 135 | else: 136 | return RequestsWebClientErrorResponse(message=message, retryable=False) 137 | -------------------------------------------------------------------------------- /usp/tree.py: -------------------------------------------------------------------------------- 1 | """Helpers to generate a sitemap tree.""" 2 | 3 | from typing import Optional 4 | 5 | from .exceptions import SitemapException 6 | from .fetch_parse import SitemapFetcher 7 | from .helpers import is_http_url, strip_url_to_homepage 8 | from .log import create_logger 9 | from .objects.sitemap import AbstractSitemap, InvalidSitemap, IndexWebsiteSitemap, IndexRobotsTxtSitemap 10 | from .web_client.abstract_client import AbstractWebClient 11 | 12 | log = create_logger(__name__) 13 | 14 | _UNPUBLISHED_SITEMAP_PATHS = { 15 | 'sitemap.xml', 16 | 'sitemap.xml.gz', 17 | 'sitemap_index.xml', 18 | 'sitemap-index.xml', 19 | 'sitemap_index.xml.gz', 20 | 'sitemap-index.xml.gz', 21 | '.sitemap.xml', 22 | 'sitemap', 23 | 'admin/config/search/xmlsitemap', 24 | 'sitemap/sitemap-index.xml', 25 | 'sitemap_news.xml', 26 | 'sitemap-news.xml', 27 | 'sitemap_news.xml.gz', 28 | 'sitemap-news.xml.gz', 29 | } 30 | """Paths which are not exposed in robots.txt but might still contain a sitemap.""" 31 | 32 | 33 | def sitemap_tree(homepage_url: str, sitemap_url: str, web_client: Optional[AbstractWebClient] = None) -> AbstractSitemap: 34 | """ 35 | Using a sitemap URL, fetch the tree of sitemaps and pages listed in them. 36 | 37 | :param sitemap_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/". 38 | :param web_client: Web client implementation to use for fetching sitemaps. 39 | :return: Root sitemap object of the fetched sitemap tree. 40 | """ 41 | 42 | if not is_http_url(homepage_url): 43 | raise SitemapException("URL {} is not a HTTP(s) URL.".format(homepage_url)) 44 | 45 | stripped_homepage_url = strip_url_to_homepage(url=homepage_url) 46 | if homepage_url != stripped_homepage_url: 47 | log.warning("Assuming that the homepage of {} is {}".format(homepage_url, stripped_homepage_url)) 48 | homepage_url = stripped_homepage_url 49 | 50 | if not homepage_url.endswith('/'): 51 | homepage_url += '/' 52 | 53 | sitemaps = [] 54 | 55 | unpublished_sitemap_fetcher = SitemapFetcher( 56 | url=sitemap_url, 57 | web_client=web_client, 58 | recursion_level=0, 59 | ) 60 | unpublished_sitemap = unpublished_sitemap_fetcher.sitemap() 61 | 62 | # Skip the ones that weren't found 63 | if not isinstance(unpublished_sitemap, InvalidSitemap): 64 | sitemaps.append(unpublished_sitemap) 65 | 66 | index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps) 67 | 68 | return index_sitemap 69 | 70 | 71 | def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWebClient] = None) -> AbstractSitemap: 72 | """ 73 | Using a homepage URL, fetch the tree of sitemaps and pages listed in them. 74 | 75 | :param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/". 76 | :param web_client: Web client implementation to use for fetching sitemaps. 77 | :return: Root sitemap object of the fetched sitemap tree. 78 | """ 79 | 80 | if not is_http_url(homepage_url): 81 | raise SitemapException("URL {} is not a HTTP(s) URL.".format(homepage_url)) 82 | 83 | stripped_homepage_url = strip_url_to_homepage(url=homepage_url) 84 | if homepage_url != stripped_homepage_url: 85 | log.warning("Assuming that the homepage of {} is {}".format(homepage_url, stripped_homepage_url)) 86 | homepage_url = stripped_homepage_url 87 | 88 | if not homepage_url.endswith('/'): 89 | homepage_url += '/' 90 | robots_txt_url = homepage_url + 'robots.txt' 91 | 92 | sitemaps = [] 93 | 94 | robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, web_client=web_client, recursion_level=0) 95 | robots_txt_sitemap = robots_txt_fetcher.sitemap() 96 | sitemaps.append(robots_txt_sitemap) 97 | 98 | sitemap_urls_found_in_robots_txt = set() 99 | if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap): 100 | for sub_sitemap in robots_txt_sitemap.sub_sitemaps: 101 | sitemap_urls_found_in_robots_txt.add(sub_sitemap.url) 102 | 103 | for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS: 104 | unpublished_sitemap_url = homepage_url + unpublished_sitemap_path 105 | 106 | # Don't refetch URLs already found in robots.txt 107 | if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt: 108 | 109 | unpublished_sitemap_fetcher = SitemapFetcher( 110 | url=unpublished_sitemap_url, 111 | web_client=web_client, 112 | recursion_level=0, 113 | ) 114 | unpublished_sitemap = unpublished_sitemap_fetcher.sitemap() 115 | 116 | # Skip the ones that weren't found 117 | if not isinstance(unpublished_sitemap, InvalidSitemap): 118 | sitemaps.append(unpublished_sitemap) 119 | 120 | index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps) 121 | 122 | return index_sitemap 123 | -------------------------------------------------------------------------------- /tests/web_client/test_requests_client.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from http import HTTPStatus 3 | from unittest import TestCase 4 | 5 | import requests_mock 6 | 7 | from usp.__about__ import __version__ 8 | from usp.web_client.abstract_client import ( 9 | AbstractWebClientSuccessResponse, 10 | WebClientErrorResponse, 11 | ) 12 | from usp.web_client.requests_client import RequestsWebClient 13 | 14 | 15 | class TestRequestsClient(TestCase): 16 | TEST_BASE_URL = 'http://test-ultimate-sitemap-parser.com' # mocked by HTTPretty 17 | TEST_CONTENT_TYPE = 'text/html' 18 | 19 | __slots__ = [ 20 | '__client', 21 | ] 22 | 23 | def setUp(self) -> None: 24 | super().setUp() 25 | 26 | self.__client = RequestsWebClient() 27 | 28 | def test_get(self): 29 | with requests_mock.Mocker() as m: 30 | test_url = self.TEST_BASE_URL + '/' 31 | test_content = 'This is a homepage.' 32 | 33 | m.get( 34 | test_url, 35 | headers={'Content-Type': self.TEST_CONTENT_TYPE}, 36 | text=test_content, 37 | ) 38 | 39 | response = self.__client.get(test_url) 40 | 41 | assert response 42 | assert isinstance(response, AbstractWebClientSuccessResponse) 43 | assert response.status_code() == HTTPStatus.OK.value 44 | assert response.status_message() == HTTPStatus.OK.phrase 45 | assert response.header('Content-Type') == self.TEST_CONTENT_TYPE 46 | assert response.header('content-type') == self.TEST_CONTENT_TYPE 47 | assert response.header('nonexistent') is None 48 | assert response.raw_data().decode('utf-8') == test_content 49 | 50 | def test_get_user_agent(self): 51 | with requests_mock.Mocker() as m: 52 | test_url = self.TEST_BASE_URL + '/' 53 | 54 | def content_user_agent(request, context): 55 | context.status_code = HTTPStatus.OK.value 56 | return request.headers.get('User-Agent', 'unknown') 57 | 58 | m.get( 59 | test_url, 60 | text=content_user_agent, 61 | ) 62 | 63 | response = self.__client.get(test_url) 64 | 65 | assert response 66 | assert isinstance(response, AbstractWebClientSuccessResponse) 67 | 68 | content = response.raw_data().decode('utf-8') 69 | assert content == 'ultimate_sitemap_parser/{}'.format(__version__) 70 | 71 | def test_get_not_found(self): 72 | with requests_mock.Mocker() as m: 73 | test_url = self.TEST_BASE_URL + '/404.html' 74 | 75 | m.get( 76 | test_url, 77 | status_code=HTTPStatus.NOT_FOUND.value, 78 | reason=HTTPStatus.NOT_FOUND.phrase, 79 | headers={'Content-Type': self.TEST_CONTENT_TYPE}, 80 | text='This page does not exist.', 81 | ) 82 | 83 | response = self.__client.get(test_url) 84 | 85 | assert response 86 | assert isinstance(response, WebClientErrorResponse) 87 | assert response.retryable() is False 88 | 89 | def test_get_nonexistent_domain(self): 90 | test_url = 'http://www.totallydoesnotexisthjkfsdhkfsd.com/some_page.html' 91 | 92 | response = self.__client.get(test_url) 93 | 94 | assert response 95 | assert isinstance(response, WebClientErrorResponse) 96 | assert response.retryable() is False 97 | assert 'Failed to establish a new connection' in response.message() 98 | 99 | def test_get_timeout(self): 100 | sock = socket.socket() 101 | sock.bind(('', 0)) 102 | socket_port = sock.getsockname()[1] 103 | assert socket_port 104 | sock.listen(1) 105 | 106 | test_timeout = 1 107 | test_url = 'http://127.0.0.1:{}/slow_page.html'.format(socket_port) 108 | 109 | self.__client.set_timeout(test_timeout) 110 | 111 | response = self.__client.get(test_url) 112 | 113 | sock.close() 114 | 115 | assert response 116 | assert isinstance(response, WebClientErrorResponse) 117 | assert response.retryable() is True 118 | assert 'Read timed out' in response.message() 119 | 120 | def test_get_max_response_data_length(self): 121 | with requests_mock.Mocker() as m: 122 | actual_length = 1024 * 1024 123 | max_length = 1024 * 512 124 | 125 | test_url = self.TEST_BASE_URL + '/huge_page.html' 126 | test_content = 'a' * actual_length 127 | 128 | m.get( 129 | test_url, 130 | headers={'Content-Type': self.TEST_CONTENT_TYPE}, 131 | text=test_content, 132 | ) 133 | 134 | self.__client.set_max_response_data_length(max_length) 135 | 136 | response = self.__client.get(test_url) 137 | 138 | assert response 139 | assert isinstance(response, AbstractWebClientSuccessResponse) 140 | 141 | response_length = len(response.raw_data()) 142 | assert response_length == max_length 143 | -------------------------------------------------------------------------------- /usp/web_client/abstract_client.py: -------------------------------------------------------------------------------- 1 | """Abstract web client class.""" 2 | 3 | import abc 4 | from http import HTTPStatus 5 | from typing import Optional 6 | 7 | RETRYABLE_HTTP_STATUS_CODES = { 8 | 9 | # Some servers return "400 Bad Request" initially but upon retry start working again, no idea why 10 | int(HTTPStatus.BAD_REQUEST), 11 | 12 | # If we timed out requesting stuff, we can just try again 13 | int(HTTPStatus.REQUEST_TIMEOUT), 14 | 15 | # If we got rate limited, it makes sense to wait a bit 16 | int(HTTPStatus.TOO_MANY_REQUESTS), 17 | 18 | # Server might be just fine on a subsequent attempt 19 | int(HTTPStatus.INTERNAL_SERVER_ERROR), 20 | 21 | # Upstream might reappear on a retry 22 | int(HTTPStatus.BAD_GATEWAY), 23 | 24 | # Service might become available again on a retry 25 | int(HTTPStatus.SERVICE_UNAVAILABLE), 26 | 27 | # Upstream might reappear on a retry 28 | int(HTTPStatus.GATEWAY_TIMEOUT), 29 | 30 | # (unofficial) 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel) 31 | 509, 32 | 33 | # (unofficial) 598 Network read timeout error 34 | 598, 35 | 36 | # (unofficial, nginx) 499 Client Closed Request 37 | 499, 38 | 39 | # (unofficial, Cloudflare) 520 Unknown Error 40 | 520, 41 | 42 | # (unofficial, Cloudflare) 521 Web Server Is Down 43 | 521, 44 | 45 | # (unofficial, Cloudflare) 522 Connection Timed Out 46 | 522, 47 | 48 | # (unofficial, Cloudflare) 523 Origin Is Unreachable 49 | 523, 50 | 51 | # (unofficial, Cloudflare) 524 A Timeout Occurred 52 | 524, 53 | 54 | # (unofficial, Cloudflare) 525 SSL Handshake Failed 55 | 525, 56 | 57 | # (unofficial, Cloudflare) 526 Invalid SSL Certificate 58 | 526, 59 | 60 | # (unofficial, Cloudflare) 527 Railgun Error 61 | 527, 62 | 63 | # (unofficial, Cloudflare) 530 Origin DNS Error 64 | 530, 65 | 66 | } 67 | """HTTP status codes on which a request should be retried.""" 68 | 69 | 70 | class AbstractWebClientResponse(object, metaclass=abc.ABCMeta): 71 | """ 72 | Abstract response. 73 | """ 74 | pass 75 | 76 | 77 | class AbstractWebClientSuccessResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta): 78 | """ 79 | Successful response. 80 | """ 81 | 82 | @abc.abstractmethod 83 | def status_code(self) -> int: 84 | """ 85 | Return HTTP status code of the response. 86 | 87 | :return: HTTP status code of the response, e.g. 200. 88 | """ 89 | raise NotImplementedError("Abstract method.") 90 | 91 | @abc.abstractmethod 92 | def status_message(self) -> str: 93 | """ 94 | Return HTTP status message of the response. 95 | 96 | :return: HTTP status message of the response, e.g. "OK". 97 | """ 98 | raise NotImplementedError("Abstract method.") 99 | 100 | @abc.abstractmethod 101 | def header(self, case_insensitive_name: str) -> Optional[str]: 102 | """ 103 | Return HTTP header value for a given case-insensitive name, or None if such header wasn't set. 104 | 105 | :param case_insensitive_name: HTTP header's name, e.g. "Content-Type". 106 | :return: HTTP header's value, or None if it was unset. 107 | """ 108 | raise NotImplementedError("Abstract method.") 109 | 110 | @abc.abstractmethod 111 | def raw_data(self) -> bytes: 112 | """ 113 | Return encoded raw data of the response. 114 | 115 | :return: Encoded raw data of the response. 116 | """ 117 | raise NotImplementedError("Abstract method.") 118 | 119 | 120 | class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta): 121 | """ 122 | Error response. 123 | """ 124 | 125 | __slots__ = [ 126 | '_message', 127 | '_retryable', 128 | ] 129 | 130 | def __init__(self, message: str, retryable: bool): 131 | """ 132 | Constructor. 133 | 134 | :param message: Message describing what went wrong. 135 | :param retryable: True if the request should be retried. 136 | """ 137 | super().__init__() 138 | self._message = message 139 | self._retryable = retryable 140 | 141 | def message(self) -> str: 142 | """ 143 | Return message describing what went wrong. 144 | 145 | :return: Message describing what went wrong. 146 | """ 147 | return self._message 148 | 149 | def retryable(self) -> bool: 150 | """ 151 | Return True if request should be retried. 152 | 153 | :return: True if request should be retried. 154 | """ 155 | return self._retryable 156 | 157 | 158 | class AbstractWebClient(object, metaclass=abc.ABCMeta): 159 | """ 160 | Abstract web client to be used by the sitemap fetcher. 161 | """ 162 | 163 | @abc.abstractmethod 164 | def set_max_response_data_length(self, max_response_data_length: int) -> None: 165 | """ 166 | Set the maximum number of bytes that the web client will fetch. 167 | 168 | :param max_response_data_length: Maximum number of bytes that the web client will fetch. 169 | """ 170 | raise NotImplementedError("Abstract method.") 171 | 172 | @abc.abstractmethod 173 | def get(self, url: str) -> AbstractWebClientResponse: 174 | """ 175 | Fetch an URL and return a response. 176 | 177 | Method shouldn't throw exceptions on connection errors (including timeouts); instead, such errors should be 178 | reported via Response object. 179 | 180 | :param url: URL to fetch. 181 | :return: Response object. 182 | """ 183 | raise NotImplementedError("Abstract method.") 184 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pytest 4 | 5 | from usp.exceptions import StripURLToHomepageException, SitemapException, GunzipException 6 | from usp.helpers import ( 7 | html_unescape_strip, 8 | parse_iso8601_date, 9 | is_http_url, 10 | strip_url_to_homepage, 11 | parse_rfc2822_date, 12 | gunzip, 13 | ) 14 | 15 | 16 | def test_html_unescape_strip(): 17 | assert html_unescape_strip(" tests & tests ") == "tests & tests" 18 | assert html_unescape_strip(None) is None 19 | 20 | 21 | def test_parse_iso8601_date(): 22 | with pytest.raises(SitemapException): 23 | # noinspection PyTypeChecker 24 | parse_iso8601_date(None) 25 | 26 | with pytest.raises(SitemapException): 27 | parse_iso8601_date('') 28 | 29 | assert parse_iso8601_date("1997-07-16") == datetime.datetime(year=1997, month=7, day=16) 30 | assert parse_iso8601_date("1997-07-16T19:20+01:00") == datetime.datetime( 31 | year=1997, month=7, day=16, hour=19, minute=20, 32 | tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), 33 | ) 34 | assert parse_iso8601_date("1997-07-16T19:20:30+01:00") == datetime.datetime( 35 | year=1997, month=7, day=16, hour=19, minute=20, second=30, 36 | tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), 37 | ) 38 | assert parse_iso8601_date("1997-07-16T19:20:30.45+01:00") == datetime.datetime( 39 | year=1997, month=7, day=16, hour=19, minute=20, second=30, microsecond=450000, 40 | tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)), 41 | ) 42 | 43 | # "Z" timezone instead of "+\d\d:\d\d" 44 | assert parse_iso8601_date("2018-01-12T21:57:27Z") == datetime.datetime( 45 | year=2018, month=1, day=12, hour=21, minute=57, second=27, tzinfo=datetime.timezone.utc, 46 | ) 47 | 48 | 49 | def test_parse_rfc2822_date(): 50 | assert parse_rfc2822_date("Tue, 10 Aug 2010 20:43:53 -0000") == datetime.datetime( 51 | year=2010, month=8, day=10, hour=20, minute=43, second=53, microsecond=0, 52 | tzinfo=datetime.timezone(datetime.timedelta(seconds=0)), 53 | ) 54 | 55 | assert parse_rfc2822_date("Thu, 17 Dec 2009 12:04:56 +0200") == datetime.datetime( 56 | year=2009, month=12, day=17, hour=12, minute=4, second=56, microsecond=0, 57 | tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), 58 | ) 59 | 60 | 61 | # noinspection SpellCheckingInspection 62 | def test_is_http_url(): 63 | # noinspection PyTypeChecker 64 | assert not is_http_url(None) 65 | assert not is_http_url('') 66 | 67 | assert not is_http_url('abc') 68 | assert not is_http_url('/abc') 69 | assert not is_http_url('//abc') 70 | assert not is_http_url('///abc') 71 | 72 | assert not is_http_url('gopher://gopher.floodgap.com/0/v2/vstat') 73 | assert not is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/') 74 | 75 | assert is_http_url('http://cyber.law.harvard.edu/about') 76 | assert is_http_url('https://github.com/mediacloud/backend') 77 | 78 | # URLs with port, HTTP auth, localhost 79 | assert is_http_url('https://username:password@domain.com:12345/path?query=string#fragment') 80 | assert is_http_url('http://localhost:9998/feed') 81 | assert is_http_url('http://127.0.0.1:12345/456789') 82 | assert is_http_url('http://127.0.00000000.1:8899/tweet_url?id=47') 83 | 84 | # Travis URL 85 | assert is_http_url('http://testing-gce-286b4005-b1ae-4b1a-a0d8-faf85e39ca92:37873/gv/tests.rss') 86 | 87 | # URLs with mistakes fixable by fix_common_url_mistakes() 88 | assert not is_http_url( 89 | 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled' 90 | ) 91 | 92 | # UTF-8 in paths 93 | assert is_http_url('http://www.example.com/šiaurė.html') 94 | 95 | # IDN 96 | assert is_http_url('http://www.šiaurė.lt/šiaurė.html') 97 | assert is_http_url('http://www.xn--iaur-yva35b.lt/šiaurė.html') 98 | assert is_http_url('http://.xn--iaur-yva35b.lt') is False # Invalid Punycode 99 | 100 | 101 | def test_strip_url_to_homepage(): 102 | assert strip_url_to_homepage('http://www.cwi.nl:80/%7Eguido/Python.html') == 'http://www.cwi.nl:80/' 103 | 104 | # HTTP auth 105 | assert strip_url_to_homepage( 106 | 'http://username:password@www.cwi.nl/page.html' 107 | ) == 'http://username:password@www.cwi.nl/' 108 | 109 | # UTF-8 in paths 110 | assert strip_url_to_homepage('http://www.example.com/šiaurė.html') == 'http://www.example.com/' 111 | 112 | # IDN 113 | assert strip_url_to_homepage('https://www.šiaurė.lt/šiaurė.html') == 'https://www.šiaurė.lt/' 114 | assert strip_url_to_homepage('http://www.xn--iaur-yva35b.lt/šiaurė.html') == 'http://www.xn--iaur-yva35b.lt/' 115 | 116 | with pytest.raises(StripURLToHomepageException): 117 | # noinspection PyTypeChecker 118 | strip_url_to_homepage(None) 119 | 120 | with pytest.raises(StripURLToHomepageException): 121 | strip_url_to_homepage('') 122 | 123 | with pytest.raises(StripURLToHomepageException): 124 | strip_url_to_homepage('not an URL') 125 | 126 | 127 | def test_gunzip(): 128 | with pytest.raises(GunzipException): 129 | # noinspection PyTypeChecker 130 | gunzip(None) 131 | with pytest.raises(GunzipException): 132 | # noinspection PyTypeChecker 133 | gunzip('') 134 | with pytest.raises(GunzipException): 135 | # noinspection PyTypeChecker 136 | gunzip(b'') 137 | with pytest.raises(GunzipException): 138 | # noinspection PyTypeChecker 139 | gunzip('foo') 140 | with pytest.raises(GunzipException): 141 | # noinspection PyTypeChecker 142 | gunzip(b'foo') 143 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | from pathlib import Path 20 | project_path = Path(__file__).absolute().parent.joinpath('..') 21 | 22 | import sys 23 | sys.path.insert(0, project_path.as_posix()) 24 | 25 | from usp.__about__ import __version__ 26 | 27 | # -- Project information ----------------------------------------------------- 28 | 29 | project = 'Ultimate Sitemap Parser' 30 | copyright = '2018, Linas Valiukas, Hal Roberts, Media Cloud project' 31 | author = 'Linas Valiukas, Hal Roberts, Media Cloud project' 32 | 33 | # The short X.Y version 34 | version = __version__ 35 | # The full version, including alpha/beta/rc tags 36 | release = version 37 | 38 | 39 | # -- General configuration --------------------------------------------------- 40 | 41 | # If your documentation needs a minimal Sphinx version, state it here. 42 | # 43 | # needs_sphinx = '1.0' 44 | 45 | # Add any Sphinx extension module names here, as strings. They can be 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 47 | # ones. 48 | extensions = [ 49 | 'sphinx.ext.autodoc', 50 | 'sphinx.ext.doctest', 51 | 'sphinx.ext.viewcode', 52 | ] 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ['_templates'] 56 | 57 | # The suffix(es) of source filenames. 58 | # You can specify multiple suffix as a list of string: 59 | # 60 | # source_suffix = ['.rst', '.md'] 61 | source_suffix = '.rst' 62 | 63 | # The master toctree document. 64 | master_doc = 'index' 65 | 66 | # The language for content autogenerated by Sphinx. Refer to documentation 67 | # for a list of supported languages. 68 | # 69 | # This is also used if you do content translation via gettext catalogs. 70 | # Usually you set "language" from the command line for these cases. 71 | language = None 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | # This pattern also affects html_static_path and html_extra_path. 76 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 77 | 78 | # The name of the Pygments (syntax highlighting) style to use. 79 | pygments_style = None 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'alabaster' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | # Custom sidebar templates, must be a dictionary that maps document names 101 | # to template names. 102 | # 103 | # The default sidebars (for documents that don't match any pattern) are 104 | # defined by theme itself. Builtin themes are using these templates by 105 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 106 | # 'searchbox.html']``. 107 | # 108 | # html_sidebars = {} 109 | 110 | 111 | # -- Options for HTMLHelp output --------------------------------------------- 112 | 113 | # Output file base name for HTML help builder. 114 | htmlhelp_basename = 'UltimateSitemapParserdoc' 115 | 116 | 117 | # -- Options for LaTeX output ------------------------------------------------ 118 | 119 | latex_elements = { 120 | # The paper size ('letterpaper' or 'a4paper'). 121 | # 122 | # 'papersize': 'letterpaper', 123 | 124 | # The font size ('10pt', '11pt' or '12pt'). 125 | # 126 | # 'pointsize': '10pt', 127 | 128 | # Additional stuff for the LaTeX preamble. 129 | # 130 | # 'preamble': '', 131 | 132 | # Latex figure (float) alignment 133 | # 134 | # 'figure_align': 'htbp', 135 | } 136 | 137 | # Grouping the document tree into LaTeX files. List of tuples 138 | # (source start file, target name, title, 139 | # author, documentclass [howto, manual, or own class]). 140 | latex_documents = [ 141 | (master_doc, 'UltimateSitemapParser.tex', 'Ultimate Sitemap Parser Documentation', 142 | 'Linas Valiukas, Hal Roberts, Media Cloud project', 'manual'), 143 | ] 144 | 145 | 146 | # -- Options for manual page output ------------------------------------------ 147 | 148 | # One entry per manual page. List of tuples 149 | # (source start file, name, description, authors, manual section). 150 | man_pages = [ 151 | (master_doc, 'ultimatesitemapparser', 'Ultimate Sitemap Parser Documentation', 152 | [author], 1) 153 | ] 154 | 155 | 156 | # -- Options for Texinfo output ---------------------------------------------- 157 | 158 | # Grouping the document tree into Texinfo files. List of tuples 159 | # (source start file, target name, title, author, 160 | # dir menu entry, description, category) 161 | texinfo_documents = [ 162 | (master_doc, 'UltimateSitemapParser', 'Ultimate Sitemap Parser Documentation', 163 | author, 'UltimateSitemapParser', 'One line description of project.', 164 | 'Miscellaneous'), 165 | ] 166 | 167 | 168 | # -- Options for Epub output ------------------------------------------------- 169 | 170 | # Bibliographic Dublin Core info. 171 | epub_title = project 172 | 173 | # The unique identifier of the text. This can be a ISBN number 174 | # or the project homepage. 175 | # 176 | # epub_identifier = '' 177 | 178 | # A unique identification for the text. 179 | # 180 | # epub_uid = '' 181 | 182 | # A list of files that should not be packed into the epub file. 183 | epub_exclude_files = ['search.html'] 184 | 185 | 186 | # -- Extension configuration ------------------------------------------------- 187 | -------------------------------------------------------------------------------- /usp/objects/sitemap.py: -------------------------------------------------------------------------------- 1 | """Objects that represent one of the found sitemaps.""" 2 | 3 | import abc 4 | import os 5 | import pickle 6 | import tempfile 7 | from typing import List, Iterator 8 | 9 | from .page import SitemapPage 10 | 11 | 12 | class AbstractSitemap(object, metaclass=abc.ABCMeta): 13 | """ 14 | Abstract sitemap. 15 | """ 16 | 17 | __slots__ = [ 18 | '__url', 19 | ] 20 | 21 | def __init__(self, url: str): 22 | """ 23 | Initialize a new sitemap. 24 | 25 | :param url: Sitemap URL. 26 | """ 27 | self.__url = url 28 | 29 | def __eq__(self, other) -> bool: 30 | if not isinstance(other, AbstractSitemap): 31 | raise NotImplemented 32 | 33 | if self.url != other.url: 34 | return False 35 | 36 | return True 37 | 38 | def __hash__(self): 39 | return hash(( 40 | self.url, 41 | )) 42 | 43 | def __repr__(self): 44 | return ( 45 | "{self.__class__.__name__}(" 46 | "url={self.url}" 47 | ")" 48 | ).format(self=self) 49 | 50 | @property 51 | def url(self) -> str: 52 | """ 53 | Return sitemap URL. 54 | 55 | :return: Sitemap URL. 56 | """ 57 | return self.__url 58 | 59 | @abc.abstractmethod 60 | def all_pages(self) -> Iterator[SitemapPage]: 61 | """ 62 | Return iterator which yields all pages of this sitemap and linked sitemaps (if any). 63 | 64 | :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). 65 | """ 66 | raise NotImplementedError("Abstract method") 67 | 68 | 69 | class InvalidSitemap(AbstractSitemap): 70 | """Invalid sitemap, e.g. the one that can't be parsed.""" 71 | 72 | __slots__ = [ 73 | '__reason', 74 | ] 75 | 76 | def __init__(self, url: str, reason: str): 77 | """ 78 | Initialize a new invalid sitemap. 79 | 80 | :param url: Sitemap URL. 81 | :param reason: Reason why the sitemap is deemed invalid. 82 | """ 83 | super().__init__(url=url) 84 | self.__reason = reason 85 | 86 | def __eq__(self, other) -> bool: 87 | if not isinstance(other, InvalidSitemap): 88 | raise NotImplemented 89 | 90 | if self.url != other.url: 91 | return False 92 | 93 | if self.reason != other.reason: 94 | return False 95 | 96 | return True 97 | 98 | def __repr__(self): 99 | return ( 100 | "{self.__class__.__name__}(" 101 | "url={self.url}, " 102 | "reason={self.reason}" 103 | ")" 104 | ).format(self=self) 105 | 106 | @property 107 | def reason(self) -> str: 108 | """ 109 | Return reason why the sitemap is deemed invalid. 110 | 111 | :return: Reason why the sitemap is deemed invalid. 112 | """ 113 | return self.__reason 114 | 115 | def all_pages(self) -> Iterator[SitemapPage]: 116 | """ 117 | Return iterator which yields all pages of this sitemap and linked sitemaps (if any). 118 | 119 | :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). 120 | """ 121 | yield from [] 122 | 123 | 124 | class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta): 125 | """Abstract sitemap that contains URLs to pages.""" 126 | 127 | __slots__ = [ 128 | '__pages_temp_file_path', 129 | ] 130 | 131 | def __init__(self, url: str, pages: List[SitemapPage]): 132 | """ 133 | Initialize new pages sitemap. 134 | 135 | :param url: Sitemap URL. 136 | :param pages: List of pages found in a sitemap. 137 | """ 138 | super().__init__(url=url) 139 | 140 | temp_file, self.__pages_temp_file_path = tempfile.mkstemp() 141 | with os.fdopen(temp_file, 'wb') as tmp: 142 | pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL) 143 | 144 | def __del__(self): 145 | os.unlink(self.__pages_temp_file_path) 146 | 147 | def __eq__(self, other) -> bool: 148 | if not isinstance(other, AbstractPagesSitemap): 149 | raise NotImplemented 150 | 151 | if self.url != other.url: 152 | return False 153 | 154 | if self.pages != other.pages: 155 | return False 156 | 157 | return True 158 | 159 | def __repr__(self): 160 | return ( 161 | "{self.__class__.__name__}(" 162 | "url={self.url}, " 163 | "pages={self.pages}" 164 | ")" 165 | ).format(self=self) 166 | 167 | @property 168 | def pages(self) -> List[SitemapPage]: 169 | """ 170 | Return list of pages found in a sitemap. 171 | 172 | :return: List of pages found in a sitemap. 173 | """ 174 | with open(self.__pages_temp_file_path, 'rb') as tmp: 175 | pages = pickle.load(tmp) 176 | return pages 177 | 178 | def all_pages(self) -> Iterator[SitemapPage]: 179 | """ 180 | Return iterator which yields all pages of this sitemap and linked sitemaps (if any). 181 | 182 | :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). 183 | """ 184 | for page in self.pages: 185 | yield page 186 | 187 | 188 | class PagesXMLSitemap(AbstractPagesSitemap): 189 | """ 190 | XML sitemap that contains URLs to pages. 191 | """ 192 | pass 193 | 194 | 195 | class PagesTextSitemap(AbstractPagesSitemap): 196 | """ 197 | Plain text sitemap that contains URLs to pages. 198 | """ 199 | pass 200 | 201 | 202 | class PagesRSSSitemap(AbstractPagesSitemap): 203 | """ 204 | RSS 2.0 sitemap that contains URLs to pages. 205 | """ 206 | pass 207 | 208 | 209 | class PagesAtomSitemap(AbstractPagesSitemap): 210 | """ 211 | RSS 0.3 / 1.0 sitemap that contains URLs to pages. 212 | """ 213 | pass 214 | 215 | 216 | class AbstractIndexSitemap(AbstractSitemap): 217 | """ 218 | Abstract sitemap with URLs to other sitemaps. 219 | """ 220 | 221 | __slots__ = [ 222 | '__sub_sitemaps', 223 | ] 224 | 225 | def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]): 226 | """ 227 | Initialize index sitemap. 228 | 229 | :param url: Sitemap URL. 230 | :param sub_sitemaps: Sub-sitemaps that are linked to from this sitemap. 231 | """ 232 | super().__init__(url=url) 233 | self.__sub_sitemaps = sub_sitemaps 234 | 235 | def __eq__(self, other) -> bool: 236 | if not isinstance(other, AbstractIndexSitemap): 237 | raise NotImplemented 238 | 239 | if self.url != other.url: 240 | return False 241 | 242 | if self.sub_sitemaps != other.sub_sitemaps: 243 | return False 244 | 245 | return True 246 | 247 | def __repr__(self): 248 | return ( 249 | "{self.__class__.__name__}(" 250 | "url={self.url}, " 251 | "sub_sitemaps={self.sub_sitemaps}" 252 | ")" 253 | ).format(self=self) 254 | 255 | @property 256 | def sub_sitemaps(self) -> List[AbstractSitemap]: 257 | """ 258 | Return sub-sitemaps that are linked to from this sitemap. 259 | 260 | :return: Sub-sitemaps that are linked to from this sitemap. 261 | """ 262 | return self.__sub_sitemaps 263 | 264 | def all_pages(self) -> Iterator[SitemapPage]: 265 | """ 266 | Return iterator which yields all pages of this sitemap and linked sitemaps (if any). 267 | 268 | :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). 269 | """ 270 | for sub_sitemap in self.sub_sitemaps: 271 | for page in sub_sitemap.all_pages(): 272 | yield page 273 | 274 | 275 | class IndexWebsiteSitemap(AbstractIndexSitemap): 276 | """ 277 | Website's root sitemaps, including robots.txt and extra ones. 278 | """ 279 | pass 280 | 281 | 282 | class IndexXMLSitemap(AbstractIndexSitemap): 283 | """ 284 | XML sitemap with URLs to other sitemaps. 285 | """ 286 | pass 287 | 288 | 289 | class IndexRobotsTxtSitemap(AbstractIndexSitemap): 290 | """ 291 | robots.txt sitemap with URLs to other sitemaps. 292 | """ 293 | pass 294 | -------------------------------------------------------------------------------- /usp/helpers.py: -------------------------------------------------------------------------------- 1 | """Helper utilities.""" 2 | 3 | import datetime 4 | import gzip as gzip_lib 5 | import html 6 | import re 7 | import time 8 | from typing import Optional 9 | from urllib.parse import urlparse, unquote_plus, urlunparse 10 | 11 | from dateutil.parser import parse as dateutil_parse 12 | 13 | from .exceptions import SitemapException, GunzipException, StripURLToHomepageException 14 | from .log import create_logger 15 | from .web_client.abstract_client import ( 16 | AbstractWebClient, 17 | AbstractWebClientSuccessResponse, 18 | WebClientErrorResponse, 19 | AbstractWebClientResponse, 20 | ) 21 | 22 | log = create_logger(__name__) 23 | 24 | __URL_REGEX = re.compile(r'^https?://[^\s/$.?#].[^\s]*$', re.IGNORECASE) 25 | """Regular expression to match HTTP(s) URLs.""" 26 | 27 | 28 | def is_http_url(url: str) -> bool: 29 | """ 30 | Returns true if URL is of the "http" ("https") scheme. 31 | 32 | :param url: URL to test. 33 | :return: True if argument URL is of the "http" ("https") scheme. 34 | """ 35 | if url is None: 36 | log.debug("URL is None") 37 | return False 38 | if len(url) == 0: 39 | log.debug("URL is empty") 40 | return False 41 | 42 | log.debug("Testing if URL '{}' is HTTP(s) URL".format(url)) 43 | 44 | if not re.search(__URL_REGEX, url): 45 | log.debug("URL '{}' does not match URL's regexp".format(url)) 46 | return False 47 | 48 | try: 49 | # Try parsing the URL 50 | uri = urlparse(url) 51 | _ = urlunparse(uri) 52 | 53 | except Exception as ex: 54 | log.debug("Cannot parse URL {}: {}".format(url, ex)) 55 | return False 56 | 57 | if not uri.scheme: 58 | log.debug("Scheme is undefined for URL {}.".format(url)) 59 | return False 60 | if not uri.scheme.lower() in ['http', 'https']: 61 | log.debug("Scheme is not HTTP(s) for URL {}.".format(url)) 62 | return False 63 | if not uri.hostname: 64 | log.debug("Host is undefined for URL {}.".format(url)) 65 | return False 66 | 67 | return True 68 | 69 | 70 | def html_unescape_strip(string: Optional[str]) -> Optional[str]: 71 | """ 72 | Decode HTML entities, strip string, set to None if it's empty; ignore None as input. 73 | 74 | :param string: String to decode HTML entities in. 75 | :return: Stripped string with HTML entities decoded; None if parameter string was empty or None. 76 | """ 77 | if string: 78 | string = html.unescape(string) 79 | string = string.strip() 80 | if not string: 81 | string = None 82 | return string 83 | 84 | 85 | def parse_iso8601_date(date_string: str) -> datetime.datetime: 86 | """ 87 | Parse ISO 8601 date (e.g. from sitemap's ) into datetime.datetime object. 88 | 89 | :param date_string: ISO 8601 date, e.g. "2018-01-12T21:57:27Z" or "1997-07-16T19:20:30+01:00". 90 | :return: datetime.datetime object of a parsed date. 91 | """ 92 | # FIXME parse known date formats faster 93 | 94 | if not date_string: 95 | raise SitemapException("Date string is unset.") 96 | 97 | date = dateutil_parse(date_string) 98 | 99 | return date 100 | 101 | 102 | def parse_rfc2822_date(date_string: str) -> datetime.datetime: 103 | """ 104 | Parse RFC 2822 date (e.g. from Atom's ) into datetime.datetime object. 105 | 106 | :param date_string: RFC 2822 date, e.g. "Tue, 10 Aug 2010 20:43:53 -0000". 107 | :return: datetime.datetime object of a parsed date. 108 | """ 109 | # FIXME parse known date formats faster 110 | return parse_iso8601_date(date_string) 111 | 112 | 113 | def get_url_retry_on_client_errors(url: str, 114 | web_client: AbstractWebClient, 115 | retry_count: int = 5, 116 | sleep_between_retries: int = 1) -> AbstractWebClientResponse: 117 | """ 118 | Fetch URL, retry on retryable errors. 119 | 120 | :param url: URL to fetch. 121 | :param web_client: Web client object to use for fetching. 122 | :param retry_count: How many times to retry fetching the same URL. 123 | :param sleep_between_retries: How long to sleep between retries, in seconds. 124 | :return: Web client response object. 125 | """ 126 | assert retry_count > 0, "Retry count must be positive." 127 | 128 | response = None 129 | for retry in range(0, retry_count): 130 | log.info("Fetching URL {}...".format(url)) 131 | response = web_client.get(url) 132 | 133 | if isinstance(response, WebClientErrorResponse): 134 | log.warning( 135 | "Request for URL {} failed: {}".format( 136 | url, response.message() 137 | ) 138 | ) 139 | 140 | if response.retryable(): 141 | log.info("Retrying URL {} in {} seconds...".format(url, sleep_between_retries)) 142 | time.sleep(sleep_between_retries) 143 | 144 | else: 145 | log.info("Not retrying for URL {}".format(url)) 146 | return response 147 | 148 | else: 149 | return response 150 | 151 | log.info("Giving up on URL {}".format(url)) 152 | return response 153 | 154 | 155 | def __response_is_gzipped_data(url: str, response: AbstractWebClientSuccessResponse) -> bool: 156 | """ 157 | Return True if Response looks like it's gzipped. 158 | 159 | :param url: URL the response was fetched from. 160 | :param response: Response object. 161 | :return: True if response looks like it might contain gzipped data. 162 | """ 163 | uri = urlparse(url) 164 | url_path = unquote_plus(uri.path) 165 | content_type = response.header('content-type') or '' 166 | 167 | if url_path.lower().endswith('.gz') or 'gzip' in content_type.lower(): 168 | return True 169 | 170 | else: 171 | return False 172 | 173 | 174 | def gunzip(data: bytes) -> bytes: 175 | """ 176 | Gunzip data. 177 | 178 | :param data: Gzipped data. 179 | :return: Gunzipped data. 180 | """ 181 | 182 | if data is None: 183 | raise GunzipException("Data is None.") 184 | 185 | if not isinstance(data, bytes): 186 | raise GunzipException("Data is not bytes: %s" % str(data)) 187 | 188 | if len(data) == 0: 189 | raise GunzipException("Data is empty (no way an empty string is a valid Gzip archive).") 190 | 191 | try: 192 | gunzipped_data = gzip_lib.decompress(data) 193 | except Exception as ex: 194 | raise GunzipException("Unable to gunzip data: %s" % str(ex)) 195 | 196 | if gunzipped_data is None: 197 | raise GunzipException("Gunzipped data is None.") 198 | 199 | if not isinstance(gunzipped_data, bytes): 200 | raise GunzipException("Gunzipped data is not bytes.") 201 | 202 | return gunzipped_data 203 | 204 | 205 | def ungzipped_response_content(url: str, response: AbstractWebClientSuccessResponse) -> str: 206 | """ 207 | Return HTTP response's decoded content, gunzip it if necessary. 208 | 209 | :param url: URL the response was fetched from. 210 | :param response: Response object. 211 | :return: Decoded and (if necessary) gunzipped response string. 212 | """ 213 | 214 | data = response.raw_data() 215 | 216 | if __response_is_gzipped_data(url=url, response=response): 217 | try: 218 | data = gunzip(data) 219 | except GunzipException as ex: 220 | # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension 221 | log.error("Unable to gunzip response {}, maybe it's a non-gzipped sitemap: {}".format(response, ex)) 222 | 223 | # FIXME other encodings 224 | data = data.decode('utf-8-sig', errors='replace') 225 | 226 | assert isinstance(data, str) 227 | 228 | return data 229 | 230 | 231 | def strip_url_to_homepage(url: str) -> str: 232 | """ 233 | Strip URL to its homepage. 234 | 235 | :param url: URL to strip, e.g. "http://www.example.com/page.html". 236 | :return: Stripped homepage URL, e.g. "http://www.example.com/" 237 | """ 238 | if not url: 239 | raise StripURLToHomepageException("URL is empty.") 240 | 241 | try: 242 | uri = urlparse(url) 243 | assert uri.scheme, "Scheme must be set." 244 | assert uri.scheme.lower() in ['http', 'https'], "Scheme must be http:// or https://" 245 | uri = ( 246 | uri.scheme, 247 | uri.netloc, 248 | '/', # path 249 | '', # params 250 | '', # query 251 | '', # fragment 252 | ) 253 | url = urlunparse(uri) 254 | except Exception as ex: 255 | raise StripURLToHomepageException("Unable to parse URL {}: {}".format(url, ex)) 256 | 257 | return url 258 | -------------------------------------------------------------------------------- /usp/objects/page.py: -------------------------------------------------------------------------------- 1 | """Objects that represent a page found in one of the sitemaps.""" 2 | 3 | import datetime 4 | from decimal import Decimal 5 | from enum import Enum, unique 6 | from typing import List, Optional 7 | 8 | SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal('0.5') 9 | """Default sitemap page priority, as per the spec.""" 10 | 11 | 12 | class SitemapNewsStory(object): 13 | """ 14 | Single story derived from Google News XML sitemap. 15 | """ 16 | 17 | __slots__ = [ 18 | '__title', 19 | '__publish_date', 20 | '__publication_name', 21 | '__publication_language', 22 | '__access', 23 | '__genres', 24 | '__keywords', 25 | '__stock_tickers', 26 | ] 27 | 28 | def __init__(self, 29 | title: str, 30 | publish_date: datetime.datetime, 31 | publication_name: Optional[str] = None, 32 | publication_language: Optional[str] = None, 33 | access: Optional[str] = None, 34 | genres: List[str] = None, 35 | keywords: List[str] = None, 36 | stock_tickers: List[str] = None): 37 | """ 38 | Initialize a new Google News story. 39 | 40 | :param title: Story title. 41 | :param publish_date: Story publication date. 42 | :param publication_name: Name of the news publication in which the article appears in. 43 | :param publication_language: Primary language of the news publication in which the article appears in. 44 | :param access: Accessibility of the article. 45 | :param genres: List of properties characterizing the content of the article. 46 | :param keywords: List of keywords describing the topic of the article. 47 | :param stock_tickers: List of up to 5 stock tickers that are the main subject of the article. 48 | """ 49 | 50 | # Spec defines that some of the properties below are "required" but in practice not every website provides the 51 | # required properties. So, we require only "title" and "publish_date" to be set. 52 | 53 | self.__title = title 54 | self.__publish_date = publish_date 55 | self.__publication_name = publication_name 56 | self.__publication_language = publication_language 57 | self.__access = access 58 | self.__genres = genres if genres else [] 59 | self.__keywords = keywords if keywords else [] 60 | self.__stock_tickers = stock_tickers if stock_tickers else [] 61 | 62 | def __eq__(self, other) -> bool: 63 | if not isinstance(other, SitemapNewsStory): 64 | raise NotImplemented 65 | 66 | if self.title != other.title: 67 | return False 68 | 69 | if self.publish_date != other.publish_date: 70 | return False 71 | 72 | if self.publication_name != other.publication_name: 73 | return False 74 | 75 | if self.publication_language != other.publication_language: 76 | return False 77 | 78 | if self.access != other.access: 79 | return False 80 | 81 | if self.genres != other.genres: 82 | return False 83 | 84 | if self.keywords != other.keywords: 85 | return False 86 | 87 | if self.stock_tickers != other.stock_tickers: 88 | return False 89 | 90 | return True 91 | 92 | def __hash__(self): 93 | return hash(( 94 | self.title, 95 | self.publish_date, 96 | self.publication_name, 97 | self.publication_language, 98 | self.access, 99 | self.genres, 100 | self.keywords, 101 | self.stock_tickers, 102 | )) 103 | 104 | def __repr__(self): 105 | return ( 106 | "{self.__class__.__name__}(" 107 | "title={self.title}, " 108 | "publish_date={self.publish_date}, " 109 | "publication_name={self.publication_name}, " 110 | "publication_language={self.publication_language}, " 111 | "access={self.access}, " 112 | "genres={self.genres}, " 113 | "keywords={self.keywords}, " 114 | "stock_tickers={self.stock_tickers}" 115 | ")" 116 | ).format(self=self) 117 | 118 | @property 119 | def title(self) -> str: 120 | """ 121 | Return story title. 122 | 123 | :return: Story title. 124 | """ 125 | return self.__title 126 | 127 | @property 128 | def publish_date(self) -> datetime.datetime: 129 | """ 130 | Return story publication date. 131 | 132 | :return: Story publication date. 133 | """ 134 | return self.__publish_date 135 | 136 | @property 137 | def publication_name(self) -> Optional[str]: 138 | """ 139 | Return name of the news publication in which the article appears in. 140 | 141 | :return: Name of the news publication in which the article appears in. 142 | """ 143 | return self.__publication_name 144 | 145 | @property 146 | def publication_language(self) -> Optional[str]: 147 | """Return primary language of the news publication in which the article appears in. 148 | 149 | It should be an ISO 639 Language Code (either 2 or 3 letters). 150 | 151 | :return: Primary language of the news publication in which the article appears in. 152 | """ 153 | return self.__publication_language 154 | 155 | @property 156 | def access(self) -> Optional[str]: 157 | """ 158 | Return accessibility of the article. 159 | 160 | :return: Accessibility of the article. 161 | """ 162 | return self.__access 163 | 164 | @property 165 | def genres(self) -> List[str]: 166 | """ 167 | Return list of properties characterizing the content of the article. 168 | 169 | Returns genres such as "PressRelease" or "UserGenerated". 170 | 171 | :return: List of properties characterizing the content of the article 172 | """ 173 | return self.__genres 174 | 175 | @property 176 | def keywords(self) -> List[str]: 177 | """ 178 | Return list of keywords describing the topic of the article. 179 | 180 | :return: List of keywords describing the topic of the article. 181 | """ 182 | return self.__keywords 183 | 184 | @property 185 | def stock_tickers(self) -> List[str]: 186 | """ 187 | Return list of up to 5 stock tickers that are the main subject of the article. 188 | 189 | Each ticker must be prefixed by the name of its stock exchange, and must match its entry in Google Finance. 190 | For example, "NASDAQ:AMAT" (but not "NASD:AMAT"), or "BOM:500325" (but not "BOM:RIL"). 191 | 192 | :return: List of up to 5 stock tickers that are the main subject of the article. 193 | """ 194 | return self.__stock_tickers 195 | 196 | 197 | @unique 198 | class SitemapPageChangeFrequency(Enum): 199 | """Change frequency of a sitemap URL.""" 200 | 201 | ALWAYS = 'always' 202 | HOURLY = 'hourly' 203 | DAILY = 'daily' 204 | WEEKLY = 'weekly' 205 | MONTHLY = 'monthly' 206 | YEARLY = 'yearly' 207 | NEVER = 'never' 208 | 209 | @classmethod 210 | def has_value(cls, value: str) -> bool: 211 | """Test if enum has specified value.""" 212 | return any(value == item.value for item in cls) 213 | 214 | 215 | class SitemapPage(object): 216 | """Single sitemap-derived page.""" 217 | 218 | __slots__ = [ 219 | '__url', 220 | '__priority', 221 | '__last_modified', 222 | '__change_frequency', 223 | '__news_story', 224 | ] 225 | 226 | def __init__(self, 227 | url: str, 228 | priority: Decimal = SITEMAP_PAGE_DEFAULT_PRIORITY, 229 | last_modified: Optional[datetime.datetime] = None, 230 | change_frequency: Optional[SitemapPageChangeFrequency] = None, 231 | news_story: Optional[SitemapNewsStory] = None): 232 | """ 233 | Initialize a new sitemap-derived page. 234 | 235 | :param url: Page URL. 236 | :param priority: Priority of this URL relative to other URLs on your site. 237 | :param last_modified: Date of last modification of the URL. 238 | :param change_frequency: Change frequency of a sitemap URL. 239 | :param news_story: Google News story attached to the URL. 240 | """ 241 | self.__url = url 242 | self.__priority = priority 243 | self.__last_modified = last_modified 244 | self.__change_frequency = change_frequency 245 | self.__news_story = news_story 246 | 247 | def __eq__(self, other) -> bool: 248 | if not isinstance(other, SitemapPage): 249 | raise NotImplemented 250 | 251 | if self.url != other.url: 252 | return False 253 | 254 | if self.priority != other.priority: 255 | return False 256 | 257 | if self.last_modified != other.last_modified: 258 | return False 259 | 260 | if self.change_frequency != other.change_frequency: 261 | return False 262 | 263 | if self.news_story != other.news_story: 264 | return False 265 | 266 | return True 267 | 268 | def __hash__(self): 269 | return hash(( 270 | # Hash only the URL to be able to find unique pages later on 271 | self.url, 272 | )) 273 | 274 | def __repr__(self): 275 | return ( 276 | "{self.__class__.__name__}(" 277 | "url={self.url}, " 278 | "priority={self.priority}, " 279 | "last_modified={self.last_modified}, " 280 | "change_frequency={self.change_frequency}, " 281 | "news_story={self.news_story}" 282 | ")" 283 | ).format(self=self) 284 | 285 | @property 286 | def url(self) -> str: 287 | """ 288 | Return page URL. 289 | 290 | :return: Page URL. 291 | """ 292 | return self.__url 293 | 294 | @property 295 | def priority(self) -> Decimal: 296 | """ 297 | Return priority of this URL relative to other URLs on your site. 298 | 299 | :return: Priority of this URL relative to other URLs on your site. 300 | """ 301 | return self.__priority 302 | 303 | @property 304 | def last_modified(self) -> Optional[datetime.datetime]: 305 | """ 306 | Return date of last modification of the URL. 307 | 308 | :return: Date of last modification of the URL. 309 | """ 310 | return self.__last_modified 311 | 312 | @property 313 | def change_frequency(self) -> Optional[SitemapPageChangeFrequency]: 314 | """ 315 | Return change frequency of a sitemap URL. 316 | 317 | :return: Change frequency of a sitemap URL. 318 | """ 319 | return self.__change_frequency 320 | 321 | @property 322 | def news_story(self) -> Optional[SitemapNewsStory]: 323 | """ 324 | Return Google News story attached to the URL. 325 | 326 | :return: Google News story attached to the URL. 327 | """ 328 | return self.__news_story 329 | -------------------------------------------------------------------------------- /usp/fetch_parse.py: -------------------------------------------------------------------------------- 1 | """Sitemap fetchers and parsers.""" 2 | 3 | import abc 4 | import re 5 | import xml.parsers.expat 6 | from collections import OrderedDict 7 | from decimal import Decimal 8 | from typing import Optional, Dict 9 | 10 | from .exceptions import SitemapException, SitemapXMLParsingException 11 | from .helpers import ( 12 | html_unescape_strip, 13 | parse_iso8601_date, 14 | get_url_retry_on_client_errors, 15 | ungzipped_response_content, 16 | is_http_url, 17 | parse_rfc2822_date, 18 | ) 19 | from .log import create_logger 20 | from .objects.page import ( 21 | SitemapPage, 22 | SitemapNewsStory, 23 | SitemapPageChangeFrequency, 24 | SITEMAP_PAGE_DEFAULT_PRIORITY, 25 | ) 26 | from .objects.sitemap import ( 27 | AbstractSitemap, 28 | InvalidSitemap, 29 | IndexRobotsTxtSitemap, 30 | IndexXMLSitemap, 31 | PagesXMLSitemap, 32 | PagesTextSitemap, 33 | PagesRSSSitemap, 34 | PagesAtomSitemap, 35 | ) 36 | from .web_client.abstract_client import ( 37 | AbstractWebClient, 38 | AbstractWebClientSuccessResponse, 39 | WebClientErrorResponse, 40 | ) 41 | from .web_client.requests_client import RequestsWebClient 42 | 43 | log = create_logger(__name__) 44 | 45 | 46 | class SitemapFetcher(object): 47 | """robots.txt / XML / plain text sitemap fetcher.""" 48 | 49 | __MAX_SITEMAP_SIZE = 100 * 1024 * 1024 50 | """Max. uncompressed sitemap size. 51 | 52 | Spec says it might be up to 50 MB but let's go for the full 100 MB here.""" 53 | 54 | __MAX_RECURSION_LEVEL = 10 55 | """Max. recursion level in iterating over sub-sitemaps.""" 56 | 57 | __slots__ = [ 58 | '_url', 59 | '_recursion_level', 60 | '_web_client', 61 | ] 62 | 63 | def __init__(self, url: str, recursion_level: int, web_client: Optional[AbstractWebClient] = None): 64 | 65 | if recursion_level > self.__MAX_RECURSION_LEVEL: 66 | raise SitemapException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url)) 67 | 68 | if not is_http_url(url): 69 | raise SitemapException("URL {} is not a HTTP(s) URL.".format(url)) 70 | 71 | if not web_client: 72 | web_client = RequestsWebClient() 73 | 74 | web_client.set_max_response_data_length(self.__MAX_SITEMAP_SIZE) 75 | 76 | self._url = url 77 | self._web_client = web_client 78 | self._recursion_level = recursion_level 79 | 80 | def sitemap(self) -> AbstractSitemap: 81 | log.info("Fetching level {} sitemap from {}...".format(self._recursion_level, self._url)) 82 | response = get_url_retry_on_client_errors(url=self._url, web_client=self._web_client) 83 | 84 | if isinstance(response, WebClientErrorResponse): 85 | return InvalidSitemap( 86 | url=self._url, 87 | reason="Unable to fetch sitemap from {}: {}".format(self._url, response.message()), 88 | ) 89 | 90 | assert isinstance(response, AbstractWebClientSuccessResponse) 91 | 92 | response_content = ungzipped_response_content(url=self._url, response=response) 93 | 94 | # MIME types returned in Content-Type are unpredictable, so peek into the content instead 95 | if response_content[:20].strip().startswith('<'): 96 | # XML sitemap (the specific kind is to be determined later) 97 | parser = XMLSitemapParser( 98 | url=self._url, 99 | content=response_content, 100 | recursion_level=self._recursion_level, 101 | web_client=self._web_client, 102 | ) 103 | 104 | else: 105 | # Assume that it's some sort of a text file (robots.txt or plain text sitemap) 106 | if self._url.endswith('/robots.txt'): 107 | parser = IndexRobotsTxtSitemapParser( 108 | url=self._url, 109 | content=response_content, 110 | recursion_level=self._recursion_level, 111 | web_client=self._web_client, 112 | ) 113 | else: 114 | parser = PlainTextSitemapParser( 115 | url=self._url, 116 | content=response_content, 117 | recursion_level=self._recursion_level, 118 | web_client=self._web_client, 119 | ) 120 | 121 | log.info("Parsing sitemap from URL {}...".format(self._url)) 122 | sitemap = parser.sitemap() 123 | 124 | return sitemap 125 | 126 | 127 | class AbstractSitemapParser(object, metaclass=abc.ABCMeta): 128 | """Abstract robots.txt / XML / plain text sitemap parser.""" 129 | 130 | __slots__ = [ 131 | '_url', 132 | '_content', 133 | '_web_client', 134 | '_recursion_level', 135 | ] 136 | 137 | def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): 138 | self._url = url 139 | self._content = content 140 | self._recursion_level = recursion_level 141 | self._web_client = web_client 142 | 143 | @abc.abstractmethod 144 | def sitemap(self) -> AbstractSitemap: 145 | raise NotImplementedError("Abstract method.") 146 | 147 | 148 | class IndexRobotsTxtSitemapParser(AbstractSitemapParser): 149 | """robots.txt index sitemap parser.""" 150 | 151 | def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): 152 | super().__init__(url=url, content=content, recursion_level=recursion_level, web_client=web_client) 153 | 154 | if not self._url.endswith('/robots.txt'): 155 | raise SitemapException("URL does not look like robots.txt URL: {}".format(self._url)) 156 | 157 | def sitemap(self) -> AbstractSitemap: 158 | 159 | # Serves as an ordered set because we want to deduplicate URLs but also retain the order 160 | sitemap_urls = OrderedDict() 161 | 162 | for robots_txt_line in self._content.splitlines(): 163 | robots_txt_line = robots_txt_line.strip() 164 | # robots.txt is supposed to be case sensitive but who cares in these Node.js times? 165 | robots_txt_line = robots_txt_line.lower() 166 | sitemap_match = re.search(r'^site-?map:\s*(.+?)$', robots_txt_line, flags=re.IGNORECASE) 167 | if sitemap_match: 168 | sitemap_url = sitemap_match.group(1) 169 | if is_http_url(sitemap_url): 170 | sitemap_urls[sitemap_url] = True 171 | else: 172 | log.warning("Sitemap URL {} doesn't look like an URL, skipping".format(sitemap_url)) 173 | 174 | sub_sitemaps = [] 175 | 176 | for sitemap_url in sitemap_urls.keys(): 177 | fetcher = SitemapFetcher( 178 | url=sitemap_url, 179 | recursion_level=self._recursion_level, 180 | web_client=self._web_client, 181 | ) 182 | fetched_sitemap = fetcher.sitemap() 183 | sub_sitemaps.append(fetched_sitemap) 184 | 185 | index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps) 186 | 187 | return index_sitemap 188 | 189 | 190 | class PlainTextSitemapParser(AbstractSitemapParser): 191 | """Plain text sitemap parser.""" 192 | 193 | def sitemap(self) -> AbstractSitemap: 194 | 195 | story_urls = OrderedDict() 196 | 197 | for story_url in self._content.splitlines(): 198 | story_url = story_url.strip() 199 | if not story_url: 200 | continue 201 | if is_http_url(story_url): 202 | story_urls[story_url] = True 203 | else: 204 | log.warning("Story URL {} doesn't look like an URL, skipping".format(story_url)) 205 | 206 | pages = [] 207 | for page_url in story_urls.keys(): 208 | page = SitemapPage(url=page_url) 209 | pages.append(page) 210 | 211 | text_sitemap = PagesTextSitemap(url=self._url, pages=pages) 212 | 213 | return text_sitemap 214 | 215 | 216 | class XMLSitemapParser(AbstractSitemapParser): 217 | """XML sitemap parser.""" 218 | 219 | __XML_NAMESPACE_SEPARATOR = ' ' 220 | 221 | __slots__ = [ 222 | '_concrete_parser', 223 | ] 224 | 225 | def __init__(self, url: str, content: str, recursion_level: int, web_client: AbstractWebClient): 226 | super().__init__(url=url, content=content, recursion_level=recursion_level, web_client=web_client) 227 | 228 | # Will be initialized when the type of sitemap is known 229 | self._concrete_parser = None 230 | 231 | def sitemap(self) -> AbstractSitemap: 232 | 233 | parser = xml.parsers.expat.ParserCreate(namespace_separator=self.__XML_NAMESPACE_SEPARATOR) 234 | parser.StartElementHandler = self._xml_element_start 235 | parser.EndElementHandler = self._xml_element_end 236 | parser.CharacterDataHandler = self._xml_char_data 237 | 238 | try: 239 | is_final = True 240 | parser.Parse(self._content, is_final) 241 | except Exception as ex: 242 | # Some sitemap XML files might end abruptly because webservers might be timing out on returning huge XML 243 | # files so don't return InvalidSitemap() but try to get as much pages as possible 244 | log.error("Parsing sitemap from URL {} failed: {}".format(self._url, ex)) 245 | 246 | if not self._concrete_parser: 247 | return InvalidSitemap( 248 | url=self._url, 249 | reason="No parsers support sitemap from {}".format(self._url), 250 | ) 251 | 252 | return self._concrete_parser.sitemap() 253 | 254 | @classmethod 255 | def __normalize_xml_element_name(cls, name: str): 256 | """ 257 | Replace the namespace URL in the argument element name with internal namespace. 258 | 259 | * Elements from http://www.sitemaps.org/schemas/sitemap/0.9 namespace will be prefixed with "sitemap:", 260 | e.g. "" will become "" 261 | 262 | * Elements from http://www.google.com/schemas/sitemap-news/0.9 namespace will be prefixed with "news:", 263 | e.g. "" will become "" 264 | 265 | For non-sitemap namespaces, return the element name with the namespace stripped. 266 | 267 | :param name: Namespace URL plus XML element name, e.g. "http://www.sitemaps.org/schemas/sitemap/0.9 loc" 268 | :return: Internal namespace name plus element name, e.g. "sitemap loc" 269 | """ 270 | 271 | name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR) 272 | 273 | if len(name_parts) == 1: 274 | namespace_url = '' 275 | name = name_parts[0] 276 | 277 | elif len(name_parts) == 2: 278 | namespace_url = name_parts[0] 279 | name = name_parts[1] 280 | 281 | else: 282 | raise SitemapXMLParsingException("Unable to determine namespace for element '{}'".format(name)) 283 | 284 | if '/sitemap/' in namespace_url: 285 | name = 'sitemap:{}'.format(name) 286 | elif '/sitemap-news/' in namespace_url: 287 | name = 'news:{}'.format(name) 288 | else: 289 | # We don't care about the rest of the namespaces, so just keep the plain element name 290 | pass 291 | 292 | return name 293 | 294 | def _xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: 295 | 296 | name = self.__normalize_xml_element_name(name) 297 | 298 | if self._concrete_parser: 299 | self._concrete_parser.xml_element_start(name=name, attrs=attrs) 300 | 301 | else: 302 | 303 | # Root element -- initialize concrete parser 304 | if name == 'sitemap:urlset': 305 | self._concrete_parser = PagesXMLSitemapParser( 306 | url=self._url, 307 | ) 308 | 309 | elif name == 'sitemap:sitemapindex': 310 | self._concrete_parser = IndexXMLSitemapParser( 311 | url=self._url, 312 | web_client=self._web_client, 313 | recursion_level=self._recursion_level, 314 | ) 315 | 316 | elif name == 'rss': 317 | self._concrete_parser = PagesRSSSitemapParser( 318 | url=self._url, 319 | ) 320 | 321 | elif name == 'feed': 322 | self._concrete_parser = PagesAtomSitemapParser( 323 | url=self._url, 324 | ) 325 | 326 | else: 327 | raise SitemapXMLParsingException("Unsupported root element '{}'.".format(name)) 328 | 329 | def _xml_element_end(self, name: str) -> None: 330 | 331 | name = self.__normalize_xml_element_name(name) 332 | 333 | if not self._concrete_parser: 334 | raise SitemapXMLParsingException("Concrete sitemap parser should be set by now.") 335 | 336 | self._concrete_parser.xml_element_end(name=name) 337 | 338 | def _xml_char_data(self, data: str) -> None: 339 | 340 | if not self._concrete_parser: 341 | raise SitemapXMLParsingException("Concrete sitemap parser should be set by now.") 342 | 343 | self._concrete_parser.xml_char_data(data=data) 344 | 345 | 346 | class AbstractXMLSitemapParser(object, metaclass=abc.ABCMeta): 347 | """ 348 | Abstract XML sitemap parser. 349 | """ 350 | 351 | __slots__ = [ 352 | # URL of the sitemap that is being parsed 353 | '_url', 354 | 355 | # Last encountered character data 356 | '_last_char_data', 357 | 358 | '_last_handler_call_was_xml_char_data', 359 | ] 360 | 361 | def __init__(self, url: str): 362 | self._url = url 363 | self._last_char_data = '' 364 | self._last_handler_call_was_xml_char_data = False 365 | 366 | def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: 367 | self._last_handler_call_was_xml_char_data = False 368 | pass 369 | 370 | def xml_element_end(self, name: str) -> None: 371 | # End of any element always resets last encountered character data 372 | self._last_char_data = '' 373 | self._last_handler_call_was_xml_char_data = False 374 | 375 | def xml_char_data(self, data: str) -> None: 376 | # Handler might be called multiple times for what essentially is a single string, e.g. in case of entities 377 | # ("ABC & DEF"), so this is why we're appending 378 | if self._last_handler_call_was_xml_char_data: 379 | self._last_char_data += data 380 | else: 381 | self._last_char_data = data 382 | 383 | self._last_handler_call_was_xml_char_data = True 384 | 385 | @abc.abstractmethod 386 | def sitemap(self) -> AbstractSitemap: 387 | raise NotImplementedError("Abstract method.") 388 | 389 | 390 | class IndexXMLSitemapParser(AbstractXMLSitemapParser): 391 | """ 392 | Index XML sitemap parser. 393 | """ 394 | 395 | __slots__ = [ 396 | '_web_client', 397 | '_recursion_level', 398 | 399 | # List of sub-sitemap URLs found in this index sitemap 400 | '_sub_sitemap_urls', 401 | ] 402 | 403 | def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int): 404 | super().__init__(url=url) 405 | 406 | self._web_client = web_client 407 | self._recursion_level = recursion_level 408 | self._sub_sitemap_urls = [] 409 | 410 | def xml_element_end(self, name: str) -> None: 411 | 412 | if name == 'sitemap:loc': 413 | sub_sitemap_url = html_unescape_strip(self._last_char_data) 414 | if not is_http_url(sub_sitemap_url): 415 | log.warning("Sub-sitemap URL does not look like one: {}".format(sub_sitemap_url)) 416 | 417 | else: 418 | if sub_sitemap_url not in self._sub_sitemap_urls: 419 | self._sub_sitemap_urls.append(sub_sitemap_url) 420 | 421 | super().xml_element_end(name=name) 422 | 423 | def sitemap(self) -> AbstractSitemap: 424 | 425 | sub_sitemaps = [] 426 | 427 | for sub_sitemap_url in self._sub_sitemap_urls: 428 | 429 | # URL might be invalid, or recursion limit might have been reached 430 | try: 431 | fetcher = SitemapFetcher(url=sub_sitemap_url, 432 | recursion_level=self._recursion_level + 1, 433 | web_client=self._web_client) 434 | fetched_sitemap = fetcher.sitemap() 435 | except Exception as ex: 436 | fetched_sitemap = InvalidSitemap( 437 | url=sub_sitemap_url, 438 | reason="Unable to add sub-sitemap from URL {}: {}".format(sub_sitemap_url, str(ex)), 439 | ) 440 | 441 | sub_sitemaps.append(fetched_sitemap) 442 | 443 | index_sitemap = IndexXMLSitemap(url=self._url, sub_sitemaps=sub_sitemaps) 444 | 445 | return index_sitemap 446 | 447 | 448 | class PagesXMLSitemapParser(AbstractXMLSitemapParser): 449 | """ 450 | Pages XML sitemap parser. 451 | """ 452 | 453 | class Page(object): 454 | """Simple data class for holding various properties for a single entry while parsing.""" 455 | 456 | __slots__ = [ 457 | 'url', 458 | 'last_modified', 459 | 'change_frequency', 460 | 'priority', 461 | 'news_title', 462 | 'news_publish_date', 463 | 'news_publication_name', 464 | 'news_publication_language', 465 | 'news_access', 466 | 'news_genres', 467 | 'news_keywords', 468 | 'news_stock_tickers', 469 | ] 470 | 471 | def __init__(self): 472 | self.url = None 473 | self.last_modified = None 474 | self.change_frequency = None 475 | self.priority = None 476 | self.news_title = None 477 | self.news_publish_date = None 478 | self.news_publication_name = None 479 | self.news_publication_language = None 480 | self.news_access = None 481 | self.news_genres = None 482 | self.news_keywords = None 483 | self.news_stock_tickers = None 484 | 485 | def __hash__(self): 486 | return hash(( 487 | # Hash only the URL to be able to find unique ones 488 | self.url, 489 | )) 490 | 491 | def page(self) -> Optional[SitemapPage]: 492 | """Return constructed sitemap page if one has been completed, otherwise None.""" 493 | 494 | # Required 495 | url = html_unescape_strip(self.url) 496 | if not url: 497 | log.error("URL is unset") 498 | return None 499 | 500 | last_modified = html_unescape_strip(self.last_modified) 501 | if last_modified: 502 | last_modified = parse_iso8601_date(last_modified) 503 | 504 | change_frequency = html_unescape_strip(self.change_frequency) 505 | if change_frequency: 506 | change_frequency = change_frequency.lower() 507 | if SitemapPageChangeFrequency.has_value(change_frequency): 508 | change_frequency = SitemapPageChangeFrequency(change_frequency) 509 | else: 510 | log.warning("Invalid change frequency, defaulting to 'always'.".format(change_frequency)) 511 | change_frequency = SitemapPageChangeFrequency.ALWAYS 512 | assert isinstance(change_frequency, SitemapPageChangeFrequency) 513 | 514 | priority = html_unescape_strip(self.priority) 515 | if priority: 516 | priority = Decimal(priority) 517 | 518 | comp_zero = priority.compare(Decimal('0.0')) 519 | comp_one = priority.compare(Decimal('1.0')) 520 | if comp_zero in (Decimal('0'), Decimal('1') and comp_one in (Decimal('0'), Decimal('-1'))): 521 | # 0 <= priority <= 1 522 | pass 523 | else: 524 | log.warning("Priority is not within 0 and 1: {}".format(priority)) 525 | priority = SITEMAP_PAGE_DEFAULT_PRIORITY 526 | 527 | else: 528 | priority = SITEMAP_PAGE_DEFAULT_PRIORITY 529 | 530 | news_title = html_unescape_strip(self.news_title) 531 | 532 | news_publish_date = html_unescape_strip(self.news_publish_date) 533 | if news_publish_date: 534 | news_publish_date = parse_iso8601_date(date_string=news_publish_date) 535 | 536 | news_publication_name = html_unescape_strip(self.news_publication_name) 537 | news_publication_language = html_unescape_strip(self.news_publication_language) 538 | news_access = html_unescape_strip(self.news_access) 539 | 540 | news_genres = html_unescape_strip(self.news_genres) 541 | if news_genres: 542 | news_genres = [x.strip() for x in news_genres.split(',')] 543 | else: 544 | news_genres = [] 545 | 546 | news_keywords = html_unescape_strip(self.news_keywords) 547 | if news_keywords: 548 | news_keywords = [x.strip() for x in news_keywords.split(',')] 549 | else: 550 | news_keywords = [] 551 | 552 | news_stock_tickers = html_unescape_strip(self.news_stock_tickers) 553 | if news_stock_tickers: 554 | news_stock_tickers = [x.strip() for x in news_stock_tickers.split(',')] 555 | else: 556 | news_stock_tickers = [] 557 | 558 | sitemap_news_story = None 559 | if news_title and news_publish_date: 560 | sitemap_news_story = SitemapNewsStory( 561 | title=news_title, 562 | publish_date=news_publish_date, 563 | publication_name=news_publication_name, 564 | publication_language=news_publication_language, 565 | access=news_access, 566 | genres=news_genres, 567 | keywords=news_keywords, 568 | stock_tickers=news_stock_tickers, 569 | ) 570 | 571 | return SitemapPage( 572 | url=url, 573 | last_modified=last_modified, 574 | change_frequency=change_frequency, 575 | priority=priority, 576 | news_story=sitemap_news_story, 577 | ) 578 | 579 | __slots__ = [ 580 | '_current_page', 581 | '_pages', 582 | ] 583 | 584 | def __init__(self, url: str): 585 | super().__init__(url=url) 586 | 587 | self._current_page = None 588 | self._pages = [] 589 | 590 | def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: 591 | 592 | super().xml_element_start(name=name, attrs=attrs) 593 | 594 | if name == 'sitemap:url': 595 | if self._current_page: 596 | raise SitemapXMLParsingException("Page is expected to be unset by .") 597 | self._current_page = self.Page() 598 | 599 | def __require_last_char_data_to_be_set(self, name: str) -> None: 600 | if not self._last_char_data: 601 | raise SitemapXMLParsingException( 602 | "Character data is expected to be set at the end of <{}>.".format(name) 603 | ) 604 | 605 | def xml_element_end(self, name: str) -> None: 606 | 607 | if not self._current_page and name != 'sitemap:urlset': 608 | raise SitemapXMLParsingException("Page is expected to be set at the end of <{}>.".format(name)) 609 | 610 | if name == 'sitemap:url': 611 | if self._current_page not in self._pages: 612 | self._pages.append(self._current_page) 613 | self._current_page = None 614 | 615 | else: 616 | 617 | if name == 'sitemap:loc': 618 | # Every entry must have 619 | self.__require_last_char_data_to_be_set(name=name) 620 | self._current_page.url = self._last_char_data 621 | 622 | elif name == 'sitemap:lastmod': 623 | # Element might be present but character data might be empty 624 | self._current_page.last_modified = self._last_char_data 625 | 626 | elif name == 'sitemap:changefreq': 627 | # Element might be present but character data might be empty 628 | self._current_page.change_frequency = self._last_char_data 629 | 630 | elif name == 'sitemap:priority': 631 | # Element might be present but character data might be empty 632 | self._current_page.priority = self._last_char_data 633 | 634 | elif name == 'news:name': # news/publication/name 635 | # Element might be present but character data might be empty 636 | self._current_page.news_publication_name = self._last_char_data 637 | 638 | elif name == 'news:language': # news/publication/language 639 | # Element might be present but character data might be empty 640 | self._current_page.news_publication_language = self._last_char_data 641 | 642 | elif name == 'news:publication_date': 643 | # Element might be present but character data might be empty 644 | self._current_page.news_publish_date = self._last_char_data 645 | 646 | elif name == 'news:title': 647 | # Every Google News sitemap entry must have 648 | self.__require_last_char_data_to_be_set(name=name) 649 | self._current_page.news_title = self._last_char_data 650 | 651 | elif name == 'news:access': 652 | # Element might be present but character data might be empty 653 | self._current_page.news_access = self._last_char_data 654 | 655 | elif name == 'news:keywords': 656 | # Element might be present but character data might be empty 657 | self._current_page.news_keywords = self._last_char_data 658 | 659 | elif name == 'news:stock_tickers': 660 | # Element might be present but character data might be empty 661 | self._current_page.news_stock_tickers = self._last_char_data 662 | 663 | super().xml_element_end(name=name) 664 | 665 | def sitemap(self) -> AbstractSitemap: 666 | 667 | pages = [] 668 | 669 | for page_row in self._pages: 670 | page = page_row.page() 671 | if page: 672 | pages.append(page) 673 | 674 | pages_sitemap = PagesXMLSitemap(url=self._url, pages=pages) 675 | 676 | return pages_sitemap 677 | 678 | 679 | class PagesRSSSitemapParser(AbstractXMLSitemapParser): 680 | """ 681 | Pages RSS 2.0 sitemap parser. 682 | 683 | https://validator.w3.org/feed/docs/rss2.html 684 | """ 685 | 686 | class Page(object): 687 | """ 688 | Data class for holding various properties for a single RSS <item> while parsing. 689 | """ 690 | 691 | __slots__ = [ 692 | 'link', 693 | 'title', 694 | 'description', 695 | 'publication_date', 696 | ] 697 | 698 | def __init__(self): 699 | self.link = None 700 | self.title = None 701 | self.description = None 702 | self.publication_date = None 703 | 704 | def __hash__(self): 705 | return hash(( 706 | # Hash only the URL 707 | self.link, 708 | )) 709 | 710 | def page(self) -> Optional[SitemapPage]: 711 | """Return constructed sitemap page if one has been completed, otherwise None.""" 712 | 713 | # Required 714 | link = html_unescape_strip(self.link) 715 | if not link: 716 | log.error("Link is unset") 717 | return None 718 | 719 | title = html_unescape_strip(self.title) 720 | description = html_unescape_strip(self.description) 721 | if not (title or description): 722 | log.error("Both title and description are unset") 723 | return None 724 | 725 | publication_date = html_unescape_strip(self.publication_date) 726 | if publication_date: 727 | publication_date = parse_rfc2822_date(publication_date) 728 | 729 | return SitemapPage( 730 | url=link, 731 | news_story=SitemapNewsStory( 732 | title=title or description, 733 | publish_date=publication_date, 734 | ), 735 | ) 736 | 737 | __slots__ = [ 738 | '_current_page', 739 | '_pages', 740 | ] 741 | 742 | def __init__(self, url: str): 743 | super().__init__(url=url) 744 | 745 | self._current_page = None 746 | self._pages = [] 747 | 748 | def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: 749 | 750 | super().xml_element_start(name=name, attrs=attrs) 751 | 752 | if name == 'item': 753 | if self._current_page: 754 | raise SitemapXMLParsingException("Page is expected to be unset by <item>.") 755 | self._current_page = self.Page() 756 | 757 | def __require_last_char_data_to_be_set(self, name: str) -> None: 758 | if not self._last_char_data: 759 | raise SitemapXMLParsingException( 760 | "Character data is expected to be set at the end of <{}>.".format(name) 761 | ) 762 | 763 | def xml_element_end(self, name: str) -> None: 764 | 765 | # If within <item> already 766 | if self._current_page: 767 | 768 | if name == 'item': 769 | if self._current_page not in self._pages: 770 | self._pages.append(self._current_page) 771 | self._current_page = None 772 | 773 | else: 774 | 775 | if name == 'link': 776 | # Every entry must have <link> 777 | self.__require_last_char_data_to_be_set(name=name) 778 | self._current_page.link = self._last_char_data 779 | 780 | elif name == 'title': 781 | # Title (if set) can't be empty 782 | self.__require_last_char_data_to_be_set(name=name) 783 | self._current_page.title = self._last_char_data 784 | 785 | elif name == 'description': 786 | # Description (if set) can't be empty 787 | self.__require_last_char_data_to_be_set(name=name) 788 | self._current_page.description = self._last_char_data 789 | 790 | elif name == 'pubDate': 791 | # Element might be present but character data might be empty 792 | self._current_page.publication_date = self._last_char_data 793 | 794 | super().xml_element_end(name=name) 795 | 796 | def sitemap(self) -> AbstractSitemap: 797 | 798 | pages = [] 799 | 800 | for page_row in self._pages: 801 | page = page_row.page() 802 | if page: 803 | pages.append(page) 804 | 805 | pages_sitemap = PagesRSSSitemap(url=self._url, pages=pages) 806 | 807 | return pages_sitemap 808 | 809 | 810 | class PagesAtomSitemapParser(AbstractXMLSitemapParser): 811 | """ 812 | Pages Atom 0.3 / 1.0 sitemap parser. 813 | 814 | https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3 815 | https://www.ietf.org/rfc/rfc4287.txt 816 | http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html 817 | """ 818 | 819 | # FIXME merge with RSS parser class as there are too many similarities 820 | 821 | class Page(object): 822 | """Data class for holding various properties for a single Atom <entry> while parsing.""" 823 | 824 | __slots__ = [ 825 | 'link', 826 | 'title', 827 | 'description', 828 | 'publication_date', 829 | ] 830 | 831 | def __init__(self): 832 | self.link = None 833 | self.title = None 834 | self.description = None 835 | self.publication_date = None 836 | 837 | def __hash__(self): 838 | return hash(( 839 | # Hash only the URL 840 | self.link, 841 | )) 842 | 843 | def page(self) -> Optional[SitemapPage]: 844 | """Return constructed sitemap page if one has been completed, otherwise None.""" 845 | 846 | # Required 847 | link = html_unescape_strip(self.link) 848 | if not link: 849 | log.error("Link is unset") 850 | return None 851 | 852 | title = html_unescape_strip(self.title) 853 | description = html_unescape_strip(self.description) 854 | if not (title or description): 855 | log.error("Both title and description are unset") 856 | return None 857 | 858 | publication_date = html_unescape_strip(self.publication_date) 859 | if publication_date: 860 | publication_date = parse_rfc2822_date(publication_date) 861 | 862 | return SitemapPage( 863 | url=link, 864 | news_story=SitemapNewsStory( 865 | title=title or description, 866 | publish_date=publication_date, 867 | ), 868 | ) 869 | 870 | __slots__ = [ 871 | '_current_page', 872 | '_pages', 873 | '_last_link_rel_self_href', 874 | ] 875 | 876 | def __init__(self, url: str): 877 | super().__init__(url=url) 878 | 879 | self._current_page = None 880 | self._pages = [] 881 | self._last_link_rel_self_href = None 882 | 883 | def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: 884 | 885 | super().xml_element_start(name=name, attrs=attrs) 886 | 887 | if name == 'entry': 888 | if self._current_page: 889 | raise SitemapXMLParsingException("Page is expected to be unset by <entry>.") 890 | self._current_page = self.Page() 891 | 892 | elif name == 'link': 893 | if self._current_page: 894 | if attrs.get('rel', 'self').lower() == 'self' or self._last_link_rel_self_href is None: 895 | self._last_link_rel_self_href = attrs.get('href', None) 896 | 897 | def __require_last_char_data_to_be_set(self, name: str) -> None: 898 | if not self._last_char_data: 899 | raise SitemapXMLParsingException( 900 | "Character data is expected to be set at the end of <{}>.".format(name) 901 | ) 902 | 903 | def xml_element_end(self, name: str) -> None: 904 | 905 | # If within <entry> already 906 | if self._current_page: 907 | 908 | if name == 'entry': 909 | 910 | if self._last_link_rel_self_href: 911 | self._current_page.link = self._last_link_rel_self_href 912 | self._last_link_rel_self_href = None 913 | 914 | if self._current_page not in self._pages: 915 | self._pages.append(self._current_page) 916 | 917 | self._current_page = None 918 | 919 | else: 920 | 921 | if name == 'title': 922 | # Title (if set) can't be empty 923 | self.__require_last_char_data_to_be_set(name=name) 924 | self._current_page.title = self._last_char_data 925 | 926 | elif name == 'tagline' or name == 'summary': 927 | # Description (if set) can't be empty 928 | self.__require_last_char_data_to_be_set(name=name) 929 | self._current_page.description = self._last_char_data 930 | 931 | elif name == 'issued' or name == 'published': 932 | # Element might be present but character data might be empty 933 | self._current_page.publication_date = self._last_char_data 934 | 935 | elif name == 'updated': 936 | # No 'issued' or 'published' were set before 937 | if not self._current_page.publication_date: 938 | self._current_page.publication_date = self._last_char_data 939 | 940 | super().xml_element_end(name=name) 941 | 942 | def sitemap(self) -> AbstractSitemap: 943 | 944 | pages = [] 945 | 946 | for page_row in self._pages: 947 | page = page_row.page() 948 | if page: 949 | pages.append(page) 950 | 951 | pages_sitemap = PagesAtomSitemap(url=self._url, pages=pages) 952 | 953 | return pages_sitemap 954 | -------------------------------------------------------------------------------- /tests/test_tree.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import difflib 3 | import textwrap 4 | from decimal import Decimal 5 | from email.utils import format_datetime 6 | from unittest import TestCase 7 | 8 | import requests_mock 9 | from dateutil.tz import tzoffset 10 | 11 | from tests.helpers import gzip 12 | from usp.log import create_logger 13 | from usp.objects.page import ( 14 | SitemapPage, 15 | SitemapNewsStory, 16 | SitemapPageChangeFrequency, 17 | ) 18 | from usp.objects.sitemap import ( 19 | IndexRobotsTxtSitemap, 20 | PagesXMLSitemap, 21 | IndexXMLSitemap, 22 | InvalidSitemap, 23 | PagesTextSitemap, 24 | IndexWebsiteSitemap, 25 | PagesRSSSitemap, 26 | PagesAtomSitemap, 27 | ) 28 | from usp.tree import sitemap_tree_for_homepage 29 | 30 | # FIXME various exotic properties 31 | # FIXME XML vulnerabilities with Expat 32 | # FIXME max. recursion level 33 | # FIXME tests responses that are too big 34 | 35 | 36 | log = create_logger(__name__) 37 | 38 | 39 | class TestSitemapTree(TestCase): 40 | TEST_BASE_URL = 'http://test_ultimate-sitemap-parser.com' # mocked by HTTPretty 41 | 42 | # Publication / "last modified" date 43 | TEST_DATE_DATETIME = datetime.datetime( 44 | year=2009, month=12, day=17, hour=12, minute=4, second=56, 45 | tzinfo=tzoffset(None, 7200), 46 | ) 47 | TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat() 48 | """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps).""" 49 | 50 | TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME) 51 | """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps).""" 52 | 53 | TEST_PUBLICATION_NAME = 'Test publication' 54 | TEST_PUBLICATION_LANGUAGE = 'en' 55 | 56 | @staticmethod 57 | def fallback_to_404_not_found_matcher(request): 58 | """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress.""" 59 | return requests_mock.create_response( 60 | request, 61 | status_code=404, 62 | reason='Not Found', 63 | headers={'Content-Type': 'text/html'}, 64 | text="<h1>404 Not Found!</h1>", 65 | ) 66 | 67 | # noinspection DuplicatedCode 68 | def test_sitemap_tree_for_homepage(self): 69 | """Test sitemap_tree_for_homepage().""" 70 | 71 | with requests_mock.Mocker() as m: 72 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 73 | 74 | m.get( 75 | self.TEST_BASE_URL + '/', 76 | text='This is a homepage.', 77 | ) 78 | 79 | m.get( 80 | self.TEST_BASE_URL + '/robots.txt', 81 | headers={'Content-Type': 'text/plain'}, 82 | text=textwrap.dedent(""" 83 | User-agent: * 84 | Disallow: /whatever 85 | 86 | Sitemap: {base_url}/sitemap_pages.xml 87 | 88 | # Intentionally spelled as "Site-map" as Google tolerates this: 89 | # https://github.com/google/robotstxt/blob/master/robots.cc#L703 90 | Site-map: {base_url}/sitemap_news_index_1.xml 91 | """.format(base_url=self.TEST_BASE_URL)).strip(), 92 | ) 93 | 94 | # One sitemap for random static pages 95 | m.get( 96 | self.TEST_BASE_URL + '/sitemap_pages.xml', 97 | headers={'Content-Type': 'application/xml'}, 98 | text=textwrap.dedent(""" 99 | <?xml version="1.0" encoding="UTF-8"?> 100 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 101 | <url> 102 | <loc>{base_url}/about.html</loc> 103 | <lastmod>{last_modified_date}</lastmod> 104 | <changefreq>monthly</changefreq> 105 | <priority>0.8</priority> 106 | </url> 107 | <url> 108 | <loc>{base_url}/contact.html</loc> 109 | <lastmod>{last_modified_date}</lastmod> 110 | 111 | <!-- Invalid change frequency --> 112 | <changefreq>when we feel like it</changefreq> 113 | 114 | <!-- Invalid priority --> 115 | <priority>1.1</priority> 116 | 117 | </url> 118 | </urlset> 119 | """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(), 120 | ) 121 | 122 | # Index sitemap pointing to sitemaps with stories 123 | m.get( 124 | self.TEST_BASE_URL + '/sitemap_news_index_1.xml', 125 | headers={'Content-Type': 'application/xml'}, 126 | text=textwrap.dedent(""" 127 | <?xml version="1.0" encoding="UTF-8"?> 128 | <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 129 | <sitemap> 130 | <loc>{base_url}/sitemap_news_1.xml</loc> 131 | <lastmod>{last_modified}</lastmod> 132 | </sitemap> 133 | <sitemap> 134 | <loc>{base_url}/sitemap_news_index_2.xml</loc> 135 | <lastmod>{last_modified}</lastmod> 136 | </sitemap> 137 | </sitemapindex> 138 | """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(), 139 | ) 140 | 141 | # First sitemap with actual stories 142 | m.get( 143 | self.TEST_BASE_URL + '/sitemap_news_1.xml', 144 | headers={'Content-Type': 'application/xml'}, 145 | text=textwrap.dedent(""" 146 | <?xml version="1.0" encoding="UTF-8"?> 147 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 148 | xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" 149 | xmlns:xhtml="http://www.w3.org/1999/xhtml"> 150 | 151 | <url> 152 | <loc>{base_url}/news/foo.html</loc> 153 | 154 | <!-- Element present but empty --> 155 | <lastmod /> 156 | 157 | <!-- Some other XML namespace --> 158 | <xhtml:link rel="alternate" 159 | media="only screen and (max-width: 640px)" 160 | href="{base_url}/news/foo.html?mobile=1" /> 161 | 162 | <news:news> 163 | <news:publication> 164 | <news:name>{publication_name}</news:name> 165 | <news:language>{publication_language}</news:language> 166 | </news:publication> 167 | <news:publication_date>{publication_date}</news:publication_date> 168 | <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> 169 | </news:news> 170 | </url> 171 | 172 | <!-- Has a duplicate story in /sitemap_news_2.xml --> 173 | <url> 174 | <loc>{base_url}/news/bar.html</loc> 175 | <xhtml:link rel="alternate" 176 | media="only screen and (max-width: 640px)" 177 | href="{base_url}/news/bar.html?mobile=1" /> 178 | <news:news> 179 | <news:publication> 180 | <news:name>{publication_name}</news:name> 181 | <news:language>{publication_language}</news:language> 182 | </news:publication> 183 | <news:publication_date>{publication_date}</news:publication_date> 184 | <news:title>Bar & bar</news:title> 185 | </news:news> 186 | </url> 187 | 188 | </urlset> 189 | """.format( 190 | base_url=self.TEST_BASE_URL, 191 | publication_name=self.TEST_PUBLICATION_NAME, 192 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 193 | publication_date=self.TEST_DATE_STR_ISO8601, 194 | )).strip(), 195 | ) 196 | 197 | # Another index sitemap pointing to a second sitemaps with stories 198 | m.get( 199 | self.TEST_BASE_URL + '/sitemap_news_index_2.xml', 200 | headers={'Content-Type': 'application/xml'}, 201 | text=textwrap.dedent(""" 202 | <?xml version="1.0" encoding="UTF-8"?> 203 | <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 204 | 205 | <sitemap> 206 | <!-- Extra whitespace added around URL --> 207 | <loc> {base_url}/sitemap_news_2.xml </loc> 208 | <lastmod>{last_modified}</lastmod> 209 | </sitemap> 210 | 211 | <!-- Nonexistent sitemap --> 212 | <sitemap> 213 | <loc>{base_url}/sitemap_news_missing.xml</loc> 214 | <lastmod>{last_modified}</lastmod> 215 | </sitemap> 216 | 217 | </sitemapindex> 218 | """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(), 219 | ) 220 | 221 | # Second sitemap with actual stories 222 | m.get( 223 | self.TEST_BASE_URL + '/sitemap_news_2.xml', 224 | headers={'Content-Type': 'application/xml'}, 225 | text=textwrap.dedent(""" 226 | <?xml version="1.0" encoding="UTF-8"?> 227 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 228 | xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" 229 | xmlns:xhtml="http://www.w3.org/1999/xhtml"> 230 | 231 | <!-- Has a duplicate story in /sitemap_news_1.xml --> 232 | <url> 233 | <!-- Extra whitespace added around URL --> 234 | <loc> {base_url}/news/bar.html </loc> 235 | <xhtml:link rel="alternate" 236 | media="only screen and (max-width: 640px)" 237 | href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> 238 | <news:news> 239 | <news:publication> 240 | <news:name>{publication_name}</news:name> 241 | <news:language>{publication_language}</news:language> 242 | </news:publication> 243 | <news:publication_date>{publication_date}</news:publication_date> 244 | 245 | <tag_without_inner_character_data name="value" /> 246 | 247 | <news:title>Bar & bar</news:title> 248 | </news:news> 249 | </url> 250 | 251 | <url> 252 | <loc>{base_url}/news/baz.html</loc> 253 | <xhtml:link rel="alternate" 254 | media="only screen and (max-width: 640px)" 255 | href="{base_url}/news/baz.html?mobile=1" /> 256 | <news:news> 257 | <news:publication> 258 | <news:name>{publication_name}</news:name> 259 | <news:language>{publication_language}</news:language> 260 | </news:publication> 261 | <news:publication_date>{publication_date}</news:publication_date> 262 | <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> 263 | </news:news> 264 | </url> 265 | 266 | </urlset> 267 | """.format( 268 | base_url=self.TEST_BASE_URL, 269 | publication_name=self.TEST_PUBLICATION_NAME, 270 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 271 | publication_date=self.TEST_DATE_STR_ISO8601, 272 | )).strip(), 273 | ) 274 | 275 | # Nonexistent sitemap 276 | m.get( 277 | self.TEST_BASE_URL + '/sitemap_news_missing.xml', 278 | status_code=404, 279 | reason='Not Found', 280 | headers={'Content-Type': 'text/html'}, 281 | text="<h1>404 Not Found!</h1>", 282 | ) 283 | 284 | expected_sitemap_tree = IndexWebsiteSitemap( 285 | url='{}/'.format(self.TEST_BASE_URL), 286 | sub_sitemaps=[ 287 | IndexRobotsTxtSitemap( 288 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 289 | sub_sitemaps=[ 290 | PagesXMLSitemap( 291 | url='{}/sitemap_pages.xml'.format(self.TEST_BASE_URL), 292 | pages=[ 293 | SitemapPage( 294 | url='{}/about.html'.format(self.TEST_BASE_URL), 295 | last_modified=self.TEST_DATE_DATETIME, 296 | news_story=None, 297 | change_frequency=SitemapPageChangeFrequency.MONTHLY, 298 | priority=Decimal('0.8'), 299 | ), 300 | SitemapPage( 301 | url='{}/contact.html'.format(self.TEST_BASE_URL), 302 | last_modified=self.TEST_DATE_DATETIME, 303 | news_story=None, 304 | 305 | # Invalid input -- should be reset to "always" 306 | change_frequency=SitemapPageChangeFrequency.ALWAYS, 307 | 308 | # Invalid input -- should be reset to 0.5 (the default as per the spec) 309 | priority=Decimal('0.5'), 310 | 311 | ) 312 | ], 313 | ), 314 | IndexXMLSitemap( 315 | url='{}/sitemap_news_index_1.xml'.format(self.TEST_BASE_URL), 316 | sub_sitemaps=[ 317 | PagesXMLSitemap( 318 | url='{}/sitemap_news_1.xml'.format(self.TEST_BASE_URL), 319 | pages=[ 320 | SitemapPage( 321 | url='{}/news/foo.html'.format(self.TEST_BASE_URL), 322 | news_story=SitemapNewsStory( 323 | title='Foo <foo>', 324 | publish_date=self.TEST_DATE_DATETIME, 325 | publication_name=self.TEST_PUBLICATION_NAME, 326 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 327 | ), 328 | ), 329 | SitemapPage( 330 | url='{}/news/bar.html'.format(self.TEST_BASE_URL), 331 | news_story=SitemapNewsStory( 332 | title='Bar & bar', 333 | publish_date=self.TEST_DATE_DATETIME, 334 | publication_name=self.TEST_PUBLICATION_NAME, 335 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 336 | ), 337 | ), 338 | ] 339 | ), 340 | IndexXMLSitemap( 341 | url='{}/sitemap_news_index_2.xml'.format(self.TEST_BASE_URL), 342 | sub_sitemaps=[ 343 | PagesXMLSitemap( 344 | url='{}/sitemap_news_2.xml'.format(self.TEST_BASE_URL), 345 | pages=[ 346 | SitemapPage( 347 | url='{}/news/bar.html'.format(self.TEST_BASE_URL), 348 | news_story=SitemapNewsStory( 349 | title='Bar & bar', 350 | publish_date=self.TEST_DATE_DATETIME, 351 | publication_name=self.TEST_PUBLICATION_NAME, 352 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 353 | ), 354 | ), 355 | SitemapPage( 356 | url='{}/news/baz.html'.format(self.TEST_BASE_URL), 357 | news_story=SitemapNewsStory( 358 | title='Bąž', 359 | publish_date=self.TEST_DATE_DATETIME, 360 | publication_name=self.TEST_PUBLICATION_NAME, 361 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 362 | ), 363 | ), 364 | ], 365 | ), 366 | InvalidSitemap( 367 | url='{}/sitemap_news_missing.xml'.format(self.TEST_BASE_URL), 368 | reason=( 369 | 'Unable to fetch sitemap from {base_url}/sitemap_news_missing.xml: ' 370 | '404 Not Found' 371 | ).format(base_url=self.TEST_BASE_URL), 372 | ), 373 | ], 374 | ), 375 | ], 376 | ), 377 | ], 378 | ) 379 | ] 380 | ) 381 | 382 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 383 | 384 | expected_lines = str(expected_sitemap_tree).split() 385 | actual_lines = str(actual_sitemap_tree).split() 386 | diff = difflib.ndiff(expected_lines, actual_lines) 387 | diff_str = '\n'.join(diff) 388 | 389 | assert expected_sitemap_tree == actual_sitemap_tree, diff_str 390 | 391 | assert len(list(actual_sitemap_tree.all_pages())) == 6 392 | 393 | def test_sitemap_tree_for_homepage_gzip(self): 394 | """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" 395 | 396 | with requests_mock.Mocker() as m: 397 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 398 | 399 | m.get( 400 | self.TEST_BASE_URL + '/', 401 | text='This is a homepage.', 402 | ) 403 | 404 | m.get( 405 | self.TEST_BASE_URL + '/robots.txt', 406 | headers={'Content-Type': 'text/plain'}, 407 | text=textwrap.dedent(""" 408 | User-agent: * 409 | Disallow: /whatever 410 | 411 | Sitemap: {base_url}/sitemap_1.gz 412 | Sitemap: {base_url}/sitemap_2.dat 413 | Sitemap: {base_url}/sitemap_3.xml.gz 414 | """.format(base_url=self.TEST_BASE_URL)).strip(), 415 | ) 416 | 417 | # Gzipped sitemap without correct HTTP header but with .gz extension 418 | m.get( 419 | self.TEST_BASE_URL + '/sitemap_1.gz', 420 | content=gzip(textwrap.dedent(""" 421 | <?xml version="1.0" encoding="UTF-8"?> 422 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 423 | xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> 424 | <url> 425 | <loc>{base_url}/news/foo.html</loc> 426 | <news:news> 427 | <news:publication> 428 | <news:name>{publication_name}</news:name> 429 | <news:language>{publication_language}</news:language> 430 | </news:publication> 431 | <news:publication_date>{publication_date}</news:publication_date> 432 | <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> 433 | </news:news> 434 | </url> 435 | </urlset> 436 | """.format( 437 | base_url=self.TEST_BASE_URL, 438 | publication_name=self.TEST_PUBLICATION_NAME, 439 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 440 | publication_date=self.TEST_DATE_STR_ISO8601, 441 | )).strip()), 442 | ) 443 | 444 | # Gzipped sitemap with correct HTTP header but without .gz extension 445 | m.get( 446 | self.TEST_BASE_URL + '/sitemap_2.dat', 447 | headers={'Content-Type': 'application/x-gzip'}, 448 | content=gzip(textwrap.dedent(""" 449 | <?xml version="1.0" encoding="UTF-8"?> 450 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 451 | xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> 452 | <url> 453 | <loc>{base_url}/news/bar.html</loc> 454 | <news:news> 455 | <news:publication> 456 | <news:name>{publication_name}</news:name> 457 | <news:language>{publication_language}</news:language> 458 | </news:publication> 459 | <news:publication_date>{publication_date}</news:publication_date> 460 | <news:title><![CDATA[Bąr]]></news:title> <!-- CDATA and UTF-8 --> 461 | </news:news> 462 | </url> 463 | </urlset> 464 | """.format( 465 | base_url=self.TEST_BASE_URL, 466 | publication_name=self.TEST_PUBLICATION_NAME, 467 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 468 | publication_date=self.TEST_DATE_STR_ISO8601, 469 | )).strip()), 470 | ) 471 | 472 | # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't 473 | m.get( 474 | self.TEST_BASE_URL + '/sitemap_3.xml.gz', 475 | headers={'Content-Type': 'application/x-gzip'}, 476 | text=textwrap.dedent(""" 477 | <?xml version="1.0" encoding="UTF-8"?> 478 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 479 | xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> 480 | <url> 481 | <loc>{base_url}/news/baz.html</loc> 482 | <news:news> 483 | <news:publication> 484 | <news:name>{publication_name}</news:name> 485 | <news:language>{publication_language}</news:language> 486 | </news:publication> 487 | <news:publication_date>{publication_date}</news:publication_date> 488 | <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> 489 | </news:news> 490 | </url> 491 | </urlset> 492 | """.format( 493 | base_url=self.TEST_BASE_URL, 494 | publication_name=self.TEST_PUBLICATION_NAME, 495 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 496 | publication_date=self.TEST_DATE_STR_ISO8601, 497 | )).strip(), 498 | ) 499 | 500 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 501 | 502 | # Don't do an in-depth check, we just need to make sure that gunzip works 503 | assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) 504 | assert len(actual_sitemap_tree.sub_sitemaps) == 1 505 | 506 | assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) 507 | # noinspection PyUnresolvedReferences 508 | assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3 509 | 510 | # noinspection PyUnresolvedReferences 511 | sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] 512 | assert isinstance(sitemap_1, PagesXMLSitemap) 513 | assert len(sitemap_1.pages) == 1 514 | 515 | # noinspection PyUnresolvedReferences 516 | sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] 517 | assert isinstance(sitemap_2, PagesXMLSitemap) 518 | assert len(sitemap_2.pages) == 1 519 | 520 | # noinspection PyUnresolvedReferences 521 | sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2] 522 | assert isinstance(sitemap_3, PagesXMLSitemap) 523 | assert len(sitemap_3.pages) == 1 524 | 525 | def test_sitemap_tree_for_homepage_plain_text(self): 526 | """Test sitemap_tree_for_homepage() with plain text sitemaps.""" 527 | 528 | with requests_mock.Mocker() as m: 529 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 530 | 531 | m.get( 532 | self.TEST_BASE_URL + '/', 533 | text='This is a homepage.', 534 | ) 535 | 536 | m.get( 537 | self.TEST_BASE_URL + '/robots.txt', 538 | headers={'Content-Type': 'text/plain'}, 539 | text=textwrap.dedent(""" 540 | User-agent: * 541 | Disallow: /whatever 542 | 543 | Sitemap: {base_url}/sitemap_1.txt 544 | Sitemap: {base_url}/sitemap_2.txt.dat 545 | """.format(base_url=self.TEST_BASE_URL)).strip(), 546 | ) 547 | 548 | # Plain text uncompressed sitemap (no Content-Type header) 549 | m.get( 550 | self.TEST_BASE_URL + '/sitemap_1.txt', 551 | text=textwrap.dedent(""" 552 | 553 | {base_url}/news/foo.html 554 | 555 | 556 | {base_url}/news/bar.html 557 | 558 | Some other stuff which totally doesn't look like an URL 559 | """.format(base_url=self.TEST_BASE_URL)).strip(), 560 | ) 561 | 562 | # Plain text compressed sitemap without .gz extension 563 | m.get( 564 | self.TEST_BASE_URL + '/sitemap_2.txt.dat', 565 | headers={'Content-Type': 'application/x-gzip'}, 566 | content=gzip(textwrap.dedent(""" 567 | {base_url}/news/bar.html 568 | {base_url}/news/baz.html 569 | """.format(base_url=self.TEST_BASE_URL)).strip()), 570 | ) 571 | 572 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 573 | 574 | assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap) 575 | assert len(actual_sitemap_tree.sub_sitemaps) == 1 576 | 577 | assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap) 578 | # noinspection PyUnresolvedReferences 579 | assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2 580 | 581 | # noinspection PyUnresolvedReferences 582 | sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0] 583 | assert isinstance(sitemap_1, PagesTextSitemap) 584 | assert len(sitemap_1.pages) == 2 585 | 586 | # noinspection PyUnresolvedReferences 587 | sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1] 588 | assert isinstance(sitemap_2, PagesTextSitemap) 589 | assert len(sitemap_2.pages) == 2 590 | 591 | pages = list(actual_sitemap_tree.all_pages()) 592 | assert len(pages) == 4 593 | assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages 594 | assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages 595 | assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages 596 | 597 | # noinspection DuplicatedCode 598 | def test_sitemap_tree_for_homepage_rss_atom(self): 599 | """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" 600 | 601 | with requests_mock.Mocker() as m: 602 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 603 | 604 | m.get( 605 | self.TEST_BASE_URL + '/', 606 | text='This is a homepage.', 607 | ) 608 | 609 | m.get( 610 | self.TEST_BASE_URL + '/robots.txt', 611 | headers={'Content-Type': 'text/plain'}, 612 | text=textwrap.dedent(""" 613 | User-agent: * 614 | Disallow: /whatever 615 | 616 | Sitemap: {base_url}/sitemap_rss.xml 617 | Sitemap: {base_url}/sitemap_atom_0_3.xml 618 | Sitemap: {base_url}/sitemap_atom_1_0.xml 619 | """.format(base_url=self.TEST_BASE_URL)).strip(), 620 | ) 621 | 622 | # RSS 2.0 sitemap 623 | m.get( 624 | self.TEST_BASE_URL + '/sitemap_rss.xml', 625 | headers={'Content-Type': 'application/rss+xml'}, 626 | text=textwrap.dedent(""" 627 | <?xml version="1.0" encoding="UTF-8"?> 628 | <rss version="2.0"> 629 | <channel> 630 | <title>Test RSS 2.0 feed 631 | This is a test RSS 2.0 feed. 632 | {base_url} 633 | {pub_date} 634 | 635 | 636 | Test RSS 2.0 story #1 637 | This is a test RSS 2.0 story #1. 638 | {base_url}/rss_story_1.html 639 | {base_url}/rss_story_1.html 640 | {pub_date} 641 | 642 | 643 | 644 | Test RSS 2.0 story #2 645 | This is a test RSS 2.0 story #2. 646 | {base_url}/rss_story_2.html 647 | {base_url}/rss_story_2.html 648 | {pub_date} 649 | 650 | 651 | 652 | 653 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(), 654 | ) 655 | 656 | # Atom 0.3 sitemap 657 | m.get( 658 | self.TEST_BASE_URL + '/sitemap_atom_0_3.xml', 659 | headers={'Content-Type': 'application/atom+xml'}, 660 | text=textwrap.dedent(""" 661 | 662 | 663 | Test Atom 0.3 feed 664 | 665 | {pub_date} 666 | 667 | 668 | Test Atom 0.3 story #1 669 | 670 | {base_url}/atom_0_3_story_1.html 671 | {pub_date} 672 | 673 | 674 | 675 | Test Atom 0.3 story #2 676 | 677 | {base_url}/atom_0_3_story_2.html 678 | {pub_date} 679 | 680 | 681 | 682 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), 683 | ) 684 | 685 | # Atom 1.0 sitemap 686 | m.get( 687 | self.TEST_BASE_URL + '/sitemap_atom_1_0.xml', 688 | headers={'Content-Type': 'application/atom+xml'}, 689 | text=textwrap.dedent(""" 690 | 691 | 692 | Test Atom 1.0 feed 693 | This is a test Atom 1.0 feed. 694 | 695 | 696 | {base_url} 697 | {pub_date} 698 | 699 | 700 | Test Atom 1.0 story #1 701 | 702 | 703 | 704 | {base_url}/atom_1_0_story_1.html 705 | {pub_date} 706 | This is test atom 1.0 story #1. 707 | 708 |
709 |

This is test atom 1.0 story #1.

710 |
711 |
712 | 713 | John Doe 714 | johndoe@example.com 715 | 716 |
717 | 718 | 719 | Test Atom 1.0 story #2 720 | 721 | 722 | 723 | {base_url}/atom_1_0_story_2.html 724 | {pub_date} 725 | This is test atom 1.0 story #2. 726 | 727 |
728 |

This is test atom 1.0 story #2.

729 |
730 |
731 | 732 | John Doe 733 | johndoe@example.com 734 | 735 |
736 | 737 |
738 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), 739 | ) 740 | 741 | expected_sitemap_tree = IndexWebsiteSitemap( 742 | url='{}/'.format(self.TEST_BASE_URL), 743 | sub_sitemaps=[ 744 | IndexRobotsTxtSitemap( 745 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 746 | sub_sitemaps=[ 747 | PagesRSSSitemap( 748 | url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL), 749 | pages=[ 750 | SitemapPage( 751 | url='{}/rss_story_1.html'.format(self.TEST_BASE_URL), 752 | news_story=SitemapNewsStory( 753 | title='Test RSS 2.0 story #1', 754 | publish_date=self.TEST_DATE_DATETIME, 755 | ), 756 | ), 757 | SitemapPage( 758 | url='{}/rss_story_2.html'.format(self.TEST_BASE_URL), 759 | news_story=SitemapNewsStory( 760 | title='Test RSS 2.0 story #2', 761 | publish_date=self.TEST_DATE_DATETIME, 762 | ) 763 | ) 764 | ] 765 | ), 766 | PagesAtomSitemap( 767 | url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL), 768 | pages=[ 769 | SitemapPage( 770 | url='{}/atom_0_3_story_1.html'.format(self.TEST_BASE_URL), 771 | news_story=SitemapNewsStory( 772 | title='Test Atom 0.3 story #1', 773 | publish_date=self.TEST_DATE_DATETIME, 774 | ), 775 | ), 776 | SitemapPage( 777 | url='{}/atom_0_3_story_2.html'.format(self.TEST_BASE_URL), 778 | news_story=SitemapNewsStory( 779 | title='Test Atom 0.3 story #2', 780 | publish_date=self.TEST_DATE_DATETIME, 781 | ) 782 | ) 783 | ] 784 | ), 785 | PagesAtomSitemap( 786 | url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL), 787 | pages=[ 788 | SitemapPage( 789 | url='{}/atom_1_0_story_1.html'.format(self.TEST_BASE_URL), 790 | news_story=SitemapNewsStory( 791 | title='Test Atom 1.0 story #1', 792 | publish_date=self.TEST_DATE_DATETIME, 793 | ), 794 | ), 795 | SitemapPage( 796 | url='{}/atom_1_0_story_2.html'.format(self.TEST_BASE_URL), 797 | news_story=SitemapNewsStory( 798 | title='Test Atom 1.0 story #2', 799 | publish_date=self.TEST_DATE_DATETIME, 800 | ) 801 | ) 802 | ] 803 | ), 804 | ] 805 | ) 806 | ] 807 | ) 808 | 809 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 810 | 811 | expected_lines = str(expected_sitemap_tree).split() 812 | actual_lines = str(actual_sitemap_tree).split() 813 | diff = difflib.ndiff(expected_lines, actual_lines) 814 | diff_str = '\n'.join(diff) 815 | 816 | assert expected_sitemap_tree == actual_sitemap_tree, diff_str 817 | 818 | assert len(list(actual_sitemap_tree.all_pages())) == 6 819 | 820 | def test_sitemap_tree_for_homepage_rss_atom_empty(self): 821 | """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds.""" 822 | 823 | with requests_mock.Mocker() as m: 824 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 825 | 826 | m.get( 827 | self.TEST_BASE_URL + '/', 828 | text='This is a homepage.', 829 | ) 830 | 831 | m.get( 832 | self.TEST_BASE_URL + '/robots.txt', 833 | headers={'Content-Type': 'text/plain'}, 834 | text=textwrap.dedent(""" 835 | User-agent: * 836 | Disallow: /whatever 837 | 838 | Sitemap: {base_url}/sitemap_rss.xml 839 | Sitemap: {base_url}/sitemap_atom_0_3.xml 840 | Sitemap: {base_url}/sitemap_atom_1_0.xml 841 | """.format(base_url=self.TEST_BASE_URL)).strip(), 842 | ) 843 | 844 | # RSS 2.0 sitemap 845 | m.get( 846 | self.TEST_BASE_URL + '/sitemap_rss.xml', 847 | headers={'Content-Type': 'application/rss+xml'}, 848 | text=textwrap.dedent(""" 849 | 850 | 851 | 852 | Test RSS 2.0 feed 853 | This is a test RSS 2.0 feed. 854 | {base_url} 855 | {pub_date} 856 | 857 | 858 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(), 859 | ) 860 | 861 | # Atom 0.3 sitemap 862 | m.get( 863 | self.TEST_BASE_URL + '/sitemap_atom_0_3.xml', 864 | headers={'Content-Type': 'application/atom+xml'}, 865 | text=textwrap.dedent(""" 866 | 867 | 868 | Test Atom 0.3 feed 869 | 870 | {pub_date} 871 | 872 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), 873 | ) 874 | 875 | # Atom 1.0 sitemap 876 | m.get( 877 | self.TEST_BASE_URL + '/sitemap_atom_1_0.xml', 878 | headers={'Content-Type': 'application/atom+xml'}, 879 | text=textwrap.dedent(""" 880 | 881 | 882 | Test Atom 1.0 feed 883 | This is a test Atom 1.0 feed. 884 | 885 | 886 | {base_url} 887 | {pub_date} 888 | 889 | """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(), 890 | ) 891 | 892 | expected_sitemap_tree = IndexWebsiteSitemap( 893 | url='{}/'.format(self.TEST_BASE_URL), 894 | sub_sitemaps=[ 895 | IndexRobotsTxtSitemap( 896 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 897 | sub_sitemaps=[ 898 | PagesRSSSitemap( 899 | url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL), 900 | pages=[] 901 | ), 902 | PagesAtomSitemap( 903 | url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL), 904 | pages=[] 905 | ), 906 | PagesAtomSitemap( 907 | url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL), 908 | pages=[] 909 | ), 910 | ] 911 | ) 912 | ] 913 | ) 914 | 915 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 916 | 917 | assert expected_sitemap_tree == actual_sitemap_tree 918 | 919 | assert len(list(actual_sitemap_tree.all_pages())) == 0 920 | 921 | def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): 922 | """Test sitemap_tree_for_homepage() with clipped XML. 923 | 924 | Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the 925 | server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with 926 | this behavior, so we have to support this too. 927 | """ 928 | 929 | with requests_mock.Mocker() as m: 930 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 931 | 932 | m.get( 933 | self.TEST_BASE_URL + '/', 934 | text='This is a homepage.', 935 | ) 936 | 937 | m.get( 938 | self.TEST_BASE_URL + '/robots.txt', 939 | headers={'Content-Type': 'text/plain'}, 940 | text=textwrap.dedent(""" 941 | User-agent: * 942 | Disallow: /whatever 943 | 944 | Sitemap: {base_url}/sitemap.xml 945 | """.format(base_url=self.TEST_BASE_URL)).strip(), 946 | ) 947 | 948 | m.get( 949 | self.TEST_BASE_URL + '/sitemap.xml', 950 | text=textwrap.dedent(""" 951 | 952 | 954 | 955 | {base_url}/news/first.html 956 | 957 | 958 | {publication_name} 959 | {publication_language} 960 | 961 | {publication_date} 962 | First story 963 | 964 | 965 | 966 | {base_url}/news/second.html 967 | 968 | 969 | {publication_name} 970 | {publication_language} 971 | 972 | {publication_date} 973 | Second story 974 | 975 | 976 | 977 | 978 | 979 | {base_url}/news/third.html 980 | 981 | 982 | {publication_name} 983 | {publication_language} 984 | 985 | 1069 | 1070 | 1071 | {base_url}/news/public.html 1072 | 1073 | 1074 | """.format( 1075 | base_url=self.TEST_BASE_URL, 1076 | publication_name=self.TEST_PUBLICATION_NAME, 1077 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 1078 | publication_date=self.TEST_DATE_STR_ISO8601, 1079 | )).strip(), 1080 | ) 1081 | 1082 | # Private sitemap (to be discovered by trying out a few paths) 1083 | m.get( 1084 | self.TEST_BASE_URL + '/sitemap_index.xml', 1085 | text=textwrap.dedent(""" 1086 | 1087 | 1088 | 1089 | {base_url}/news/private.html 1090 | 1091 | 1092 | """.format( 1093 | base_url=self.TEST_BASE_URL, 1094 | publication_name=self.TEST_PUBLICATION_NAME, 1095 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 1096 | publication_date=self.TEST_DATE_STR_ISO8601, 1097 | )).strip(), 1098 | ) 1099 | 1100 | expected_sitemap_tree = IndexWebsiteSitemap( 1101 | url='{}/'.format(self.TEST_BASE_URL), 1102 | sub_sitemaps=[ 1103 | IndexRobotsTxtSitemap( 1104 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 1105 | sub_sitemaps=[ 1106 | PagesXMLSitemap( 1107 | url='{}/sitemap_public.xml'.format(self.TEST_BASE_URL), 1108 | pages=[ 1109 | SitemapPage( 1110 | url='{}/news/public.html'.format(self.TEST_BASE_URL), 1111 | ), 1112 | ], 1113 | ), 1114 | ], 1115 | ), 1116 | PagesXMLSitemap( 1117 | url='{}/sitemap_index.xml'.format(self.TEST_BASE_URL), 1118 | pages=[ 1119 | SitemapPage( 1120 | url='{}/news/private.html'.format(self.TEST_BASE_URL), 1121 | ), 1122 | ], 1123 | ), 1124 | ] 1125 | ) 1126 | 1127 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1128 | 1129 | assert expected_sitemap_tree == actual_sitemap_tree 1130 | 1131 | def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self): 1132 | """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" 1133 | 1134 | with requests_mock.Mocker() as m: 1135 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 1136 | 1137 | m.get( 1138 | self.TEST_BASE_URL + '/', 1139 | text='This is a homepage.', 1140 | ) 1141 | 1142 | m.get( 1143 | self.TEST_BASE_URL + '/robots.txt', 1144 | headers={'Content-Type': ''}, 1145 | text=textwrap.dedent(""" 1146 | User-agent: * 1147 | Disallow: /whatever 1148 | """.format(base_url=self.TEST_BASE_URL)).strip(), 1149 | ) 1150 | 1151 | expected_sitemap_tree = IndexWebsiteSitemap( 1152 | url='{}/'.format(self.TEST_BASE_URL), 1153 | sub_sitemaps=[ 1154 | IndexRobotsTxtSitemap( 1155 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 1156 | sub_sitemaps=[], 1157 | ) 1158 | ] 1159 | ) 1160 | 1161 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1162 | 1163 | assert expected_sitemap_tree == actual_sitemap_tree 1164 | 1165 | def test_sitemap_tree_for_homepage_no_robots_txt(self): 1166 | """Test sitemap_tree_for_homepage() with no robots.txt.""" 1167 | 1168 | with requests_mock.Mocker() as m: 1169 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 1170 | 1171 | m.get( 1172 | self.TEST_BASE_URL + '/', 1173 | text='This is a homepage.', 1174 | ) 1175 | 1176 | # Nonexistent robots.txt 1177 | m.get( 1178 | self.TEST_BASE_URL + '/robots.txt', 1179 | status_code=404, 1180 | reason='Not Found', 1181 | headers={'Content-Type': 'text/html'}, 1182 | text="

404 Not Found!

", 1183 | ) 1184 | 1185 | expected_sitemap_tree = IndexWebsiteSitemap( 1186 | url='{}/'.format(self.TEST_BASE_URL), 1187 | sub_sitemaps=[ 1188 | InvalidSitemap( 1189 | url='{}/robots.txt'.format(self.TEST_BASE_URL), 1190 | reason=( 1191 | 'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found' 1192 | ).format(base_url=self.TEST_BASE_URL), 1193 | ) 1194 | ] 1195 | ) 1196 | 1197 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1198 | 1199 | assert expected_sitemap_tree == actual_sitemap_tree 1200 | 1201 | def test_sitemap_tree_for_homepage_huge_sitemap(self): 1202 | """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" 1203 | 1204 | page_count = 1000 1205 | 1206 | sitemap_xml = """ 1207 | 1210 | """ 1211 | for x in range(page_count): 1212 | sitemap_xml += """ 1213 | 1214 | {base_url}/news/page_{x}.html 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1223 | 1224 | 1225 | 1226 | {publication_name} 1227 | {publication_language} 1228 | 1229 | {publication_date} 1230 | Foo <foo> 1231 | 1232 | 1233 | """.format( 1234 | x=x, 1235 | base_url=self.TEST_BASE_URL, 1236 | publication_name=self.TEST_PUBLICATION_NAME, 1237 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 1238 | publication_date=self.TEST_DATE_STR_ISO8601, 1239 | ) 1240 | 1241 | sitemap_xml += "" 1242 | 1243 | with requests_mock.Mocker() as m: 1244 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 1245 | 1246 | m.get( 1247 | self.TEST_BASE_URL + '/', 1248 | text='This is a homepage.', 1249 | ) 1250 | 1251 | m.get( 1252 | self.TEST_BASE_URL + '/robots.txt', 1253 | headers={'Content-Type': 'text/plain'}, 1254 | text=textwrap.dedent(""" 1255 | User-agent: * 1256 | Disallow: /whatever 1257 | 1258 | Sitemap: {base_url}/sitemap.xml.gz 1259 | """.format(base_url=self.TEST_BASE_URL)).strip(), 1260 | ) 1261 | 1262 | m.get( 1263 | self.TEST_BASE_URL + '/sitemap.xml.gz', 1264 | headers={'Content-Type': 'application/x-gzip'}, 1265 | content=gzip(sitemap_xml), 1266 | ) 1267 | 1268 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1269 | 1270 | assert len(list(actual_sitemap_tree.all_pages())) == page_count 1271 | 1272 | def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self): 1273 | """Test sitemap_tree_for_homepage() with weird (but valid) spacing.""" 1274 | 1275 | with requests_mock.Mocker() as m: 1276 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 1277 | 1278 | m.get( 1279 | self.TEST_BASE_URL + '/', 1280 | text='This is a homepage.', 1281 | ) 1282 | 1283 | robots_txt_body = "" 1284 | robots_txt_body += "User-agent: *\n" 1285 | # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL 1286 | robots_txt_body += " Sitemap:{base_url}/sitemap.xml ".format(base_url=self.TEST_BASE_URL) 1287 | 1288 | m.get( 1289 | self.TEST_BASE_URL + '/robots.txt', 1290 | headers={'Content-Type': 'text/plain'}, 1291 | text=robots_txt_body, 1292 | ) 1293 | 1294 | m.get( 1295 | self.TEST_BASE_URL + '/sitemap.xml', 1296 | text=textwrap.dedent(""" 1297 | 1298 | 1300 | 1301 | {base_url}/news/first.html 1302 | 1303 | 1304 | {publication_name} 1305 | {publication_language} 1306 | 1307 | {publication_date} 1308 | First story 1309 | 1310 | 1311 | 1312 | """.format( 1313 | base_url=self.TEST_BASE_URL, 1314 | publication_name=self.TEST_PUBLICATION_NAME, 1315 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 1316 | publication_date=self.TEST_DATE_STR_ISO8601, 1317 | )).strip(), 1318 | ) 1319 | 1320 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1321 | assert len(list(actual_sitemap_tree.all_pages())) == 1 1322 | 1323 | def test_sitemap_tree_for_homepage_utf8_bom(self): 1324 | """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap.""" 1325 | 1326 | robots_txt_body = textwrap.dedent(""" 1327 | User-agent: * 1328 | Disallow: /whatever 1329 | 1330 | Sitemap: {base_url}/sitemap.xml 1331 | """.format(base_url=self.TEST_BASE_URL)).strip() 1332 | 1333 | sitemap_xml_body = textwrap.dedent(""" 1334 | 1335 | 1337 | 1338 | {base_url}/news/first.html 1339 | 1340 | 1341 | {publication_name} 1342 | {publication_language} 1343 | 1344 | {publication_date} 1345 | First story 1346 | 1347 | 1348 | 1349 | """.format( 1350 | base_url=self.TEST_BASE_URL, 1351 | publication_name=self.TEST_PUBLICATION_NAME, 1352 | publication_language=self.TEST_PUBLICATION_LANGUAGE, 1353 | publication_date=self.TEST_DATE_STR_ISO8601, 1354 | )).strip() 1355 | 1356 | robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig') 1357 | sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig') 1358 | 1359 | with requests_mock.Mocker() as m: 1360 | m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher) 1361 | 1362 | m.get( 1363 | self.TEST_BASE_URL + '/', 1364 | text='This is a homepage.', 1365 | ) 1366 | 1367 | m.get( 1368 | self.TEST_BASE_URL + '/robots.txt', 1369 | headers={'Content-Type': 'text/plain'}, 1370 | content=robots_txt_body_encoded, 1371 | ) 1372 | 1373 | m.get( 1374 | self.TEST_BASE_URL + '/sitemap.xml', 1375 | content=sitemap_xml_body_encoded, 1376 | ) 1377 | 1378 | actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL) 1379 | assert len(list(actual_sitemap_tree.all_pages())) == 1 1380 | --------------------------------------------------------------------------------