├── tests ├── __init__.py ├── crawler │ ├── __init__.py │ └── lib_test.py └── feed_spider │ ├── __init__.py │ ├── favicon_test.py │ ├── link_filter_test.py │ └── feed_info_parser_test.py ├── feedsearch_crawler ├── crawler │ ├── item.py │ ├── item_parser.py │ ├── __init__.py │ ├── duplicatefilter.py │ ├── queueable.py │ ├── trace.py │ ├── response.py │ ├── lib.py │ ├── request.py │ └── crawler.py ├── feed_spider │ ├── __init__.py │ ├── dupefilter.py │ ├── site_meta.py │ ├── regexes.py │ ├── favicon.py │ ├── feed_info.py │ ├── lib.py │ ├── site_meta_parser.py │ ├── link_filter.py │ ├── spider.py │ └── feed_info_parser.py └── __init__.py ├── LICENSE ├── pyproject.toml ├── .gitignore ├── .github └── workflows │ └── codeql-analysis.yml ├── app.py └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/feed_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/item.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | 4 | class Item(ABC): 5 | ignore_item = False 6 | 7 | def __init__(self, **kwargs): 8 | for k in kwargs.keys(): 9 | if hasattr(self, k): 10 | self.__setattr__(k, kwargs[k]) 11 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/__init__.py: -------------------------------------------------------------------------------- 1 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo 2 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta 3 | from feedsearch_crawler.feed_spider.spider import FeedsearchSpider 4 | 5 | __all__ = ["FeedsearchSpider", "FeedInfo", "SiteMeta"] 6 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/item_parser.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from types import AsyncGeneratorType 3 | from typing import Union 4 | 5 | from feedsearch_crawler.crawler.item import Item 6 | from feedsearch_crawler.crawler.request import Request 7 | from feedsearch_crawler.crawler.response import Response 8 | 9 | 10 | class ItemParser(ABC): 11 | def __init__(self, crawler): 12 | self.crawler = crawler 13 | self.follow = crawler.follow 14 | 15 | @abstractmethod 16 | async def parse_item( 17 | self, request: Request, response: Response, *args, **kwargs 18 | ) -> Union[Item, AsyncGeneratorType]: 19 | raise NotImplementedError("Not Implemented") 20 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/__init__.py: -------------------------------------------------------------------------------- 1 | from feedsearch_crawler.crawler.crawler import Crawler 2 | from feedsearch_crawler.crawler.duplicatefilter import DuplicateFilter 3 | from feedsearch_crawler.crawler.item import Item 4 | from feedsearch_crawler.crawler.item_parser import ItemParser 5 | from feedsearch_crawler.crawler.lib import ( 6 | to_string, 7 | to_bytes, 8 | coerce_url, 9 | CallbackResult, 10 | ) 11 | from feedsearch_crawler.crawler.request import Request 12 | from feedsearch_crawler.crawler.response import Response 13 | 14 | __all__ = [ 15 | "Crawler", 16 | "Item", 17 | "ItemParser", 18 | "DuplicateFilter", 19 | "Request", 20 | "Response", 21 | "to_bytes", 22 | "to_string", 23 | "coerce_url", 24 | "CallbackResult", 25 | ] 26 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/dupefilter.py: -------------------------------------------------------------------------------- 1 | from w3lib.url import url_query_cleaner, canonicalize_url 2 | from yarl import URL 3 | 4 | from feedsearch_crawler.crawler import DuplicateFilter 5 | 6 | 7 | class NoQueryDupeFilter(DuplicateFilter): 8 | valid_keys = ["feedformat", "feed", "rss", "atom", "jsonfeed", "format", "podcast"] 9 | 10 | def parse_url(self, url: URL) -> str: 11 | # Keep the query strings if they might be feed strings. 12 | # Wikipedia for example uses query strings to differentiate feeds. 13 | if any(key in url.query for key in self.valid_keys): 14 | return canonicalize_url(str(url)) 15 | 16 | # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests. 17 | return canonicalize_url(url_query_cleaner(str(url))) 18 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/site_meta.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from yarl import URL 4 | 5 | from feedsearch_crawler.crawler import Item 6 | 7 | 8 | class SiteMeta(Item): 9 | url: URL = None 10 | site_url: str = "" 11 | site_name: str = "" 12 | icon_url: URL = None 13 | icon_data_uri: str = "" 14 | possible_icons: List = [] 15 | host: str = "" 16 | 17 | def __init__(self, url: URL, **kwargs) -> None: 18 | super().__init__(**kwargs) 19 | self.url = url 20 | 21 | def serialize(self): 22 | return dict( 23 | url=str(self.url), site_name=self.site_name, icon_url=str(self.icon_url) 24 | ) 25 | 26 | def __eq__(self, other): 27 | return isinstance(other, self.__class__) and self.url == other.url 28 | 29 | def __hash__(self): 30 | return hash(self.url) 31 | 32 | def __repr__(self): 33 | return f"{self.__class__.__name__}({str(self.url)})" 34 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/regexes.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # Regex to check if possible RSS data. 4 | rss_regex = re.compile("( bool: 24 | """ 25 | Check that the Favicon site_host is a match for the host. 26 | 27 | :param host: domain host url string 28 | :param requires_data_uri: Whether the Favicon is required to have a data_uri 29 | :return: bool 30 | """ 31 | return ( 32 | self.url 33 | and self.site_host 34 | and self.site_host in host 35 | and (self.data_uri if requires_data_uri else True) 36 | ) 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "feedsearch-crawler" 3 | version = "1.0.3" 4 | description = "Search sites for RSS, Atom, and JSON feeds" 5 | authors = ["David Beath "] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/DBeath/feedsearch-crawler" 9 | keywords = [ 10 | "RSS", 11 | "Search", 12 | "Crawler", 13 | "Feeds", 14 | "Atom" 15 | ] 16 | classifiers = [ 17 | "License :: OSI Approved :: MIT License", 18 | "Intended Audience :: Developers", 19 | "Development Status :: 5 - Production/Stable", 20 | "Natural Language :: English", 21 | "Operating System :: POSIX", 22 | "Operating System :: Microsoft :: Windows", 23 | "Operating System :: MacOS :: MacOS X", 24 | "Programming Language :: Python :: 3.7", 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "Topic :: Internet :: WWW/HTTP :: Indexing/Search", 28 | "Typing :: Typed", 29 | "Framework :: AsyncIO", 30 | ] 31 | packages = [ 32 | { include = "feedsearch_crawler" }, 33 | ] 34 | 35 | [tool.poetry.dependencies] 36 | python = "^3.8" 37 | aiohttp = "^3.7.4" 38 | beautifulsoup4 = "^4.9.3" 39 | cchardet = "^2.1.7" 40 | aiodns = "^2.0.0" 41 | uvloop = "^0.15.2" 42 | w3lib = "^1.22.0" 43 | feedparser = "^6.0.10" 44 | brotlipy = "^0.7.0" 45 | python-dateutil = "^2.8.1" 46 | yarl = "^1.6.3" 47 | 48 | [tool.poetry.dev-dependencies] 49 | twine = "*" 50 | pytest = "*" 51 | 52 | [build-system] 53 | requires = ["poetry-core>=1.0.0"] 54 | build-backend = "poetry.core.masonry.api" 55 | -------------------------------------------------------------------------------- /tests/feed_spider/favicon_test.py: -------------------------------------------------------------------------------- 1 | from feedsearch_crawler.feed_spider.favicon import Favicon 2 | 3 | 4 | def test_matches_host(): 5 | favicon = Favicon( 6 | site_host="test.com", 7 | url="test.com/favicon.ico", 8 | priority=1, 9 | data_uri="data_uri", 10 | ) 11 | assert favicon.matches_host("test.com") 12 | 13 | 14 | def test_matches_host_no_match(): 15 | favicon = Favicon( 16 | site_host="test.com", 17 | url="test.com/favicon.ico", 18 | priority=1, 19 | data_uri="data_uri", 20 | ) 21 | assert not favicon.matches_host("test2.com") 22 | 23 | 24 | def test_matches_host_no_site_host(): 25 | favicon = Favicon( 26 | site_host="", 27 | url="test.com/favicon.ico", 28 | priority=1, 29 | data_uri="data_uri", 30 | ) 31 | assert not favicon.matches_host("test2.com") 32 | 33 | 34 | def test_matches_host_data_uri(): 35 | favicon = Favicon( 36 | site_host="test.com", 37 | url="test.com/favicon.ico", 38 | priority=1, 39 | data_uri="data_uri", 40 | ) 41 | assert favicon.matches_host("test.com", requires_data_uri=True) 42 | 43 | 44 | def test_matches_host_no_data_uri(): 45 | favicon = Favicon( 46 | site_host="test.com", 47 | url="test.com/favicon.ico", 48 | priority=1, 49 | ) 50 | assert not favicon.matches_host("test.com", requires_data_uri=True) 51 | 52 | 53 | def test_matches_host_no_url(): 54 | favicon = Favicon(site_host="test.com", priority=1, data_uri="data_uri") 55 | assert not favicon.matches_host("test.com", requires_data_uri=True) 56 | -------------------------------------------------------------------------------- /tests/crawler/lib_test.py: -------------------------------------------------------------------------------- 1 | from feedsearch_crawler.crawler.lib import coerce_url, is_same_domain 2 | from yarl import URL 3 | 4 | 5 | def test_coerce_url(): 6 | assert coerce_url("test.com") == URL("http://test.com") 7 | assert coerce_url("https://test.com") == URL("https://test.com") 8 | assert coerce_url(" https://test.com") == URL("https://test.com") 9 | assert coerce_url("test.com/path/path2") == URL("http://test.com/path/path2") 10 | 11 | assert coerce_url("test.com", https=True) == URL("https://test.com") 12 | assert coerce_url("https://test.com", https=True) == URL("https://test.com") 13 | assert coerce_url(" https://test.com", https=True) == URL("https://test.com") 14 | assert coerce_url("http://test.com", https=True) == URL("https://test.com") 15 | assert coerce_url("test.com/path/path2", https=True) == URL( 16 | "https://test.com/path/path2" 17 | ) 18 | assert coerce_url("//test.com") == URL("http://test.com") 19 | assert coerce_url("feed://test.com") == URL("feed://test.com") 20 | assert coerce_url("feed://www.internet-law.de/?feed=/feed/") == URL( 21 | "feed://www.internet-law.de/?feed=/feed/" 22 | ) 23 | 24 | 25 | def test_is_same_domain(): 26 | assert is_same_domain("test.com", "test.com") is True 27 | assert is_same_domain("example.com", "test.com") is False 28 | assert is_same_domain("feeds.test.com", "test.com") is False 29 | assert is_same_domain("test.com", "feeds.test.com") is True 30 | assert is_same_domain("test.com", "test.feeds.test.com") is True 31 | assert is_same_domain("www.test.com", "test.com") is True 32 | assert is_same_domain("www.test.com", "feed.test.com") is True 33 | assert is_same_domain("test.www.test.com", "test.com") is False 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea/ 107 | .vscode/ 108 | 109 | logs/ 110 | experiments/ 111 | timed.sh 112 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/duplicatefilter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | 4 | from yarl import URL 5 | 6 | from feedsearch_crawler.crawler.lib import to_bytes 7 | 8 | 9 | class DuplicateFilter: 10 | """ 11 | Filters duplicate URLs. 12 | """ 13 | 14 | def __init__(self): 15 | # Dictionary whose keys are the hashed fingerprints of the URLs 16 | self.fingerprints = dict() 17 | # Locks the fingerprints dict when accessing keys. 18 | self._seen_lock = asyncio.Lock() 19 | 20 | async def url_seen(self, url: URL, method: str = "") -> bool: 21 | """ 22 | Checks if the URL has already been seen, and adds the URL fingerprint if not. 23 | 24 | :param url: URL object 25 | :param method: Optional HTTP method to use for hashing 26 | :return: True if URL already seen 27 | """ 28 | url_str: str = self.parse_url(url) 29 | fp = self.url_fingerprint_hash(url_str, method) 30 | async with self._seen_lock: 31 | if fp in self.fingerprints: 32 | return True 33 | self.fingerprints[fp] = url_str 34 | return False 35 | 36 | def parse_url(self, url: URL) -> str: 37 | """ 38 | Parse the URL object to a string. Used for functionality such as filtering query strings. 39 | 40 | :param url: URL object 41 | :return: URL as string 42 | """ 43 | return str(url) 44 | 45 | @staticmethod 46 | def url_fingerprint_hash(url: str, method: str = "") -> str: 47 | """ 48 | Create a fingerprint hash of a URL string along with the method if provided. 49 | 50 | :param url: URL as string 51 | :param method: Optional HTTP method 52 | :return: Hashed string 53 | """ 54 | # noinspection InsecureHash 55 | fp = hashlib.sha1() 56 | fp.update(to_bytes(url)) 57 | if method: 58 | fp.update(to_bytes(method)) 59 | return fp.hexdigest() 60 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/feed_info.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List 3 | 4 | from yarl import URL 5 | 6 | from feedsearch_crawler.crawler import Item, to_string 7 | 8 | 9 | class FeedInfo(Item): 10 | bozo: int = 0 11 | content_length: int = 0 12 | content_type: str = "" 13 | description: str = "" 14 | favicon: URL = "" 15 | favicon_data_uri: str = "" 16 | hubs: List[str] = [] 17 | is_podcast: bool = False 18 | is_push: bool = False 19 | item_count: int = 0 20 | last_updated: datetime = None 21 | score: int = 0 22 | self_url: URL = "" 23 | site_name: str = "" 24 | site_url: URL = "" 25 | title: str = "" 26 | url: URL = "" 27 | velocity: float = 0 28 | version: str = "" 29 | 30 | def serialize(self): 31 | last_updated = self.last_updated.isoformat() if self.last_updated else "" 32 | 33 | return dict( 34 | bozo=self.bozo, 35 | description=self.description, 36 | content_length=self.content_length, 37 | content_type=self.content_type, 38 | favicon=to_string(self.favicon), 39 | favicon_data_uri=self.favicon_data_uri, 40 | hubs=self.hubs, 41 | is_podcast=self.is_podcast, 42 | is_push=self.is_push, 43 | item_count=self.item_count, 44 | last_updated=last_updated, 45 | score=self.score, 46 | self_url=to_string(self.self_url), 47 | site_name=self.site_name, 48 | site_url=to_string(self.site_url), 49 | title=self.title, 50 | url=to_string(self.url), 51 | velocity=self.velocity, 52 | version=self.version, 53 | ) 54 | 55 | def __eq__(self, other): 56 | return isinstance(other, self.__class__) and self.url == other.url 57 | 58 | def __hash__(self): 59 | return hash(self.url) 60 | 61 | def __repr__(self): 62 | return f"{self.__class__.__name__}({str(self.url)})" 63 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/queueable.py: -------------------------------------------------------------------------------- 1 | from asyncio import Queue 2 | from typing import Union 3 | 4 | import time 5 | 6 | 7 | class Queueable: 8 | queue_put_time = None 9 | queue_get_time = None 10 | # Default lowest queue priority is 100 (higher number means lower priority) 11 | priority = 100 12 | 13 | def get_queue_wait_time(self) -> Union[float, None]: 14 | """ 15 | Get the time in Milliseconds that this object has been on the queue. 16 | 17 | :return: Queue wait time in Milliseconds as float 18 | """ 19 | # Only set queue_get_time if not already set, so that the value of this method doesn't change each time 20 | # it's called. 21 | if not self.queue_get_time: 22 | self.queue_get_time = time.perf_counter() 23 | if self.queue_put_time: 24 | return (self.queue_get_time - self.queue_put_time) * 1000 25 | return None 26 | 27 | def set_queue_put_time(self) -> None: 28 | """ 29 | Set the time that this object was put onto the queue. 30 | """ 31 | # Set queue_get_time to None, because this method is called whenever a Queueable is added to the queue 32 | # and it may be added to a queue multiple times in it's life. 33 | self.queue_get_time = None 34 | self.queue_put_time = time.perf_counter() 35 | 36 | def add_to_queue(self, queue: Queue) -> None: 37 | """ 38 | Add the Queueable to the queue and set the queue put time. 39 | 40 | :param queue: An Queue instance 41 | """ 42 | self.set_queue_put_time() 43 | queue.put_nowait(self) 44 | 45 | def __lt__(self, other) -> bool: 46 | """ 47 | Compare Queueable priority for Queue ordering. 48 | Lower priority has precedence in the Queue. 49 | 50 | :param other: Another Queueable object 51 | :return: boolean 52 | """ 53 | if not isinstance(other, Queueable): 54 | return True 55 | return self.priority < other.priority 56 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | push: 10 | branches: [master] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [master] 14 | schedule: 15 | - cron: '0 23 * * 6' 16 | 17 | jobs: 18 | analyze: 19 | name: Analyze 20 | runs-on: ubuntu-latest 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | # Override automatic language detection by changing the below list 26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] 27 | language: ['python'] 28 | # Learn more... 29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection 30 | 31 | steps: 32 | - name: Checkout repository 33 | uses: actions/checkout@v2 34 | with: 35 | # We must fetch at least the immediate parents so that if this is 36 | # a pull request then we can checkout the head. 37 | fetch-depth: 2 38 | 39 | # If this run was triggered by a pull request event, then checkout 40 | # the head of the pull request instead of the merge commit. 41 | - run: git checkout HEAD^2 42 | if: ${{ github.event_name == 'pull_request' }} 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /feedsearch_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from xml.etree import ElementTree 4 | from typing import List, Union 5 | 6 | from yarl import URL 7 | 8 | from feedsearch_crawler.feed_spider import FeedsearchSpider, FeedInfo 9 | 10 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 11 | 12 | name = "Feedsearch Crawler" 13 | 14 | 15 | def search( 16 | url: Union[URL, str, List[Union[URL, str]]], 17 | try_urls: Union[List[str], bool] = False, 18 | *args, 19 | **kwargs 20 | ) -> List[FeedInfo]: 21 | """ 22 | Search for feeds at a URL. 23 | 24 | :param url: URL or list of URLs to search 25 | :param try_urls: Tries different paths that may contain feeds. 26 | :return: List of FeedInfo objects 27 | """ 28 | results = asyncio.run(search_async(url, try_urls=try_urls, *args, **kwargs)) 29 | return results 30 | 31 | 32 | async def search_async( 33 | url: Union[URL, str, List[Union[URL, str]]], 34 | try_urls: Union[List[str], bool] = False, 35 | *args, 36 | **kwargs 37 | ) -> List[FeedInfo]: 38 | """ 39 | Search asynchronously for feeds at a URL. 40 | 41 | :param url: URL or list of URLs to search 42 | :param try_urls: Tries different paths that may contain feeds. 43 | :return: List of FeedInfo objects 44 | """ 45 | crawler = FeedsearchSpider(try_urls=try_urls, *args, **kwargs) 46 | await crawler.crawl(url) 47 | 48 | return sort_urls(list(crawler.items)) 49 | 50 | 51 | def sort_urls(feeds: List[FeedInfo]) -> List[FeedInfo]: 52 | """ 53 | Sort list of feeds based on Url score 54 | 55 | :param feeds: List of FeedInfo objects 56 | :return: List of FeedInfo objects sorted by score 57 | """ 58 | feeds = [f for f in feeds if isinstance(f, FeedInfo)] 59 | sorted_urls = sorted(list(set(feeds)), key=lambda x: x.score, reverse=True) 60 | return sorted_urls 61 | 62 | 63 | def output_opml(feeds: List[FeedInfo]) -> bytes: 64 | """ 65 | Return feeds as a subscriptionlist OPML file. 66 | http://dev.opml.org/spec2.html#subscriptionLists 67 | 68 | :param feeds: List of FeedInfo objects 69 | :return: OPML file as XML bytestring 70 | """ 71 | root = ElementTree.Element("opml", version="2.0") 72 | head = ElementTree.SubElement(root, "head") 73 | title = ElementTree.SubElement(head, "title") 74 | title.text = "Feeds" 75 | body = ElementTree.SubElement(root, "body") 76 | 77 | for feed in feeds: 78 | if not feed.url: 79 | continue 80 | 81 | fe = ElementTree.SubElement(body, "outline", type="rss", xmlUrl=str(feed.url)) 82 | 83 | if feed.title: 84 | fe.set("text", feed.title) 85 | fe.set("title", feed.title) 86 | if feed.site_url: 87 | fe.set("htmlUrl", str(feed.site_url)) 88 | if feed.description: 89 | fe.set("description", feed.description) 90 | if feed.version: 91 | fe.set("version", feed.version) 92 | 93 | return ElementTree.tostring(root, encoding="utf8", method="xml") 94 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/lib.py: -------------------------------------------------------------------------------- 1 | import cgi 2 | from datetime import datetime 3 | from typing import Union, List 4 | 5 | from dateutil import tz, parser 6 | from yarl import URL 7 | 8 | 9 | class ParseTypes: 10 | JSON = "json" 11 | XML = "xml" 12 | 13 | 14 | def get_site_root(url: Union[str, URL]) -> str: 15 | """ 16 | Find the root domain of a url 17 | """ 18 | if isinstance(url, URL): 19 | return url.host 20 | return URL(url).host 21 | 22 | 23 | def create_allowed_domains(url: Union[str, URL]) -> List[str]: 24 | if isinstance(url, URL): 25 | return [url.host] 26 | return [URL(url).host] 27 | 28 | 29 | def parse_header_links(value): 30 | """ 31 | Return a list of Dicts of parsed link headers proxies. 32 | i.e. Link: ; rel=front; type="image/jpeg", 33 | ; rel=back;type="image/jpeg" 34 | 35 | :param value: HTTP Link header to parse 36 | :return: List of Dicts 37 | """ 38 | 39 | links = [] 40 | 41 | replace_chars = " '\"" 42 | 43 | for val in value.split(","): 44 | try: 45 | url, params = val.split(";", 1) 46 | except ValueError: 47 | url, params = val, "" 48 | 49 | link = {"url": url.strip("<> '\"")} 50 | 51 | for param in params.split(";"): 52 | try: 53 | key, value = param.split("=") 54 | except ValueError: 55 | break 56 | 57 | link[key.strip(replace_chars)] = value.strip(replace_chars) 58 | 59 | links.append(link) 60 | 61 | return links 62 | 63 | 64 | def force_utc(dt: datetime) -> datetime: 65 | """ 66 | Change a datetime to UTC, and convert naive datetimes to tz-aware UTC. 67 | 68 | :param dt: datetime to change to UTC 69 | :return: tz-aware UTC datetime 70 | """ 71 | if dt.tzinfo is None: 72 | dt = dt.replace(tzinfo=tz.tzutc()) 73 | return dt.astimezone(tz.tzutc()) 74 | 75 | 76 | def datestring_to_utc_datetime(date_string: str) -> datetime: 77 | """ 78 | Convert a date string to a tz-aware UTC datetime. 79 | 80 | :param date_string: A datetime as a string in almost any format. 81 | :return: tz-aware UTC datetime 82 | """ 83 | dt = parser.parse(date_string) 84 | return force_utc(dt) 85 | 86 | 87 | def create_content_type(parse_type: str, encoding: str, content_type: str) -> str: 88 | """ 89 | Create the actual content type of the feed. 90 | 91 | :param parse_type: How the feed is being parsed. XML or JSON 92 | :param encoding: Charset encoding of the response 93 | :param content_type: Content-Type header string of the response 94 | :return: Content-Type string 95 | """ 96 | ctype, pdict = cgi.parse_header(content_type) 97 | 98 | if parse_type == ParseTypes.JSON and ParseTypes.JSON not in ctype.lower(): 99 | ctype = "application/json" 100 | elif parse_type == ParseTypes.XML and ParseTypes.XML not in ctype.lower(): 101 | ctype = "application/xml" 102 | 103 | return f"{ctype}; charset={encoding}".lower() 104 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/trace.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | 4 | import aiohttp 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | async def on_request_start(session, trace_config_ctx, params): 10 | loop = asyncio.get_event_loop() 11 | trace_config_ctx.start = loop.time() 12 | logger.debug("Request Start: %s", params.url) 13 | 14 | 15 | async def on_request_end(session, trace_config_ctx, params): 16 | loop = asyncio.get_event_loop() 17 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 18 | logger.debug("Request END: %s %s %dms", params.url, params.response.url, elapsed) 19 | 20 | 21 | async def on_connection_create_start(session, trace_config_ctx, params): 22 | loop = asyncio.get_event_loop() 23 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 24 | logger.debug("Connection create Start: %dms", elapsed) 25 | 26 | 27 | async def on_connection_create_end(session, trace_config_ctx, params): 28 | loop = asyncio.get_event_loop() 29 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 30 | logger.debug("Connection create END: %dms", elapsed) 31 | 32 | 33 | async def on_dns_resolvehost_start(session, trace_config_ctx, params): 34 | loop = asyncio.get_event_loop() 35 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 36 | logger.debug("DNS Resolve Host Start: %s %dms", params.host, elapsed) 37 | 38 | 39 | async def on_dns_resolvehost_end(session, trace_config_ctx, params): 40 | loop = asyncio.get_event_loop() 41 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 42 | logger.debug("DNS Resolve Host END: %s %dms", params.host, elapsed) 43 | 44 | 45 | async def on_dns_cache_hit(session, trace_config_ctx, params): 46 | loop = asyncio.get_event_loop() 47 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 48 | logger.debug("DNS Cache Hit: %s %dms", params.host, elapsed) 49 | 50 | 51 | async def on_dns_cache_miss(session, trace_config_ctx, params): 52 | loop = asyncio.get_event_loop() 53 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 54 | logger.debug("DNS Cache Miss: %s %dms", params.host, elapsed) 55 | 56 | 57 | async def on_request_redirect(session, trace_config_ctx, params): 58 | loop = asyncio.get_event_loop() 59 | elapsed = int((loop.time() - trace_config_ctx.start) * 1000) 60 | logger.debug( 61 | "Request redirect: %s %s %dms", params.url, params.response.url, elapsed 62 | ) 63 | 64 | 65 | def add_trace_config(): 66 | trace_config = aiohttp.TraceConfig() 67 | trace_config.on_request_start.append(on_request_start) 68 | trace_config.on_dns_resolvehost_start.append(on_dns_resolvehost_start) 69 | trace_config.on_dns_cache_hit.append(on_dns_cache_hit) 70 | trace_config.on_dns_cache_miss.append(on_dns_cache_miss) 71 | trace_config.on_dns_resolvehost_end.append(on_dns_resolvehost_end) 72 | trace_config.on_request_end.append(on_request_end) 73 | trace_config.on_request_redirect.append(on_request_redirect) 74 | trace_config.on_connection_create_start.append(on_connection_create_start) 75 | trace_config.on_connection_create_end.append(on_connection_create_end) 76 | return trace_config 77 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/response.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import List, Dict, Any, Optional 3 | 4 | from yarl import URL 5 | 6 | from feedsearch_crawler.crawler.lib import is_same_domain 7 | 8 | 9 | class Response: 10 | _xml = None 11 | 12 | def __init__( 13 | self, 14 | url: URL, 15 | method: str, 16 | encoding: str = "", 17 | text: str = "", 18 | json: Dict = None, 19 | data: bytes = b"", 20 | history: List[URL] = None, 21 | headers=None, 22 | status_code: int = -1, 23 | cookies=None, 24 | xml_parser=None, 25 | redirect_history=None, 26 | content_length: int = 0, 27 | meta: Dict = None, 28 | ): 29 | self.url = url 30 | self.encoding = encoding 31 | self.method = method 32 | self.text = text 33 | self.json = json 34 | self.data = data 35 | self.history = history or [] 36 | self.headers = headers or {} 37 | self.status_code = status_code 38 | self.cookies = cookies 39 | self.id = uuid.uuid4() 40 | self._xml_parser = xml_parser 41 | self.redirect_history = redirect_history 42 | self.content_length = content_length 43 | self.meta = meta 44 | self.origin: URL = url.origin() 45 | 46 | @property 47 | def ok(self) -> bool: 48 | return self.status_code == 0 or 200 <= self.status_code <= 299 49 | 50 | @property 51 | def domain(self) -> str: 52 | return self.url.host 53 | 54 | @property 55 | def scheme(self) -> str: 56 | return self.url.scheme 57 | 58 | @property 59 | def previous_domain(self) -> str: 60 | if not self.history: 61 | return "" 62 | return self.history[-1].host 63 | 64 | @property 65 | def originator_url(self) -> Optional[URL]: 66 | if not self.history or len(self.history) == 1: 67 | return None 68 | return self.history[-2] 69 | 70 | @property 71 | async def xml(self) -> Any: 72 | if self._xml: 73 | return self._xml 74 | 75 | if not self._xml_parser: 76 | return None 77 | 78 | if not self.text and self.data and self.encoding: 79 | self.text = self.data.decode(self.encoding) 80 | 81 | self._xml = await self._xml_parser(self.text) 82 | return self._xml 83 | 84 | def is_max_depth_reached(self, max_depth: int) -> bool: 85 | """ 86 | Check if the max response depth has been reached. 87 | 88 | :param max_depth: Max length of response history 89 | :return: boolean 90 | """ 91 | if max_depth and len(self.history) >= max_depth: 92 | return True 93 | return False 94 | 95 | def is_original_domain(self) -> bool: 96 | """ 97 | Check if this response is still at the original domain in the response chain. 98 | 99 | :return: boolean 100 | """ 101 | # This is the first Response in the chain 102 | if len(self.history) < 2: 103 | return True 104 | # URL is same domain or sub-domain 105 | if is_same_domain(self.history[0].host, self.url.host): 106 | return True 107 | 108 | return False 109 | 110 | def __repr__(self): 111 | return f"{self.__class__.__name__}({str(self.url)})" 112 | -------------------------------------------------------------------------------- /tests/feed_spider/link_filter_test.py: -------------------------------------------------------------------------------- 1 | from yarl import URL 2 | 3 | from feedsearch_crawler.feed_spider.link_filter import ( 4 | LinkFilter as lf, 5 | ) 6 | from feedsearch_crawler.feed_spider.regexes import feedlike_regex, podcast_regex 7 | 8 | 9 | def test_feedlike_regex(): 10 | valid = [ 11 | "rss", 12 | "testing/rss", 13 | "testing/rss-test", 14 | "test-rss-test", 15 | "test.rss.test", 16 | "RSS", 17 | "test/RSS/test", 18 | "feed", 19 | "testing/feed/", 20 | "test-feed-test", 21 | "test.feed.test", 22 | "FEED", 23 | "FeeD", 24 | "test/FEED/test", 25 | "feeds", 26 | "testing/feeds", 27 | "test-feeds-test", 28 | "test.feeds.test", 29 | "FEEDS", 30 | "FeedS", 31 | "test/FEEDS/test", 32 | "atom", 33 | "json", 34 | "xml", 35 | "rdf", 36 | "blog", 37 | "blogs", 38 | "test/subscribe/testing" 39 | ] 40 | for value in valid: 41 | assert feedlike_regex.search(value) 42 | 43 | 44 | def test_feedlike_regex_invalid(): 45 | invalid = ["rsss", "rs-s", "feedss", "tfeed", "fee-d", "fee.d"] 46 | for value in invalid: 47 | assert not feedlike_regex.search(value) 48 | 49 | 50 | def test_podcast_regex(): 51 | pass 52 | 53 | 54 | def test_is_feedlike_href(): 55 | assert lf.is_href_matching("test.com/feed", feedlike_regex) is True 56 | assert lf.is_href_matching("feed", feedlike_regex) is True 57 | assert lf.is_href_matching("feeds", feedlike_regex) is True 58 | assert lf.is_href_matching("test.com/feeds", feedlike_regex) is True 59 | assert lf.is_href_matching("test.com/feeds/test", feedlike_regex) is True 60 | assert lf.is_href_matching("test.com/podcasts/test", feedlike_regex) is False 61 | assert lf.is_href_matching("test.com/podcast/test", feedlike_regex) is False 62 | assert lf.is_href_matching("test.com/podcasts", feedlike_regex) is False 63 | assert lf.is_href_matching("test.com/podcast", feedlike_regex) is False 64 | 65 | 66 | def test_is_feedlike_querystring(): 67 | assert lf.is_querystring_matching(URL("test.com?feed"), feedlike_regex) is True 68 | assert lf.is_querystring_matching(URL("test.com/test?feed"), feedlike_regex) is True 69 | assert ( 70 | lf.is_querystring_matching( 71 | URL("test.com/test?url=feed&test=true"), feedlike_regex 72 | ) 73 | is False 74 | ) 75 | assert ( 76 | lf.is_querystring_matching(URL("test.com/test?url=feed"), feedlike_regex) 77 | is False 78 | ) 79 | assert ( 80 | lf.is_querystring_matching(URL("test.com/feed?url=test"), feedlike_regex) 81 | is False 82 | ) 83 | assert ( 84 | lf.is_querystring_matching(URL("test.com/test?feed=test"), feedlike_regex) 85 | is True 86 | ) 87 | assert ( 88 | lf.is_querystring_matching(URL("test.com?podcast=test"), feedlike_regex) 89 | is False 90 | ) 91 | assert ( 92 | lf.is_querystring_matching(URL("test.com?feeds=test"), feedlike_regex) is True 93 | ) 94 | assert ( 95 | lf.is_querystring_matching(URL("test.com?podcasts=test"), feedlike_regex) 96 | is False 97 | ) 98 | 99 | 100 | def test_is_podcast_href(): 101 | assert lf.is_href_matching("test.com/podcasts/test", podcast_regex) is True 102 | assert lf.is_href_matching("test.com/podcast/test", podcast_regex) is True 103 | assert lf.is_href_matching("test.com/podcasts", podcast_regex) is True 104 | assert lf.is_href_matching("test.com/podcast", podcast_regex) is True 105 | 106 | 107 | def test_is_podcast_querystring(): 108 | assert ( 109 | lf.is_querystring_matching(URL("test.com?podcast=test"), podcast_regex) is True 110 | ) 111 | assert ( 112 | lf.is_querystring_matching(URL("test.com?podcasts=test"), podcast_regex) is True 113 | ) 114 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/site_meta_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | from yarl import URL 5 | 6 | from feedsearch_crawler.crawler import ItemParser, Request, Response 7 | from feedsearch_crawler.crawler.lib import remove_www 8 | from feedsearch_crawler.feed_spider.favicon import Favicon 9 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class SiteMetaParser(ItemParser): 15 | async def parse_item(self, request: Request, response: Response, *args, **kwargs): 16 | logger.info("Parsing: SiteMeta %s", response.url) 17 | url = response.url 18 | site_meta: SiteMeta = SiteMeta(url) 19 | 20 | xml = await response.xml 21 | if not xml: 22 | return 23 | 24 | site_meta.url = self.find_site_url(xml, url) 25 | site_meta.host = remove_www(site_meta.url.host) 26 | site_meta.site_name = self.find_site_name(xml) 27 | site_meta.possible_icons = self.find_site_icon_urls(xml, url, site_meta.host) 28 | 29 | for icon in site_meta.possible_icons: 30 | if icon.url: 31 | # Only follow favicon urls if we want to create a data uri 32 | if self.crawler.favicon_data_uri: 33 | yield self.follow( 34 | icon.url, 35 | self.crawler.parse_favicon_data_uri, 36 | cb_kwargs=dict(favicon=icon), 37 | allow_domain=True, 38 | max_content_length=51200, 39 | ) 40 | else: 41 | yield icon 42 | 43 | yield site_meta 44 | 45 | @staticmethod 46 | def find_site_icon_urls(soup, url, host) -> List[Favicon]: 47 | search_icons = [ 48 | Favicon( 49 | url=url.join(URL("favicon.ico")), 50 | rel="favicon", 51 | priority=3, 52 | site_host=host, 53 | ), 54 | Favicon(url="", rel="shortcut icon", priority=1, site_host=host), 55 | Favicon(url="", rel="icon", priority=2, site_host=host), 56 | ] 57 | 58 | possible_icons = [] 59 | for icon in search_icons: 60 | link = soup.find(name="link", rel=icon.rel) 61 | if link: 62 | href = link.get("href", None) 63 | if href: 64 | icon.url = url.join(URL(href)) 65 | if icon.url: 66 | possible_icons.append(icon) 67 | 68 | return sorted(possible_icons, key=lambda x: x.priority) 69 | 70 | @staticmethod 71 | def find_site_url(soup, url: URL) -> URL: 72 | """ 73 | Attempts to find the canonical Url of the Site 74 | 75 | :param soup: BeautifulSoup of site 76 | :param url: Current Url of site 77 | :return: str 78 | """ 79 | try: 80 | canonical = soup.find(name="link", rel="canonical") 81 | site = canonical.get("href") 82 | if site: 83 | if site.strip() == "/": 84 | return url 85 | return URL(site).origin() 86 | except (AttributeError, ValueError): 87 | pass 88 | 89 | try: 90 | meta = soup.find(name="meta", property="og:url") 91 | site = meta.get("content") 92 | if site: 93 | if site.strip() == "/": 94 | return url 95 | return URL(site).origin() 96 | except (AttributeError, ValueError): 97 | pass 98 | 99 | return url.origin() 100 | 101 | @staticmethod 102 | def find_site_name(soup) -> str: 103 | """ 104 | Attempts to find Site Name 105 | 106 | :param soup: BeautifulSoup of site 107 | :return: str 108 | """ 109 | site_name_meta = [ 110 | "og:site_name", 111 | "og:title", 112 | "application:name", 113 | "twitter:app:name:iphone", 114 | ] 115 | 116 | for p in site_name_meta: 117 | try: 118 | name = soup.find(name="meta", property=p).get("content") 119 | if name: 120 | return name 121 | except AttributeError: 122 | pass 123 | 124 | try: 125 | title = soup.find(name="title").text 126 | if title: 127 | return title 128 | except AttributeError: 129 | pass 130 | 131 | return "" 132 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import json 4 | import time 5 | from pprint import pprint 6 | from feedsearch_crawler import search, FeedsearchSpider, output_opml, sort_urls 7 | from feedsearch_crawler.crawler import coerce_url 8 | from datetime import datetime 9 | import collections 10 | 11 | urls = [ 12 | # "arstechnica.com", 13 | # "https://davidbeath.com", 14 | # "http://xkcd.com", 15 | # "http://jsonfeed.org", 16 | # "en.wikipedia.com", 17 | # "scientificamerican.com", 18 | # "newyorktimes.com", 19 | # "https://www.dancarlin.com", 20 | # "https://www.hanselminutes.com/", 21 | # "nytimes.com", 22 | # "https://www.jeremydaly.com/serverless-microservice-patterns-for-aws/", 23 | # "feedhandbook.com", 24 | # "https://americanaffairsjournal.org/2019/05/ubers-path-of-destruction/", 25 | # "localhost:8080/test", 26 | # "theatlantic.com", 27 | # "nypost.com", 28 | # "https://www.washingtonpost.com", 29 | # "localhost:5000", 30 | # "latimes.com", 31 | # "http://feeds.washingtonpost.com/rss/rss_fact-checker?noredirect=on", 32 | # "http://tabletopwhale.com/index.html" 33 | # "www.vanityfair.com", 34 | # "bloomberg.com", 35 | # "http://www.bloomberg.com/politics/feeds/site.xml", 36 | # "propublica.org" 37 | # "npr.org", 38 | # "rifters.com", 39 | # "https://www.bbc.co.uk/podcasts" 40 | # "https://www.bbc.co.uk/programmes/p02nrsln/episodes/downloads", 41 | # "https://breebird33.tumblr.com/", 42 | # "https://neurocorp.tumblr.com/", 43 | # "https://breebird33.tumblr.com/rss" 44 | # "https://resel.fr/rss-news" 45 | # "https://muhammadraza.me" 46 | # "https://www.franceinter.fr/rss/a-la-une.xml", 47 | # "harpers.org", 48 | # "slashdot.com", 49 | # "https://bearblog.dev", 50 | # "aeon.co", 51 | # "https://davidgerard.co.uk/blockchain/" 52 | # "raymii.org/s/" 53 | # "stratechery.com", 54 | # "www.internet-law.de", 55 | # "https://medium.com/zendesk-engineering/the-joys-of-story-estimation-cda0cd807903", 56 | # "https://danwang.co/", 57 | "http://matthewdickens.me/podcasts/TWIS-feed.xml" 58 | ] 59 | 60 | 61 | def get_pretty_print(json_object: object): 62 | return json.dumps(json_object, sort_keys=True, indent=2, separators=(",", ": ")) 63 | 64 | 65 | # @profile() 66 | def run_crawl(): 67 | # user_agent = "Mozilla/5.0 (Compatible; Bot)" 68 | user_agent = "Mozilla/5.0 (Compatible; Feedsearch Bot)" 69 | # user_agent = "curl/7.58.0" 70 | # user_agent = ( 71 | # "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0" 72 | # ) 73 | # user_agent = ( 74 | # "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 75 | # ) 76 | 77 | # headers = { 78 | # "User-Agent": user_agent, 79 | # "DNT": "1", 80 | # "Upgrade-Insecure-Requests": "1", 81 | # "Accept-Language": "en-US,en;q=0.5", 82 | # "Accept-Encoding": "gzip, deflate, br", 83 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 84 | # "Referrer": "https://www.google.com/", 85 | # } 86 | 87 | crawler = FeedsearchSpider( 88 | concurrency=10, 89 | total_timeout=30, 90 | request_timeout=30, 91 | user_agent=user_agent, 92 | # headers=headers, 93 | favicon_data_uri=False, 94 | max_depth=5, 95 | max_retries=3, 96 | ssl=True, 97 | full_crawl=False, 98 | delay=0, 99 | try_urls=True, 100 | ) 101 | crawler.start_urls = urls 102 | # crawler.allowed_domains = create_allowed_domains(urls) 103 | asyncio.run(crawler.crawl()) 104 | # asyncio.run(crawler.crawl(urls[0])) 105 | # items = search(urls, crawl_hosts=True) 106 | 107 | items = sort_urls(list(crawler.items)) 108 | 109 | serialized = [item.serialize() for item in items] 110 | 111 | # items = search(urls[0], concurrency=40, try_urls=False, favicon_data_uri=False) 112 | # serialized = [item.serialize() for item in items] 113 | 114 | results = get_pretty_print(serialized) 115 | print(results) 116 | 117 | site_metas = [item.serialize() for item in crawler.site_metas] 118 | metas = get_pretty_print(site_metas) 119 | print(metas) 120 | # pprint(site_metas) 121 | 122 | pprint(crawler.favicons) 123 | pprint(crawler._duplicate_filter.fingerprints) 124 | 125 | print(output_opml(items).decode()) 126 | 127 | pprint([result["url"] for result in serialized]) 128 | pprint(crawler.get_stats()) 129 | 130 | print(f"Feeds found: {len(items)}") 131 | print(f"SiteMetas: {len(crawler.site_metas)}") 132 | print(f"Favicons fetched: {len(crawler.favicons)}") 133 | # pprint(crawler.queue_wait_times) 134 | pprint(list((x.score, x.url) for x in items)) 135 | 136 | 137 | def create_allowed_domains(urls): 138 | domain_patterns = [] 139 | for url in urls: 140 | url = coerce_url(url) 141 | host = url.host 142 | pattern = f"*.{host}" 143 | domain_patterns.append(host) 144 | domain_patterns.append(pattern) 145 | return domain_patterns 146 | 147 | 148 | if __name__ == "__main__": 149 | logger = logging.getLogger("feedsearch_crawler") 150 | logger.setLevel(logging.DEBUG) 151 | ch = logging.StreamHandler() 152 | ch.setLevel(logging.DEBUG) 153 | formatter = logging.Formatter( 154 | "%(asctime)s - %(levelname)s - %(name)s - %(message)s [in %(pathname)s:%(lineno)d]" 155 | ) 156 | ch.setFormatter(formatter) 157 | fl = logging.FileHandler( 158 | f"/home/dbeath/code/feedsearch-crawler/logs/feedsearch_crawl_{datetime.utcnow().isoformat()}" 159 | ) 160 | fl.setLevel((logging.DEBUG)) 161 | fl.setFormatter(formatter) 162 | logger.addHandler(ch) 163 | logger.addHandler(fl) 164 | 165 | start = time.perf_counter() 166 | run_crawl() 167 | duration = int((time.perf_counter() - start) * 1000) 168 | print(f"Entire process ran in {duration}ms") 169 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feedsearch Crawler 2 | [![PyPI](https://img.shields.io/pypi/v/feedsearch-crawler.svg)](https://pypi.org/project/feedsearch-crawler/) 3 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/feedsearch-crawler.svg) 4 | ![PyPI - License](https://img.shields.io/pypi/l/feedsearch-crawler.svg) 5 | 6 | Feedsearch Crawler is a Python library for searching websites for [RSS](https://en.wikipedia.org/wiki/RSS), [Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)), and [JSON](https://jsonfeed.org/) feeds. 7 | 8 | It is a continuation of my work on [Feedsearch](https://github.com/DBeath/feedsearch), which is itself a continuation of the work done by [Dan Foreman-Mackey](http://dfm.io/) on [Feedfinder2](https://github.com/dfm/feedfinder2), which in turn is based on [feedfinder](http://www.aaronsw.com/2002/feedfinder/) - originally written by [Mark Pilgrim](http://en.wikipedia.org/wiki/Mark_Pilgrim_(software_developer)) and subsequently maintained by 9 | [Aaron Swartz](http://en.wikipedia.org/wiki/Aaron_Swartz) until his untimely death. 10 | 11 | Feedsearch Crawler differs with all of the above in that it is now built as an asynchronous [Web crawler](https://en.wikipedia.org/wiki/Web_crawler) for [Python 3.7](https://www.python.org/downloads/release/python-370/) and above, using [asyncio](https://docs.python.org/3/library/asyncio.html) and [aiohttp](https://aiohttp.readthedocs.io/en/stable/), to allow much more rapid scanning of possible feed urls. 12 | 13 | An implementation using this library to provide a public Feed Search API is available at https://feedsearch.dev 14 | 15 | Pull requests and suggestions are welcome. 16 | 17 | ## Installation 18 | The library is available on [PyPI](https://pypi.org/project/feedsearch-crawler/): 19 | 20 | ``` 21 | pip install feedsearch-crawler 22 | ``` 23 | 24 | The library requires Python 3.7+. 25 | 26 | ## Usage 27 | Feedsearch Crawler is called with the single function ``search``: 28 | 29 | ``` python 30 | >>> from feedsearch_crawler import search 31 | >>> feeds = search('xkcd.com') 32 | >>> feeds 33 | [FeedInfo('https://xkcd.com/rss.xml'), FeedInfo('https://xkcd.com/atom.xml')] 34 | >>> feeds[0].url 35 | URL('https://xkcd.com/rss.xml') 36 | >>> str(feeds[0].url) 37 | 'https://xkcd.com/rss.xml' 38 | >>> feeds[0].serialize() 39 | {'url': 'https://xkcd.com/rss.xml', 'title': 'xkcd.com', 'version': 'rss20', 'score': 24, 'hubs': [], 'description': 'xkcd.com: A webcomic of romance and math humor.', 'is_push': False, 'self_url': '', 'favicon': 'https://xkcd.com/s/919f27.ico', 'content_type': 'text/xml; charset=UTF-8', 'bozo': 0, 'site_url': 'https://xkcd.com/', 'site_name': 'xkcd: Chernobyl', 'favicon_data_uri': '', 'content_length': 2847} 40 | ``` 41 | 42 | If you are already running in an [asyncio event loop](https://docs.python.org/3/library/asyncio-eventloop.html), then you can import and await ``search_async`` instead. The ``search`` function is only a wrapper that runs ``search_async`` in a new asyncio event loop. 43 | 44 | ``` python 45 | from feedsearch_crawler import search_async 46 | 47 | feeds = await search_async('xkcd.com') 48 | ``` 49 | 50 | A search will always return a list of *FeedInfo* objects, each of which will always have a *url* property, which is a [URL](https://yarl.readthedocs.io/en/latest/api.html) object that can be decoded to a string with ``str(url)``. 51 | The returned *FeedInfo* are sorted by the *score* value from highest to lowest, with a higher score theoretically indicating a more relevant feed compared to the original URL provided. A *FeedInfo* can also be serialized to a JSON compatible dictionary by calling it's ``.serialize()`` method. 52 | 53 | The crawl logs can be accessed with: 54 | 55 | ``` python 56 | import logging 57 | 58 | logger = logging.getLogger("feedsearch_crawler") 59 | ``` 60 | 61 | Feedsearch Crawler also provides a handy function to output the returned feeds as an [OPML](https://en.wikipedia.org/wiki/OPML) subscription list, encoded as a UTF-8 bytestring. 62 | 63 | ``` python 64 | from feedsearch_crawler import output_opml 65 | 66 | output_opml(feeds).decode() 67 | ``` 68 | 69 | ## Search Arguments 70 | ``search`` and ``search_async`` take the following arguments: 71 | 72 | ``` python 73 | search( 74 | url: Union[URL, str, List[Union[URL, str]]], 75 | crawl_hosts: bool=True, 76 | try_urls: Union[List[str], bool]=False, 77 | concurrency: int=10, 78 | total_timeout: Union[float, aiohttp.ClientTimeout]=10, 79 | request_timeout: Union[float, aiohttp.ClientTimeout]=3, 80 | user_agent: str="Feedsearch Bot", 81 | max_content_length: int=1024 * 1024 * 10, 82 | max_depth: int=10, 83 | headers: dict={"X-Custom-Header": "Custom Header"}, 84 | favicon_data_uri: bool=True, 85 | delay: float=0 86 | ) 87 | ``` 88 | 89 | - **url**: *Union[str, List[str]]*: The initial URL or list of URLs at which to search for feeds. You may also provide [URL](https://yarl.readthedocs.io/en/latest/api.html) objects. 90 | - **crawl_hosts**: *bool*: (default True): An optional argument to add the site host origin URL to the list of initial crawl URLs. (e.g. add "example.com" if crawling "example.com/path/rss.xml"). If **False**, site metadata and favicon data may not be found. 91 | - **try_urls**: *Union[List[str], bool]*: (default False): An optional list of URL paths to query for feeds. Takes the origins of the *url* parameter and appends the provided paths. If no list is provided, but *try_urls* is **True**, then a list of common feed locations will be used. 92 | - **concurrency**: *int*: (default 10): An optional argument to specify the maximum number of concurrent HTTP requests. 93 | - **total_timeout**: *float*: (default 30.0): An optional argument to specify the time this function may run before timing out. 94 | - **request_timeout**: *float*: (default 3.0): An optional argument that controls how long before each individual HTTP request times out. 95 | - **user_agent**: *str*: An optional argument to override the default User-Agent header. 96 | - **max_content_length**: *int*: (default 10Mb): An optional argument to specify the maximum size in bytes of each HTTP Response. 97 | - **max_depth**: *int*: (default 10): An optional argument to limit the maximum depth of requests while following urls. 98 | - **headers**: *dict*: An optional dictionary of headers to pass to each HTTP request. 99 | - **favicon_data_uri**: *bool*: (default True): Optionally control whether to fetch found favicons and return them as a Data Uri. 100 | - **delay**: *float*: (default 0.0): An optional argument to delay each HTTP request by the specified time in seconds. Used in conjunction with the concurrency setting to avoid overloading sites. 101 | 102 | ## FeedInfo Values 103 | In addition to the *url*, FeedInfo objects may have the following values: 104 | 105 | - **bozo**: *int*: Set to 1 when feed data is not well formed or may not be a feed. Defaults 0. 106 | - **content_length**: *int*: Current length of the feed in bytes. 107 | - **content_type**: *str*: [Content-Type](https://en.wikipedia.org/wiki/Media_type) value of the returned feed. 108 | - **description**: *str*: Feed description. 109 | - **favicon**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) of feed or site [Favicon](https://en.wikipedia.org/wiki/Favicon). 110 | - **favicon_data_uri**: *str*: [Data Uri](https://en.wikipedia.org/wiki/Data_URI_scheme) of Favicon. 111 | - **hubs**: *List[str]*: List of [Websub](https://en.wikipedia.org/wiki/WebSub) hubs of feed if available. 112 | - **is_podcast**: *bool*: True if the feed contains valid [podcast](https://en.wikipedia.org/wiki/Podcast) elements and enclosures. 113 | - **is_push**: *bool*: True if feed contains valid Websub data. 114 | - **item_count**: *int*: Number of items currently in the feed. 115 | - **last_updated**: *datetime*: Date of the latest published entry. 116 | - **score**: *int*: Computed relevance of feed url value to provided URL. May be safely ignored. 117 | - **self_url**: *URL*: *ref="self"* value returned from feed links. In some cases may be different from feed url. 118 | - **site_name**: *str*: Name of feed's website. 119 | - **site_url**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) of feed's website. 120 | - **title**: *str*: Feed Title. 121 | - **url**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) location of feed. 122 | - **velocity**: *float*: Mean number of items per day in the feed at the current time. 123 | - **version**: *str*: Feed version [XML values](https://pythonhosted.org/feedparser/version-detection.html), 124 | or [JSON feed](https://jsonfeed.org/version/1). 125 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/link_filter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | import re 4 | from typing import Optional, Tuple, List 5 | 6 | import bs4 7 | from w3lib.url import url_query_cleaner 8 | from yarl import URL 9 | 10 | from feedsearch_crawler.crawler import Response, Request 11 | from feedsearch_crawler.crawler.lib import parse_href_to_url 12 | from feedsearch_crawler.feed_spider.regexes import ( 13 | feedlike_regex, 14 | podcast_regex, 15 | author_regex, 16 | date_regex, 17 | ) 18 | 19 | # List of invalid filetypes 20 | invalid_filetypes: List[str] = [ 21 | "jpeg", 22 | "jpg", 23 | "png", 24 | "gif", 25 | "bmp", 26 | "mp4", 27 | "mp3", 28 | "mkv", 29 | "md", 30 | "css", 31 | "avi", 32 | "pdf", 33 | "js", 34 | "woff", 35 | "woff2", 36 | "svg", 37 | "ttf", 38 | ] 39 | 40 | # List of strings that are invalid as querystring keys 41 | invalid_querystring_keys: List[str] = ["comment", "comments", "post", "view", "theme"] 42 | 43 | # List of strings that indicate a URL is invalid for crawling 44 | invalid_url_contents: List[str] = [ 45 | "wp-includes", 46 | "wp-content", 47 | "wp-json", 48 | "xmlrpc", 49 | "wp-admin", 50 | "/amp/", # Theoretically there could be a feed at an AMP url, but not worth checking. 51 | "mailto:", 52 | "//font.", 53 | ] 54 | 55 | # List of strings that indicate a URL should be low priority 56 | low_priority_urls: List[str] = [ 57 | "/archive/", # Archives are less likely to contain feeds. 58 | "/page/", # Articles pages are less likely to contain feeds. 59 | "forum", # Forums are not likely to contain interesting feeds. 60 | "//cdn.", # Can't guarantee that someone won't put a feed at a CDN url, so we can't outright ignore it. 61 | "video", 62 | ] 63 | 64 | # Link Types that should always be searched for feeds 65 | feed_link_types: List[str] = ["application/json", "rss", "atom", "rdf"] 66 | 67 | 68 | logger = logging.getLogger(__name__) 69 | 70 | 71 | class LinkFilter: 72 | def __init__(self, response: Response, request: Request, full_crawl: bool = False): 73 | self.response = response 74 | self.request = request 75 | self.full_crawl = full_crawl 76 | 77 | def should_follow_link(self, link: bs4.Tag) -> Optional[Tuple[URL, int]]: 78 | """ 79 | Check that the link should be followed if it may contain feed information. 80 | 81 | :param link: Link tag 82 | :return: boolean 83 | """ 84 | href: str = link.get("href") 85 | link_type: str = link.get("type") 86 | 87 | url: URL = parse_href_to_url(href) 88 | if not url: 89 | return None 90 | 91 | # If the link may have a valid feed type then follow it regardless of the url text. 92 | if ( 93 | link_type 94 | and any(map(link_type.lower().count, feed_link_types)) 95 | and "json+oembed" not in link_type 96 | ): 97 | # A link with a possible feed type has the highest priority after callbacks. 98 | return url, 2 99 | 100 | is_feedlike_href: bool = self.is_href_matching(str(url), feedlike_regex) 101 | is_feedlike_querystring: bool = self.is_querystring_matching( 102 | url, feedlike_regex 103 | ) 104 | 105 | is_podcast_href: bool = self.is_href_matching(str(url), podcast_regex) 106 | is_podcast_querystring: bool = self.is_querystring_matching(url, podcast_regex) 107 | 108 | is_feedlike_url = is_feedlike_querystring or is_feedlike_href 109 | is_podcast_url = is_podcast_href or is_podcast_querystring 110 | 111 | if not self.full_crawl and not is_feedlike_url and not is_podcast_url: 112 | return 113 | 114 | # This check is deprecated, as it has been moved to the spider to prevent the crawling of any links 115 | # from responses that are not the same as the original domain 116 | # 117 | # is_one_jump: bool = self.is_one_jump_from_original_domain(url, self.response) 118 | # if not is_one_jump: 119 | # return 120 | 121 | has_author_info: bool = self.is_href_matching(href, author_regex) 122 | is_low_priority: bool = self.is_low_priority(href) 123 | 124 | priority: int = Request.priority 125 | # A low priority url should be fetched last. 126 | if is_low_priority: 127 | priority = Request.priority + 2 128 | # Podcast pages are lower priority than authors or feeds. 129 | if is_podcast_url: 130 | priority = 5 131 | # Potential author info has a medium priority. 132 | if has_author_info: 133 | priority = 4 134 | # A feedlike url has high priority. 135 | if is_feedlike_url: 136 | priority = 3 137 | 138 | # Validate the actual URL string. 139 | follow = ( 140 | # is_one_jump 141 | not self.has_invalid_contents(href) 142 | and self.is_valid_filetype(href) 143 | and not self.has_invalid_querystring(url) 144 | ) 145 | # If full_crawl then follow all valid URLs regardless of the feedlike quality of the URL. 146 | # Otherwise only follow URLs if they look like they might contain feed information. 147 | if follow and (self.full_crawl or is_feedlike_url or is_podcast_href): 148 | 149 | # Remove the querystring unless it may point to a feed. 150 | if not is_feedlike_querystring: 151 | url = url.with_query(None) 152 | 153 | return url, priority 154 | 155 | @staticmethod 156 | def is_one_jump_from_original_domain(url: URL, response: Response) -> bool: 157 | """ 158 | Check that the current URL is only one response away from the originally queried domain. 159 | 160 | We want to be able to follow potential feed links that point to a different domain than 161 | the originally queried domain, but not to follow any deeper than that. 162 | 163 | Sub-domains of the original domain are ok. 164 | 165 | i.e: the following are ok 166 | "test.com" -> "feedhost.com" 167 | "test.com/feeds" -> "example.com/feed.xml" 168 | "test.com" -> "feeds.test.com" 169 | 170 | not ok: 171 | "test.com" -> "feedhost.com" (we stop here) -> "feedhost.com/feeds" 172 | 173 | :param url: URL object or string 174 | :param response: Response object 175 | :return: boolean 176 | """ 177 | 178 | # This is the first Response in the chain 179 | if len(response.history) < 2: 180 | return True 181 | 182 | # The URL is relative, so on the same domain 183 | if not url.is_absolute(): 184 | return True 185 | 186 | # URL is same domain 187 | if url.host == response.history[0].host: 188 | return True 189 | 190 | # URL is sub-domain 191 | if response.history[0].host in url.host: 192 | return True 193 | 194 | # URL domain and current Response domain are different from original domain 195 | if ( 196 | response.history[-1].host != response.history[0].host 197 | and url.host != response.history[0].host 198 | ): 199 | return False 200 | 201 | return True 202 | 203 | @staticmethod 204 | def is_valid_filetype(url: str) -> bool: 205 | """ 206 | Check if url string has an invalid filetype extension. 207 | 208 | :param url: URL string 209 | :return: boolean 210 | """ 211 | # if file_regex.search(url.strip()): 212 | # return False 213 | # return True 214 | suffix = pathlib.Path(url_query_cleaner(url)).suffix.strip(".").lower() 215 | if suffix in invalid_filetypes: 216 | return False 217 | return True 218 | 219 | @staticmethod 220 | def has_invalid_querystring(url: URL) -> bool: 221 | """ 222 | Check if URL querystring contains invalid keys. 223 | 224 | :param url: URL object 225 | :return: boolean 226 | """ 227 | return any(key in url.query for key in invalid_querystring_keys) 228 | 229 | @staticmethod 230 | def is_href_matching(url_string: str, regex: re) -> bool: 231 | """ 232 | Check if the regex has any match in the url string. 233 | 234 | :param url_string: URL as string 235 | :param regex: Regex used to search URL 236 | :return: boolean 237 | """ 238 | if regex.search(url_query_cleaner(url_string)): 239 | return True 240 | return False 241 | 242 | @staticmethod 243 | def is_querystring_matching(url: URL, regex: re) -> bool: 244 | """ 245 | Check if the regex has any match in the URL query parameters. 246 | 247 | :param url: URL object 248 | :param regex: Regex used to search query 249 | :return: boolean 250 | """ 251 | for key in url.query: 252 | if regex.search(key): 253 | return True 254 | return False 255 | 256 | @staticmethod 257 | def has_invalid_contents(string: str) -> bool: 258 | """ 259 | Ignore any string containing the following strings. 260 | 261 | :param string: String to check 262 | :return: boolean 263 | """ 264 | return any(value in string.lower() for value in invalid_url_contents) 265 | 266 | @staticmethod 267 | def is_low_priority(url_string: str) -> bool: 268 | """ 269 | Check if the url contains any strings that indicate the url should be low priority. 270 | 271 | :param url_string: URL string 272 | :return: boolean 273 | """ 274 | if any(value in url_string.lower() for value in low_priority_urls): 275 | return True 276 | 277 | # Search for dates in url, this generally indicates an article page. 278 | if date_regex.search(url_string): 279 | return True 280 | return False 281 | 282 | @staticmethod 283 | def is_subdomain_matching(url: URL, regex: re) -> bool: 284 | """ 285 | Check if the url subdomain matches the regex 286 | 287 | :param url: URL object 288 | :param regex: regex object 289 | :return: boolean 290 | """ 291 | if not url.host: 292 | return False 293 | 294 | split = url.host.split(".") 295 | if len(split) <= 2: 296 | return False 297 | 298 | sub_domains = ".".join(split[:-2]) 299 | if regex.search(sub_domains): 300 | return True 301 | return False 302 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/lib.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from asyncio import PriorityQueue 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import Any, Union, Dict 6 | 7 | from yarl import URL 8 | 9 | from feedsearch_crawler.crawler.queueable import Queueable 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | # noinspection PyUnresolvedReferences 15 | class CrawlerPriorityQueue(PriorityQueue): 16 | _unfinished_tasks: int 17 | 18 | def clear(self): 19 | """ 20 | Clear the Queue of any unfinished tasks. 21 | """ 22 | self._queue.clear() 23 | self._unfinished_tasks = 0 24 | self._finished.set() 25 | 26 | 27 | @dataclass 28 | class CallbackResult(Queueable): 29 | """Dataclass for holding callback results and recording recursion""" 30 | 31 | result: Any 32 | callback_recursion: int 33 | # CallbackResult priority is high so that we clear Callbacks off the queue and process them as fast as possible. 34 | # Otherwise the workers always process Requests and don't often process the Request results. 35 | priority = 1 36 | 37 | def __repr__(self): 38 | return f"{self.__class__.__name__}({self.result.__class__.__name__})" 39 | 40 | 41 | class Stats(Enum): 42 | # Number of Requests added to the queue. 43 | REQUESTS_QUEUED = "requests_queued" 44 | # Number of HTTP Requests that were successful (HTTP Status code 200-299). 45 | REQUESTS_SUCCESSFUL = "requests_successful" 46 | # Number of HTTP Requests that were unsuccessful (HTTP Status code not in 200s). 47 | REQUESTS_FAILED = "requests_failed" 48 | # Total size in bytes of all HTTP Responses. 49 | CONTENT_LENGTH_TOTAL = "content_length_total" 50 | # Harmonic mean of total HTTP Response content length in bytes. 51 | CONTENT_LENGTH_AVG = "content_length_avg" 52 | # Highest HTTP Response content length in bytes. 53 | CONTENT_LENGTH_MAX = "content_length_max" 54 | # Lowest HTTP Response content length in bytes. 55 | CONTENT_LENGTH_MIN = "content_length_min" 56 | # Median HTTP Response content length in bytes. 57 | CONTENT_LENGTH_MEDIAN = "content_length_med" 58 | # Number of Items processed. 59 | ITEMS_PROCESSED = "items_processed" 60 | # Number of URls seen and added to duplicate filter. 61 | URLS_SEEN = "urls_seen" 62 | # Harmonic mean of Request duration in Milliseconds. 63 | REQUESTS_DURATION_AVG = "requests_duration_avg" 64 | # Highest Request duration in Milliseconds. 65 | REQUESTS_DURATION_MAX = "requests_duration_max" 66 | # Lowest Request duration in Milliseconds. 67 | REQUESTS_DURATION_MIN = "requests_duration_min" 68 | # Total Request duration in Milliseconds. 69 | REQUESTS_DURATION_TOTAL = "requests_duration_total" 70 | # Median Request duration in Milliseconds. 71 | REQUESTS_DURATION_MEDIAN = "requests_duration_med" 72 | # Harmonic mean of HTTP request latency in Milliseconds. 73 | REQUESTS_LATENCY_AVG = "requests_latency_avg" 74 | # Highest HTTP Request latency in Milliseconds. 75 | REQUESTS_LATENCY_MAX = "requests_latency_max" 76 | # Lowest HTTP Request latency in Milliseconds. 77 | REQUESTS_LATENCY_MIN = "requests_latency_min" 78 | # Median HTTP Request latency in Milliseconds. 79 | REQUESTS_LATENCY_MEDIAN = "requests_latency_med" 80 | # Total HTTP Request latency in Milliseconds. 81 | REQUESTS_LATENCY_TOTAL = "requests_latency_total" 82 | # Total duration of crawl in Milliseconds. 83 | TOTAL_DURATION = "total_duration" 84 | # Response status codes. 85 | STATUS_CODES = "status_codes" 86 | # Highest queue wait time in Milliseconds. 87 | QUEUE_WAIT_MAX = "queue_wait_max" 88 | # Lowest queue wait time in Milliseconds. 89 | QUEUE_WAIT_MIN = "queue_wait_min" 90 | # Harmonic mean of queue wait time in Milliseconds. 91 | QUEUE_WAIT_AVG = "queue_wait_avg" 92 | # Median queue wait time in Milliseconds. 93 | QUEUE_WAIT_MEDIAN = "queue_wait_med" 94 | # Highest queue size. 95 | QUEUE_SIZE_MAX = "queue_size_max" 96 | # Harmonic mean of queue size. 97 | QUEUE_SIZE_AVG = "queue_size_avg" 98 | # Median queue size. 99 | QUEUE_SIZE_MEDIAN = "queue_size_med" 100 | # Total objects put on queue. 101 | QUEUED_TOTAL = "queued_total" 102 | # Total number of retried Requests 103 | REQUESTS_RETRIED = "requests_retried" 104 | 105 | def __repr__(self): 106 | return self.value 107 | 108 | def __str__(self): 109 | return str(self.value) 110 | 111 | def __lt__(self, other): 112 | if not isinstance(other, Stats): 113 | return False 114 | return self.value < other.value 115 | 116 | 117 | def coerce_url( 118 | url: Union[URL, str], https: bool = False, default_scheme: str = "http" 119 | ) -> URL: 120 | """ 121 | Coerce URL to valid format 122 | 123 | :param url: URL 124 | :param https: Force https if no scheme in url 125 | :param default_scheme: Default scheme if not forcing https 126 | :return: str 127 | """ 128 | if isinstance(url, str): 129 | url = URL(url.strip()) 130 | 131 | scheme = "https" if https else default_scheme 132 | 133 | if not url.is_absolute(): 134 | url_string = str(url) 135 | split = url_string.split("/", 1) 136 | url = URL.build(scheme=scheme, host=split[0]) 137 | if len(split) > 1: 138 | url = url.with_path(split[1]) 139 | 140 | if (url.scheme == "http" and https) or not url.scheme: 141 | url = url.with_scheme(scheme) 142 | 143 | return url 144 | 145 | 146 | def to_bytes(text, encoding: str = "utf-8", errors: str = "strict"): 147 | """Return the binary representation of `text`. If `text` 148 | is already a bytes object, return it as-is.""" 149 | if not text: 150 | return b"" 151 | if isinstance(text, bytes): 152 | return text 153 | return text.encode(encoding, errors) 154 | 155 | 156 | def to_string(item: Any, encoding: str = "utf-8", errors: str = "strict") -> str: 157 | """ 158 | Return the string representation of 'item'. 159 | """ 160 | if item is None: 161 | return "" 162 | if isinstance(item, bytes): 163 | return item.decode(encoding, errors) 164 | return str(item) 165 | 166 | 167 | def case_insensitive_key(key: str, dictionary: Dict) -> bool: 168 | """ 169 | Check if a case-insensitive key is in a dictionary. 170 | """ 171 | k = key.lower() 172 | for key in dictionary.keys(): 173 | if key.lower() == k: 174 | return True 175 | 176 | 177 | def headers_to_dict(headers: Any) -> Dict[str, str]: 178 | """ 179 | Convert various header classes to a simple dictionary 180 | 181 | :param headers: Dict subclass of HTTP headers 182 | :return: Dict of HTTP headers 183 | """ 184 | if isinstance(headers, dict): 185 | return headers 186 | 187 | new_headers = {} 188 | try: 189 | new_headers.update({k.lower(): v for (k, v) in headers.items()}) 190 | except Exception as e: 191 | logger.warning("Exception parsing headers to dict: %s", e) 192 | pass 193 | return new_headers 194 | 195 | 196 | def ignore_aiohttp_ssl_error(loop, aiohttpversion="3.5.4"): 197 | """Ignore aiohttp #3535 issue with SSL data after close 198 | There appears to be an issue on Python 3.7 and aiohttp SSL that throws a 199 | ssl.SSLError fatal error (ssl.SSLError: [SSL: KRB5_S_INIT] application data 200 | after close notify (_ssl.c:2609)) after we are already done with the 201 | connection. See GitHub issue aio-libs/aiohttp#3535 202 | Given a loop, this sets up a exception handler that ignores this specific 203 | exception, but passes everything else on to the previous exception handler 204 | this one replaces. 205 | If the current aiohttp version is not exactly equal to aiohttpversion 206 | nothing is done, assuming that the next version will have this bug fixed. 207 | This can be disabled by setting this parameter to None 208 | """ 209 | import ssl 210 | import aiohttp 211 | import asyncio 212 | 213 | try: 214 | # noinspection PyUnresolvedReferences 215 | import uvloop 216 | 217 | protocol_class = uvloop.loop.SSLProtocol 218 | except ImportError: 219 | protocol_class = asyncio.sslproto.SSLProtocol 220 | pass 221 | 222 | if aiohttpversion is not None and aiohttp.__version__ != aiohttpversion: 223 | return 224 | 225 | orig_handler = loop.get_exception_handler() 226 | 227 | # noinspection PyUnresolvedReferences 228 | def ignore_ssl_error(this_loop, context): 229 | errors = ["SSL error", "Fatal error"] 230 | if any(x in context.get("message") for x in errors): 231 | # validate we have the right exception, transport and protocol 232 | exception = context.get("exception") 233 | protocol = context.get("protocol") 234 | if ( 235 | isinstance(exception, ssl.SSLError) 236 | and exception.reason == "KRB5_S_INIT" 237 | and isinstance(protocol, protocol_class) 238 | ): 239 | if this_loop.get_debug(): 240 | asyncio.log.logger.debug("Ignoring aiohttp SSL KRB5_S_INIT error") 241 | return 242 | if orig_handler is not None: 243 | orig_handler(this_loop, context) 244 | else: 245 | this_loop.default_exception_handler(context) 246 | 247 | loop.set_exception_handler(ignore_ssl_error) 248 | 249 | 250 | def parse_href_to_url(href: str) -> Union[URL, None]: 251 | """ 252 | Parse an href string to a URL object. 253 | 254 | :param href: An href string that may be a valid url. 255 | :return: URL or None. 256 | """ 257 | if not href: 258 | return None 259 | 260 | if not isinstance(href, str): 261 | raise TypeError("href must be string") 262 | 263 | try: 264 | return URL(href) 265 | except (UnicodeError, ValueError) as e: 266 | logger.warning("Failed to encode href: %s : %s", href, e) 267 | return None 268 | 269 | 270 | def remove_www(host: str) -> str: 271 | """ 272 | Remove www. subdomain from URL host strings. 273 | 274 | :param host: URL host without scheme or path. e.g. www.test.com 275 | :return: URL host string. 276 | """ 277 | if host.startswith("www."): 278 | return host[4:] 279 | return host 280 | 281 | 282 | def is_same_domain(root_domain: str, url_domain: str) -> bool: 283 | """ 284 | Check if the url domain is the same or a subdomain of the root domain. 285 | 286 | :param root_domain: Original root domain of this crawl 287 | :param url_domain: Domain of the url to filter 288 | :return: boolean 289 | """ 290 | return remove_www(root_domain) in url_domain 291 | -------------------------------------------------------------------------------- /tests/feed_spider/feed_info_parser_test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from dateutil.tz import tzutc 4 | 5 | from feedsearch_crawler.feed_spider.feed_info_parser import FeedInfoParser 6 | 7 | 8 | def test_entry_velocity_no_dates(): 9 | dates = [] 10 | result = FeedInfoParser.entry_velocity(dates) 11 | assert result == 0 12 | 13 | 14 | def test_entry_velocity_identical_dates(): 15 | dates = [datetime(2020, 1, 1), datetime(2020, 1, 1), datetime(2020, 1, 1)] 16 | result = FeedInfoParser.entry_velocity(dates) 17 | assert result == 0 18 | 19 | 20 | def test_entry_velocity(): 21 | dates = [ 22 | datetime(2019, 1, 1), 23 | datetime(2019, 1, 2), 24 | datetime(2019, 1, 3), 25 | datetime(2019, 1, 4), 26 | datetime(2019, 1, 5), 27 | ] 28 | result = FeedInfoParser.entry_velocity(dates) 29 | assert result == 1.0 30 | 31 | dates = [ 32 | datetime(2019, 1, 1, 1), 33 | datetime(2019, 1, 1, 2), 34 | datetime(2019, 1, 1, 3), 35 | datetime(2019, 1, 1, 4), 36 | datetime(2019, 1, 1, 5), 37 | ] 38 | result = FeedInfoParser.entry_velocity(dates) 39 | assert result == 24 40 | 41 | dates = [ 42 | datetime(2019, 1, 1), 43 | datetime(2019, 1, 7), 44 | datetime(2019, 1, 14), 45 | datetime(2019, 1, 21), 46 | datetime(2019, 1, 27), 47 | ] 48 | dates = sorted(dates, reverse=True) 49 | result = FeedInfoParser.entry_velocity(dates) 50 | assert result == 0.154 51 | 52 | dates = [ 53 | datetime(2019, 9, 21, 13, 10, 3, tzinfo=tzutc()), 54 | datetime(2019, 9, 21, 13, 10, 3, tzinfo=tzutc()), 55 | datetime(2019, 9, 21, 2, 20, 32, tzinfo=tzutc()), 56 | datetime(2019, 9, 21, 2, 20, 32, tzinfo=tzutc()), 57 | datetime(2019, 9, 20, 20, 15, 45, tzinfo=tzutc()), 58 | datetime(2019, 9, 20, 20, 15, 45, tzinfo=tzutc()), 59 | datetime(2019, 9, 20, 19, 40, 40, tzinfo=tzutc()), 60 | datetime(2019, 9, 20, 19, 40, 40, tzinfo=tzutc()), 61 | datetime(2019, 9, 20, 19, 38, 23, tzinfo=tzutc()), 62 | datetime(2019, 9, 20, 19, 38, 23, tzinfo=tzutc()), 63 | datetime(2019, 9, 20, 19, 23, 20, tzinfo=tzutc()), 64 | datetime(2019, 9, 20, 19, 23, 20, tzinfo=tzutc()), 65 | datetime(2019, 9, 20, 19, 8, tzinfo=tzutc()), 66 | datetime(2019, 9, 20, 19, 8, tzinfo=tzutc()), 67 | datetime(2019, 9, 20, 18, 41, 57, tzinfo=tzutc()), 68 | datetime(2019, 9, 20, 18, 41, 57, tzinfo=tzutc()), 69 | datetime(2019, 9, 20, 17, 36, 30, tzinfo=tzutc()), 70 | datetime(2019, 9, 20, 17, 36, 30, tzinfo=tzutc()), 71 | datetime(2019, 9, 20, 17, 18, 2, tzinfo=tzutc()), 72 | datetime(2019, 9, 20, 17, 18, 2, tzinfo=tzutc()), 73 | datetime(2019, 9, 20, 16, 35, 53, tzinfo=tzutc()), 74 | datetime(2019, 9, 20, 16, 35, 53, tzinfo=tzutc()), 75 | datetime(2019, 9, 20, 16, 25, 13, tzinfo=tzutc()), 76 | datetime(2019, 9, 20, 16, 25, 13, tzinfo=tzutc()), 77 | datetime(2019, 9, 20, 16, 0, 49, tzinfo=tzutc()), 78 | datetime(2019, 9, 20, 16, 0, 49, tzinfo=tzutc()), 79 | datetime(2019, 9, 20, 15, 35, 50, tzinfo=tzutc()), 80 | datetime(2019, 9, 20, 15, 35, 50, tzinfo=tzutc()), 81 | datetime(2019, 9, 20, 15, 31, 35, tzinfo=tzutc()), 82 | datetime(2019, 9, 20, 15, 31, 35, tzinfo=tzutc()), 83 | datetime(2019, 9, 20, 15, 30, 48, tzinfo=tzutc()), 84 | datetime(2019, 9, 20, 15, 30, 48, tzinfo=tzutc()), 85 | datetime(2019, 9, 20, 11, 0, 53, tzinfo=tzutc()), 86 | datetime(2019, 9, 20, 11, 0, 53, tzinfo=tzutc()), 87 | datetime(2019, 9, 20, 10, 45, 16, tzinfo=tzutc()), 88 | datetime(2019, 9, 20, 10, 45, 16, tzinfo=tzutc()), 89 | datetime(2019, 9, 20, 10, 0, 49, tzinfo=tzutc()), 90 | datetime(2019, 9, 20, 10, 0, 49, tzinfo=tzutc()), 91 | datetime(2019, 9, 19, 22, 6, 47, tzinfo=tzutc()), 92 | datetime(2019, 9, 19, 22, 6, 47, tzinfo=tzutc()), 93 | ] 94 | result = FeedInfoParser.entry_velocity(dates) 95 | assert result == 11.676 96 | 97 | dates = [ 98 | datetime(2019, 9, 16, 14, 8, 51, tzinfo=tzutc()), 99 | datetime(2019, 9, 16, 14, 8, 51, tzinfo=tzutc()), 100 | datetime(2019, 9, 18, 4, 44, 14, tzinfo=tzutc()), 101 | datetime(2019, 9, 18, 4, 44, 14, tzinfo=tzutc()), 102 | datetime(2019, 9, 18, 9, 0, 16, tzinfo=tzutc()), 103 | datetime(2019, 9, 18, 9, 0, 16, tzinfo=tzutc()), 104 | datetime(2019, 9, 19, 14, 1, 56, tzinfo=tzutc()), 105 | datetime(2019, 9, 19, 14, 1, 56, tzinfo=tzutc()), 106 | datetime(2019, 9, 19, 20, 58, 52, tzinfo=tzutc()), 107 | datetime(2019, 9, 19, 20, 58, 52, tzinfo=tzutc()), 108 | datetime(2019, 9, 20, 19, 41, 7, tzinfo=tzutc()), 109 | datetime(2019, 9, 20, 19, 41, 7, tzinfo=tzutc()), 110 | datetime(2019, 9, 20, 23, 2, 15, tzinfo=tzutc()), 111 | datetime(2019, 9, 20, 23, 2, 15, tzinfo=tzutc()), 112 | datetime(2019, 9, 21, 2, 53, 22, tzinfo=tzutc()), 113 | datetime(2019, 9, 21, 2, 53, 22, tzinfo=tzutc()), 114 | datetime(2019, 9, 21, 5, 28, 43, tzinfo=tzutc()), 115 | datetime(2019, 9, 21, 5, 28, 43, tzinfo=tzutc()), 116 | datetime(2019, 9, 21, 5, 28, 44, tzinfo=tzutc()), 117 | datetime(2019, 9, 21, 5, 28, 44, tzinfo=tzutc()), 118 | datetime(2019, 9, 21, 5, 38, 3, tzinfo=tzutc()), 119 | datetime(2019, 9, 21, 5, 38, 3, tzinfo=tzutc()), 120 | datetime(2019, 9, 21, 9, 0, 1, tzinfo=tzutc()), 121 | datetime(2019, 9, 21, 9, 0, 1, tzinfo=tzutc()), 122 | datetime(2019, 9, 21, 9, 36, 12, tzinfo=tzutc()), 123 | datetime(2019, 9, 21, 9, 36, 12, tzinfo=tzutc()), 124 | datetime(2019, 9, 21, 9, 39, 2, tzinfo=tzutc()), 125 | datetime(2019, 9, 21, 9, 39, 2, tzinfo=tzutc()), 126 | datetime(2019, 9, 21, 9, 54, 27, tzinfo=tzutc()), 127 | datetime(2019, 9, 21, 9, 54, 27, tzinfo=tzutc()), 128 | datetime(2019, 9, 21, 11, 24, 14, tzinfo=tzutc()), 129 | datetime(2019, 9, 21, 11, 24, 14, tzinfo=tzutc()), 130 | datetime(2019, 9, 21, 11, 57, 54, tzinfo=tzutc()), 131 | datetime(2019, 9, 21, 11, 57, 54, tzinfo=tzutc()), 132 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 133 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 134 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 135 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 136 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 137 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 138 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 139 | datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()), 140 | datetime(2019, 9, 21, 12, 50, 21, tzinfo=tzutc()), 141 | datetime(2019, 9, 21, 12, 50, 21, tzinfo=tzutc()), 142 | datetime(2019, 9, 21, 13, 26, 45, tzinfo=tzutc()), 143 | datetime(2019, 9, 21, 13, 26, 45, tzinfo=tzutc()), 144 | datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()), 145 | datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()), 146 | datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()), 147 | datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()), 148 | datetime(2019, 9, 21, 17, 50, 13, tzinfo=tzutc()), 149 | datetime(2019, 9, 21, 17, 50, 13, tzinfo=tzutc()), 150 | datetime(2019, 9, 21, 18, 3, 9, tzinfo=tzutc()), 151 | datetime(2019, 9, 21, 18, 3, 9, tzinfo=tzutc()), 152 | datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()), 153 | datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()), 154 | datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()), 155 | datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()), 156 | datetime(2019, 9, 21, 18, 3, 12, tzinfo=tzutc()), 157 | datetime(2019, 9, 21, 18, 3, 12, tzinfo=tzutc()), 158 | datetime(2019, 9, 21, 18, 39, 5, tzinfo=tzutc()), 159 | datetime(2019, 9, 21, 18, 39, 5, tzinfo=tzutc()), 160 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 161 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 162 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 163 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 164 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 165 | datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()), 166 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 167 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 168 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 169 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 170 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 171 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 172 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 173 | datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()), 174 | datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()), 175 | datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()), 176 | datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()), 177 | datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()), 178 | datetime(2019, 9, 21, 18, 44, 6, tzinfo=tzutc()), 179 | datetime(2019, 9, 21, 18, 44, 6, tzinfo=tzutc()), 180 | datetime(2019, 9, 21, 18, 44, 7, tzinfo=tzutc()), 181 | datetime(2019, 9, 21, 18, 44, 7, tzinfo=tzutc()), 182 | datetime(2019, 9, 21, 18, 50, 33, tzinfo=tzutc()), 183 | datetime(2019, 9, 21, 18, 50, 33, tzinfo=tzutc()), 184 | datetime(2019, 9, 21, 19, 32, 23, tzinfo=tzutc()), 185 | datetime(2019, 9, 21, 19, 32, 23, tzinfo=tzutc()), 186 | datetime(2019, 9, 21, 19, 32, 25, tzinfo=tzutc()), 187 | datetime(2019, 9, 21, 19, 32, 25, tzinfo=tzutc()), 188 | datetime(2019, 9, 21, 19, 50, 21, tzinfo=tzutc()), 189 | datetime(2019, 9, 21, 19, 50, 21, tzinfo=tzutc()), 190 | datetime(2019, 9, 21, 19, 50, 22, tzinfo=tzutc()), 191 | datetime(2019, 9, 21, 19, 50, 22, tzinfo=tzutc()), 192 | datetime(2019, 9, 21, 19, 50, 42, tzinfo=tzutc()), 193 | datetime(2019, 9, 21, 19, 50, 42, tzinfo=tzutc()), 194 | datetime(2019, 9, 21, 19, 50, 44, tzinfo=tzutc()), 195 | datetime(2019, 9, 21, 19, 50, 44, tzinfo=tzutc()), 196 | datetime(2019, 9, 21, 19, 50, 45, tzinfo=tzutc()), 197 | datetime(2019, 9, 21, 19, 50, 45, tzinfo=tzutc()), 198 | ] 199 | result = FeedInfoParser.entry_velocity(dates) 200 | assert result == 7.255 201 | 202 | 203 | def test_is_podcast_no_data(): 204 | data = {} 205 | result = FeedInfoParser.is_podcast(data) 206 | assert result is False 207 | 208 | 209 | def test_is_podcast_not_podcast(): 210 | data = {"entries": [{}]} 211 | result = FeedInfoParser.is_podcast(data) 212 | assert result is False 213 | 214 | 215 | def test_is_podcast_no_namespace(): 216 | data = {"entries": [{"enclosures": [{"media": "file_url"}]}]} 217 | result = FeedInfoParser.is_podcast(data) 218 | assert result is False 219 | 220 | 221 | def test_is_podcast_is_true(): 222 | data = { 223 | "namespaces": {"itunes": "testing"}, 224 | "entries": [{"enclosures": [{"media": "file_url"}]}], 225 | } 226 | result = FeedInfoParser.is_podcast(data) 227 | assert result is True 228 | 229 | 230 | def test_is_podcast_no_enclosures(): 231 | data = {"namespaces": {"itunes": "testing"}, "entries": [{}]} 232 | result = FeedInfoParser.is_podcast(data) 233 | assert result is False 234 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/spider.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | from types import AsyncGeneratorType 4 | from typing import Union, Any, List, Set 5 | 6 | import bs4 7 | from yarl import URL 8 | 9 | from feedsearch_crawler.crawler import Crawler, Item, Request, Response 10 | from feedsearch_crawler.crawler.lib import parse_href_to_url 11 | from feedsearch_crawler.feed_spider.dupefilter import NoQueryDupeFilter 12 | from feedsearch_crawler.feed_spider.favicon import Favicon 13 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo 14 | from feedsearch_crawler.feed_spider.feed_info_parser import FeedInfoParser 15 | from feedsearch_crawler.feed_spider.lib import ParseTypes 16 | from feedsearch_crawler.feed_spider.link_filter import LinkFilter 17 | from feedsearch_crawler.feed_spider.regexes import rss_regex 18 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta 19 | from feedsearch_crawler.feed_spider.site_meta_parser import SiteMetaParser 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class FeedsearchSpider(Crawler): 25 | duplicate_filter_class = NoQueryDupeFilter 26 | htmlparser = "html.parser" 27 | favicon_data_uri = True 28 | try_urls: Union[List[str], bool] = False 29 | full_crawl: bool = False 30 | crawl_hosts: bool = True 31 | 32 | def __init__(self, *args, **kwargs): 33 | super().__init__(*args, **kwargs) 34 | self.site_meta_processor = SiteMetaParser(self) 35 | self.feed_info_parser = FeedInfoParser(self) 36 | self.site_metas = set() 37 | self.favicons = dict() 38 | self.feeds_seen = dict() 39 | self.post_crawl_callback = self.populate_feed_site_meta 40 | if "try_urls" in kwargs: 41 | self.try_urls = kwargs["try_urls"] 42 | if "favicon_data_uri" in kwargs: 43 | self.favicon_data_uri = kwargs["favicon_data_uri"] 44 | if "full_crawl" in kwargs: 45 | self.full_crawl = kwargs["full_crawl"] 46 | if "crawl_hosts" in kwargs: 47 | self.crawl_hosts = kwargs["crawl_hosts"] 48 | 49 | async def parse(self, request: Request, response: Response) -> AsyncGeneratorType: 50 | """ 51 | Parse a Response for feeds or site metadata. 52 | 53 | :param request: Request 54 | :param response: Response 55 | :return: AsyncGenerator yielding Items, Requests, or iterative AsyncGenerators 56 | """ 57 | 58 | # If the Response is not OK then there's no data to parse. 59 | if not response.ok: 60 | return 61 | 62 | # If the Response contains JSON then attempt to parse it as a JsonFeed. 63 | if response.json: 64 | if "version" and "jsonfeed" and "feed_url" in response.json: 65 | yield self.feed_info_parser.parse_item( 66 | request, response, parse_type=ParseTypes.JSON 67 | ) 68 | return 69 | 70 | if not isinstance(response.text, str): 71 | logger.debug("No text in %s", response) 72 | return 73 | 74 | yield self.parse_site_meta(request, response) 75 | 76 | # Restrict the RSS check to the first 1000 characters, otherwise it's almost definitely not an actual feed. 77 | if rss_regex.search(response.text, endpos=1000): 78 | yield self.feed_info_parser.parse_item( 79 | request, response, parse_type=ParseTypes.XML 80 | ) 81 | return 82 | 83 | # Don't waste time trying to parse and follow urls if the max depth is already reached. 84 | if response.is_max_depth_reached(self.max_depth): 85 | logger.debug("Max depth %d reached: %s", self.max_depth, response) 86 | return 87 | 88 | # Make sure the Response XML has been parsed if it exists. 89 | soup = await response.xml 90 | if not soup: 91 | return 92 | 93 | # Don't crawl links from pages that are not from the original domain 94 | if not response.is_original_domain(): 95 | return 96 | 97 | link_filter = LinkFilter( 98 | request=request, response=response, full_crawl=self.full_crawl 99 | ) 100 | 101 | # Find all links in the Response. 102 | links = soup.find_all(self.tag_has_href) 103 | for link in links: 104 | # Check each href for validity and queue priority. 105 | values = link_filter.should_follow_link(link) 106 | if values: 107 | url, priority = values 108 | yield await self.follow( 109 | url, self.parse, response, priority=priority, allow_domain=True 110 | ) 111 | 112 | async def parse_site_meta( 113 | self, request: Request, response: Response 114 | ) -> AsyncGeneratorType: 115 | """ 116 | Parses site metadata if the returned URL is a site origin URL. 117 | 118 | If the returned url is an origin url, or the request url is an origin url (and there may have been a redirect) 119 | then parse the site meta. 120 | 121 | :param request: Request 122 | :param response: Response 123 | :return: AsyncGenerator yielding SiteMeta items 124 | """ 125 | url_origin = response.url.origin() 126 | request_url_origin = request.url.origin() 127 | 128 | if response.url == url_origin or request.url == request_url_origin: 129 | yield self.site_meta_processor.parse_item(request, response) 130 | 131 | async def parse_xml(self, response_text: str) -> Any: 132 | """ 133 | Parse Response text as XML. 134 | Used to allow implementations to provide their own XML parser. 135 | 136 | :param response_text: Response text as string. 137 | :return: None 138 | """ 139 | return bs4.BeautifulSoup(response_text, self.htmlparser) 140 | 141 | async def process_item(self, item: Item) -> None: 142 | """ 143 | Process parsed items. 144 | 145 | :param item: Item object 146 | :return: None 147 | """ 148 | if isinstance(item, FeedInfo): 149 | self.items.add(item) 150 | elif isinstance(item, SiteMeta): 151 | self.site_metas.add(item) 152 | elif isinstance(item, Favicon): 153 | self.add_favicon(item) 154 | 155 | def add_favicon(self, favicon: Favicon) -> None: 156 | """ 157 | Add a favicon to the spider's favicon dictionary. 158 | 159 | :param favicon: Favicon object 160 | """ 161 | existing: Favicon = self.favicons.get(favicon.url) 162 | if existing and existing.data_uri and not favicon.data_uri: 163 | return 164 | self.favicons[favicon.url] = favicon 165 | 166 | # noinspection PyPep8 167 | async def populate_feed_site_meta(self) -> None: 168 | """ 169 | Populate FeedInfo site information with data from the relevant SiteMeta item 170 | """ 171 | for feed in self.items: 172 | # Check each SiteMeta for a url host match 173 | site_meta = next( 174 | (x for x in self.site_metas if x.host in feed.url.host), None 175 | ) 176 | if site_meta: 177 | feed.site_url = site_meta.url 178 | feed.site_name = site_meta.site_name 179 | 180 | # Populate favicon directly if available 181 | if feed.favicon: 182 | favicon = self.favicons.get(feed.favicon) 183 | if favicon: 184 | feed.favicon_data_uri = favicon.data_uri 185 | feed.favicon = favicon.resp_url if favicon.resp_url else favicon.url 186 | 187 | # If a favicon hasn't been found yet or there is no data_uri then try and find a suitable favicon 188 | if not feed.favicon or ( 189 | self.favicon_data_uri and not feed.favicon_data_uri 190 | ): 191 | feed_host = feed.url.host 192 | favicons = list( 193 | x 194 | for x in self.favicons.values() 195 | if x.matches_host(feed_host, self.favicon_data_uri) 196 | ) 197 | 198 | if favicons: 199 | favicon = min(favicons, key=lambda x: x.priority) 200 | 201 | feed.favicon_data_uri = favicon.data_uri 202 | feed.favicon = favicon.resp_url if favicon.resp_url else favicon.url 203 | 204 | # noinspection PyUnusedLocal 205 | async def parse_favicon_data_uri( 206 | self, request: Request, response: Response, favicon: Favicon 207 | ) -> None: 208 | """ 209 | Create a data uri from a favicon image. 210 | 211 | :param request: Request 212 | :param response: Response 213 | :param favicon: Favicon object 214 | :return: None 215 | """ 216 | if not response.ok or not response.data or not isinstance(response.data, bytes): 217 | return 218 | 219 | def is_png(data: bytes) -> bool: 220 | return data[:8] in bytes.fromhex("89 50 4E 47 0D 0A 1A 0A") 221 | 222 | def is_ico(data: bytes) -> bool: 223 | return data[:4] in bytes.fromhex("00 00 01 00") 224 | 225 | try: 226 | if not is_png(response.data) and not is_ico(response.data): 227 | logger.debug("Response data is not a valid image type: %s", response) 228 | return 229 | except Exception as e: 230 | logger.exception("Failure validation image type: %s: %s", response, e) 231 | 232 | try: 233 | encoded = base64.b64encode(response.data) 234 | uri = "data:image/png;base64," + encoded.decode(response.encoding) 235 | favicon.resp_url = response.url 236 | favicon.data_uri = uri 237 | self.add_favicon(favicon) 238 | except Exception as e: 239 | logger.exception("Failure encoding image: %s: %s", response, e) 240 | 241 | def create_start_urls(self, urls: List[Union[URL, str]]) -> List[URL]: 242 | """ 243 | Create the start URLs for the crawl from an initial URL. May be overridden. 244 | 245 | :param urls: Initial URLs 246 | """ 247 | crawl_start_urls: Set[URL] = set() 248 | 249 | for url in urls + self.start_urls: 250 | if isinstance(url, str): 251 | if "//" not in url: 252 | url = f"//{url}" 253 | url = parse_href_to_url(url) 254 | if not url: 255 | continue 256 | 257 | if url.scheme.lower() not in ["http", "https"]: 258 | url = url.with_scheme("http") 259 | 260 | crawl_start_urls.add(url) 261 | 262 | origins = set(url.origin() for url in crawl_start_urls) 263 | 264 | if self.try_urls: 265 | # Common paths for feeds. 266 | suffixes = { 267 | "index.xml", 268 | "atom.xml", 269 | "feeds", 270 | "feeds/default", 271 | "feed", 272 | "feed/default", 273 | "feeds/posts/default", 274 | "?feed=rss", 275 | "?feed=atom", 276 | "?feed=rss2", 277 | "?feed=rdf", 278 | "rss", 279 | "atom", 280 | "rdf", 281 | "index.rss", 282 | "index.rdf", 283 | "index.atom", 284 | "data/rss", 285 | "rss.xml", 286 | "index.json", 287 | "about", 288 | "about/feeds", 289 | "rss-feeds", 290 | } 291 | 292 | for origin in origins: 293 | if isinstance(self.try_urls, list): 294 | crawl_start_urls.update( 295 | origin.join(URL(suffix)) for suffix in self.try_urls 296 | ) 297 | else: 298 | crawl_start_urls.update( 299 | origin.join(URL(suffix)) for suffix in suffixes 300 | ) 301 | 302 | # Crawl the origin urls of the start urls for Site metadata. 303 | if self.crawl_hosts: 304 | crawl_start_urls.update(origins) 305 | 306 | return list(crawl_start_urls) 307 | 308 | @staticmethod 309 | def tag_has_href(tag: bs4.Tag) -> bool: 310 | """ 311 | Find all tags that contain links. 312 | 313 | :param tag: XML tag 314 | :return: boolean 315 | """ 316 | return tag.has_attr("href") 317 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/request.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import copy 3 | import json 4 | import logging 5 | import uuid 6 | from asyncio import Semaphore, IncompleteReadError, LimitOverrunError, CancelledError 7 | from random import random 8 | from typing import List, Tuple, Any, Union, Optional, Dict 9 | 10 | import aiohttp 11 | import time 12 | from aiohttp import ClientSession, ClientTimeout, hdrs 13 | from yarl import URL 14 | 15 | from feedsearch_crawler.crawler.queueable import Queueable 16 | from feedsearch_crawler.crawler.response import Response 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Request(Queueable): 22 | METHOD = ["GET", "POST"] 23 | 24 | def __init__( 25 | self, 26 | url: URL, 27 | request_session: ClientSession, 28 | params: Dict = None, 29 | data: Union[dict, bytes] = None, 30 | json_data: Dict = None, 31 | encoding: str = None, 32 | method: str = "GET", 33 | headers: Dict = None, 34 | timeout: Union[float, ClientTimeout] = 5.0, 35 | history: List = None, 36 | callback=None, 37 | xml_parser=None, 38 | failure_callback=None, 39 | max_content_length: int = 1024 * 1024 * 10, 40 | delay: float = 0, 41 | retries: int = 3, 42 | cb_kwargs: Dict = None, 43 | **kwargs, 44 | ): 45 | """ 46 | A pending HTTP request to a URL. Wraps an aiohttp ClientSession request. 47 | https://aiohttp.readthedocs.io/en/stable/client_reference.html 48 | 49 | :param params: Mapping of query string parameters 50 | :param data: Dictionary, bytes, or file-like object to send in the body of the request 51 | :param json_data: Json dict to send as body. Not compatible with data 52 | :param url: Request URL 53 | :param request_session: aiohttp ClientSession 54 | :param encoding: Default Response encoding 55 | :param method: HTTP method 56 | :param headers: HTTP headers for the request 57 | :param timeout: Seconds before Request times out 58 | :param history: Response history, list of previous URLs 59 | :param callback: Callback function to run after request is successful 60 | :param xml_parser: Function to parse Response XML 61 | :param failure_callback: Callback function to run if request is unsuccessful 62 | :param max_content_length: Maximum allowed size in bytes of Response content 63 | :param delay: Time in seconds to delay Request 64 | :param retries: Number of times to retry a failed Request 65 | :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function. 66 | :param kwargs: Optional keyword arguments 67 | """ 68 | self.url = url 69 | self.method = method.upper() 70 | if self.method not in self.METHOD: 71 | raise ValueError(f"{self.method} is not supported") 72 | if not isinstance(request_session, ClientSession): 73 | raise ValueError(f"request_session must be of type ClientSession") 74 | self.request_session = request_session 75 | self.headers = headers 76 | if not isinstance(timeout, ClientTimeout): 77 | timeout = aiohttp.ClientTimeout(total=self.timeout) 78 | self.timeout = timeout 79 | self.history = history or [] 80 | self.encoding = encoding 81 | self._callback = callback 82 | self._failure_callback = failure_callback 83 | self.id = uuid.uuid4() 84 | self._xml_parser = xml_parser 85 | self.max_content_length = max_content_length 86 | self.json_data = json_data 87 | self.data = data 88 | self.params = params 89 | self.has_run: bool = False 90 | self.delay = delay 91 | self.cb_kwargs = cb_kwargs or {} 92 | 93 | self.should_retry: bool = False 94 | self._max_retries = retries 95 | # Number of times this request has been retried. 96 | self._num_retries: int = 0 97 | # Time in Milliseconds for the HTTP response to arrive. 98 | self.req_latency: int = 0 99 | # Time in Milliseconds for the HTTP response content to be read. 100 | self.content_read: int = 0 101 | 102 | for key, value in kwargs: 103 | if hasattr(self, key): 104 | setattr(self, key, value) 105 | 106 | async def fetch_callback(self, semaphore: Semaphore = None) -> Tuple[Any, Response]: 107 | """ 108 | Fetch HTTP Response and run Callbacks. 109 | 110 | :param semaphore: asyncio Semaphore 111 | :returns: Tuple of Callback result and Response object 112 | """ 113 | if semaphore: 114 | async with semaphore: 115 | response = await self._fetch() 116 | else: 117 | response = await self._fetch() 118 | 119 | callback_result = None 120 | 121 | if response.ok and self._callback: 122 | callback_result = self._callback( 123 | request=self, response=response, **self.cb_kwargs 124 | ) 125 | elif not response.ok and self._failure_callback: 126 | callback_result = self._failure_callback( 127 | request=self, response=response, **self.cb_kwargs 128 | ) 129 | 130 | return callback_result, response 131 | 132 | # noinspection PyProtectedMember 133 | async def _fetch(self) -> Response: 134 | """ 135 | Run HTTP Request and fetch HTTP Response. 136 | 137 | :return: Response object 138 | """ 139 | # Delay the request if self.delay is > 0 140 | await self.delay_request() 141 | 142 | # Copy the Request history so that it isn't a pointer. 143 | history = copy.deepcopy(self.history) 144 | 145 | # Make sure that retry is reset. 146 | self.should_retry = False 147 | response = None 148 | start = time.perf_counter() 149 | 150 | try: 151 | async with self._create_request() as resp: 152 | resp_recieved = time.perf_counter() 153 | self.req_latency = int((resp_recieved - start) * 1000) 154 | history.append(resp.url) 155 | 156 | # Fail the response if the content length header is too large. 157 | content_length: int = int(resp.headers.get(hdrs.CONTENT_LENGTH, "0")) 158 | if content_length > self.max_content_length: 159 | logger.debug( 160 | "Content-Length of Response header %d greater than max %d: %s", 161 | content_length, 162 | self.max_content_length, 163 | self, 164 | ) 165 | return self._failed_response(413) 166 | 167 | # Read the response content, and fail the response if the actual content size is too large. 168 | content_read, actual_content_length = await self._read_response(resp) 169 | if not content_read: 170 | return self._failed_response(413) 171 | 172 | if content_length and content_length != actual_content_length: 173 | logger.debug( 174 | "Header Content-Length %d different from actual content-length %d: %s", 175 | content_length, 176 | actual_content_length, 177 | self, 178 | ) 179 | 180 | # Set encoding automatically from response if not specified. 181 | if not self.encoding: 182 | self.encoding = resp.get_encoding() 183 | 184 | # Read response content 185 | try: 186 | # Read response content as text 187 | resp_text = await resp.text(encoding=self.encoding) 188 | 189 | # Attempt to read response content as JSON 190 | resp_json = await self._read_json(resp_text) 191 | # If response content can't be decoded then neither text or JSON can be set. 192 | except UnicodeDecodeError: 193 | resp_text = None 194 | resp_json = None 195 | 196 | # Close the asyncio response 197 | if not resp.closed: 198 | resp.close() 199 | 200 | self.content_read = int((time.perf_counter() - resp_recieved) * 1000) 201 | 202 | response = Response( 203 | url=resp.url, 204 | method=resp.method, 205 | encoding=self.encoding, 206 | status_code=resp.status, 207 | history=history, 208 | text=resp_text, 209 | data=resp._body, 210 | json=resp_json, 211 | headers=resp.headers, 212 | xml_parser=self._parse_xml, 213 | cookies=resp.cookies, 214 | redirect_history=resp.history, 215 | content_length=actual_content_length, 216 | meta=copy.copy(self.cb_kwargs), 217 | ) 218 | 219 | # Raise exception after the Response object is created, because we only catch TimeoutErrors and 220 | # asyncio.ClientResponseErrors, and there may be valid data otherwise. 221 | resp.raise_for_status() 222 | 223 | except asyncio.TimeoutError: 224 | logger.debug("Failed fetch: url=%s reason=timeout", self.url) 225 | history.append(self.url) 226 | response = self._failed_response(408, history) 227 | except aiohttp.ClientResponseError as e: 228 | logger.debug("Failed fetch: url=%s reason=%s", self.url, e.message) 229 | if not response: 230 | response = self._failed_response(e.status, history) 231 | except Exception as e: 232 | logger.debug("Failed fetch: url=%s reason=%s", self.url, e) 233 | if isinstance(e, CancelledError) and not response: 234 | response = self._failed_response(499, history) 235 | finally: 236 | self.has_run = True 237 | # Make sure there is a valid Response object. 238 | if not response: 239 | response = self._failed_response(500, history) 240 | 241 | # Tell the crawler to retry this Request 242 | if response.status_code in [429, 503, 408]: 243 | self.set_retry() 244 | 245 | return response 246 | 247 | def _create_request(self): 248 | """ 249 | Create an asyncio HTTP Request. 250 | 251 | :return: asyncio HTTP Request 252 | """ 253 | if self.method.upper() == "GET": 254 | return self.request_session.get( 255 | self.url, headers=self.headers, timeout=self.timeout, params=self.params 256 | ) 257 | elif self.method.upper() == "POST": 258 | return self.request_session.post( 259 | self.url, 260 | headers=self.headers, 261 | timeout=self.timeout, 262 | params=self.params, 263 | data=self.data, 264 | json=self.json_data, 265 | ) 266 | else: 267 | raise ValueError( 268 | "HTTP method %s is not valid. Must be GET or POST", self.method 269 | ) 270 | 271 | async def _read_response(self, resp) -> Tuple[bool, int]: 272 | """ 273 | Read HTTP Response content as bytes. 274 | 275 | :param resp: asyncio HTTP Response 276 | :return: Tuple (read status, content length in bytes) 277 | """ 278 | body: bytes = b"" 279 | try: 280 | async for chunk in resp.content.iter_chunked(1024): 281 | if not chunk: 282 | break 283 | body += chunk 284 | if len(body) > self.max_content_length: 285 | logger.debug( 286 | "Content Length of Response body greater than max %d: %s", 287 | self.max_content_length, 288 | self, 289 | ) 290 | return False, 0 291 | except (IncompleteReadError, LimitOverrunError) as e: 292 | logger.exception("Failed to read Response content: %s: %s", self, e) 293 | return False, 0 294 | resp._body = body 295 | return True, len(body) 296 | 297 | @staticmethod 298 | async def _read_json(resp_text: Union[str, None]) -> Optional[dict]: 299 | """ 300 | Attempt to read Response content as JSON. 301 | 302 | :param resp_text: HTTP response context as text string 303 | :return: JSON dict or None 304 | """ 305 | 306 | # If the text hasn't been parsed then we won't be able to parse JSON either. 307 | if not resp_text: 308 | return None 309 | 310 | stripped = resp_text.strip() # type: ignore 311 | if not stripped: 312 | return None 313 | 314 | try: 315 | return json.loads(stripped) 316 | except ValueError: 317 | return None 318 | 319 | def _failed_response( 320 | self, status: int, history: List[URL] = None, headers=None 321 | ) -> Response: 322 | """ 323 | Create a failed Response object with the provided Status Code. 324 | 325 | :param status: HTTP Status Code 326 | :param history: Response History as list of URLs 327 | :param headers: Response Headers 328 | :return: Failed Response object 329 | """ 330 | return Response( 331 | url=self.url, 332 | method=self.method, 333 | encoding=self.encoding, 334 | history=history or [], 335 | status_code=status, 336 | headers=headers or {}, 337 | ) 338 | 339 | async def _parse_xml(self, response_text: str) -> Any: 340 | """ 341 | Use provided XML Parsers method to attempt to parse Response content as XML. 342 | 343 | :param response_text: Response content as text string. 344 | :return: Response content as parsed XML. Type depends on XML parser. 345 | """ 346 | try: 347 | return await self._xml_parser(response_text) 348 | except Exception as e: 349 | logger.exception("Error parsing response xml: %s", e) 350 | return None 351 | 352 | def set_retry(self) -> None: 353 | """ 354 | Set the Request to retry. 355 | """ 356 | if self._num_retries < self._max_retries: 357 | self.should_retry = True 358 | self._num_retries += 1 359 | self.delay = self._num_retries * 1 360 | 361 | async def delay_request(self) -> None: 362 | """ 363 | Delay the request by sleeping. 364 | """ 365 | if self.delay > 0: 366 | # Sleep for the delay plus up to one extra second of random time, to spread out requests. 367 | await asyncio.sleep(self.delay + random()) 368 | 369 | def __repr__(self): 370 | return f"{self.__class__.__name__}({str(self.url)})" 371 | -------------------------------------------------------------------------------- /feedsearch_crawler/feed_spider/feed_info_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, date 3 | from statistics import mean 4 | from types import AsyncGeneratorType 5 | from typing import Tuple, List, Union, Dict 6 | 7 | import feedparser 8 | import time 9 | from aiohttp import hdrs 10 | from bs4 import BeautifulSoup 11 | from yarl import URL 12 | 13 | from feedsearch_crawler.crawler import ItemParser, Request, Response, to_string 14 | from feedsearch_crawler.crawler.lib import headers_to_dict, remove_www 15 | from feedsearch_crawler.feed_spider.favicon import Favicon 16 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo 17 | from feedsearch_crawler.feed_spider.lib import ( 18 | parse_header_links, 19 | datestring_to_utc_datetime, 20 | create_content_type, 21 | ParseTypes, 22 | ) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class FeedInfoParser(ItemParser): 28 | async def parse_item( 29 | self, request: Request, response: Response, *args, **kwargs 30 | ) -> AsyncGeneratorType: 31 | logger.info("Parsing: Feed %s", response.url) 32 | 33 | if "parse_type" not in kwargs: 34 | raise ValueError("type keyword argument is required") 35 | 36 | parse_type = kwargs["parse_type"] 37 | 38 | content_type = create_content_type( 39 | parse_type, 40 | response.encoding, 41 | response.headers.get(hdrs.CONTENT_TYPE, "").lower(), 42 | ) 43 | 44 | item = FeedInfo(url=response.url, content_type=content_type) 45 | 46 | # Check link headers first for WebSub content discovery 47 | # https://www.w3.org/TR/websub/#discovery 48 | if response.headers: 49 | item.hubs, item.self_url = self.header_links(response.headers) 50 | 51 | try: 52 | valid_feed = False 53 | 54 | if parse_type == ParseTypes.JSON: 55 | valid_feed = self.parse_json(item, response.json) 56 | elif parse_type == ParseTypes.XML: 57 | valid_feed = self.parse_xml( 58 | item, 59 | response.data, 60 | response.encoding, 61 | headers_to_dict(response.headers), 62 | ) 63 | 64 | if not valid_feed: 65 | logger.debug("Invalid Feed: %s", item) 66 | return 67 | except Exception as e: 68 | logger.exception("Failed to parse feed %s, Error: %s", item, e) 69 | return 70 | 71 | if item.favicon and self.crawler.favicon_data_uri: 72 | favicon = Favicon( 73 | url=item.favicon, 74 | priority=1, 75 | ) 76 | yield self.follow( 77 | item.favicon, 78 | self.crawler.parse_favicon_data_uri, 79 | cb_kwargs=dict(favicon=favicon), 80 | ) 81 | 82 | self.validate_self_url(item) 83 | 84 | item.content_length = response.content_length 85 | self.score_item(item, response.history[0]) 86 | yield item 87 | 88 | def parse_xml( 89 | self, item: FeedInfo, data: Union[str, bytes], encoding: str, headers: Dict 90 | ) -> bool: 91 | """ 92 | Get info from XML (RSS or ATOM) feed. 93 | """ 94 | 95 | # Parse data with feedparser 96 | try: 97 | parsed: dict = self.parse_raw_data(data, encoding, headers) 98 | except Exception as e: 99 | logger.exception("Unable to parse feed %s: %s", item, e) 100 | return False 101 | 102 | if not parsed: 103 | logger.warning("No valid feed data for %s", item) 104 | return False 105 | 106 | if parsed.get("bozo") == 1: 107 | bozo_exception = parsed.get("bozo_exception", None) 108 | if isinstance(bozo_exception, feedparser.CharacterEncodingOverride): 109 | item.bozo = 1 110 | elif isinstance( 111 | bozo_exception, 112 | (feedparser.CharacterEncodingUnknown, feedparser.UndeclaredNamespace), 113 | ): 114 | logger.warning("No valid feed data for %s: %s", item, bozo_exception) 115 | return False 116 | 117 | feed = parsed.get("feed") 118 | if not feed: 119 | return False 120 | if not parsed.get("entries"): 121 | return False 122 | 123 | # Only search if no hubs already present from headers 124 | if not item.hubs: 125 | item.hubs, item.self_url = self.websub_links(feed) 126 | 127 | if item.hubs and item.self_url: 128 | item.is_push = True 129 | 130 | item.version = parsed.get("version") 131 | item.title = self.feed_title(feed) 132 | item.description = self.feed_description(feed) 133 | item.is_podcast = self.is_podcast(parsed) 134 | 135 | try: 136 | dates = [] 137 | now_date = datetime.utcnow().date() 138 | 139 | entries = parsed.get("entries", []) 140 | item.item_count = len(entries) 141 | 142 | dates.extend( 143 | FeedInfoParser.entry_dates(entries, ["updated", "published"], now_date) 144 | ) 145 | 146 | if dates: 147 | item.last_updated = sorted(dates, reverse=True)[0] 148 | item.velocity = self.entry_velocity(dates) 149 | elif feed.get("updated"): 150 | item.last_updated = datestring_to_utc_datetime(feed.get("updated")) 151 | except Exception as e: 152 | logger.exception("Unable to get feed published date: %s", e) 153 | pass 154 | 155 | return True 156 | 157 | def parse_json(self, item: FeedInfo, data: dict) -> bool: 158 | """ 159 | Get info from JSON feed. 160 | 161 | :param item: FeedInfo object 162 | :param data: JSON object 163 | :return: None 164 | """ 165 | item.version = data.get("version") 166 | if "https://jsonfeed.org/version/" not in item.version: 167 | item.bozo = 1 168 | return False 169 | 170 | if not data.get("items"): 171 | return False 172 | 173 | item.title = data.get("title") 174 | item.description = data.get("description") 175 | 176 | favicon = data.get("favicon") 177 | if favicon: 178 | item.favicon = URL(favicon) 179 | 180 | # Only search if no hubs already present from headers 181 | if not item.hubs: 182 | try: 183 | item.hubs = list(hub.get("url") for hub in data.get("hubs", [])) 184 | except (IndexError, AttributeError): 185 | pass 186 | 187 | if item.hubs: 188 | item.is_push = True 189 | 190 | try: 191 | dates = [] 192 | now_date: date = datetime.utcnow().date() 193 | 194 | entries = data.get("items", []) 195 | item.item_count = len(entries) 196 | 197 | dates.extend( 198 | FeedInfoParser.entry_dates( 199 | entries, ["date_modified", "date_published"], now_date 200 | ) 201 | ) 202 | 203 | if dates: 204 | item.last_updated = sorted(dates, reverse=True)[0] 205 | item.velocity = self.entry_velocity(dates) 206 | except Exception as e: 207 | logger.exception("Unable to get feed published date: %s", e) 208 | pass 209 | 210 | return True 211 | 212 | @staticmethod 213 | def parse_raw_data( 214 | raw_data: Union[str, bytes], encoding: str = "utf-8", headers: Dict = None 215 | ) -> Dict: 216 | """ 217 | Loads the raw RSS/Atom XML data. 218 | Returns feedparser Dict. 219 | https://pythonhosted.org/feedparser/ 220 | 221 | :param raw_data: RSS/Atom XML feed 222 | :type raw_data: str 223 | :param encoding: Character encoding of raw_data 224 | :type encoding: str 225 | :param headers: Response headers 226 | :return: Dict 227 | """ 228 | if not encoding: 229 | encoding = "utf-8" 230 | 231 | h = {} 232 | if headers: 233 | if isinstance(headers, dict): 234 | h = headers 235 | else: 236 | try: 237 | h.update({k.lower(): v for (k, v) in headers.items()}) 238 | except KeyError: 239 | pass 240 | 241 | h.pop("content-encoding", None) 242 | 243 | try: 244 | start = time.perf_counter() 245 | 246 | if isinstance(raw_data, str): 247 | raw_data: bytes = raw_data.encode(encoding) 248 | 249 | raw_data = raw_data.strip() 250 | content_length = len(raw_data) 251 | 252 | # We want to pass data into feedparser as bytes, otherwise if we accidentally pass a url string 253 | # it will attempt a fetch 254 | data = feedparser.parse(raw_data, response_headers=h) 255 | 256 | dur = int((time.perf_counter() - start) * 1000) 257 | logger.debug("Feed Parse: size=%s dur=%sms", content_length, dur) 258 | 259 | return data 260 | except Exception as e: 261 | logger.exception("Could not parse RSS data: %s", e) 262 | 263 | def feed_title(self, feed: dict) -> str: 264 | """ 265 | Get feed title 266 | 267 | :param feed: feed dict 268 | :return: str 269 | """ 270 | title = feed.get("title", None) 271 | if not title: 272 | return "" 273 | return self.clean_title(title) 274 | 275 | def clean_title(self, title: str) -> str: 276 | """ 277 | Cleans title string, and shortens if too long. 278 | Have had issues with dodgy feed titles. 279 | 280 | :param title: Title string 281 | :return: str 282 | """ 283 | try: 284 | title = BeautifulSoup(title, self.crawler.htmlparser).get_text() 285 | if len(title) > 1024: 286 | title = title[:1020] + "..." 287 | return title 288 | except Exception as ex: 289 | logger.exception("Failed to clean title: %s", ex) 290 | return "" 291 | 292 | @staticmethod 293 | def is_podcast(parsed: dict) -> bool: 294 | """ 295 | Check if the feed is a Podcast. 296 | 297 | :param parsed: Feedparser dict 298 | :return: bool 299 | """ 300 | if not parsed: 301 | return False 302 | 303 | has_itunes: bool = "itunes" in parsed.get("namespaces", {}) 304 | 305 | has_enclosures = False 306 | 307 | for entry in parsed.get("entries", []): 308 | for enclosure in entry.get("enclosures", []): 309 | if "audio" in enclosure.get("type"): 310 | has_enclosures = True 311 | 312 | return has_itunes and has_enclosures 313 | 314 | @staticmethod 315 | def feed_description(feed: dict) -> str: 316 | """ 317 | Get feed description. 318 | 319 | :param feed: feed dict 320 | :return: str 321 | """ 322 | subtitle = feed.get("subtitle", None) 323 | if subtitle: 324 | return subtitle 325 | return feed.get("description", None) 326 | 327 | @staticmethod 328 | def websub_links(feed: dict) -> Tuple[List[str], str]: 329 | """ 330 | Returns a tuple containing the hub url and the self url for 331 | a parsed feed. 332 | 333 | :param feed: An RSS feed parsed by feedparser 334 | :type feed: dict 335 | :return: tuple 336 | """ 337 | links = feed.get("links", []) 338 | return FeedInfoParser.find_hubs_and_self_links(links) 339 | 340 | @staticmethod 341 | def header_links(headers: dict) -> Tuple[List[str], str]: 342 | """ 343 | Attempt to get self and hub links from HTTP headers 344 | https://www.w3.org/TR/websub/#x4-discovery 345 | 346 | :param headers: Dict of HTTP headers 347 | :return: None 348 | """ 349 | link_header = headers.get("Link") 350 | links: list = [] 351 | 352 | if link_header: 353 | links = parse_header_links(to_string(link_header)) 354 | 355 | return FeedInfoParser.find_hubs_and_self_links(links) 356 | 357 | @staticmethod 358 | def find_hubs_and_self_links(links: List[dict]) -> Tuple[List[str], str]: 359 | """ 360 | Parses a list of links into self and hubs urls 361 | 362 | :param links: List of parsed HTTP Link Dicts 363 | :return: Tuple 364 | """ 365 | hub_urls: List[str] = [] 366 | self_url: str = "" 367 | 368 | if not links: 369 | return [], "" 370 | 371 | for link in links: 372 | try: 373 | if link["rel"] == "hub": 374 | href: str = link["href"] 375 | hub_urls.append(href) 376 | elif link["rel"] == "self": 377 | self_url = link["href"] 378 | except KeyError: 379 | continue 380 | 381 | return hub_urls, self_url 382 | 383 | @staticmethod 384 | def score_item(item: FeedInfo, original_url: URL): 385 | score = 0 386 | 387 | url_str = str(item.url).lower() 388 | 389 | # -- Score Decrement -- 390 | 391 | if original_url: 392 | host = remove_www(original_url.host) 393 | 394 | if host not in item.url.host: 395 | score -= 20 396 | 397 | # Decrement the score by every extra path in the url 398 | parts_len = len(item.url.parts) 399 | if parts_len > 2: 400 | score -= (parts_len - 2) * 2 401 | 402 | if item.bozo: 403 | score -= 20 404 | if not item.description: 405 | score -= 10 406 | if "georss" in url_str: 407 | score -= 10 408 | if "alt" in url_str: 409 | score -= 7 410 | if "comments" in url_str or "comments" in item.title.lower(): 411 | score -= 15 412 | if "feedburner" in url_str: 413 | score -= 10 414 | 415 | # -- Score Increment -- 416 | if item.url.scheme == "https": 417 | score += 10 418 | if item.is_push: 419 | score += 10 420 | if "index" in url_str: 421 | score += 30 422 | 423 | if "comments" in url_str or "comments" in item.title.lower(): 424 | score -= 15 425 | else: 426 | score += int(item.velocity) 427 | 428 | if any(map(url_str.count, ["/home", "/top", "/most", "/magazine"])): 429 | score += 10 430 | 431 | kw = ["atom", "rss", ".xml", "feed", "rdf"] 432 | for p, t in zip(range(len(kw) * 2, 0, -2), kw): 433 | if t in url_str: 434 | score += p 435 | 436 | item.score = score 437 | 438 | @staticmethod 439 | def entry_dates(entries: List[Dict], date_names: List[str], current_date: date): 440 | """ 441 | Return published or updated dates from feed entries. 442 | 443 | :param entries: List of feed entries as dicts. 444 | :param date_names: List of key names of entry published or updated values. 445 | :param current_date: The current date. 446 | :return: generator that returns datetimes. 447 | """ 448 | for entry in entries: 449 | for name in date_names: 450 | try: 451 | entry_date: datetime = datestring_to_utc_datetime(entry[name]) 452 | if entry_date.date() <= current_date: 453 | yield entry_date 454 | except (KeyError, ValueError): 455 | pass 456 | 457 | @staticmethod 458 | def entry_velocity(dates: List[datetime]) -> float: 459 | """ 460 | Calculate velocity of posted entries, returns a float of the average number of entries posted per day. 461 | 462 | :param dates: List of entry dates 463 | :return: Average entries per day 464 | """ 465 | if not dates or len(dates) < 3: 466 | return 0 467 | 468 | dates = sorted(dates) 469 | deltas = [] 470 | previous_date: datetime = dates[0] 471 | 472 | for current_date in dates[1:]: 473 | if current_date == previous_date: 474 | continue 475 | delta = current_date - previous_date 476 | deltas.append(delta.total_seconds()) 477 | previous_date = current_date 478 | 479 | if not deltas: 480 | return 0 481 | 482 | mean_seconds_delta = mean(deltas) 483 | 484 | result = round(86400 / mean_seconds_delta, 3) 485 | return result 486 | 487 | @staticmethod 488 | def validate_self_url(item: FeedInfo) -> None: 489 | """ 490 | Validate the self url 491 | 492 | :param item: FeedInfo item 493 | """ 494 | try: 495 | item.self_url = URL(item.self_url) 496 | except ValueError: 497 | item.self_url = "" 498 | return 499 | 500 | if item.self_url and item.self_url != item.url: 501 | # Handle a case where the item url contains a trailing slash and the self url doesn't. 502 | if str(item.url).strip("/") == str(item.self_url): 503 | item.url = URL(str(item.url).strip("/")) 504 | return 505 | 506 | # The self url should be an absolute url. 507 | if not item.self_url.is_absolute(): 508 | if str(item.self_url) in str(item.url): 509 | item.self_url = item.url 510 | else: 511 | item.self_url = "" 512 | -------------------------------------------------------------------------------- /feedsearch_crawler/crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import copy 3 | import inspect 4 | import logging 5 | from abc import ABC, abstractmethod 6 | from collections import OrderedDict 7 | from fnmatch import fnmatch 8 | from statistics import harmonic_mean, median 9 | from types import AsyncGeneratorType 10 | from typing import List, Any, Dict, Set 11 | from typing import Union 12 | 13 | import aiohttp 14 | import time 15 | from aiohttp import ClientTimeout 16 | from yarl import URL 17 | 18 | from feedsearch_crawler.crawler.duplicatefilter import DuplicateFilter 19 | from feedsearch_crawler.crawler.item import Item 20 | from feedsearch_crawler.crawler.lib import ( 21 | coerce_url, 22 | ignore_aiohttp_ssl_error, 23 | Stats, 24 | CallbackResult, 25 | CrawlerPriorityQueue, 26 | parse_href_to_url, 27 | ) 28 | from feedsearch_crawler.crawler.queueable import Queueable 29 | from feedsearch_crawler.crawler.request import Request 30 | from feedsearch_crawler.crawler.response import Response 31 | from feedsearch_crawler.crawler.trace import add_trace_config 32 | 33 | try: 34 | import uvloop 35 | 36 | asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) 37 | except ImportError: 38 | uvloop = None 39 | pass 40 | 41 | 42 | logger = logging.getLogger(__name__) 43 | 44 | 45 | class Crawler(ABC): 46 | 47 | # Class Name of the Duplicate Filter. 48 | # May be overridden to use different Duplicate Filter. 49 | # Not an instantiation of the class. 50 | duplicate_filter_class = DuplicateFilter 51 | 52 | # Callback to be run after all workers are finished. 53 | post_crawl_callback = None 54 | 55 | # URLs to start the crawl. 56 | start_urls = [] 57 | # Domain patterns that are allowed to be crawled. 58 | allowed_domains = [] 59 | 60 | # Max number of concurrent http requests. 61 | concurrency: int = 10 62 | # Max size of incoming http response content. 63 | max_content_length = 1024 * 1024 * 10 64 | # Max crawl depth. i.e. The max length of the response history. 65 | max_depth: int = 4 66 | # Max callback recursion depth, to prevent accidental infinite recursion from AsyncGenerators. 67 | max_callback_recursion: int = 10 68 | # Time in seconds to delay each HTTP request. 69 | delay: float = 0 70 | 71 | # List of worker tasks. 72 | _workers = [] 73 | 74 | # ClientSession for requests. Created on Crawl start. 75 | _session: aiohttp.ClientSession 76 | # Task queue for Requests. Created on Crawl start. 77 | _request_queue: CrawlerPriorityQueue 78 | # Semaphore for controlling HTTP Request concurrency. 79 | _semaphore: asyncio.Semaphore 80 | 81 | def __init__( 82 | self, 83 | start_urls: List[str] = None, 84 | allowed_domains: List[str] = None, 85 | concurrency: int = 10, 86 | total_timeout: Union[float, ClientTimeout] = 30, 87 | request_timeout: Union[float, ClientTimeout] = 5, 88 | user_agent: str = "", 89 | max_content_length: int = 1024 * 1024 * 10, 90 | max_depth: int = 10, 91 | headers: dict = None, 92 | allowed_schemes: List[str] = None, 93 | delay: float = 0.5, 94 | max_retries: int = 3, 95 | ssl: bool = False, 96 | trace: bool = False, 97 | *args, 98 | **kwargs, 99 | ): 100 | """ 101 | Base class for a WebCrawler implementation. 102 | 103 | :param allowed_schemes: List of strings of allowed Request URI schemes. e.g. ["http", "https"] 104 | :param start_urls: List of initial URLs to crawl. 105 | :param allowed_domains: List of domain patterns that are allowed. Uses Unix shell-style wildcards. 106 | :param concurrency: Max number of workers and of concurrent HTTP requests. 107 | :param total_timeout: Total aiohttp ClientSession timeout in seconds. 108 | Crawl will end if this timeout is triggered. 109 | :param request_timeout: Total timeout in seconds for each individual HTTP request. 110 | :param user_agent: Default User-Agent for HTTP requests. 111 | :param max_content_length: Max size in bytes of incoming http response content. 112 | :param max_depth: Max crawl depth. i.e. The max length of the response history. 113 | :param headers: Default HTTP headers to be included in each request. 114 | :param delay: Time in seconds to delay each HTTP request. 115 | :param max_retries: Maximum number of retries for each failed HTTP request. 116 | :param ssl: Enables strict SSL checking. 117 | :param trace: Enables aiohttp trace debugging. 118 | :param args: Additional positional arguments for subclasses. 119 | :param kwargs: Additional keyword arguments for subclasses. 120 | """ 121 | self.start_urls = start_urls or [] 122 | self.allowed_domains = allowed_domains or [] 123 | 124 | self.concurrency = concurrency 125 | 126 | if not isinstance(total_timeout, ClientTimeout): 127 | total_timeout = aiohttp.ClientTimeout(total=total_timeout) 128 | if not isinstance(request_timeout, ClientTimeout): 129 | request_timeout = aiohttp.ClientTimeout(total=request_timeout) 130 | 131 | self.total_timeout: ClientTimeout = total_timeout 132 | self.request_timeout: ClientTimeout = request_timeout 133 | 134 | self.max_content_length = max_content_length 135 | self.max_depth = max_depth 136 | 137 | self.user_agent = user_agent or ( 138 | "Mozilla/5.0 (compatible; Feedsearch-Crawler; +https://pypi.org/project/feedsearch-crawler)" 139 | ) 140 | 141 | self.headers = {"User-Agent": self.user_agent, "Upgrade-Insecure-Requests": "1"} 142 | 143 | if headers: 144 | self.headers = {**self.headers, **headers} 145 | 146 | self.allowed_schemes = allowed_schemes 147 | self.delay = delay 148 | self.max_retries = max_retries 149 | self._ssl = ssl 150 | self._trace = trace 151 | 152 | # Default set for parsed items. 153 | self.items: set = set() 154 | 155 | # URL Duplicate Filter instance. 156 | self._duplicate_filter = self.duplicate_filter_class() 157 | 158 | # List of total durations in Milliseconds for the total handling time of all Requests. 159 | self._stats_request_durations = [] 160 | # List of total duration in Milliseconds of all HTTP requests. 161 | self._stats_request_latencies = [] 162 | # List of Content Length in bytes of all Responses. 163 | self._stats_response_content_lengths = [] 164 | # List of time in Milliseconds that each item spend on the queue. 165 | self._stats_queue_wait_times = [] 166 | # List of the size of the queue each time an item was popped off the queue. 167 | self._stats_queue_sizes = [] 168 | 169 | # Initialise Crawl Statistics. 170 | self.stats: dict = { 171 | Stats.REQUESTS_QUEUED: 0, 172 | Stats.REQUESTS_SUCCESSFUL: 0, 173 | Stats.REQUESTS_FAILED: 0, 174 | Stats.CONTENT_LENGTH_TOTAL: 0, 175 | Stats.CONTENT_LENGTH_AVG: 0, 176 | Stats.CONTENT_LENGTH_MIN: 0, 177 | Stats.CONTENT_LENGTH_MAX: 0, 178 | Stats.CONTENT_LENGTH_MEDIAN: 0, 179 | Stats.ITEMS_PROCESSED: 0, 180 | Stats.URLS_SEEN: 0, 181 | Stats.REQUESTS_DURATION_AVG: 0, 182 | Stats.REQUESTS_DURATION_MAX: 0, 183 | Stats.REQUESTS_DURATION_MIN: 0, 184 | Stats.REQUESTS_DURATION_TOTAL: 0, 185 | Stats.REQUESTS_DURATION_MEDIAN: 0, 186 | Stats.TOTAL_DURATION: 0, 187 | Stats.STATUS_CODES: {}, 188 | Stats.QUEUE_WAIT_MAX: 0, 189 | Stats.QUEUE_WAIT_MIN: 0, 190 | Stats.QUEUE_WAIT_AVG: 0, 191 | Stats.QUEUE_WAIT_MEDIAN: 0, 192 | Stats.QUEUE_SIZE_MAX: 0, 193 | Stats.QUEUE_SIZE_AVG: 0, 194 | Stats.QUEUE_SIZE_MEDIAN: 0, 195 | Stats.QUEUED_TOTAL: 0, 196 | Stats.REQUESTS_RETRIED: 0, 197 | } 198 | 199 | async def _handle_request(self, request: Request) -> None: 200 | """ 201 | Handle fetching of Requests and processing of Request callbacks. 202 | 203 | :param request: Request 204 | :return: None 205 | """ 206 | try: 207 | if request.has_run and not request.should_retry: 208 | logger.warning("%s has already run", request) 209 | return 210 | 211 | start = time.perf_counter() 212 | 213 | # Fetch the request and run its callback 214 | # results, response = await request.fetch_callback(self._semaphore) 215 | results, response = await request.fetch_callback() 216 | 217 | dur = int((time.perf_counter() - start) * 1000) 218 | self._stats_request_durations.append(dur) 219 | self._stats_request_latencies.append(request.req_latency) 220 | logger.debug( 221 | "Fetched: url=%s dur=%dms latency=%dms read=%dms status=%s prev=%s", 222 | response.url, 223 | dur, 224 | request.req_latency, 225 | request.content_read, 226 | response.status_code, 227 | response.originator_url, 228 | ) 229 | 230 | if response.ok: 231 | self.stats[Stats.REQUESTS_SUCCESSFUL] += 1 232 | else: 233 | self.stats[Stats.REQUESTS_FAILED] += 1 234 | 235 | if response.status_code in self.stats[Stats.STATUS_CODES]: 236 | self.stats[Stats.STATUS_CODES][response.status_code] += 1 237 | else: 238 | self.stats[Stats.STATUS_CODES][response.status_code] = 1 239 | 240 | self._stats_response_content_lengths.append(response.content_length) 241 | 242 | # Mark the Response URL as seen in the duplicate filter, as it may be different from the Request URL 243 | # due to redirects. 244 | await self._duplicate_filter.url_seen(response.url, response.method) 245 | 246 | # Add callback results to the queue for processing. 247 | if results: 248 | self._put_queue(CallbackResult(results, 0)) 249 | 250 | # Add Request back to the queue for retrying. 251 | if request.should_retry: 252 | self.stats[Stats.REQUESTS_RETRIED] += 1 253 | self._put_queue(request) 254 | 255 | except asyncio.CancelledError as e: 256 | logger.debug("Cancelled: %s, %s", request, e) 257 | except Exception as e: 258 | logger.exception("Exception during %s: %s", request, e) 259 | finally: 260 | return 261 | 262 | async def _process_request_callback_result( 263 | self, result: Any, callback_recursion: int = 0 264 | ) -> None: 265 | """ 266 | Process the Request callback result depending on the result type. 267 | Request callbacks may contain nested iterators. 268 | 269 | :param result: Callback Result. May be an CallbackResult class, AsyncGenerator, Coroutine, Request, or Item. 270 | :param callback_recursion: Incremented counter to limit this method's recursion. 271 | :return: None 272 | """ 273 | if callback_recursion >= self.max_callback_recursion: 274 | logger.warning( 275 | "Max callback recursion of %d reached", self.max_callback_recursion 276 | ) 277 | return 278 | 279 | try: 280 | # If a CallbackResult class is passed, process the result values from within the class. 281 | if isinstance(result, CallbackResult): 282 | await self._process_request_callback_result( 283 | result.result, result.callback_recursion 284 | ) 285 | # For async generators, put each value back on the queue for processing. 286 | # This will happen recursively until the end of the recursion chain or max_callback_recursion is reached. 287 | elif inspect.isasyncgen(result): 288 | async for value in result: 289 | if value: 290 | self._put_queue(CallbackResult(value, callback_recursion + 1)) 291 | # For coroutines, await the result then put the value back on the queue for further processing. 292 | elif inspect.iscoroutine(result): 293 | value = await result 294 | self._put_queue(CallbackResult(value, callback_recursion + 1)) 295 | # Requests are put onto the queue to be fetched. 296 | elif isinstance(result, Request): 297 | self._process_request(result) 298 | 299 | # Items are handled by the implementing Class. 300 | elif isinstance(result, Item): 301 | await self.process_item(result) 302 | self.stats[Stats.ITEMS_PROCESSED] += 1 303 | except Exception as e: 304 | logger.exception(e) 305 | 306 | def _process_request(self, request: Request) -> None: 307 | """ 308 | Process a Request onto the Request Queue. 309 | 310 | :param request: HTTP Request 311 | :return: None 312 | """ 313 | if not request: 314 | return 315 | 316 | self.stats[Stats.REQUESTS_QUEUED] += 1 317 | logger.debug("Queue Add: %s", request) 318 | # Add the Request to the queue for processing. 319 | self._put_queue(request) 320 | 321 | def is_allowed_domain(self, url: URL) -> bool: 322 | """ 323 | Check that the URL host is in the list of allowed domain patterns. 324 | Domain patterns are Unix shell-style wildcards. 325 | https://docs.python.org/3/library/fnmatch.html 326 | 327 | :param url: URL object 328 | :return: boolean 329 | """ 330 | if not self.allowed_domains: 331 | return True 332 | 333 | try: 334 | if not url or not url.host: 335 | return False 336 | host = url.host 337 | for domain_pattern in self.allowed_domains: 338 | if fnmatch(host, domain_pattern): 339 | return True 340 | except Exception as e: 341 | logger.warning(e) 342 | return False 343 | 344 | async def follow( 345 | self, 346 | url: Union[str, URL], 347 | callback=None, 348 | response: Response = None, 349 | method: str = "GET", 350 | delay: Union[float, None] = None, 351 | priority: int = 0, 352 | allow_domain: bool = False, 353 | cb_kwargs: Dict = None, 354 | max_content_length: int = None, 355 | timeout: float = None, 356 | retries: int = None, 357 | **kwargs, 358 | ) -> Union[Request, None]: 359 | """ 360 | Follow a URL by creating an HTTP Request. 361 | 362 | If the URL is not absolute then it is joined with the previous Response URL. 363 | The previous Response history is copied to the Request. 364 | 365 | Before a Request is followed, first check that the Request URL has not already been seen, 366 | that the max URL depth has not been reached, and that the URI scheme is allowed. 367 | 368 | These checks are performed before the Request is created so that we don't yield multiple requests 369 | to the same URL to the queue for further processing. We want to stop duplicates and invalid 370 | requests as early as possible. 371 | 372 | :param url: URL to follow. 373 | :param callback: Callback method to run if the Request is successful. 374 | :param response: Previous Response that contained the Request URL. 375 | :param kwargs: Optional Request keyword arguments. See Request for details. 376 | :param method: HTTP method for Request. 377 | :param delay: Optionally override the default delay for the Request. 378 | :param priority: Optionally override the default priority of the Request. 379 | :param allow_domain: Optionally override the allowed domains check. 380 | :param max_content_length: Optionally override the maximum allowed size in bytes of Response body. 381 | :param retries: Optionally override the number of Request retries. 382 | :param timeout: Optionally override the Request timeout. 383 | :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function. 384 | :return: Request 385 | """ 386 | original_url = copy.copy(url) 387 | if isinstance(url, str): 388 | url = parse_href_to_url(url) 389 | 390 | if not url: 391 | logger.warning("Attempted to follow invalid URL: %s", original_url) 392 | return 393 | 394 | history = [] 395 | if response: 396 | # Join the URL to the Response URL if it doesn't contain a domain. 397 | if not url.is_absolute() or not url.scheme: 398 | url = coerce_url( 399 | response.origin.join(url), default_scheme=response.scheme 400 | ) 401 | 402 | # Restrict the depth of the Request chain to the maximum depth. 403 | # This test happens before the URL duplicate check so that the URL might still be reachable by another path. 404 | if self.max_depth and len(response.history) >= self.max_depth: 405 | logger.debug("Max Depth of '%d' reached: %s", self.max_depth, url) 406 | return 407 | 408 | # Copy the Response history so that it isn't a reference to a mutable object. 409 | history = copy.deepcopy(response.history) 410 | else: 411 | if not url.is_absolute(): 412 | logger.debug("URL should have domain: %s", url) 413 | return 414 | 415 | if not url.scheme: 416 | url = coerce_url(url) 417 | 418 | # The URL scheme must be in the list of allowed schemes. 419 | if self.allowed_schemes and url.scheme not in self.allowed_schemes: 420 | logger.debug("URI Scheme '%s' not allowed: %s", url.scheme, url) 421 | return 422 | 423 | # The URL host must be in the list of allowed domains. 424 | if not allow_domain and not self.is_allowed_domain(url): 425 | logger.debug("Domain '%s' not allowed: %s", url.host, url) 426 | return 427 | 428 | # Check if URL is not already seen, and add it to the duplicate filter seen list. 429 | if await self._duplicate_filter.url_seen(url, method): 430 | return 431 | 432 | request = Request( 433 | url=url, 434 | request_session=self._session, 435 | history=history, 436 | callback=callback, 437 | xml_parser=self.parse_xml, 438 | max_content_length=max_content_length or self.max_content_length, 439 | timeout=timeout or self.request_timeout, 440 | method=method, 441 | delay=delay if isinstance(delay, float) else self.delay, 442 | retries=retries or self.max_retries, 443 | cb_kwargs=cb_kwargs, 444 | **kwargs, 445 | ) 446 | 447 | # Override the Request priority only if the kwarg is provided. 448 | if priority: 449 | request.priority = priority 450 | 451 | return request 452 | 453 | @abstractmethod 454 | async def process_item(self, item: Item) -> None: 455 | """ 456 | Processed a parsed Item in some way. e.g. Add it to the Item set, or database, or send a signal. 457 | 458 | :param item: A parsed Item. 459 | """ 460 | self.items.add(item) 461 | 462 | @abstractmethod 463 | async def parse_xml(self, response_text: str) -> Any: 464 | """ 465 | Parse Response text as XML. 466 | Used to allow implementations to provide their own XML parser. 467 | 468 | :param response_text: Response text as string. 469 | """ 470 | raise NotImplementedError("Not Implemented") 471 | 472 | @abstractmethod 473 | async def parse(self, request: Request, response: Response) -> AsyncGeneratorType: 474 | """ 475 | Parse an HTTP Response. Must yield Items, Requests, AsyncGenerators, or Coroutines. 476 | 477 | :param request: HTTP Request that created the Response. 478 | :param response: HTTP Response. 479 | """ 480 | raise NotImplementedError("Not Implemented") 481 | 482 | def _put_queue(self, queueable: Queueable) -> None: 483 | """ 484 | Put an object that inherits from Queueable onto the Request Queue. 485 | 486 | :param queueable: An object that inherits from Queueable. 487 | """ 488 | if not isinstance(queueable, Queueable): 489 | raise ValueError("Object must inherit from Queueable Class") 490 | 491 | queueable.add_to_queue(self._request_queue) 492 | self.stats[Stats.QUEUED_TOTAL] += 1 493 | 494 | async def _work(self, task_num): 495 | """ 496 | Worker function for handling request queue items. 497 | """ 498 | try: 499 | while True: 500 | self._stats_queue_sizes.append(self._request_queue.qsize()) 501 | item: Queueable = await self._request_queue.get() 502 | # logger.debug("Priority: %s Item: %s", item.priority, item) 503 | if item.get_queue_wait_time(): 504 | # logger.debug( 505 | # "Waited: %sms Item: %s", item.get_queue_wait_time(), item 506 | # ) 507 | self._stats_queue_wait_times.append(item.get_queue_wait_time()) 508 | 509 | if self._session.closed: 510 | logger.debug("Session is closed. Cannot run %s", item) 511 | continue 512 | 513 | try: 514 | # Fetch Request and handle callbacks 515 | if isinstance(item, Request): 516 | await self._handle_request(item) 517 | # Process Callback results 518 | elif isinstance(item, CallbackResult): 519 | await self._process_request_callback_result( 520 | item.result, item.callback_recursion 521 | ) 522 | except Exception as e: 523 | logger.exception("Error handling item: %s : %s", item, e) 524 | finally: 525 | self._request_queue.task_done() 526 | except asyncio.CancelledError: 527 | logger.debug("Cancelled Worker: %s", task_num) 528 | 529 | @staticmethod 530 | async def _run_callback(callback, *args, **kwargs) -> None: 531 | """ 532 | Runs a callback function. 533 | 534 | :param callback: Function to run. May be async. 535 | :param args: Positional arguments to pass to the function. 536 | :param kwargs: Keyword arguments to pass to the function. 537 | :return: None 538 | """ 539 | if not callback: 540 | return 541 | if inspect.iscoroutinefunction(callback): 542 | await callback(*args, **kwargs) 543 | elif inspect.isfunction(callback): 544 | callback(*args, **kwargs) 545 | else: 546 | logger.warning("Callback %s must be a coroutine or function", callback) 547 | 548 | def create_start_urls(self, urls: List[Union[URL, str]]) -> List[URL]: 549 | """ 550 | Create the start URLs for the crawl from an initial URL. May be overridden. 551 | 552 | :param urls: Initial URLs 553 | """ 554 | crawl_start_urls: Set[URL] = set() 555 | 556 | for url in urls + self.start_urls: 557 | if isinstance(url, str): 558 | if "//" not in url: 559 | url = f"//{url}" 560 | url = URL(url) 561 | 562 | if url.scheme.lower() not in ["http", "https"]: 563 | url = url.with_scheme("http") 564 | 565 | crawl_start_urls.add(url) 566 | 567 | return list(crawl_start_urls) 568 | 569 | def record_statistics(self) -> None: 570 | """ 571 | Record statistics. 572 | """ 573 | self.stats[Stats.REQUESTS_DURATION_TOTAL] = int( 574 | sum(self._stats_request_durations) 575 | ) 576 | self.stats[Stats.REQUESTS_DURATION_AVG] = int( 577 | harmonic_mean(self._stats_request_durations) 578 | ) 579 | self.stats[Stats.REQUESTS_DURATION_MAX] = int( 580 | max(self._stats_request_durations) 581 | ) 582 | self.stats[Stats.REQUESTS_DURATION_MIN] = int( 583 | min(self._stats_request_durations) 584 | ) 585 | self.stats[Stats.REQUESTS_DURATION_MEDIAN] = int( 586 | median(self._stats_request_durations) 587 | ) 588 | 589 | self.stats[Stats.CONTENT_LENGTH_TOTAL] = int( 590 | sum(self._stats_response_content_lengths) 591 | ) 592 | self.stats[Stats.CONTENT_LENGTH_AVG] = int( 593 | harmonic_mean(self._stats_response_content_lengths) 594 | ) 595 | self.stats[Stats.CONTENT_LENGTH_MAX] = int( 596 | max(self._stats_response_content_lengths) 597 | ) 598 | self.stats[Stats.CONTENT_LENGTH_MIN] = int( 599 | min(self._stats_response_content_lengths) 600 | ) 601 | self.stats[Stats.CONTENT_LENGTH_MEDIAN] = int( 602 | median(self._stats_response_content_lengths) 603 | ) 604 | 605 | self.stats[Stats.URLS_SEEN] = len(self._duplicate_filter.fingerprints) 606 | 607 | self.stats[Stats.QUEUE_WAIT_AVG] = harmonic_mean(self._stats_queue_wait_times) 608 | self.stats[Stats.QUEUE_WAIT_MIN] = min(self._stats_queue_wait_times) 609 | self.stats[Stats.QUEUE_WAIT_MAX] = max(self._stats_queue_wait_times) 610 | self.stats[Stats.QUEUE_WAIT_MEDIAN] = median(self._stats_queue_wait_times) 611 | 612 | self.stats[Stats.QUEUE_SIZE_MAX] = max(self._stats_queue_sizes) 613 | self.stats[Stats.QUEUE_SIZE_AVG] = int(harmonic_mean(self._stats_queue_sizes)) 614 | self.stats[Stats.QUEUE_SIZE_MEDIAN] = int(median(self._stats_queue_sizes)) 615 | 616 | self.stats[Stats.REQUESTS_LATENCY_AVG] = harmonic_mean( 617 | self._stats_request_latencies 618 | ) 619 | self.stats[Stats.REQUESTS_LATENCY_MAX] = int(max(self._stats_request_latencies)) 620 | self.stats[Stats.REQUESTS_LATENCY_MIN] = int(min(self._stats_request_latencies)) 621 | self.stats[Stats.REQUESTS_LATENCY_MEDIAN] = int( 622 | median(self._stats_request_latencies) 623 | ) 624 | self.stats[Stats.REQUESTS_LATENCY_TOTAL] = int( 625 | sum(self._stats_request_latencies) 626 | ) 627 | 628 | def get_stats(self) -> dict: 629 | """ 630 | Return crawl statistics as a sorted dictionary. 631 | """ 632 | stats = {str(k): v for k, v in self.stats.items()} 633 | return dict(OrderedDict(sorted(stats.items())).items()) 634 | 635 | async def crawl(self, urls: Union[URL, str, List[Union[URL, str]]] = None) -> None: 636 | """ 637 | Start the web crawler. 638 | 639 | :param urls: An optional URL or List of URLS to start the crawl, in addition to start_urls. 640 | """ 641 | 642 | # Fix for ssl errors 643 | ignore_aiohttp_ssl_error(asyncio.get_running_loop()) 644 | 645 | start = time.perf_counter() 646 | 647 | # Create start urls from the initial URL if provided. 648 | if not urls: 649 | urls = [] 650 | if isinstance(urls, (URL, str)): 651 | urls = [urls] 652 | self.start_urls = self.create_start_urls(urls) 653 | 654 | if not self.start_urls: 655 | raise ValueError("crawler.start_urls are required") 656 | 657 | # Create the Request Queue within the asyncio loop. 658 | self._request_queue = CrawlerPriorityQueue() 659 | 660 | # Create the Semaphore for controlling HTTP Request concurrency within the asyncio loop. 661 | self._semaphore = asyncio.Semaphore(self.concurrency) 662 | 663 | trace_configs = [] 664 | if self._trace: 665 | trace_configs.append(add_trace_config()) 666 | 667 | conn = aiohttp.TCPConnector( 668 | limit=0, ssl=self._ssl, ttl_dns_cache=self.total_timeout.total 669 | ) 670 | # Create the ClientSession for HTTP Requests within the asyncio loop. 671 | self._session = aiohttp.ClientSession( 672 | timeout=self.total_timeout, 673 | headers=self.headers, 674 | connector=conn, 675 | trace_configs=trace_configs, 676 | ) 677 | 678 | # Create a Request for each start URL and add it to the Request Queue. 679 | for url in self.start_urls: 680 | req = await self.follow(coerce_url(url), self.parse, delay=0) 681 | if req: 682 | self._process_request(req) 683 | 684 | # Create workers to process the Request Queue. 685 | # Create twice as many workers as potential concurrent requests, to help handle request callbacks without 686 | # delay while other workers may be locked by the Semaphore. 687 | self._workers = [ 688 | asyncio.create_task(self._work(i)) for i in range(self.concurrency * 2) 689 | ] 690 | 691 | try: 692 | # Run workers within the ClientSession. 693 | async with self._session: 694 | await asyncio.wait_for( 695 | self._request_queue.join(), timeout=self.total_timeout.total 696 | ) 697 | except asyncio.TimeoutError: 698 | logger.debug("Timed out after %s seconds", self.total_timeout.total) 699 | self._request_queue.clear() 700 | finally: 701 | # Make sure all workers are cancelled. 702 | for w in self._workers: 703 | w.cancel() 704 | # Wait until all worker tasks are cancelled. 705 | await asyncio.gather(*self._workers, return_exceptions=True) 706 | 707 | # Run the post crawl callback if it exists. 708 | await self._run_callback(self.post_crawl_callback) 709 | 710 | # The ClientSession is closed only after all work is completed. 711 | await self._session.close() 712 | 713 | duration = int((time.perf_counter() - start) * 1000) 714 | self.stats[Stats.TOTAL_DURATION] = duration 715 | 716 | self.record_statistics() 717 | 718 | logger.info( 719 | "Crawl finished: requests=%s time=%dms", 720 | self.stats[Stats.REQUESTS_QUEUED], 721 | duration, 722 | ) 723 | logger.debug("Stats: %s", self.stats) 724 | --------------------------------------------------------------------------------