├── tests
    ├── __init__.py
    ├── crawler
    │   ├── __init__.py
    │   └── lib_test.py
    └── feed_spider
    │   ├── __init__.py
    │   ├── favicon_test.py
    │   ├── link_filter_test.py
    │   └── feed_info_parser_test.py
├── feedsearch_crawler
    ├── crawler
    │   ├── item.py
    │   ├── item_parser.py
    │   ├── __init__.py
    │   ├── duplicatefilter.py
    │   ├── queueable.py
    │   ├── trace.py
    │   ├── response.py
    │   ├── lib.py
    │   ├── request.py
    │   └── crawler.py
    ├── feed_spider
    │   ├── __init__.py
    │   ├── dupefilter.py
    │   ├── site_meta.py
    │   ├── regexes.py
    │   ├── favicon.py
    │   ├── feed_info.py
    │   ├── lib.py
    │   ├── site_meta_parser.py
    │   ├── link_filter.py
    │   ├── spider.py
    │   └── feed_info_parser.py
    └── __init__.py
├── LICENSE
├── pyproject.toml
├── .gitignore
├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── app.py
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/feed_spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/item.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | 
 4 | class Item(ABC):
 5 |     ignore_item = False
 6 | 
 7 |     def __init__(self, **kwargs):
 8 |         for k in kwargs.keys():
 9 |             if hasattr(self, k):
10 |                 self.__setattr__(k, kwargs[k])
11 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/__init__.py:
--------------------------------------------------------------------------------
1 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo
2 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta
3 | from feedsearch_crawler.feed_spider.spider import FeedsearchSpider
4 | 
5 | __all__ = ["FeedsearchSpider", "FeedInfo", "SiteMeta"]
6 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/item_parser.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from types import AsyncGeneratorType
 3 | from typing import Union
 4 | 
 5 | from feedsearch_crawler.crawler.item import Item
 6 | from feedsearch_crawler.crawler.request import Request
 7 | from feedsearch_crawler.crawler.response import Response
 8 | 
 9 | 
10 | class ItemParser(ABC):
11 |     def __init__(self, crawler):
12 |         self.crawler = crawler
13 |         self.follow = crawler.follow
14 | 
15 |     @abstractmethod
16 |     async def parse_item(
17 |         self, request: Request, response: Response, *args, **kwargs
18 |     ) -> Union[Item, AsyncGeneratorType]:
19 |         raise NotImplementedError("Not Implemented")
20 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/__init__.py:
--------------------------------------------------------------------------------
 1 | from feedsearch_crawler.crawler.crawler import Crawler
 2 | from feedsearch_crawler.crawler.duplicatefilter import DuplicateFilter
 3 | from feedsearch_crawler.crawler.item import Item
 4 | from feedsearch_crawler.crawler.item_parser import ItemParser
 5 | from feedsearch_crawler.crawler.lib import (
 6 |     to_string,
 7 |     to_bytes,
 8 |     coerce_url,
 9 |     CallbackResult,
10 | )
11 | from feedsearch_crawler.crawler.request import Request
12 | from feedsearch_crawler.crawler.response import Response
13 | 
14 | __all__ = [
15 |     "Crawler",
16 |     "Item",
17 |     "ItemParser",
18 |     "DuplicateFilter",
19 |     "Request",
20 |     "Response",
21 |     "to_bytes",
22 |     "to_string",
23 |     "coerce_url",
24 |     "CallbackResult",
25 | ]
26 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/dupefilter.py:
--------------------------------------------------------------------------------
 1 | from w3lib.url import url_query_cleaner, canonicalize_url
 2 | from yarl import URL
 3 | 
 4 | from feedsearch_crawler.crawler import DuplicateFilter
 5 | 
 6 | 
 7 | class NoQueryDupeFilter(DuplicateFilter):
 8 |     valid_keys = ["feedformat", "feed", "rss", "atom", "jsonfeed", "format", "podcast"]
 9 | 
10 |     def parse_url(self, url: URL) -> str:
11 |         # Keep the query strings if they might be feed strings.
12 |         # Wikipedia for example uses query strings to differentiate feeds.
13 |         if any(key in url.query for key in self.valid_keys):
14 |             return canonicalize_url(str(url))
15 | 
16 |         # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests.
17 |         return canonicalize_url(url_query_cleaner(str(url)))
18 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/site_meta.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from yarl import URL
 4 | 
 5 | from feedsearch_crawler.crawler import Item
 6 | 
 7 | 
 8 | class SiteMeta(Item):
 9 |     url: URL = None
10 |     site_url: str = ""
11 |     site_name: str = ""
12 |     icon_url: URL = None
13 |     icon_data_uri: str = ""
14 |     possible_icons: List = []
15 |     host: str = ""
16 | 
17 |     def __init__(self, url: URL, **kwargs) -> None:
18 |         super().__init__(**kwargs)
19 |         self.url = url
20 | 
21 |     def serialize(self):
22 |         return dict(
23 |             url=str(self.url), site_name=self.site_name, icon_url=str(self.icon_url)
24 |         )
25 | 
26 |     def __eq__(self, other):
27 |         return isinstance(other, self.__class__) and self.url == other.url
28 | 
29 |     def __hash__(self):
30 |         return hash(self.url)
31 | 
32 |     def __repr__(self):
33 |         return f"{self.__class__.__name__}({str(self.url)})"
34 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/regexes.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | # Regex to check if possible RSS data.
 4 | rss_regex = re.compile("(<rss|<rdf|<feed)", re.IGNORECASE)
 5 | 
 6 | # Regex to check that a feed-like string is a whole word to help rule out false positives.
 7 | feedlike_regex = re.compile(
 8 |     "\\b(rss|feeds?|atom|json|xml|rdf|blogs?|subscribe)\\b", re.IGNORECASE
 9 | )
10 | 
11 | # Regex to check that a podcast string is a whole word.
12 | podcast_regex = re.compile("\\b(podcasts?)\\b", re.IGNORECASE)
13 | 
14 | # Regex to check if the URL might contain author information.
15 | author_regex = re.compile(
16 |     "(authors?|journalists?|writers?|contributors?)", re.IGNORECASE
17 | )
18 | 
19 | # Regex to check URL string for invalid file types.
20 | file_regex = re.compile(
21 |     ".(jpe?g|png|gif|bmp|mp4|mp3|mkv|md|css|avi|pdf|js|woff2?|svg|ttf|zip)/?$",
22 |     re.IGNORECASE,
23 | )
24 | 
25 | # Regex to match year and month in URLs, e.g. /2019/07/
26 | date_regex = re.compile("/(\\d{4}/\\d{2})/")
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 David Beath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/favicon.py:
--------------------------------------------------------------------------------
 1 | from yarl import URL
 2 | 
 3 | from feedsearch_crawler.crawler import Item
 4 | 
 5 | 
 6 | class Favicon(Item):
 7 |     url: URL = None
 8 |     priority: int = 0
 9 |     rel: str = ""
10 |     data_uri: str = ""
11 |     resp_url: URL = None
12 |     site_host: str = ""
13 | 
14 |     def __eq__(self, other):
15 |         return isinstance(other, self.__class__) and self.url == other.url
16 | 
17 |     def __hash__(self):
18 |         return hash(self.url)
19 | 
20 |     def __repr__(self):
21 |         return f"{self.__class__.__name__}({str(self.url)})"
22 | 
23 |     def matches_host(self, host: str, requires_data_uri: bool = False) -> bool:
24 |         """
25 |         Check that the Favicon site_host is a match for the host.
26 | 
27 |         :param host: domain host url string
28 |         :param requires_data_uri: Whether the Favicon is required to have a data_uri
29 |         :return: bool
30 |         """
31 |         return (
32 |             self.url
33 |             and self.site_host
34 |             and self.site_host in host
35 |             and (self.data_uri if requires_data_uri else True)
36 |         )
37 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "feedsearch-crawler"
 3 | version = "1.0.3"
 4 | description = "Search sites for RSS, Atom, and JSON feeds"
 5 | authors = ["David Beath <davidgbeath@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | repository = "https://github.com/DBeath/feedsearch-crawler"
 9 | keywords = [
10 |     "RSS",
11 |     "Search",
12 |     "Crawler",
13 |     "Feeds",
14 |     "Atom"
15 | ]
16 | classifiers = [
17 |     "License :: OSI Approved :: MIT License",
18 |     "Intended Audience :: Developers",
19 |     "Development Status :: 5 - Production/Stable",
20 |     "Natural Language :: English",
21 |     "Operating System :: POSIX",
22 |     "Operating System :: Microsoft :: Windows",
23 |     "Operating System :: MacOS :: MacOS X",
24 |     "Programming Language :: Python :: 3.7",
25 |     "Programming Language :: Python :: 3.8",
26 |     "Programming Language :: Python :: 3.9",
27 |     "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
28 |     "Typing :: Typed",
29 |     "Framework :: AsyncIO",
30 | ]
31 | packages = [
32 |     { include = "feedsearch_crawler" },
33 | ]
34 | 
35 | [tool.poetry.dependencies]
36 | python = "^3.8"
37 | aiohttp = "^3.7.4"
38 | beautifulsoup4 = "^4.9.3"
39 | cchardet = "^2.1.7"
40 | aiodns = "^2.0.0"
41 | uvloop = "^0.15.2"
42 | w3lib = "^1.22.0"
43 | feedparser = "^6.0.10"
44 | brotlipy = "^0.7.0"
45 | python-dateutil = "^2.8.1"
46 | yarl = "^1.6.3"
47 | 
48 | [tool.poetry.dev-dependencies]
49 | twine = "*"
50 | pytest = "*"
51 | 
52 | [build-system]
53 | requires = ["poetry-core>=1.0.0"]
54 | build-backend = "poetry.core.masonry.api"
55 | 


--------------------------------------------------------------------------------
/tests/feed_spider/favicon_test.py:
--------------------------------------------------------------------------------
 1 | from feedsearch_crawler.feed_spider.favicon import Favicon
 2 | 
 3 | 
 4 | def test_matches_host():
 5 |     favicon = Favicon(
 6 |         site_host="test.com",
 7 |         url="test.com/favicon.ico",
 8 |         priority=1,
 9 |         data_uri="data_uri",
10 |     )
11 |     assert favicon.matches_host("test.com")
12 | 
13 | 
14 | def test_matches_host_no_match():
15 |     favicon = Favicon(
16 |         site_host="test.com",
17 |         url="test.com/favicon.ico",
18 |         priority=1,
19 |         data_uri="data_uri",
20 |     )
21 |     assert not favicon.matches_host("test2.com")
22 | 
23 | 
24 | def test_matches_host_no_site_host():
25 |     favicon = Favicon(
26 |         site_host="",
27 |         url="test.com/favicon.ico",
28 |         priority=1,
29 |         data_uri="data_uri",
30 |     )
31 |     assert not favicon.matches_host("test2.com")
32 | 
33 | 
34 | def test_matches_host_data_uri():
35 |     favicon = Favicon(
36 |         site_host="test.com",
37 |         url="test.com/favicon.ico",
38 |         priority=1,
39 |         data_uri="data_uri",
40 |     )
41 |     assert favicon.matches_host("test.com", requires_data_uri=True)
42 | 
43 | 
44 | def test_matches_host_no_data_uri():
45 |     favicon = Favicon(
46 |         site_host="test.com",
47 |         url="test.com/favicon.ico",
48 |         priority=1,
49 |     )
50 |     assert not favicon.matches_host("test.com", requires_data_uri=True)
51 | 
52 | 
53 | def test_matches_host_no_url():
54 |     favicon = Favicon(site_host="test.com", priority=1, data_uri="data_uri")
55 |     assert not favicon.matches_host("test.com", requires_data_uri=True)
56 | 


--------------------------------------------------------------------------------
/tests/crawler/lib_test.py:
--------------------------------------------------------------------------------
 1 | from feedsearch_crawler.crawler.lib import coerce_url, is_same_domain
 2 | from yarl import URL
 3 | 
 4 | 
 5 | def test_coerce_url():
 6 |     assert coerce_url("test.com") == URL("http://test.com")
 7 |     assert coerce_url("https://test.com") == URL("https://test.com")
 8 |     assert coerce_url(" https://test.com") == URL("https://test.com")
 9 |     assert coerce_url("test.com/path/path2") == URL("http://test.com/path/path2")
10 | 
11 |     assert coerce_url("test.com", https=True) == URL("https://test.com")
12 |     assert coerce_url("https://test.com", https=True) == URL("https://test.com")
13 |     assert coerce_url(" https://test.com", https=True) == URL("https://test.com")
14 |     assert coerce_url("http://test.com", https=True) == URL("https://test.com")
15 |     assert coerce_url("test.com/path/path2", https=True) == URL(
16 |         "https://test.com/path/path2"
17 |     )
18 |     assert coerce_url("//test.com") == URL("http://test.com")
19 |     assert coerce_url("feed://test.com") == URL("feed://test.com")
20 |     assert coerce_url("feed://www.internet-law.de/?feed=/feed/") == URL(
21 |         "feed://www.internet-law.de/?feed=/feed/"
22 |     )
23 | 
24 | 
25 | def test_is_same_domain():
26 |     assert is_same_domain("test.com", "test.com") is True
27 |     assert is_same_domain("example.com", "test.com") is False
28 |     assert is_same_domain("feeds.test.com", "test.com") is False
29 |     assert is_same_domain("test.com", "feeds.test.com") is True
30 |     assert is_same_domain("test.com", "test.feeds.test.com") is True
31 |     assert is_same_domain("www.test.com", "test.com") is True
32 |     assert is_same_domain("www.test.com", "feed.test.com") is True
33 |     assert is_same_domain("test.www.test.com", "test.com") is False
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea/
107 | .vscode/
108 | 
109 | logs/
110 | experiments/
111 | timed.sh
112 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/duplicatefilter.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import hashlib
 3 | 
 4 | from yarl import URL
 5 | 
 6 | from feedsearch_crawler.crawler.lib import to_bytes
 7 | 
 8 | 
 9 | class DuplicateFilter:
10 |     """
11 |     Filters duplicate URLs.
12 |     """
13 | 
14 |     def __init__(self):
15 |         # Dictionary whose keys are the hashed fingerprints of the URLs
16 |         self.fingerprints = dict()
17 |         # Locks the fingerprints dict when accessing keys.
18 |         self._seen_lock = asyncio.Lock()
19 | 
20 |     async def url_seen(self, url: URL, method: str = "") -> bool:
21 |         """
22 |         Checks if the URL has already been seen, and adds the URL fingerprint if not.
23 | 
24 |         :param url: URL object
25 |         :param method: Optional HTTP method to use for hashing
26 |         :return: True if URL already seen
27 |         """
28 |         url_str: str = self.parse_url(url)
29 |         fp = self.url_fingerprint_hash(url_str, method)
30 |         async with self._seen_lock:
31 |             if fp in self.fingerprints:
32 |                 return True
33 |             self.fingerprints[fp] = url_str
34 |             return False
35 | 
36 |     def parse_url(self, url: URL) -> str:
37 |         """
38 |         Parse the URL object to a string. Used for functionality such as filtering query strings.
39 | 
40 |         :param url: URL object
41 |         :return: URL as string
42 |         """
43 |         return str(url)
44 | 
45 |     @staticmethod
46 |     def url_fingerprint_hash(url: str, method: str = "") -> str:
47 |         """
48 |         Create a fingerprint hash of a URL string along with the method if provided.
49 | 
50 |         :param url: URL as string
51 |         :param method: Optional HTTP method
52 |         :return: Hashed string
53 |         """
54 |         # noinspection InsecureHash
55 |         fp = hashlib.sha1()
56 |         fp.update(to_bytes(url))
57 |         if method:
58 |             fp.update(to_bytes(method))
59 |         return fp.hexdigest()
60 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/feed_info.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import List
 3 | 
 4 | from yarl import URL
 5 | 
 6 | from feedsearch_crawler.crawler import Item, to_string
 7 | 
 8 | 
 9 | class FeedInfo(Item):
10 |     bozo: int = 0
11 |     content_length: int = 0
12 |     content_type: str = ""
13 |     description: str = ""
14 |     favicon: URL = ""
15 |     favicon_data_uri: str = ""
16 |     hubs: List[str] = []
17 |     is_podcast: bool = False
18 |     is_push: bool = False
19 |     item_count: int = 0
20 |     last_updated: datetime = None
21 |     score: int = 0
22 |     self_url: URL = ""
23 |     site_name: str = ""
24 |     site_url: URL = ""
25 |     title: str = ""
26 |     url: URL = ""
27 |     velocity: float = 0
28 |     version: str = ""
29 | 
30 |     def serialize(self):
31 |         last_updated = self.last_updated.isoformat() if self.last_updated else ""
32 | 
33 |         return dict(
34 |             bozo=self.bozo,
35 |             description=self.description,
36 |             content_length=self.content_length,
37 |             content_type=self.content_type,
38 |             favicon=to_string(self.favicon),
39 |             favicon_data_uri=self.favicon_data_uri,
40 |             hubs=self.hubs,
41 |             is_podcast=self.is_podcast,
42 |             is_push=self.is_push,
43 |             item_count=self.item_count,
44 |             last_updated=last_updated,
45 |             score=self.score,
46 |             self_url=to_string(self.self_url),
47 |             site_name=self.site_name,
48 |             site_url=to_string(self.site_url),
49 |             title=self.title,
50 |             url=to_string(self.url),
51 |             velocity=self.velocity,
52 |             version=self.version,
53 |         )
54 | 
55 |     def __eq__(self, other):
56 |         return isinstance(other, self.__class__) and self.url == other.url
57 | 
58 |     def __hash__(self):
59 |         return hash(self.url)
60 | 
61 |     def __repr__(self):
62 |         return f"{self.__class__.__name__}({str(self.url)})"
63 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/queueable.py:
--------------------------------------------------------------------------------
 1 | from asyncio import Queue
 2 | from typing import Union
 3 | 
 4 | import time
 5 | 
 6 | 
 7 | class Queueable:
 8 |     queue_put_time = None
 9 |     queue_get_time = None
10 |     # Default lowest queue priority is 100 (higher number means lower priority)
11 |     priority = 100
12 | 
13 |     def get_queue_wait_time(self) -> Union[float, None]:
14 |         """
15 |         Get the time in Milliseconds that this object has been on the queue.
16 | 
17 |         :return: Queue wait time in Milliseconds as float
18 |         """
19 |         # Only set queue_get_time if not already set, so that the value of this method doesn't change each time
20 |         # it's called.
21 |         if not self.queue_get_time:
22 |             self.queue_get_time = time.perf_counter()
23 |         if self.queue_put_time:
24 |             return (self.queue_get_time - self.queue_put_time) * 1000
25 |         return None
26 | 
27 |     def set_queue_put_time(self) -> None:
28 |         """
29 |         Set the time that this object was put onto the queue.
30 |         """
31 |         # Set queue_get_time to None, because this method is called whenever a Queueable is added to the queue
32 |         # and it may be added to a queue multiple times in it's life.
33 |         self.queue_get_time = None
34 |         self.queue_put_time = time.perf_counter()
35 | 
36 |     def add_to_queue(self, queue: Queue) -> None:
37 |         """
38 |         Add the Queueable to the queue and set the queue put time.
39 | 
40 |         :param queue: An Queue instance
41 |         """
42 |         self.set_queue_put_time()
43 |         queue.put_nowait(self)
44 | 
45 |     def __lt__(self, other) -> bool:
46 |         """
47 |         Compare Queueable priority for Queue ordering.
48 |         Lower priority has precedence in the Queue.
49 | 
50 |         :param other: Another Queueable object
51 |         :return: boolean
52 |         """
53 |         if not isinstance(other, Queueable):
54 |             return True
55 |         return self.priority < other.priority
56 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | name: "CodeQL"
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [master]
11 |   pull_request:
12 |     # The branches below must be a subset of the branches above
13 |     branches: [master]
14 |   schedule:
15 |     - cron: '0 23 * * 6'
16 | 
17 | jobs:
18 |   analyze:
19 |     name: Analyze
20 |     runs-on: ubuntu-latest
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         # Override automatic language detection by changing the below list
26 |         # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
27 |         language: ['python']
28 |         # Learn more...
29 |         # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
30 | 
31 |     steps:
32 |     - name: Checkout repository
33 |       uses: actions/checkout@v2
34 |       with:
35 |         # We must fetch at least the immediate parents so that if this is
36 |         # a pull request then we can checkout the head.
37 |         fetch-depth: 2
38 | 
39 |     # If this run was triggered by a pull request event, then checkout
40 |     # the head of the pull request instead of the merge commit.
41 |     - run: git checkout HEAD^2
42 |       if: ${{ github.event_name == 'pull_request' }}
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file. 
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from xml.etree import ElementTree
 4 | from typing import List, Union
 5 | 
 6 | from yarl import URL
 7 | 
 8 | from feedsearch_crawler.feed_spider import FeedsearchSpider, FeedInfo
 9 | 
10 | logging.getLogger(__name__).addHandler(logging.NullHandler())
11 | 
12 | name = "Feedsearch Crawler"
13 | 
14 | 
15 | def search(
16 |     url: Union[URL, str, List[Union[URL, str]]],
17 |     try_urls: Union[List[str], bool] = False,
18 |     *args,
19 |     **kwargs
20 | ) -> List[FeedInfo]:
21 |     """
22 |     Search for feeds at a URL.
23 | 
24 |     :param url: URL or list of URLs to search
25 |     :param try_urls: Tries different paths that may contain feeds.
26 |     :return: List of FeedInfo objects
27 |     """
28 |     results = asyncio.run(search_async(url, try_urls=try_urls, *args, **kwargs))
29 |     return results
30 | 
31 | 
32 | async def search_async(
33 |     url: Union[URL, str, List[Union[URL, str]]],
34 |     try_urls: Union[List[str], bool] = False,
35 |     *args,
36 |     **kwargs
37 | ) -> List[FeedInfo]:
38 |     """
39 |     Search asynchronously for feeds at a URL.
40 | 
41 |     :param url: URL or list of URLs to search
42 |     :param try_urls: Tries different paths that may contain feeds.
43 |     :return: List of FeedInfo objects
44 |     """
45 |     crawler = FeedsearchSpider(try_urls=try_urls, *args, **kwargs)
46 |     await crawler.crawl(url)
47 | 
48 |     return sort_urls(list(crawler.items))
49 | 
50 | 
51 | def sort_urls(feeds: List[FeedInfo]) -> List[FeedInfo]:
52 |     """
53 |     Sort list of feeds based on Url score
54 | 
55 |     :param feeds: List of FeedInfo objects
56 |     :return: List of FeedInfo objects sorted by score
57 |     """
58 |     feeds = [f for f in feeds if isinstance(f, FeedInfo)]
59 |     sorted_urls = sorted(list(set(feeds)), key=lambda x: x.score, reverse=True)
60 |     return sorted_urls
61 | 
62 | 
63 | def output_opml(feeds: List[FeedInfo]) -> bytes:
64 |     """
65 |     Return feeds as a subscriptionlist OPML file.
66 |     http://dev.opml.org/spec2.html#subscriptionLists
67 | 
68 |     :param feeds: List of FeedInfo objects
69 |     :return: OPML file as XML bytestring
70 |     """
71 |     root = ElementTree.Element("opml", version="2.0")
72 |     head = ElementTree.SubElement(root, "head")
73 |     title = ElementTree.SubElement(head, "title")
74 |     title.text = "Feeds"
75 |     body = ElementTree.SubElement(root, "body")
76 | 
77 |     for feed in feeds:
78 |         if not feed.url:
79 |             continue
80 | 
81 |         fe = ElementTree.SubElement(body, "outline", type="rss", xmlUrl=str(feed.url))
82 | 
83 |         if feed.title:
84 |             fe.set("text", feed.title)
85 |             fe.set("title", feed.title)
86 |         if feed.site_url:
87 |             fe.set("htmlUrl", str(feed.site_url))
88 |         if feed.description:
89 |             fe.set("description", feed.description)
90 |         if feed.version:
91 |             fe.set("version", feed.version)
92 | 
93 |     return ElementTree.tostring(root, encoding="utf8", method="xml")
94 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/lib.py:
--------------------------------------------------------------------------------
  1 | import cgi
  2 | from datetime import datetime
  3 | from typing import Union, List
  4 | 
  5 | from dateutil import tz, parser
  6 | from yarl import URL
  7 | 
  8 | 
  9 | class ParseTypes:
 10 |     JSON = "json"
 11 |     XML = "xml"
 12 | 
 13 | 
 14 | def get_site_root(url: Union[str, URL]) -> str:
 15 |     """
 16 |     Find the root domain of a url
 17 |     """
 18 |     if isinstance(url, URL):
 19 |         return url.host
 20 |     return URL(url).host
 21 | 
 22 | 
 23 | def create_allowed_domains(url: Union[str, URL]) -> List[str]:
 24 |     if isinstance(url, URL):
 25 |         return [url.host]
 26 |     return [URL(url).host]
 27 | 
 28 | 
 29 | def parse_header_links(value):
 30 |     """
 31 |     Return a list of Dicts of parsed link headers proxies.
 32 |     i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",
 33 |     <http://.../back.jpeg>; rel=back;type="image/jpeg"
 34 | 
 35 |     :param value: HTTP Link header to parse
 36 |     :return: List of Dicts
 37 |     """
 38 | 
 39 |     links = []
 40 | 
 41 |     replace_chars = " '\""
 42 | 
 43 |     for val in value.split(","):
 44 |         try:
 45 |             url, params = val.split(";", 1)
 46 |         except ValueError:
 47 |             url, params = val, ""
 48 | 
 49 |         link = {"url": url.strip("<> '\"")}
 50 | 
 51 |         for param in params.split(";"):
 52 |             try:
 53 |                 key, value = param.split("=")
 54 |             except ValueError:
 55 |                 break
 56 | 
 57 |             link[key.strip(replace_chars)] = value.strip(replace_chars)
 58 | 
 59 |         links.append(link)
 60 | 
 61 |     return links
 62 | 
 63 | 
 64 | def force_utc(dt: datetime) -> datetime:
 65 |     """
 66 |     Change a datetime to UTC, and convert naive datetimes to tz-aware UTC.
 67 | 
 68 |     :param dt: datetime to change to UTC
 69 |     :return: tz-aware UTC datetime
 70 |     """
 71 |     if dt.tzinfo is None:
 72 |         dt = dt.replace(tzinfo=tz.tzutc())
 73 |     return dt.astimezone(tz.tzutc())
 74 | 
 75 | 
 76 | def datestring_to_utc_datetime(date_string: str) -> datetime:
 77 |     """
 78 |     Convert a date string to a tz-aware UTC datetime.
 79 | 
 80 |     :param date_string: A datetime as a string in almost any format.
 81 |     :return: tz-aware UTC datetime
 82 |     """
 83 |     dt = parser.parse(date_string)
 84 |     return force_utc(dt)
 85 | 
 86 | 
 87 | def create_content_type(parse_type: str, encoding: str, content_type: str) -> str:
 88 |     """
 89 |     Create the actual content type of the feed.
 90 | 
 91 |     :param parse_type: How the feed is being parsed. XML or JSON
 92 |     :param encoding: Charset encoding of the response
 93 |     :param content_type: Content-Type header string of the response
 94 |     :return: Content-Type string
 95 |     """
 96 |     ctype, pdict = cgi.parse_header(content_type)
 97 | 
 98 |     if parse_type == ParseTypes.JSON and ParseTypes.JSON not in ctype.lower():
 99 |         ctype = "application/json"
100 |     elif parse_type == ParseTypes.XML and ParseTypes.XML not in ctype.lower():
101 |         ctype = "application/xml"
102 | 
103 |     return f"{ctype}; charset={encoding}".lower()
104 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/trace.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | 
 4 | import aiohttp
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | async def on_request_start(session, trace_config_ctx, params):
10 |     loop = asyncio.get_event_loop()
11 |     trace_config_ctx.start = loop.time()
12 |     logger.debug("Request Start: %s", params.url)
13 | 
14 | 
15 | async def on_request_end(session, trace_config_ctx, params):
16 |     loop = asyncio.get_event_loop()
17 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
18 |     logger.debug("Request END: %s %s %dms", params.url, params.response.url, elapsed)
19 | 
20 | 
21 | async def on_connection_create_start(session, trace_config_ctx, params):
22 |     loop = asyncio.get_event_loop()
23 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
24 |     logger.debug("Connection create Start: %dms", elapsed)
25 | 
26 | 
27 | async def on_connection_create_end(session, trace_config_ctx, params):
28 |     loop = asyncio.get_event_loop()
29 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
30 |     logger.debug("Connection create END: %dms", elapsed)
31 | 
32 | 
33 | async def on_dns_resolvehost_start(session, trace_config_ctx, params):
34 |     loop = asyncio.get_event_loop()
35 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
36 |     logger.debug("DNS Resolve Host Start: %s %dms", params.host, elapsed)
37 | 
38 | 
39 | async def on_dns_resolvehost_end(session, trace_config_ctx, params):
40 |     loop = asyncio.get_event_loop()
41 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
42 |     logger.debug("DNS Resolve Host END: %s %dms", params.host, elapsed)
43 | 
44 | 
45 | async def on_dns_cache_hit(session, trace_config_ctx, params):
46 |     loop = asyncio.get_event_loop()
47 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
48 |     logger.debug("DNS Cache Hit: %s %dms", params.host, elapsed)
49 | 
50 | 
51 | async def on_dns_cache_miss(session, trace_config_ctx, params):
52 |     loop = asyncio.get_event_loop()
53 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
54 |     logger.debug("DNS Cache Miss: %s %dms", params.host, elapsed)
55 | 
56 | 
57 | async def on_request_redirect(session, trace_config_ctx, params):
58 |     loop = asyncio.get_event_loop()
59 |     elapsed = int((loop.time() - trace_config_ctx.start) * 1000)
60 |     logger.debug(
61 |         "Request redirect: %s %s %dms", params.url, params.response.url, elapsed
62 |     )
63 | 
64 | 
65 | def add_trace_config():
66 |     trace_config = aiohttp.TraceConfig()
67 |     trace_config.on_request_start.append(on_request_start)
68 |     trace_config.on_dns_resolvehost_start.append(on_dns_resolvehost_start)
69 |     trace_config.on_dns_cache_hit.append(on_dns_cache_hit)
70 |     trace_config.on_dns_cache_miss.append(on_dns_cache_miss)
71 |     trace_config.on_dns_resolvehost_end.append(on_dns_resolvehost_end)
72 |     trace_config.on_request_end.append(on_request_end)
73 |     trace_config.on_request_redirect.append(on_request_redirect)
74 |     trace_config.on_connection_create_start.append(on_connection_create_start)
75 |     trace_config.on_connection_create_end.append(on_connection_create_end)
76 |     return trace_config
77 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/response.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from typing import List, Dict, Any, Optional
  3 | 
  4 | from yarl import URL
  5 | 
  6 | from feedsearch_crawler.crawler.lib import is_same_domain
  7 | 
  8 | 
  9 | class Response:
 10 |     _xml = None
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         url: URL,
 15 |         method: str,
 16 |         encoding: str = "",
 17 |         text: str = "",
 18 |         json: Dict = None,
 19 |         data: bytes = b"",
 20 |         history: List[URL] = None,
 21 |         headers=None,
 22 |         status_code: int = -1,
 23 |         cookies=None,
 24 |         xml_parser=None,
 25 |         redirect_history=None,
 26 |         content_length: int = 0,
 27 |         meta: Dict = None,
 28 |     ):
 29 |         self.url = url
 30 |         self.encoding = encoding
 31 |         self.method = method
 32 |         self.text = text
 33 |         self.json = json
 34 |         self.data = data
 35 |         self.history = history or []
 36 |         self.headers = headers or {}
 37 |         self.status_code = status_code
 38 |         self.cookies = cookies
 39 |         self.id = uuid.uuid4()
 40 |         self._xml_parser = xml_parser
 41 |         self.redirect_history = redirect_history
 42 |         self.content_length = content_length
 43 |         self.meta = meta
 44 |         self.origin: URL = url.origin()
 45 | 
 46 |     @property
 47 |     def ok(self) -> bool:
 48 |         return self.status_code == 0 or 200 <= self.status_code <= 299
 49 | 
 50 |     @property
 51 |     def domain(self) -> str:
 52 |         return self.url.host
 53 | 
 54 |     @property
 55 |     def scheme(self) -> str:
 56 |         return self.url.scheme
 57 | 
 58 |     @property
 59 |     def previous_domain(self) -> str:
 60 |         if not self.history:
 61 |             return ""
 62 |         return self.history[-1].host
 63 | 
 64 |     @property
 65 |     def originator_url(self) -> Optional[URL]:
 66 |         if not self.history or len(self.history) == 1:
 67 |             return None
 68 |         return self.history[-2]
 69 | 
 70 |     @property
 71 |     async def xml(self) -> Any:
 72 |         if self._xml:
 73 |             return self._xml
 74 | 
 75 |         if not self._xml_parser:
 76 |             return None
 77 | 
 78 |         if not self.text and self.data and self.encoding:
 79 |             self.text = self.data.decode(self.encoding)
 80 | 
 81 |         self._xml = await self._xml_parser(self.text)
 82 |         return self._xml
 83 | 
 84 |     def is_max_depth_reached(self, max_depth: int) -> bool:
 85 |         """
 86 |         Check if the max response depth has been reached.
 87 | 
 88 |         :param max_depth: Max length of response history
 89 |         :return: boolean
 90 |         """
 91 |         if max_depth and len(self.history) >= max_depth:
 92 |             return True
 93 |         return False
 94 | 
 95 |     def is_original_domain(self) -> bool:
 96 |         """
 97 |         Check if this response is still at the original domain in the response chain.
 98 | 
 99 |         :return: boolean
100 |         """
101 |         # This is the first Response in the chain
102 |         if len(self.history) < 2:
103 |             return True
104 |         # URL is same domain or sub-domain
105 |         if is_same_domain(self.history[0].host, self.url.host):
106 |             return True
107 | 
108 |         return False
109 | 
110 |     def __repr__(self):
111 |         return f"{self.__class__.__name__}({str(self.url)})"
112 | 


--------------------------------------------------------------------------------
/tests/feed_spider/link_filter_test.py:
--------------------------------------------------------------------------------
  1 | from yarl import URL
  2 | 
  3 | from feedsearch_crawler.feed_spider.link_filter import (
  4 |     LinkFilter as lf,
  5 | )
  6 | from feedsearch_crawler.feed_spider.regexes import feedlike_regex, podcast_regex
  7 | 
  8 | 
  9 | def test_feedlike_regex():
 10 |     valid = [
 11 |         "rss",
 12 |         "testing/rss",
 13 |         "testing/rss-test",
 14 |         "test-rss-test",
 15 |         "test.rss.test",
 16 |         "RSS",
 17 |         "test/RSS/test",
 18 |         "feed",
 19 |         "testing/feed/",
 20 |         "test-feed-test",
 21 |         "test.feed.test",
 22 |         "FEED",
 23 |         "FeeD",
 24 |         "test/FEED/test",
 25 |         "feeds",
 26 |         "testing/feeds",
 27 |         "test-feeds-test",
 28 |         "test.feeds.test",
 29 |         "FEEDS",
 30 |         "FeedS",
 31 |         "test/FEEDS/test",
 32 |         "atom",
 33 |         "json",
 34 |         "xml",
 35 |         "rdf",
 36 |         "blog",
 37 |         "blogs",
 38 |         "test/subscribe/testing"
 39 |     ]
 40 |     for value in valid:
 41 |         assert feedlike_regex.search(value)
 42 | 
 43 | 
 44 | def test_feedlike_regex_invalid():
 45 |     invalid = ["rsss", "rs-s", "feedss", "tfeed", "fee-d", "fee.d"]
 46 |     for value in invalid:
 47 |         assert not feedlike_regex.search(value)
 48 | 
 49 | 
 50 | def test_podcast_regex():
 51 |     pass
 52 | 
 53 | 
 54 | def test_is_feedlike_href():
 55 |     assert lf.is_href_matching("test.com/feed", feedlike_regex) is True
 56 |     assert lf.is_href_matching("feed", feedlike_regex) is True
 57 |     assert lf.is_href_matching("feeds", feedlike_regex) is True
 58 |     assert lf.is_href_matching("test.com/feeds", feedlike_regex) is True
 59 |     assert lf.is_href_matching("test.com/feeds/test", feedlike_regex) is True
 60 |     assert lf.is_href_matching("test.com/podcasts/test", feedlike_regex) is False
 61 |     assert lf.is_href_matching("test.com/podcast/test", feedlike_regex) is False
 62 |     assert lf.is_href_matching("test.com/podcasts", feedlike_regex) is False
 63 |     assert lf.is_href_matching("test.com/podcast", feedlike_regex) is False
 64 | 
 65 | 
 66 | def test_is_feedlike_querystring():
 67 |     assert lf.is_querystring_matching(URL("test.com?feed"), feedlike_regex) is True
 68 |     assert lf.is_querystring_matching(URL("test.com/test?feed"), feedlike_regex) is True
 69 |     assert (
 70 |         lf.is_querystring_matching(
 71 |             URL("test.com/test?url=feed&test=true"), feedlike_regex
 72 |         )
 73 |         is False
 74 |     )
 75 |     assert (
 76 |         lf.is_querystring_matching(URL("test.com/test?url=feed"), feedlike_regex)
 77 |         is False
 78 |     )
 79 |     assert (
 80 |         lf.is_querystring_matching(URL("test.com/feed?url=test"), feedlike_regex)
 81 |         is False
 82 |     )
 83 |     assert (
 84 |         lf.is_querystring_matching(URL("test.com/test?feed=test"), feedlike_regex)
 85 |         is True
 86 |     )
 87 |     assert (
 88 |         lf.is_querystring_matching(URL("test.com?podcast=test"), feedlike_regex)
 89 |         is False
 90 |     )
 91 |     assert (
 92 |         lf.is_querystring_matching(URL("test.com?feeds=test"), feedlike_regex) is True
 93 |     )
 94 |     assert (
 95 |         lf.is_querystring_matching(URL("test.com?podcasts=test"), feedlike_regex)
 96 |         is False
 97 |     )
 98 | 
 99 | 
100 | def test_is_podcast_href():
101 |     assert lf.is_href_matching("test.com/podcasts/test", podcast_regex) is True
102 |     assert lf.is_href_matching("test.com/podcast/test", podcast_regex) is True
103 |     assert lf.is_href_matching("test.com/podcasts", podcast_regex) is True
104 |     assert lf.is_href_matching("test.com/podcast", podcast_regex) is True
105 | 
106 | 
107 | def test_is_podcast_querystring():
108 |     assert (
109 |         lf.is_querystring_matching(URL("test.com?podcast=test"), podcast_regex) is True
110 |     )
111 |     assert (
112 |         lf.is_querystring_matching(URL("test.com?podcasts=test"), podcast_regex) is True
113 |     )
114 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/site_meta_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List
  3 | 
  4 | from yarl import URL
  5 | 
  6 | from feedsearch_crawler.crawler import ItemParser, Request, Response
  7 | from feedsearch_crawler.crawler.lib import remove_www
  8 | from feedsearch_crawler.feed_spider.favicon import Favicon
  9 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class SiteMetaParser(ItemParser):
 15 |     async def parse_item(self, request: Request, response: Response, *args, **kwargs):
 16 |         logger.info("Parsing: SiteMeta %s", response.url)
 17 |         url = response.url
 18 |         site_meta: SiteMeta = SiteMeta(url)
 19 | 
 20 |         xml = await response.xml
 21 |         if not xml:
 22 |             return
 23 | 
 24 |         site_meta.url = self.find_site_url(xml, url)
 25 |         site_meta.host = remove_www(site_meta.url.host)
 26 |         site_meta.site_name = self.find_site_name(xml)
 27 |         site_meta.possible_icons = self.find_site_icon_urls(xml, url, site_meta.host)
 28 | 
 29 |         for icon in site_meta.possible_icons:
 30 |             if icon.url:
 31 |                 # Only follow favicon urls if we want to create a data uri
 32 |                 if self.crawler.favicon_data_uri:
 33 |                     yield self.follow(
 34 |                         icon.url,
 35 |                         self.crawler.parse_favicon_data_uri,
 36 |                         cb_kwargs=dict(favicon=icon),
 37 |                         allow_domain=True,
 38 |                         max_content_length=51200,
 39 |                     )
 40 |                 else:
 41 |                     yield icon
 42 | 
 43 |         yield site_meta
 44 | 
 45 |     @staticmethod
 46 |     def find_site_icon_urls(soup, url, host) -> List[Favicon]:
 47 |         search_icons = [
 48 |             Favicon(
 49 |                 url=url.join(URL("favicon.ico")),
 50 |                 rel="favicon",
 51 |                 priority=3,
 52 |                 site_host=host,
 53 |             ),
 54 |             Favicon(url="", rel="shortcut icon", priority=1, site_host=host),
 55 |             Favicon(url="", rel="icon", priority=2, site_host=host),
 56 |         ]
 57 | 
 58 |         possible_icons = []
 59 |         for icon in search_icons:
 60 |             link = soup.find(name="link", rel=icon.rel)
 61 |             if link:
 62 |                 href = link.get("href", None)
 63 |                 if href:
 64 |                     icon.url = url.join(URL(href))
 65 |             if icon.url:
 66 |                 possible_icons.append(icon)
 67 | 
 68 |         return sorted(possible_icons, key=lambda x: x.priority)
 69 | 
 70 |     @staticmethod
 71 |     def find_site_url(soup, url: URL) -> URL:
 72 |         """
 73 |         Attempts to find the canonical Url of the Site
 74 | 
 75 |         :param soup: BeautifulSoup of site
 76 |         :param url: Current Url of site
 77 |         :return: str
 78 |         """
 79 |         try:
 80 |             canonical = soup.find(name="link", rel="canonical")
 81 |             site = canonical.get("href")
 82 |             if site:
 83 |                 if site.strip() == "/":
 84 |                     return url
 85 |                 return URL(site).origin()
 86 |         except (AttributeError, ValueError):
 87 |             pass
 88 | 
 89 |         try:
 90 |             meta = soup.find(name="meta", property="og:url")
 91 |             site = meta.get("content")
 92 |             if site:
 93 |                 if site.strip() == "/":
 94 |                     return url
 95 |                 return URL(site).origin()
 96 |         except (AttributeError, ValueError):
 97 |             pass
 98 | 
 99 |         return url.origin()
100 | 
101 |     @staticmethod
102 |     def find_site_name(soup) -> str:
103 |         """
104 |         Attempts to find Site Name
105 | 
106 |         :param soup: BeautifulSoup of site
107 |         :return: str
108 |         """
109 |         site_name_meta = [
110 |             "og:site_name",
111 |             "og:title",
112 |             "application:name",
113 |             "twitter:app:name:iphone",
114 |         ]
115 | 
116 |         for p in site_name_meta:
117 |             try:
118 |                 name = soup.find(name="meta", property=p).get("content")
119 |                 if name:
120 |                     return name
121 |             except AttributeError:
122 |                 pass
123 | 
124 |         try:
125 |             title = soup.find(name="title").text
126 |             if title:
127 |                 return title
128 |         except AttributeError:
129 |             pass
130 | 
131 |         return ""
132 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import json
  4 | import time
  5 | from pprint import pprint
  6 | from feedsearch_crawler import search, FeedsearchSpider, output_opml, sort_urls
  7 | from feedsearch_crawler.crawler import coerce_url
  8 | from datetime import datetime
  9 | import collections
 10 | 
 11 | urls = [
 12 |     # "arstechnica.com",
 13 |     # "https://davidbeath.com",
 14 |     # "http://xkcd.com",
 15 |     # "http://jsonfeed.org",
 16 |     # "en.wikipedia.com",
 17 |     # "scientificamerican.com",
 18 |     # "newyorktimes.com",
 19 |     # "https://www.dancarlin.com",
 20 |     # "https://www.hanselminutes.com/",
 21 |     # "nytimes.com",
 22 |     # "https://www.jeremydaly.com/serverless-microservice-patterns-for-aws/",
 23 |     # "feedhandbook.com",
 24 |     # "https://americanaffairsjournal.org/2019/05/ubers-path-of-destruction/",
 25 |     # "localhost:8080/test",
 26 |     # "theatlantic.com",
 27 |     # "nypost.com",
 28 |     # "https://www.washingtonpost.com",
 29 |     # "localhost:5000",
 30 |     # "latimes.com",
 31 |     # "http://feeds.washingtonpost.com/rss/rss_fact-checker?noredirect=on",
 32 |     # "http://tabletopwhale.com/index.html"
 33 |     # "www.vanityfair.com",
 34 |     # "bloomberg.com",
 35 |     # "http://www.bloomberg.com/politics/feeds/site.xml",
 36 |     # "propublica.org"
 37 |     # "npr.org",
 38 |     # "rifters.com",
 39 |     # "https://www.bbc.co.uk/podcasts"
 40 |     # "https://www.bbc.co.uk/programmes/p02nrsln/episodes/downloads",
 41 |     # "https://breebird33.tumblr.com/",
 42 |     # "https://neurocorp.tumblr.com/",
 43 |     # "https://breebird33.tumblr.com/rss"
 44 |     # "https://resel.fr/rss-news"
 45 |     # "https://muhammadraza.me"
 46 |     # "https://www.franceinter.fr/rss/a-la-une.xml",
 47 |     # "harpers.org",
 48 |     # "slashdot.com",
 49 |     # "https://bearblog.dev",
 50 |     # "aeon.co",
 51 |     # "https://davidgerard.co.uk/blockchain/"
 52 |     # "raymii.org/s/"
 53 |     # "stratechery.com",
 54 |     # "www.internet-law.de",
 55 |     # "https://medium.com/zendesk-engineering/the-joys-of-story-estimation-cda0cd807903",
 56 |     # "https://danwang.co/",
 57 |     "http://matthewdickens.me/podcasts/TWIS-feed.xml"
 58 | ]
 59 | 
 60 | 
 61 | def get_pretty_print(json_object: object):
 62 |     return json.dumps(json_object, sort_keys=True, indent=2, separators=(",", ": "))
 63 | 
 64 | 
 65 | # @profile()
 66 | def run_crawl():
 67 |     # user_agent = "Mozilla/5.0 (Compatible; Bot)"
 68 |     user_agent = "Mozilla/5.0 (Compatible; Feedsearch Bot)"
 69 |     # user_agent = "curl/7.58.0"
 70 |     # user_agent = (
 71 |     #     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
 72 |     # )
 73 |     # user_agent = (
 74 |     #     "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 75 |     # )
 76 | 
 77 |     # headers = {
 78 |     #     "User-Agent": user_agent,
 79 |     #     "DNT": "1",
 80 |     #     "Upgrade-Insecure-Requests": "1",
 81 |     #     "Accept-Language": "en-US,en;q=0.5",
 82 |     #     "Accept-Encoding": "gzip, deflate, br",
 83 |     #     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 84 |     #     "Referrer": "https://www.google.com/",
 85 |     # }
 86 | 
 87 |     crawler = FeedsearchSpider(
 88 |         concurrency=10,
 89 |         total_timeout=30,
 90 |         request_timeout=30,
 91 |         user_agent=user_agent,
 92 |         # headers=headers,
 93 |         favicon_data_uri=False,
 94 |         max_depth=5,
 95 |         max_retries=3,
 96 |         ssl=True,
 97 |         full_crawl=False,
 98 |         delay=0,
 99 |         try_urls=True,
100 |     )
101 |     crawler.start_urls = urls
102 |     # crawler.allowed_domains = create_allowed_domains(urls)
103 |     asyncio.run(crawler.crawl())
104 |     # asyncio.run(crawler.crawl(urls[0]))
105 |     # items = search(urls, crawl_hosts=True)
106 | 
107 |     items = sort_urls(list(crawler.items))
108 | 
109 |     serialized = [item.serialize() for item in items]
110 | 
111 |     # items = search(urls[0], concurrency=40, try_urls=False, favicon_data_uri=False)
112 |     # serialized = [item.serialize() for item in items]
113 | 
114 |     results = get_pretty_print(serialized)
115 |     print(results)
116 | 
117 |     site_metas = [item.serialize() for item in crawler.site_metas]
118 |     metas = get_pretty_print(site_metas)
119 |     print(metas)
120 |     # pprint(site_metas)
121 | 
122 |     pprint(crawler.favicons)
123 |     pprint(crawler._duplicate_filter.fingerprints)
124 | 
125 |     print(output_opml(items).decode())
126 | 
127 |     pprint([result["url"] for result in serialized])
128 |     pprint(crawler.get_stats())
129 | 
130 |     print(f"Feeds found: {len(items)}")
131 |     print(f"SiteMetas: {len(crawler.site_metas)}")
132 |     print(f"Favicons fetched: {len(crawler.favicons)}")
133 |     # pprint(crawler.queue_wait_times)
134 |     pprint(list((x.score, x.url) for x in items))
135 | 
136 | 
137 | def create_allowed_domains(urls):
138 |     domain_patterns = []
139 |     for url in urls:
140 |         url = coerce_url(url)
141 |         host = url.host
142 |         pattern = f"*.{host}"
143 |         domain_patterns.append(host)
144 |         domain_patterns.append(pattern)
145 |     return domain_patterns
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     logger = logging.getLogger("feedsearch_crawler")
150 |     logger.setLevel(logging.DEBUG)
151 |     ch = logging.StreamHandler()
152 |     ch.setLevel(logging.DEBUG)
153 |     formatter = logging.Formatter(
154 |         "%(asctime)s - %(levelname)s - %(name)s - %(message)s [in %(pathname)s:%(lineno)d]"
155 |     )
156 |     ch.setFormatter(formatter)
157 |     fl = logging.FileHandler(
158 |         f"/home/dbeath/code/feedsearch-crawler/logs/feedsearch_crawl_{datetime.utcnow().isoformat()}"
159 |     )
160 |     fl.setLevel((logging.DEBUG))
161 |     fl.setFormatter(formatter)
162 |     logger.addHandler(ch)
163 |     logger.addHandler(fl)
164 | 
165 |     start = time.perf_counter()
166 |     run_crawl()
167 |     duration = int((time.perf_counter() - start) * 1000)
168 |     print(f"Entire process ran in {duration}ms")
169 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Feedsearch Crawler
  2 | [![PyPI](https://img.shields.io/pypi/v/feedsearch-crawler.svg)](https://pypi.org/project/feedsearch-crawler/)
  3 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/feedsearch-crawler.svg)
  4 | ![PyPI - License](https://img.shields.io/pypi/l/feedsearch-crawler.svg)
  5 | 
  6 | Feedsearch Crawler is a Python library for searching websites for [RSS](https://en.wikipedia.org/wiki/RSS), [Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)), and [JSON](https://jsonfeed.org/) feeds.
  7 | 
  8 | It is a continuation of my work on [Feedsearch](https://github.com/DBeath/feedsearch), which is itself a continuation of the work done by [Dan Foreman-Mackey](http://dfm.io/) on [Feedfinder2](https://github.com/dfm/feedfinder2), which in turn is based on [feedfinder](http://www.aaronsw.com/2002/feedfinder/) - originally written by [Mark Pilgrim](http://en.wikipedia.org/wiki/Mark_Pilgrim_(software_developer)) and subsequently maintained by
  9 | [Aaron Swartz](http://en.wikipedia.org/wiki/Aaron_Swartz) until his untimely death.
 10 | 
 11 | Feedsearch Crawler differs with all of the above in that it is now built as an asynchronous [Web crawler](https://en.wikipedia.org/wiki/Web_crawler) for [Python 3.7](https://www.python.org/downloads/release/python-370/) and above, using [asyncio](https://docs.python.org/3/library/asyncio.html) and [aiohttp](https://aiohttp.readthedocs.io/en/stable/), to allow much more rapid scanning of possible feed urls.
 12 | 
 13 | An implementation using this library to provide a public Feed Search API is available at https://feedsearch.dev
 14 | 
 15 | Pull requests and suggestions are welcome.
 16 | 
 17 | ## Installation
 18 | The library is available on [PyPI](https://pypi.org/project/feedsearch-crawler/):
 19 | 
 20 | ```
 21 | pip install feedsearch-crawler
 22 | ```
 23 | 
 24 | The library requires Python 3.7+.
 25 | 
 26 | ## Usage
 27 | Feedsearch Crawler is called with the single function ``search``:
 28 | 
 29 | ``` python
 30 | >>> from feedsearch_crawler import search
 31 | >>> feeds = search('xkcd.com')
 32 | >>> feeds
 33 | [FeedInfo('https://xkcd.com/rss.xml'), FeedInfo('https://xkcd.com/atom.xml')]
 34 | >>> feeds[0].url
 35 | URL('https://xkcd.com/rss.xml')
 36 | >>> str(feeds[0].url)
 37 | 'https://xkcd.com/rss.xml'
 38 | >>> feeds[0].serialize()
 39 | {'url': 'https://xkcd.com/rss.xml', 'title': 'xkcd.com', 'version': 'rss20', 'score': 24, 'hubs': [], 'description': 'xkcd.com: A webcomic of romance and math humor.', 'is_push': False, 'self_url': '', 'favicon': 'https://xkcd.com/s/919f27.ico', 'content_type': 'text/xml; charset=UTF-8', 'bozo': 0, 'site_url': 'https://xkcd.com/', 'site_name': 'xkcd: Chernobyl', 'favicon_data_uri': '', 'content_length': 2847}
 40 | ```
 41 | 
 42 | If you are already running in an [asyncio event loop](https://docs.python.org/3/library/asyncio-eventloop.html), then you can import and await ``search_async`` instead. The ``search`` function is only a wrapper that runs ``search_async`` in a new asyncio event loop.
 43 | 
 44 | ``` python
 45 | from feedsearch_crawler import search_async
 46 | 
 47 | feeds = await search_async('xkcd.com')
 48 | ```
 49 | 
 50 | A search will always return a list of *FeedInfo* objects, each of which will always have a *url* property, which is a [URL](https://yarl.readthedocs.io/en/latest/api.html) object that can be decoded to a string with ``str(url)``.
 51 | The returned *FeedInfo* are sorted by the *score* value from highest to lowest, with a higher score theoretically indicating a more relevant feed compared to the original URL provided. A *FeedInfo* can also be serialized to a JSON compatible dictionary by calling it's ``.serialize()`` method.
 52 | 
 53 | The crawl logs can be accessed with:
 54 | 
 55 | ``` python
 56 | import logging
 57 | 
 58 | logger = logging.getLogger("feedsearch_crawler")
 59 | ```
 60 | 
 61 | Feedsearch Crawler also provides a handy function to output the returned feeds as an [OPML](https://en.wikipedia.org/wiki/OPML) subscription list, encoded as a UTF-8 bytestring. 
 62 | 
 63 | ``` python
 64 | from feedsearch_crawler import output_opml
 65 | 
 66 | output_opml(feeds).decode()
 67 | ```
 68 | 
 69 | ## Search Arguments
 70 | ``search`` and ``search_async`` take the following arguments:
 71 | 
 72 | ``` python
 73 | search(
 74 |     url: Union[URL, str, List[Union[URL, str]]],
 75 |     crawl_hosts: bool=True,
 76 |     try_urls: Union[List[str], bool]=False,
 77 |     concurrency: int=10,
 78 |     total_timeout: Union[float, aiohttp.ClientTimeout]=10,
 79 |     request_timeout: Union[float, aiohttp.ClientTimeout]=3,
 80 |     user_agent: str="Feedsearch Bot",
 81 |     max_content_length: int=1024 * 1024 * 10,
 82 |     max_depth: int=10,
 83 |     headers: dict={"X-Custom-Header": "Custom Header"},
 84 |     favicon_data_uri: bool=True,
 85 |     delay: float=0
 86 | )
 87 | ```
 88 | 
 89 | - **url**: *Union[str, List[str]]*: The initial URL or list of URLs at which to search for feeds. You may also provide [URL](https://yarl.readthedocs.io/en/latest/api.html) objects.
 90 | - **crawl_hosts**: *bool*: (default True): An optional argument to add the site host origin URL to the list of initial crawl URLs. (e.g. add "example.com" if crawling "example.com/path/rss.xml"). If **False**, site metadata and favicon data may not be found.
 91 | - **try_urls**: *Union[List[str], bool]*: (default False): An optional list of URL paths to query for feeds. Takes the origins of the *url* parameter and appends the provided paths. If no list is provided, but *try_urls* is **True**, then a list of common feed locations will be used.
 92 | - **concurrency**: *int*: (default 10): An optional argument to specify the maximum number of concurrent HTTP requests.
 93 | - **total_timeout**: *float*: (default 30.0): An optional argument to specify the time this function may run before timing out.
 94 | - **request_timeout**: *float*: (default 3.0): An optional argument that controls how long before each individual HTTP request times out.
 95 | - **user_agent**: *str*: An optional argument to override the default User-Agent header.
 96 | - **max_content_length**: *int*: (default 10Mb): An optional argument to specify the maximum size in bytes of each HTTP Response.
 97 | - **max_depth**: *int*: (default 10): An optional argument to limit the maximum depth of requests while following urls.
 98 | - **headers**: *dict*: An optional dictionary of headers to pass to each HTTP request.
 99 | - **favicon_data_uri**: *bool*: (default True): Optionally control whether to fetch found favicons and return them as a Data Uri.
100 | - **delay**: *float*: (default 0.0): An optional argument to delay each HTTP request by the specified time in seconds. Used in conjunction with the concurrency setting to avoid overloading sites.
101 | 
102 | ## FeedInfo Values
103 | In addition to the *url*, FeedInfo objects may have the following values:
104 | 
105 | - **bozo**: *int*: Set to 1 when feed data is not well formed or may not be a feed. Defaults 0.
106 | - **content_length**: *int*: Current length of the feed in bytes.
107 | - **content_type**: *str*: [Content-Type](https://en.wikipedia.org/wiki/Media_type) value of the returned feed.
108 | - **description**: *str*: Feed description.
109 | - **favicon**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) of feed or site [Favicon](https://en.wikipedia.org/wiki/Favicon).
110 | - **favicon_data_uri**: *str*: [Data Uri](https://en.wikipedia.org/wiki/Data_URI_scheme) of Favicon.
111 | - **hubs**: *List[str]*: List of [Websub](https://en.wikipedia.org/wiki/WebSub) hubs of feed if available.
112 | - **is_podcast**: *bool*: True if the feed contains valid [podcast](https://en.wikipedia.org/wiki/Podcast) elements and enclosures.
113 | - **is_push**: *bool*: True if feed contains valid Websub data.
114 | - **item_count**: *int*: Number of items currently in the feed.
115 | - **last_updated**: *datetime*: Date of the latest published entry.
116 | - **score**: *int*: Computed relevance of feed url value to provided URL. May be safely ignored.
117 | - **self_url**: *URL*: *ref="self"* value returned from feed links. In some cases may be different from feed url.
118 | - **site_name**: *str*: Name of feed's website.
119 | - **site_url**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) of feed's website.
120 | - **title**: *str*: Feed Title.
121 | - **url**: *URL*: [URL](https://yarl.readthedocs.io/en/latest/api.html) location of feed.
122 | - **velocity**: *float*: Mean number of items per day in the feed at the current time.
123 | - **version**: *str*: Feed version [XML values](https://pythonhosted.org/feedparser/version-detection.html),
124 |   or [JSON feed](https://jsonfeed.org/version/1).
125 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/link_filter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pathlib
  3 | import re
  4 | from typing import Optional, Tuple, List
  5 | 
  6 | import bs4
  7 | from w3lib.url import url_query_cleaner
  8 | from yarl import URL
  9 | 
 10 | from feedsearch_crawler.crawler import Response, Request
 11 | from feedsearch_crawler.crawler.lib import parse_href_to_url
 12 | from feedsearch_crawler.feed_spider.regexes import (
 13 |     feedlike_regex,
 14 |     podcast_regex,
 15 |     author_regex,
 16 |     date_regex,
 17 | )
 18 | 
 19 | # List of invalid filetypes
 20 | invalid_filetypes: List[str] = [
 21 |     "jpeg",
 22 |     "jpg",
 23 |     "png",
 24 |     "gif",
 25 |     "bmp",
 26 |     "mp4",
 27 |     "mp3",
 28 |     "mkv",
 29 |     "md",
 30 |     "css",
 31 |     "avi",
 32 |     "pdf",
 33 |     "js",
 34 |     "woff",
 35 |     "woff2",
 36 |     "svg",
 37 |     "ttf",
 38 | ]
 39 | 
 40 | # List of strings that are invalid as querystring keys
 41 | invalid_querystring_keys: List[str] = ["comment", "comments", "post", "view", "theme"]
 42 | 
 43 | # List of strings that indicate a URL is invalid for crawling
 44 | invalid_url_contents: List[str] = [
 45 |     "wp-includes",
 46 |     "wp-content",
 47 |     "wp-json",
 48 |     "xmlrpc",
 49 |     "wp-admin",
 50 |     "/amp/",  # Theoretically there could be a feed at an AMP url, but not worth checking.
 51 |     "mailto:",
 52 |     "//font.",
 53 | ]
 54 | 
 55 | # List of strings that indicate a URL should be low priority
 56 | low_priority_urls: List[str] = [
 57 |     "/archive/",  # Archives are less likely to contain feeds.
 58 |     "/page/",  # Articles pages are less likely to contain feeds.
 59 |     "forum",  # Forums are not likely to contain interesting feeds.
 60 |     "//cdn.",  # Can't guarantee that someone won't put a feed at a CDN url, so we can't outright ignore it.
 61 |     "video",
 62 | ]
 63 | 
 64 | # Link Types that should always be searched for feeds
 65 | feed_link_types: List[str] = ["application/json", "rss", "atom", "rdf"]
 66 | 
 67 | 
 68 | logger = logging.getLogger(__name__)
 69 | 
 70 | 
 71 | class LinkFilter:
 72 |     def __init__(self, response: Response, request: Request, full_crawl: bool = False):
 73 |         self.response = response
 74 |         self.request = request
 75 |         self.full_crawl = full_crawl
 76 | 
 77 |     def should_follow_link(self, link: bs4.Tag) -> Optional[Tuple[URL, int]]:
 78 |         """
 79 |         Check that the link should be followed if it may contain feed information.
 80 | 
 81 |         :param link: Link tag
 82 |         :return: boolean
 83 |         """
 84 |         href: str = link.get("href")
 85 |         link_type: str = link.get("type")
 86 | 
 87 |         url: URL = parse_href_to_url(href)
 88 |         if not url:
 89 |             return None
 90 | 
 91 |         # If the link may have a valid feed type then follow it regardless of the url text.
 92 |         if (
 93 |             link_type
 94 |             and any(map(link_type.lower().count, feed_link_types))
 95 |             and "json+oembed" not in link_type
 96 |         ):
 97 |             # A link with a possible feed type has the highest priority after callbacks.
 98 |             return url, 2
 99 | 
100 |         is_feedlike_href: bool = self.is_href_matching(str(url), feedlike_regex)
101 |         is_feedlike_querystring: bool = self.is_querystring_matching(
102 |             url, feedlike_regex
103 |         )
104 | 
105 |         is_podcast_href: bool = self.is_href_matching(str(url), podcast_regex)
106 |         is_podcast_querystring: bool = self.is_querystring_matching(url, podcast_regex)
107 | 
108 |         is_feedlike_url = is_feedlike_querystring or is_feedlike_href
109 |         is_podcast_url = is_podcast_href or is_podcast_querystring
110 | 
111 |         if not self.full_crawl and not is_feedlike_url and not is_podcast_url:
112 |             return
113 | 
114 |         # This check is deprecated, as it has been moved to the spider to prevent the crawling of any links
115 |         # from responses that are not the same as the original domain
116 |         #
117 |         # is_one_jump: bool = self.is_one_jump_from_original_domain(url, self.response)
118 |         # if not is_one_jump:
119 |         #     return
120 | 
121 |         has_author_info: bool = self.is_href_matching(href, author_regex)
122 |         is_low_priority: bool = self.is_low_priority(href)
123 | 
124 |         priority: int = Request.priority
125 |         # A low priority url should be fetched last.
126 |         if is_low_priority:
127 |             priority = Request.priority + 2
128 |         # Podcast pages are lower priority than authors or feeds.
129 |         if is_podcast_url:
130 |             priority = 5
131 |         # Potential author info has a medium priority.
132 |         if has_author_info:
133 |             priority = 4
134 |         # A feedlike url has high priority.
135 |         if is_feedlike_url:
136 |             priority = 3
137 | 
138 |         # Validate the actual URL string.
139 |         follow = (
140 |             # is_one_jump
141 |             not self.has_invalid_contents(href)
142 |             and self.is_valid_filetype(href)
143 |             and not self.has_invalid_querystring(url)
144 |         )
145 |         # If full_crawl then follow all valid URLs regardless of the feedlike quality of the URL.
146 |         # Otherwise only follow URLs if they look like they might contain feed information.
147 |         if follow and (self.full_crawl or is_feedlike_url or is_podcast_href):
148 | 
149 |             # Remove the querystring unless it may point to a feed.
150 |             if not is_feedlike_querystring:
151 |                 url = url.with_query(None)
152 | 
153 |             return url, priority
154 | 
155 |     @staticmethod
156 |     def is_one_jump_from_original_domain(url: URL, response: Response) -> bool:
157 |         """
158 |         Check that the current URL is only one response away from the originally queried domain.
159 | 
160 |         We want to be able to follow potential feed links that point to a different domain than
161 |         the originally queried domain, but not to follow any deeper than that.
162 | 
163 |         Sub-domains of the original domain are ok.
164 | 
165 |         i.e: the following are ok
166 |             "test.com" -> "feedhost.com"
167 |             "test.com/feeds" -> "example.com/feed.xml"
168 |             "test.com" -> "feeds.test.com"
169 | 
170 |         not ok:
171 |             "test.com" -> "feedhost.com" (we stop here) -> "feedhost.com/feeds"
172 | 
173 |         :param url: URL object or string
174 |         :param response: Response object
175 |         :return: boolean
176 |         """
177 | 
178 |         # This is the first Response in the chain
179 |         if len(response.history) < 2:
180 |             return True
181 | 
182 |         # The URL is relative, so on the same domain
183 |         if not url.is_absolute():
184 |             return True
185 | 
186 |         # URL is same domain
187 |         if url.host == response.history[0].host:
188 |             return True
189 | 
190 |         # URL is sub-domain
191 |         if response.history[0].host in url.host:
192 |             return True
193 | 
194 |         # URL domain and current Response domain are different from original domain
195 |         if (
196 |             response.history[-1].host != response.history[0].host
197 |             and url.host != response.history[0].host
198 |         ):
199 |             return False
200 | 
201 |         return True
202 | 
203 |     @staticmethod
204 |     def is_valid_filetype(url: str) -> bool:
205 |         """
206 |         Check if url string has an invalid filetype extension.
207 | 
208 |         :param url: URL string
209 |         :return: boolean
210 |         """
211 |         # if file_regex.search(url.strip()):
212 |         #     return False
213 |         # return True
214 |         suffix = pathlib.Path(url_query_cleaner(url)).suffix.strip(".").lower()
215 |         if suffix in invalid_filetypes:
216 |             return False
217 |         return True
218 | 
219 |     @staticmethod
220 |     def has_invalid_querystring(url: URL) -> bool:
221 |         """
222 |         Check if URL querystring contains invalid keys.
223 | 
224 |         :param url: URL object
225 |         :return: boolean
226 |         """
227 |         return any(key in url.query for key in invalid_querystring_keys)
228 | 
229 |     @staticmethod
230 |     def is_href_matching(url_string: str, regex: re) -> bool:
231 |         """
232 |         Check if the regex has any match in the url string.
233 | 
234 |         :param url_string: URL as string
235 |         :param regex: Regex used to search URL
236 |         :return: boolean
237 |         """
238 |         if regex.search(url_query_cleaner(url_string)):
239 |             return True
240 |         return False
241 | 
242 |     @staticmethod
243 |     def is_querystring_matching(url: URL, regex: re) -> bool:
244 |         """
245 |         Check if the regex has any match in the URL query parameters.
246 | 
247 |         :param url: URL object
248 |         :param regex: Regex used to search query
249 |         :return: boolean
250 |         """
251 |         for key in url.query:
252 |             if regex.search(key):
253 |                 return True
254 |         return False
255 | 
256 |     @staticmethod
257 |     def has_invalid_contents(string: str) -> bool:
258 |         """
259 |         Ignore any string containing the following strings.
260 | 
261 |         :param string: String to check
262 |         :return: boolean
263 |         """
264 |         return any(value in string.lower() for value in invalid_url_contents)
265 | 
266 |     @staticmethod
267 |     def is_low_priority(url_string: str) -> bool:
268 |         """
269 |         Check if the url contains any strings that indicate the url should be low priority.
270 | 
271 |         :param url_string: URL string
272 |         :return: boolean
273 |         """
274 |         if any(value in url_string.lower() for value in low_priority_urls):
275 |             return True
276 | 
277 |         # Search for dates in url, this generally indicates an article page.
278 |         if date_regex.search(url_string):
279 |             return True
280 |         return False
281 | 
282 |     @staticmethod
283 |     def is_subdomain_matching(url: URL, regex: re) -> bool:
284 |         """
285 |         Check if the url subdomain matches the regex
286 | 
287 |         :param url: URL object
288 |         :param regex: regex object
289 |         :return: boolean
290 |         """
291 |         if not url.host:
292 |             return False
293 | 
294 |         split = url.host.split(".")
295 |         if len(split) <= 2:
296 |             return False
297 | 
298 |         sub_domains = ".".join(split[:-2])
299 |         if regex.search(sub_domains):
300 |             return True
301 |         return False
302 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/lib.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from asyncio import PriorityQueue
  3 | from dataclasses import dataclass
  4 | from enum import Enum
  5 | from typing import Any, Union, Dict
  6 | 
  7 | from yarl import URL
  8 | 
  9 | from feedsearch_crawler.crawler.queueable import Queueable
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # noinspection PyUnresolvedReferences
 15 | class CrawlerPriorityQueue(PriorityQueue):
 16 |     _unfinished_tasks: int
 17 | 
 18 |     def clear(self):
 19 |         """
 20 |         Clear the Queue of any unfinished tasks.
 21 |         """
 22 |         self._queue.clear()
 23 |         self._unfinished_tasks = 0
 24 |         self._finished.set()
 25 | 
 26 | 
 27 | @dataclass
 28 | class CallbackResult(Queueable):
 29 |     """Dataclass for holding callback results and recording recursion"""
 30 | 
 31 |     result: Any
 32 |     callback_recursion: int
 33 |     # CallbackResult priority is high so that we clear Callbacks off the queue and process them as fast as possible.
 34 |     # Otherwise the workers always process Requests and don't often process the Request results.
 35 |     priority = 1
 36 | 
 37 |     def __repr__(self):
 38 |         return f"{self.__class__.__name__}({self.result.__class__.__name__})"
 39 | 
 40 | 
 41 | class Stats(Enum):
 42 |     # Number of Requests added to the queue.
 43 |     REQUESTS_QUEUED = "requests_queued"
 44 |     # Number of HTTP Requests that were successful (HTTP Status code 200-299).
 45 |     REQUESTS_SUCCESSFUL = "requests_successful"
 46 |     # Number of HTTP Requests that were unsuccessful (HTTP Status code not in 200s).
 47 |     REQUESTS_FAILED = "requests_failed"
 48 |     # Total size in bytes of all HTTP Responses.
 49 |     CONTENT_LENGTH_TOTAL = "content_length_total"
 50 |     # Harmonic mean of total HTTP Response content length in bytes.
 51 |     CONTENT_LENGTH_AVG = "content_length_avg"
 52 |     # Highest HTTP Response content length in bytes.
 53 |     CONTENT_LENGTH_MAX = "content_length_max"
 54 |     # Lowest HTTP Response content length in bytes.
 55 |     CONTENT_LENGTH_MIN = "content_length_min"
 56 |     # Median HTTP Response content length in bytes.
 57 |     CONTENT_LENGTH_MEDIAN = "content_length_med"
 58 |     # Number of Items processed.
 59 |     ITEMS_PROCESSED = "items_processed"
 60 |     # Number of URls seen and added to duplicate filter.
 61 |     URLS_SEEN = "urls_seen"
 62 |     # Harmonic mean of Request duration in Milliseconds.
 63 |     REQUESTS_DURATION_AVG = "requests_duration_avg"
 64 |     # Highest Request duration in Milliseconds.
 65 |     REQUESTS_DURATION_MAX = "requests_duration_max"
 66 |     # Lowest Request duration in Milliseconds.
 67 |     REQUESTS_DURATION_MIN = "requests_duration_min"
 68 |     # Total Request duration in Milliseconds.
 69 |     REQUESTS_DURATION_TOTAL = "requests_duration_total"
 70 |     # Median Request duration in Milliseconds.
 71 |     REQUESTS_DURATION_MEDIAN = "requests_duration_med"
 72 |     # Harmonic mean of HTTP request latency in Milliseconds.
 73 |     REQUESTS_LATENCY_AVG = "requests_latency_avg"
 74 |     # Highest HTTP Request latency in Milliseconds.
 75 |     REQUESTS_LATENCY_MAX = "requests_latency_max"
 76 |     # Lowest HTTP Request latency in Milliseconds.
 77 |     REQUESTS_LATENCY_MIN = "requests_latency_min"
 78 |     # Median HTTP Request latency in Milliseconds.
 79 |     REQUESTS_LATENCY_MEDIAN = "requests_latency_med"
 80 |     # Total HTTP Request latency in Milliseconds.
 81 |     REQUESTS_LATENCY_TOTAL = "requests_latency_total"
 82 |     # Total duration of crawl in Milliseconds.
 83 |     TOTAL_DURATION = "total_duration"
 84 |     # Response status codes.
 85 |     STATUS_CODES = "status_codes"
 86 |     # Highest queue wait time in Milliseconds.
 87 |     QUEUE_WAIT_MAX = "queue_wait_max"
 88 |     # Lowest queue wait time in Milliseconds.
 89 |     QUEUE_WAIT_MIN = "queue_wait_min"
 90 |     # Harmonic mean of queue wait time in Milliseconds.
 91 |     QUEUE_WAIT_AVG = "queue_wait_avg"
 92 |     # Median queue wait time in Milliseconds.
 93 |     QUEUE_WAIT_MEDIAN = "queue_wait_med"
 94 |     # Highest queue size.
 95 |     QUEUE_SIZE_MAX = "queue_size_max"
 96 |     # Harmonic mean of queue size.
 97 |     QUEUE_SIZE_AVG = "queue_size_avg"
 98 |     # Median queue size.
 99 |     QUEUE_SIZE_MEDIAN = "queue_size_med"
100 |     # Total objects put on queue.
101 |     QUEUED_TOTAL = "queued_total"
102 |     # Total number of retried Requests
103 |     REQUESTS_RETRIED = "requests_retried"
104 | 
105 |     def __repr__(self):
106 |         return self.value
107 | 
108 |     def __str__(self):
109 |         return str(self.value)
110 | 
111 |     def __lt__(self, other):
112 |         if not isinstance(other, Stats):
113 |             return False
114 |         return self.value < other.value
115 | 
116 | 
117 | def coerce_url(
118 |     url: Union[URL, str], https: bool = False, default_scheme: str = "http"
119 | ) -> URL:
120 |     """
121 |     Coerce URL to valid format
122 | 
123 |     :param url: URL
124 |     :param https: Force https if no scheme in url
125 |     :param default_scheme: Default scheme if not forcing https
126 |     :return: str
127 |     """
128 |     if isinstance(url, str):
129 |         url = URL(url.strip())
130 | 
131 |     scheme = "https" if https else default_scheme
132 | 
133 |     if not url.is_absolute():
134 |         url_string = str(url)
135 |         split = url_string.split("/", 1)
136 |         url = URL.build(scheme=scheme, host=split[0])
137 |         if len(split) > 1:
138 |             url = url.with_path(split[1])
139 | 
140 |     if (url.scheme == "http" and https) or not url.scheme:
141 |         url = url.with_scheme(scheme)
142 | 
143 |     return url
144 | 
145 | 
146 | def to_bytes(text, encoding: str = "utf-8", errors: str = "strict"):
147 |     """Return the binary representation of `text`. If `text`
148 |     is already a bytes object, return it as-is."""
149 |     if not text:
150 |         return b""
151 |     if isinstance(text, bytes):
152 |         return text
153 |     return text.encode(encoding, errors)
154 | 
155 | 
156 | def to_string(item: Any, encoding: str = "utf-8", errors: str = "strict") -> str:
157 |     """
158 |     Return the string representation of 'item'.
159 |     """
160 |     if item is None:
161 |         return ""
162 |     if isinstance(item, bytes):
163 |         return item.decode(encoding, errors)
164 |     return str(item)
165 | 
166 | 
167 | def case_insensitive_key(key: str, dictionary: Dict) -> bool:
168 |     """
169 |     Check if a case-insensitive key is in a dictionary.
170 |     """
171 |     k = key.lower()
172 |     for key in dictionary.keys():
173 |         if key.lower() == k:
174 |             return True
175 | 
176 | 
177 | def headers_to_dict(headers: Any) -> Dict[str, str]:
178 |     """
179 |     Convert various header classes to a simple dictionary
180 | 
181 |     :param headers: Dict subclass of HTTP headers
182 |     :return: Dict of HTTP headers
183 |     """
184 |     if isinstance(headers, dict):
185 |         return headers
186 | 
187 |     new_headers = {}
188 |     try:
189 |         new_headers.update({k.lower(): v for (k, v) in headers.items()})
190 |     except Exception as e:
191 |         logger.warning("Exception parsing headers to dict: %s", e)
192 |         pass
193 |     return new_headers
194 | 
195 | 
196 | def ignore_aiohttp_ssl_error(loop, aiohttpversion="3.5.4"):
197 |     """Ignore aiohttp #3535 issue with SSL data after close
198 |      There appears to be an issue on Python 3.7 and aiohttp SSL that throws a
199 |     ssl.SSLError fatal error (ssl.SSLError: [SSL: KRB5_S_INIT] application data
200 |     after close notify (_ssl.c:2609)) after we are already done with the
201 |     connection. See GitHub issue aio-libs/aiohttp#3535
202 |      Given a loop, this sets up a exception handler that ignores this specific
203 |     exception, but passes everything else on to the previous exception handler
204 |     this one replaces.
205 |      If the current aiohttp version is not exactly equal to aiohttpversion
206 |     nothing is done, assuming that the next version will have this bug fixed.
207 |     This can be disabled by setting this parameter to None
208 |     """
209 |     import ssl
210 |     import aiohttp
211 |     import asyncio
212 | 
213 |     try:
214 |         # noinspection PyUnresolvedReferences
215 |         import uvloop
216 | 
217 |         protocol_class = uvloop.loop.SSLProtocol
218 |     except ImportError:
219 |         protocol_class = asyncio.sslproto.SSLProtocol
220 |         pass
221 | 
222 |     if aiohttpversion is not None and aiohttp.__version__ != aiohttpversion:
223 |         return
224 | 
225 |     orig_handler = loop.get_exception_handler()
226 | 
227 |     # noinspection PyUnresolvedReferences
228 |     def ignore_ssl_error(this_loop, context):
229 |         errors = ["SSL error", "Fatal error"]
230 |         if any(x in context.get("message") for x in errors):
231 |             # validate we have the right exception, transport and protocol
232 |             exception = context.get("exception")
233 |             protocol = context.get("protocol")
234 |             if (
235 |                 isinstance(exception, ssl.SSLError)
236 |                 and exception.reason == "KRB5_S_INIT"
237 |                 and isinstance(protocol, protocol_class)
238 |             ):
239 |                 if this_loop.get_debug():
240 |                     asyncio.log.logger.debug("Ignoring aiohttp SSL KRB5_S_INIT error")
241 |                 return
242 |         if orig_handler is not None:
243 |             orig_handler(this_loop, context)
244 |         else:
245 |             this_loop.default_exception_handler(context)
246 | 
247 |     loop.set_exception_handler(ignore_ssl_error)
248 | 
249 | 
250 | def parse_href_to_url(href: str) -> Union[URL, None]:
251 |     """
252 |     Parse an href string to a URL object.
253 | 
254 |     :param href: An href string that may be a valid url.
255 |     :return: URL or None.
256 |     """
257 |     if not href:
258 |         return None
259 | 
260 |     if not isinstance(href, str):
261 |         raise TypeError("href must be string")
262 | 
263 |     try:
264 |         return URL(href)
265 |     except (UnicodeError, ValueError) as e:
266 |         logger.warning("Failed to encode href: %s : %s", href, e)
267 |         return None
268 | 
269 | 
270 | def remove_www(host: str) -> str:
271 |     """
272 |     Remove www. subdomain from URL host strings.
273 | 
274 |     :param host: URL host without scheme or path. e.g. www.test.com
275 |     :return: URL host string.
276 |     """
277 |     if host.startswith("www."):
278 |         return host[4:]
279 |     return host
280 | 
281 | 
282 | def is_same_domain(root_domain: str, url_domain: str) -> bool:
283 |     """
284 |     Check if the url domain is the same or a subdomain of the root domain.
285 | 
286 |     :param root_domain: Original root domain of this crawl
287 |     :param url_domain: Domain of the url to filter
288 |     :return: boolean
289 |     """
290 |     return remove_www(root_domain) in url_domain
291 | 


--------------------------------------------------------------------------------
/tests/feed_spider/feed_info_parser_test.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | from dateutil.tz import tzutc
  4 | 
  5 | from feedsearch_crawler.feed_spider.feed_info_parser import FeedInfoParser
  6 | 
  7 | 
  8 | def test_entry_velocity_no_dates():
  9 |     dates = []
 10 |     result = FeedInfoParser.entry_velocity(dates)
 11 |     assert result == 0
 12 | 
 13 | 
 14 | def test_entry_velocity_identical_dates():
 15 |     dates = [datetime(2020, 1, 1), datetime(2020, 1, 1), datetime(2020, 1, 1)]
 16 |     result = FeedInfoParser.entry_velocity(dates)
 17 |     assert result == 0
 18 | 
 19 | 
 20 | def test_entry_velocity():
 21 |     dates = [
 22 |         datetime(2019, 1, 1),
 23 |         datetime(2019, 1, 2),
 24 |         datetime(2019, 1, 3),
 25 |         datetime(2019, 1, 4),
 26 |         datetime(2019, 1, 5),
 27 |     ]
 28 |     result = FeedInfoParser.entry_velocity(dates)
 29 |     assert result == 1.0
 30 | 
 31 |     dates = [
 32 |         datetime(2019, 1, 1, 1),
 33 |         datetime(2019, 1, 1, 2),
 34 |         datetime(2019, 1, 1, 3),
 35 |         datetime(2019, 1, 1, 4),
 36 |         datetime(2019, 1, 1, 5),
 37 |     ]
 38 |     result = FeedInfoParser.entry_velocity(dates)
 39 |     assert result == 24
 40 | 
 41 |     dates = [
 42 |         datetime(2019, 1, 1),
 43 |         datetime(2019, 1, 7),
 44 |         datetime(2019, 1, 14),
 45 |         datetime(2019, 1, 21),
 46 |         datetime(2019, 1, 27),
 47 |     ]
 48 |     dates = sorted(dates, reverse=True)
 49 |     result = FeedInfoParser.entry_velocity(dates)
 50 |     assert result == 0.154
 51 | 
 52 |     dates = [
 53 |         datetime(2019, 9, 21, 13, 10, 3, tzinfo=tzutc()),
 54 |         datetime(2019, 9, 21, 13, 10, 3, tzinfo=tzutc()),
 55 |         datetime(2019, 9, 21, 2, 20, 32, tzinfo=tzutc()),
 56 |         datetime(2019, 9, 21, 2, 20, 32, tzinfo=tzutc()),
 57 |         datetime(2019, 9, 20, 20, 15, 45, tzinfo=tzutc()),
 58 |         datetime(2019, 9, 20, 20, 15, 45, tzinfo=tzutc()),
 59 |         datetime(2019, 9, 20, 19, 40, 40, tzinfo=tzutc()),
 60 |         datetime(2019, 9, 20, 19, 40, 40, tzinfo=tzutc()),
 61 |         datetime(2019, 9, 20, 19, 38, 23, tzinfo=tzutc()),
 62 |         datetime(2019, 9, 20, 19, 38, 23, tzinfo=tzutc()),
 63 |         datetime(2019, 9, 20, 19, 23, 20, tzinfo=tzutc()),
 64 |         datetime(2019, 9, 20, 19, 23, 20, tzinfo=tzutc()),
 65 |         datetime(2019, 9, 20, 19, 8, tzinfo=tzutc()),
 66 |         datetime(2019, 9, 20, 19, 8, tzinfo=tzutc()),
 67 |         datetime(2019, 9, 20, 18, 41, 57, tzinfo=tzutc()),
 68 |         datetime(2019, 9, 20, 18, 41, 57, tzinfo=tzutc()),
 69 |         datetime(2019, 9, 20, 17, 36, 30, tzinfo=tzutc()),
 70 |         datetime(2019, 9, 20, 17, 36, 30, tzinfo=tzutc()),
 71 |         datetime(2019, 9, 20, 17, 18, 2, tzinfo=tzutc()),
 72 |         datetime(2019, 9, 20, 17, 18, 2, tzinfo=tzutc()),
 73 |         datetime(2019, 9, 20, 16, 35, 53, tzinfo=tzutc()),
 74 |         datetime(2019, 9, 20, 16, 35, 53, tzinfo=tzutc()),
 75 |         datetime(2019, 9, 20, 16, 25, 13, tzinfo=tzutc()),
 76 |         datetime(2019, 9, 20, 16, 25, 13, tzinfo=tzutc()),
 77 |         datetime(2019, 9, 20, 16, 0, 49, tzinfo=tzutc()),
 78 |         datetime(2019, 9, 20, 16, 0, 49, tzinfo=tzutc()),
 79 |         datetime(2019, 9, 20, 15, 35, 50, tzinfo=tzutc()),
 80 |         datetime(2019, 9, 20, 15, 35, 50, tzinfo=tzutc()),
 81 |         datetime(2019, 9, 20, 15, 31, 35, tzinfo=tzutc()),
 82 |         datetime(2019, 9, 20, 15, 31, 35, tzinfo=tzutc()),
 83 |         datetime(2019, 9, 20, 15, 30, 48, tzinfo=tzutc()),
 84 |         datetime(2019, 9, 20, 15, 30, 48, tzinfo=tzutc()),
 85 |         datetime(2019, 9, 20, 11, 0, 53, tzinfo=tzutc()),
 86 |         datetime(2019, 9, 20, 11, 0, 53, tzinfo=tzutc()),
 87 |         datetime(2019, 9, 20, 10, 45, 16, tzinfo=tzutc()),
 88 |         datetime(2019, 9, 20, 10, 45, 16, tzinfo=tzutc()),
 89 |         datetime(2019, 9, 20, 10, 0, 49, tzinfo=tzutc()),
 90 |         datetime(2019, 9, 20, 10, 0, 49, tzinfo=tzutc()),
 91 |         datetime(2019, 9, 19, 22, 6, 47, tzinfo=tzutc()),
 92 |         datetime(2019, 9, 19, 22, 6, 47, tzinfo=tzutc()),
 93 |     ]
 94 |     result = FeedInfoParser.entry_velocity(dates)
 95 |     assert result == 11.676
 96 | 
 97 |     dates = [
 98 |         datetime(2019, 9, 16, 14, 8, 51, tzinfo=tzutc()),
 99 |         datetime(2019, 9, 16, 14, 8, 51, tzinfo=tzutc()),
100 |         datetime(2019, 9, 18, 4, 44, 14, tzinfo=tzutc()),
101 |         datetime(2019, 9, 18, 4, 44, 14, tzinfo=tzutc()),
102 |         datetime(2019, 9, 18, 9, 0, 16, tzinfo=tzutc()),
103 |         datetime(2019, 9, 18, 9, 0, 16, tzinfo=tzutc()),
104 |         datetime(2019, 9, 19, 14, 1, 56, tzinfo=tzutc()),
105 |         datetime(2019, 9, 19, 14, 1, 56, tzinfo=tzutc()),
106 |         datetime(2019, 9, 19, 20, 58, 52, tzinfo=tzutc()),
107 |         datetime(2019, 9, 19, 20, 58, 52, tzinfo=tzutc()),
108 |         datetime(2019, 9, 20, 19, 41, 7, tzinfo=tzutc()),
109 |         datetime(2019, 9, 20, 19, 41, 7, tzinfo=tzutc()),
110 |         datetime(2019, 9, 20, 23, 2, 15, tzinfo=tzutc()),
111 |         datetime(2019, 9, 20, 23, 2, 15, tzinfo=tzutc()),
112 |         datetime(2019, 9, 21, 2, 53, 22, tzinfo=tzutc()),
113 |         datetime(2019, 9, 21, 2, 53, 22, tzinfo=tzutc()),
114 |         datetime(2019, 9, 21, 5, 28, 43, tzinfo=tzutc()),
115 |         datetime(2019, 9, 21, 5, 28, 43, tzinfo=tzutc()),
116 |         datetime(2019, 9, 21, 5, 28, 44, tzinfo=tzutc()),
117 |         datetime(2019, 9, 21, 5, 28, 44, tzinfo=tzutc()),
118 |         datetime(2019, 9, 21, 5, 38, 3, tzinfo=tzutc()),
119 |         datetime(2019, 9, 21, 5, 38, 3, tzinfo=tzutc()),
120 |         datetime(2019, 9, 21, 9, 0, 1, tzinfo=tzutc()),
121 |         datetime(2019, 9, 21, 9, 0, 1, tzinfo=tzutc()),
122 |         datetime(2019, 9, 21, 9, 36, 12, tzinfo=tzutc()),
123 |         datetime(2019, 9, 21, 9, 36, 12, tzinfo=tzutc()),
124 |         datetime(2019, 9, 21, 9, 39, 2, tzinfo=tzutc()),
125 |         datetime(2019, 9, 21, 9, 39, 2, tzinfo=tzutc()),
126 |         datetime(2019, 9, 21, 9, 54, 27, tzinfo=tzutc()),
127 |         datetime(2019, 9, 21, 9, 54, 27, tzinfo=tzutc()),
128 |         datetime(2019, 9, 21, 11, 24, 14, tzinfo=tzutc()),
129 |         datetime(2019, 9, 21, 11, 24, 14, tzinfo=tzutc()),
130 |         datetime(2019, 9, 21, 11, 57, 54, tzinfo=tzutc()),
131 |         datetime(2019, 9, 21, 11, 57, 54, tzinfo=tzutc()),
132 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
133 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
134 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
135 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
136 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
137 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
138 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
139 |         datetime(2019, 9, 21, 11, 57, 55, tzinfo=tzutc()),
140 |         datetime(2019, 9, 21, 12, 50, 21, tzinfo=tzutc()),
141 |         datetime(2019, 9, 21, 12, 50, 21, tzinfo=tzutc()),
142 |         datetime(2019, 9, 21, 13, 26, 45, tzinfo=tzutc()),
143 |         datetime(2019, 9, 21, 13, 26, 45, tzinfo=tzutc()),
144 |         datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()),
145 |         datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()),
146 |         datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()),
147 |         datetime(2019, 9, 21, 17, 7, 9, tzinfo=tzutc()),
148 |         datetime(2019, 9, 21, 17, 50, 13, tzinfo=tzutc()),
149 |         datetime(2019, 9, 21, 17, 50, 13, tzinfo=tzutc()),
150 |         datetime(2019, 9, 21, 18, 3, 9, tzinfo=tzutc()),
151 |         datetime(2019, 9, 21, 18, 3, 9, tzinfo=tzutc()),
152 |         datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()),
153 |         datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()),
154 |         datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()),
155 |         datetime(2019, 9, 21, 18, 3, 10, tzinfo=tzutc()),
156 |         datetime(2019, 9, 21, 18, 3, 12, tzinfo=tzutc()),
157 |         datetime(2019, 9, 21, 18, 3, 12, tzinfo=tzutc()),
158 |         datetime(2019, 9, 21, 18, 39, 5, tzinfo=tzutc()),
159 |         datetime(2019, 9, 21, 18, 39, 5, tzinfo=tzutc()),
160 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
161 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
162 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
163 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
164 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
165 |         datetime(2019, 9, 21, 18, 44, 2, tzinfo=tzutc()),
166 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
167 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
168 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
169 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
170 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
171 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
172 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
173 |         datetime(2019, 9, 21, 18, 44, 4, tzinfo=tzutc()),
174 |         datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()),
175 |         datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()),
176 |         datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()),
177 |         datetime(2019, 9, 21, 18, 44, 5, tzinfo=tzutc()),
178 |         datetime(2019, 9, 21, 18, 44, 6, tzinfo=tzutc()),
179 |         datetime(2019, 9, 21, 18, 44, 6, tzinfo=tzutc()),
180 |         datetime(2019, 9, 21, 18, 44, 7, tzinfo=tzutc()),
181 |         datetime(2019, 9, 21, 18, 44, 7, tzinfo=tzutc()),
182 |         datetime(2019, 9, 21, 18, 50, 33, tzinfo=tzutc()),
183 |         datetime(2019, 9, 21, 18, 50, 33, tzinfo=tzutc()),
184 |         datetime(2019, 9, 21, 19, 32, 23, tzinfo=tzutc()),
185 |         datetime(2019, 9, 21, 19, 32, 23, tzinfo=tzutc()),
186 |         datetime(2019, 9, 21, 19, 32, 25, tzinfo=tzutc()),
187 |         datetime(2019, 9, 21, 19, 32, 25, tzinfo=tzutc()),
188 |         datetime(2019, 9, 21, 19, 50, 21, tzinfo=tzutc()),
189 |         datetime(2019, 9, 21, 19, 50, 21, tzinfo=tzutc()),
190 |         datetime(2019, 9, 21, 19, 50, 22, tzinfo=tzutc()),
191 |         datetime(2019, 9, 21, 19, 50, 22, tzinfo=tzutc()),
192 |         datetime(2019, 9, 21, 19, 50, 42, tzinfo=tzutc()),
193 |         datetime(2019, 9, 21, 19, 50, 42, tzinfo=tzutc()),
194 |         datetime(2019, 9, 21, 19, 50, 44, tzinfo=tzutc()),
195 |         datetime(2019, 9, 21, 19, 50, 44, tzinfo=tzutc()),
196 |         datetime(2019, 9, 21, 19, 50, 45, tzinfo=tzutc()),
197 |         datetime(2019, 9, 21, 19, 50, 45, tzinfo=tzutc()),
198 |     ]
199 |     result = FeedInfoParser.entry_velocity(dates)
200 |     assert result == 7.255
201 | 
202 | 
203 | def test_is_podcast_no_data():
204 |     data = {}
205 |     result = FeedInfoParser.is_podcast(data)
206 |     assert result is False
207 | 
208 | 
209 | def test_is_podcast_not_podcast():
210 |     data = {"entries": [{}]}
211 |     result = FeedInfoParser.is_podcast(data)
212 |     assert result is False
213 | 
214 | 
215 | def test_is_podcast_no_namespace():
216 |     data = {"entries": [{"enclosures": [{"media": "file_url"}]}]}
217 |     result = FeedInfoParser.is_podcast(data)
218 |     assert result is False
219 | 
220 | 
221 | def test_is_podcast_is_true():
222 |     data = {
223 |         "namespaces": {"itunes": "testing"},
224 |         "entries": [{"enclosures": [{"media": "file_url"}]}],
225 |     }
226 |     result = FeedInfoParser.is_podcast(data)
227 |     assert result is True
228 | 
229 | 
230 | def test_is_podcast_no_enclosures():
231 |     data = {"namespaces": {"itunes": "testing"}, "entries": [{}]}
232 |     result = FeedInfoParser.is_podcast(data)
233 |     assert result is False
234 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/spider.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | from types import AsyncGeneratorType
  4 | from typing import Union, Any, List, Set
  5 | 
  6 | import bs4
  7 | from yarl import URL
  8 | 
  9 | from feedsearch_crawler.crawler import Crawler, Item, Request, Response
 10 | from feedsearch_crawler.crawler.lib import parse_href_to_url
 11 | from feedsearch_crawler.feed_spider.dupefilter import NoQueryDupeFilter
 12 | from feedsearch_crawler.feed_spider.favicon import Favicon
 13 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo
 14 | from feedsearch_crawler.feed_spider.feed_info_parser import FeedInfoParser
 15 | from feedsearch_crawler.feed_spider.lib import ParseTypes
 16 | from feedsearch_crawler.feed_spider.link_filter import LinkFilter
 17 | from feedsearch_crawler.feed_spider.regexes import rss_regex
 18 | from feedsearch_crawler.feed_spider.site_meta import SiteMeta
 19 | from feedsearch_crawler.feed_spider.site_meta_parser import SiteMetaParser
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class FeedsearchSpider(Crawler):
 25 |     duplicate_filter_class = NoQueryDupeFilter
 26 |     htmlparser = "html.parser"
 27 |     favicon_data_uri = True
 28 |     try_urls: Union[List[str], bool] = False
 29 |     full_crawl: bool = False
 30 |     crawl_hosts: bool = True
 31 | 
 32 |     def __init__(self, *args, **kwargs):
 33 |         super().__init__(*args, **kwargs)
 34 |         self.site_meta_processor = SiteMetaParser(self)
 35 |         self.feed_info_parser = FeedInfoParser(self)
 36 |         self.site_metas = set()
 37 |         self.favicons = dict()
 38 |         self.feeds_seen = dict()
 39 |         self.post_crawl_callback = self.populate_feed_site_meta
 40 |         if "try_urls" in kwargs:
 41 |             self.try_urls = kwargs["try_urls"]
 42 |         if "favicon_data_uri" in kwargs:
 43 |             self.favicon_data_uri = kwargs["favicon_data_uri"]
 44 |         if "full_crawl" in kwargs:
 45 |             self.full_crawl = kwargs["full_crawl"]
 46 |         if "crawl_hosts" in kwargs:
 47 |             self.crawl_hosts = kwargs["crawl_hosts"]
 48 | 
 49 |     async def parse(self, request: Request, response: Response) -> AsyncGeneratorType:
 50 |         """
 51 |         Parse a Response for feeds or site metadata.
 52 | 
 53 |         :param request: Request
 54 |         :param response: Response
 55 |         :return: AsyncGenerator yielding Items, Requests, or iterative AsyncGenerators
 56 |         """
 57 | 
 58 |         # If the Response is not OK then there's no data to parse.
 59 |         if not response.ok:
 60 |             return
 61 | 
 62 |         # If the Response contains JSON then attempt to parse it as a JsonFeed.
 63 |         if response.json:
 64 |             if "version" and "jsonfeed" and "feed_url" in response.json:
 65 |                 yield self.feed_info_parser.parse_item(
 66 |                     request, response, parse_type=ParseTypes.JSON
 67 |                 )
 68 |                 return
 69 | 
 70 |         if not isinstance(response.text, str):
 71 |             logger.debug("No text in %s", response)
 72 |             return
 73 | 
 74 |         yield self.parse_site_meta(request, response)
 75 | 
 76 |         # Restrict the RSS check to the first 1000 characters, otherwise it's almost definitely not an actual feed.
 77 |         if rss_regex.search(response.text, endpos=1000):
 78 |             yield self.feed_info_parser.parse_item(
 79 |                 request, response, parse_type=ParseTypes.XML
 80 |             )
 81 |             return
 82 | 
 83 |         # Don't waste time trying to parse and follow urls if the max depth is already reached.
 84 |         if response.is_max_depth_reached(self.max_depth):
 85 |             logger.debug("Max depth %d reached: %s", self.max_depth, response)
 86 |             return
 87 | 
 88 |         # Make sure the Response XML has been parsed if it exists.
 89 |         soup = await response.xml
 90 |         if not soup:
 91 |             return
 92 | 
 93 |         # Don't crawl links from pages that are not from the original domain
 94 |         if not response.is_original_domain():
 95 |             return
 96 | 
 97 |         link_filter = LinkFilter(
 98 |             request=request, response=response, full_crawl=self.full_crawl
 99 |         )
100 | 
101 |         # Find all links in the Response.
102 |         links = soup.find_all(self.tag_has_href)
103 |         for link in links:
104 |             # Check each href for validity and queue priority.
105 |             values = link_filter.should_follow_link(link)
106 |             if values:
107 |                 url, priority = values
108 |                 yield await self.follow(
109 |                     url, self.parse, response, priority=priority, allow_domain=True
110 |                 )
111 | 
112 |     async def parse_site_meta(
113 |         self, request: Request, response: Response
114 |     ) -> AsyncGeneratorType:
115 |         """
116 |         Parses site metadata if the returned URL is a site origin URL.
117 | 
118 |         If the returned url is an origin url, or the request url is an origin url (and there may have been a redirect)
119 |         then parse the site meta.
120 | 
121 |         :param request: Request
122 |         :param response: Response
123 |         :return: AsyncGenerator yielding SiteMeta items
124 |         """
125 |         url_origin = response.url.origin()
126 |         request_url_origin = request.url.origin()
127 | 
128 |         if response.url == url_origin or request.url == request_url_origin:
129 |             yield self.site_meta_processor.parse_item(request, response)
130 | 
131 |     async def parse_xml(self, response_text: str) -> Any:
132 |         """
133 |         Parse Response text as XML.
134 |         Used to allow implementations to provide their own XML parser.
135 | 
136 |         :param response_text: Response text as string.
137 |         :return: None
138 |         """
139 |         return bs4.BeautifulSoup(response_text, self.htmlparser)
140 | 
141 |     async def process_item(self, item: Item) -> None:
142 |         """
143 |         Process parsed items.
144 | 
145 |         :param item: Item object
146 |         :return: None
147 |         """
148 |         if isinstance(item, FeedInfo):
149 |             self.items.add(item)
150 |         elif isinstance(item, SiteMeta):
151 |             self.site_metas.add(item)
152 |         elif isinstance(item, Favicon):
153 |             self.add_favicon(item)
154 | 
155 |     def add_favicon(self, favicon: Favicon) -> None:
156 |         """
157 |         Add a favicon to the spider's favicon dictionary.
158 | 
159 |         :param favicon: Favicon object
160 |         """
161 |         existing: Favicon = self.favicons.get(favicon.url)
162 |         if existing and existing.data_uri and not favicon.data_uri:
163 |             return
164 |         self.favicons[favicon.url] = favicon
165 | 
166 |     # noinspection PyPep8
167 |     async def populate_feed_site_meta(self) -> None:
168 |         """
169 |         Populate FeedInfo site information with data from the relevant SiteMeta item
170 |         """
171 |         for feed in self.items:
172 |             # Check each SiteMeta for a url host match
173 |             site_meta = next(
174 |                 (x for x in self.site_metas if x.host in feed.url.host), None
175 |             )
176 |             if site_meta:
177 |                 feed.site_url = site_meta.url
178 |                 feed.site_name = site_meta.site_name
179 | 
180 |             # Populate favicon directly if available
181 |             if feed.favicon:
182 |                 favicon = self.favicons.get(feed.favicon)
183 |                 if favicon:
184 |                     feed.favicon_data_uri = favicon.data_uri
185 |                     feed.favicon = favicon.resp_url if favicon.resp_url else favicon.url
186 | 
187 |             # If a favicon hasn't been found yet or there is no data_uri then try and find a suitable favicon
188 |             if not feed.favicon or (
189 |                 self.favicon_data_uri and not feed.favicon_data_uri
190 |             ):
191 |                 feed_host = feed.url.host
192 |                 favicons = list(
193 |                     x
194 |                     for x in self.favicons.values()
195 |                     if x.matches_host(feed_host, self.favicon_data_uri)
196 |                 )
197 | 
198 |                 if favicons:
199 |                     favicon = min(favicons, key=lambda x: x.priority)
200 | 
201 |                     feed.favicon_data_uri = favicon.data_uri
202 |                     feed.favicon = favicon.resp_url if favicon.resp_url else favicon.url
203 | 
204 |     # noinspection PyUnusedLocal
205 |     async def parse_favicon_data_uri(
206 |         self, request: Request, response: Response, favicon: Favicon
207 |     ) -> None:
208 |         """
209 |         Create a data uri from a favicon image.
210 | 
211 |         :param request: Request
212 |         :param response: Response
213 |         :param favicon: Favicon object
214 |         :return: None
215 |         """
216 |         if not response.ok or not response.data or not isinstance(response.data, bytes):
217 |             return
218 | 
219 |         def is_png(data: bytes) -> bool:
220 |             return data[:8] in bytes.fromhex("89 50 4E 47 0D 0A 1A 0A")
221 | 
222 |         def is_ico(data: bytes) -> bool:
223 |             return data[:4] in bytes.fromhex("00 00 01 00")
224 | 
225 |         try:
226 |             if not is_png(response.data) and not is_ico(response.data):
227 |                 logger.debug("Response data is not a valid image type: %s", response)
228 |                 return
229 |         except Exception as e:
230 |             logger.exception("Failure validation image type: %s: %s", response, e)
231 | 
232 |         try:
233 |             encoded = base64.b64encode(response.data)
234 |             uri = "data:image/png;base64," + encoded.decode(response.encoding)
235 |             favicon.resp_url = response.url
236 |             favicon.data_uri = uri
237 |             self.add_favicon(favicon)
238 |         except Exception as e:
239 |             logger.exception("Failure encoding image: %s: %s", response, e)
240 | 
241 |     def create_start_urls(self, urls: List[Union[URL, str]]) -> List[URL]:
242 |         """
243 |         Create the start URLs for the crawl from an initial URL. May be overridden.
244 | 
245 |         :param urls: Initial URLs
246 |         """
247 |         crawl_start_urls: Set[URL] = set()
248 | 
249 |         for url in urls + self.start_urls:
250 |             if isinstance(url, str):
251 |                 if "//" not in url:
252 |                     url = f"//{url}"
253 |                 url = parse_href_to_url(url)
254 |                 if not url:
255 |                     continue
256 | 
257 |             if url.scheme.lower() not in ["http", "https"]:
258 |                 url = url.with_scheme("http")
259 | 
260 |             crawl_start_urls.add(url)
261 | 
262 |         origins = set(url.origin() for url in crawl_start_urls)
263 | 
264 |         if self.try_urls:
265 |             # Common paths for feeds.
266 |             suffixes = {
267 |                 "index.xml",
268 |                 "atom.xml",
269 |                 "feeds",
270 |                 "feeds/default",
271 |                 "feed",
272 |                 "feed/default",
273 |                 "feeds/posts/default",
274 |                 "?feed=rss",
275 |                 "?feed=atom",
276 |                 "?feed=rss2",
277 |                 "?feed=rdf",
278 |                 "rss",
279 |                 "atom",
280 |                 "rdf",
281 |                 "index.rss",
282 |                 "index.rdf",
283 |                 "index.atom",
284 |                 "data/rss",
285 |                 "rss.xml",
286 |                 "index.json",
287 |                 "about",
288 |                 "about/feeds",
289 |                 "rss-feeds",
290 |             }
291 | 
292 |             for origin in origins:
293 |                 if isinstance(self.try_urls, list):
294 |                     crawl_start_urls.update(
295 |                         origin.join(URL(suffix)) for suffix in self.try_urls
296 |                     )
297 |                 else:
298 |                     crawl_start_urls.update(
299 |                         origin.join(URL(suffix)) for suffix in suffixes
300 |                     )
301 | 
302 |         # Crawl the origin urls of the start urls for Site metadata.
303 |         if self.crawl_hosts:
304 |             crawl_start_urls.update(origins)
305 | 
306 |         return list(crawl_start_urls)
307 | 
308 |     @staticmethod
309 |     def tag_has_href(tag: bs4.Tag) -> bool:
310 |         """
311 |         Find all tags that contain links.
312 | 
313 |         :param tag: XML tag
314 |         :return: boolean
315 |         """
316 |         return tag.has_attr("href")
317 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/request.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import copy
  3 | import json
  4 | import logging
  5 | import uuid
  6 | from asyncio import Semaphore, IncompleteReadError, LimitOverrunError, CancelledError
  7 | from random import random
  8 | from typing import List, Tuple, Any, Union, Optional, Dict
  9 | 
 10 | import aiohttp
 11 | import time
 12 | from aiohttp import ClientSession, ClientTimeout, hdrs
 13 | from yarl import URL
 14 | 
 15 | from feedsearch_crawler.crawler.queueable import Queueable
 16 | from feedsearch_crawler.crawler.response import Response
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class Request(Queueable):
 22 |     METHOD = ["GET", "POST"]
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         url: URL,
 27 |         request_session: ClientSession,
 28 |         params: Dict = None,
 29 |         data: Union[dict, bytes] = None,
 30 |         json_data: Dict = None,
 31 |         encoding: str = None,
 32 |         method: str = "GET",
 33 |         headers: Dict = None,
 34 |         timeout: Union[float, ClientTimeout] = 5.0,
 35 |         history: List = None,
 36 |         callback=None,
 37 |         xml_parser=None,
 38 |         failure_callback=None,
 39 |         max_content_length: int = 1024 * 1024 * 10,
 40 |         delay: float = 0,
 41 |         retries: int = 3,
 42 |         cb_kwargs: Dict = None,
 43 |         **kwargs,
 44 |     ):
 45 |         """
 46 |         A pending HTTP request to a URL. Wraps an aiohttp ClientSession request.
 47 |         https://aiohttp.readthedocs.io/en/stable/client_reference.html
 48 | 
 49 |         :param params: Mapping of query string parameters
 50 |         :param data: Dictionary, bytes, or file-like object to send in the body of the request
 51 |         :param json_data: Json dict to send as body. Not compatible with data
 52 |         :param url: Request URL
 53 |         :param request_session: aiohttp ClientSession
 54 |         :param encoding: Default Response encoding
 55 |         :param method: HTTP method
 56 |         :param headers: HTTP headers for the request
 57 |         :param timeout: Seconds before Request times out
 58 |         :param history: Response history, list of previous URLs
 59 |         :param callback: Callback function to run after request is successful
 60 |         :param xml_parser: Function to parse Response XML
 61 |         :param failure_callback: Callback function to run if request is unsuccessful
 62 |         :param max_content_length: Maximum allowed size in bytes of Response content
 63 |         :param delay: Time in seconds to delay Request
 64 |         :param retries: Number of times to retry a failed Request
 65 |         :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function.
 66 |         :param kwargs: Optional keyword arguments
 67 |         """
 68 |         self.url = url
 69 |         self.method = method.upper()
 70 |         if self.method not in self.METHOD:
 71 |             raise ValueError(f"{self.method} is not supported")
 72 |         if not isinstance(request_session, ClientSession):
 73 |             raise ValueError(f"request_session must be of type ClientSession")
 74 |         self.request_session = request_session
 75 |         self.headers = headers
 76 |         if not isinstance(timeout, ClientTimeout):
 77 |             timeout = aiohttp.ClientTimeout(total=self.timeout)
 78 |         self.timeout = timeout
 79 |         self.history = history or []
 80 |         self.encoding = encoding
 81 |         self._callback = callback
 82 |         self._failure_callback = failure_callback
 83 |         self.id = uuid.uuid4()
 84 |         self._xml_parser = xml_parser
 85 |         self.max_content_length = max_content_length
 86 |         self.json_data = json_data
 87 |         self.data = data
 88 |         self.params = params
 89 |         self.has_run: bool = False
 90 |         self.delay = delay
 91 |         self.cb_kwargs = cb_kwargs or {}
 92 | 
 93 |         self.should_retry: bool = False
 94 |         self._max_retries = retries
 95 |         # Number of times this request has been retried.
 96 |         self._num_retries: int = 0
 97 |         # Time in Milliseconds for the HTTP response to arrive.
 98 |         self.req_latency: int = 0
 99 |         # Time in Milliseconds for the HTTP response content to be read.
100 |         self.content_read: int = 0
101 | 
102 |         for key, value in kwargs:
103 |             if hasattr(self, key):
104 |                 setattr(self, key, value)
105 | 
106 |     async def fetch_callback(self, semaphore: Semaphore = None) -> Tuple[Any, Response]:
107 |         """
108 |         Fetch HTTP Response and run Callbacks.
109 | 
110 |         :param semaphore: asyncio Semaphore
111 |         :returns: Tuple of Callback result and Response object
112 |         """
113 |         if semaphore:
114 |             async with semaphore:
115 |                 response = await self._fetch()
116 |         else:
117 |             response = await self._fetch()
118 | 
119 |         callback_result = None
120 | 
121 |         if response.ok and self._callback:
122 |             callback_result = self._callback(
123 |                 request=self, response=response, **self.cb_kwargs
124 |             )
125 |         elif not response.ok and self._failure_callback:
126 |             callback_result = self._failure_callback(
127 |                 request=self, response=response, **self.cb_kwargs
128 |             )
129 | 
130 |         return callback_result, response
131 | 
132 |     # noinspection PyProtectedMember
133 |     async def _fetch(self) -> Response:
134 |         """
135 |         Run HTTP Request and fetch HTTP Response.
136 | 
137 |         :return: Response object
138 |         """
139 |         # Delay the request if self.delay is > 0
140 |         await self.delay_request()
141 | 
142 |         # Copy the Request history so that it isn't a pointer.
143 |         history = copy.deepcopy(self.history)
144 | 
145 |         # Make sure that retry is reset.
146 |         self.should_retry = False
147 |         response = None
148 |         start = time.perf_counter()
149 | 
150 |         try:
151 |             async with self._create_request() as resp:
152 |                 resp_recieved = time.perf_counter()
153 |                 self.req_latency = int((resp_recieved - start) * 1000)
154 |                 history.append(resp.url)
155 | 
156 |                 # Fail the response if the content length header is too large.
157 |                 content_length: int = int(resp.headers.get(hdrs.CONTENT_LENGTH, "0"))
158 |                 if content_length > self.max_content_length:
159 |                     logger.debug(
160 |                         "Content-Length of Response header %d greater than max %d: %s",
161 |                         content_length,
162 |                         self.max_content_length,
163 |                         self,
164 |                     )
165 |                     return self._failed_response(413)
166 | 
167 |                 # Read the response content, and fail the response if the actual content size is too large.
168 |                 content_read, actual_content_length = await self._read_response(resp)
169 |                 if not content_read:
170 |                     return self._failed_response(413)
171 | 
172 |                 if content_length and content_length != actual_content_length:
173 |                     logger.debug(
174 |                         "Header Content-Length %d different from actual content-length %d: %s",
175 |                         content_length,
176 |                         actual_content_length,
177 |                         self,
178 |                     )
179 | 
180 |                 # Set encoding automatically from response if not specified.
181 |                 if not self.encoding:
182 |                     self.encoding = resp.get_encoding()
183 | 
184 |                 # Read response content
185 |                 try:
186 |                     # Read response content as text
187 |                     resp_text = await resp.text(encoding=self.encoding)
188 | 
189 |                     # Attempt to read response content as JSON
190 |                     resp_json = await self._read_json(resp_text)
191 |                 # If response content can't be decoded then neither text or JSON can be set.
192 |                 except UnicodeDecodeError:
193 |                     resp_text = None
194 |                     resp_json = None
195 | 
196 |                 # Close the asyncio response
197 |                 if not resp.closed:
198 |                     resp.close()
199 | 
200 |                 self.content_read = int((time.perf_counter() - resp_recieved) * 1000)
201 | 
202 |                 response = Response(
203 |                     url=resp.url,
204 |                     method=resp.method,
205 |                     encoding=self.encoding,
206 |                     status_code=resp.status,
207 |                     history=history,
208 |                     text=resp_text,
209 |                     data=resp._body,
210 |                     json=resp_json,
211 |                     headers=resp.headers,
212 |                     xml_parser=self._parse_xml,
213 |                     cookies=resp.cookies,
214 |                     redirect_history=resp.history,
215 |                     content_length=actual_content_length,
216 |                     meta=copy.copy(self.cb_kwargs),
217 |                 )
218 | 
219 |                 # Raise exception after the Response object is created, because we only catch TimeoutErrors and
220 |                 # asyncio.ClientResponseErrors, and there may be valid data otherwise.
221 |                 resp.raise_for_status()
222 | 
223 |         except asyncio.TimeoutError:
224 |             logger.debug("Failed fetch: url=%s reason=timeout", self.url)
225 |             history.append(self.url)
226 |             response = self._failed_response(408, history)
227 |         except aiohttp.ClientResponseError as e:
228 |             logger.debug("Failed fetch: url=%s reason=%s", self.url, e.message)
229 |             if not response:
230 |                 response = self._failed_response(e.status, history)
231 |         except Exception as e:
232 |             logger.debug("Failed fetch: url=%s reason=%s", self.url, e)
233 |             if isinstance(e, CancelledError) and not response:
234 |                 response = self._failed_response(499, history)
235 |         finally:
236 |             self.has_run = True
237 |             # Make sure there is a valid Response object.
238 |             if not response:
239 |                 response = self._failed_response(500, history)
240 | 
241 |             # Tell the crawler to retry this Request
242 |             if response.status_code in [429, 503, 408]:
243 |                 self.set_retry()
244 | 
245 |             return response
246 | 
247 |     def _create_request(self):
248 |         """
249 |         Create an asyncio HTTP Request.
250 | 
251 |         :return: asyncio HTTP Request
252 |         """
253 |         if self.method.upper() == "GET":
254 |             return self.request_session.get(
255 |                 self.url, headers=self.headers, timeout=self.timeout, params=self.params
256 |             )
257 |         elif self.method.upper() == "POST":
258 |             return self.request_session.post(
259 |                 self.url,
260 |                 headers=self.headers,
261 |                 timeout=self.timeout,
262 |                 params=self.params,
263 |                 data=self.data,
264 |                 json=self.json_data,
265 |             )
266 |         else:
267 |             raise ValueError(
268 |                 "HTTP method %s is not valid. Must be GET or POST", self.method
269 |             )
270 | 
271 |     async def _read_response(self, resp) -> Tuple[bool, int]:
272 |         """
273 |         Read HTTP Response content as bytes.
274 | 
275 |         :param resp: asyncio HTTP Response
276 |         :return: Tuple (read status, content length in bytes)
277 |         """
278 |         body: bytes = b""
279 |         try:
280 |             async for chunk in resp.content.iter_chunked(1024):
281 |                 if not chunk:
282 |                     break
283 |                 body += chunk
284 |                 if len(body) > self.max_content_length:
285 |                     logger.debug(
286 |                         "Content Length of Response body greater than max %d: %s",
287 |                         self.max_content_length,
288 |                         self,
289 |                     )
290 |                     return False, 0
291 |         except (IncompleteReadError, LimitOverrunError) as e:
292 |             logger.exception("Failed to read Response content: %s: %s", self, e)
293 |             return False, 0
294 |         resp._body = body
295 |         return True, len(body)
296 | 
297 |     @staticmethod
298 |     async def _read_json(resp_text: Union[str, None]) -> Optional[dict]:
299 |         """
300 |         Attempt to read Response content as JSON.
301 | 
302 |         :param resp_text: HTTP response context as text string
303 |         :return: JSON dict or None
304 |         """
305 | 
306 |         # If the text hasn't been parsed then we won't be able to parse JSON either.
307 |         if not resp_text:
308 |             return None
309 | 
310 |         stripped = resp_text.strip()  # type: ignore
311 |         if not stripped:
312 |             return None
313 | 
314 |         try:
315 |             return json.loads(stripped)
316 |         except ValueError:
317 |             return None
318 | 
319 |     def _failed_response(
320 |         self, status: int, history: List[URL] = None, headers=None
321 |     ) -> Response:
322 |         """
323 |         Create a failed Response object with the provided Status Code.
324 | 
325 |         :param status: HTTP Status Code
326 |         :param history: Response History as list of URLs
327 |         :param headers: Response Headers
328 |         :return: Failed Response object
329 |         """
330 |         return Response(
331 |             url=self.url,
332 |             method=self.method,
333 |             encoding=self.encoding,
334 |             history=history or [],
335 |             status_code=status,
336 |             headers=headers or {},
337 |         )
338 | 
339 |     async def _parse_xml(self, response_text: str) -> Any:
340 |         """
341 |         Use provided XML Parsers method to attempt to parse Response content as XML.
342 | 
343 |         :param response_text: Response content as text string.
344 |         :return: Response content as parsed XML. Type depends on XML parser.
345 |         """
346 |         try:
347 |             return await self._xml_parser(response_text)
348 |         except Exception as e:
349 |             logger.exception("Error parsing response xml: %s", e)
350 |             return None
351 | 
352 |     def set_retry(self) -> None:
353 |         """
354 |         Set the Request to retry.
355 |         """
356 |         if self._num_retries < self._max_retries:
357 |             self.should_retry = True
358 |             self._num_retries += 1
359 |             self.delay = self._num_retries * 1
360 | 
361 |     async def delay_request(self) -> None:
362 |         """
363 |         Delay the request by sleeping.
364 |         """
365 |         if self.delay > 0:
366 |             # Sleep for the delay plus up to one extra second of random time, to spread out requests.
367 |             await asyncio.sleep(self.delay + random())
368 | 
369 |     def __repr__(self):
370 |         return f"{self.__class__.__name__}({str(self.url)})"
371 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/feed_spider/feed_info_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import datetime, date
  3 | from statistics import mean
  4 | from types import AsyncGeneratorType
  5 | from typing import Tuple, List, Union, Dict
  6 | 
  7 | import feedparser
  8 | import time
  9 | from aiohttp import hdrs
 10 | from bs4 import BeautifulSoup
 11 | from yarl import URL
 12 | 
 13 | from feedsearch_crawler.crawler import ItemParser, Request, Response, to_string
 14 | from feedsearch_crawler.crawler.lib import headers_to_dict, remove_www
 15 | from feedsearch_crawler.feed_spider.favicon import Favicon
 16 | from feedsearch_crawler.feed_spider.feed_info import FeedInfo
 17 | from feedsearch_crawler.feed_spider.lib import (
 18 |     parse_header_links,
 19 |     datestring_to_utc_datetime,
 20 |     create_content_type,
 21 |     ParseTypes,
 22 | )
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class FeedInfoParser(ItemParser):
 28 |     async def parse_item(
 29 |         self, request: Request, response: Response, *args, **kwargs
 30 |     ) -> AsyncGeneratorType:
 31 |         logger.info("Parsing: Feed %s", response.url)
 32 | 
 33 |         if "parse_type" not in kwargs:
 34 |             raise ValueError("type keyword argument is required")
 35 | 
 36 |         parse_type = kwargs["parse_type"]
 37 | 
 38 |         content_type = create_content_type(
 39 |             parse_type,
 40 |             response.encoding,
 41 |             response.headers.get(hdrs.CONTENT_TYPE, "").lower(),
 42 |         )
 43 | 
 44 |         item = FeedInfo(url=response.url, content_type=content_type)
 45 | 
 46 |         # Check link headers first for WebSub content discovery
 47 |         # https://www.w3.org/TR/websub/#discovery
 48 |         if response.headers:
 49 |             item.hubs, item.self_url = self.header_links(response.headers)
 50 | 
 51 |         try:
 52 |             valid_feed = False
 53 | 
 54 |             if parse_type == ParseTypes.JSON:
 55 |                 valid_feed = self.parse_json(item, response.json)
 56 |             elif parse_type == ParseTypes.XML:
 57 |                 valid_feed = self.parse_xml(
 58 |                     item,
 59 |                     response.data,
 60 |                     response.encoding,
 61 |                     headers_to_dict(response.headers),
 62 |                 )
 63 | 
 64 |             if not valid_feed:
 65 |                 logger.debug("Invalid Feed: %s", item)
 66 |                 return
 67 |         except Exception as e:
 68 |             logger.exception("Failed to parse feed %s, Error: %s", item, e)
 69 |             return
 70 | 
 71 |         if item.favicon and self.crawler.favicon_data_uri:
 72 |             favicon = Favicon(
 73 |                 url=item.favicon,
 74 |                 priority=1,
 75 |             )
 76 |             yield self.follow(
 77 |                 item.favicon,
 78 |                 self.crawler.parse_favicon_data_uri,
 79 |                 cb_kwargs=dict(favicon=favicon),
 80 |             )
 81 | 
 82 |         self.validate_self_url(item)
 83 | 
 84 |         item.content_length = response.content_length
 85 |         self.score_item(item, response.history[0])
 86 |         yield item
 87 | 
 88 |     def parse_xml(
 89 |         self, item: FeedInfo, data: Union[str, bytes], encoding: str, headers: Dict
 90 |     ) -> bool:
 91 |         """
 92 |         Get info from XML (RSS or ATOM) feed.
 93 |         """
 94 | 
 95 |         # Parse data with feedparser
 96 |         try:
 97 |             parsed: dict = self.parse_raw_data(data, encoding, headers)
 98 |         except Exception as e:
 99 |             logger.exception("Unable to parse feed %s: %s", item, e)
100 |             return False
101 | 
102 |         if not parsed:
103 |             logger.warning("No valid feed data for %s", item)
104 |             return False
105 | 
106 |         if parsed.get("bozo") == 1:
107 |             bozo_exception = parsed.get("bozo_exception", None)
108 |             if isinstance(bozo_exception, feedparser.CharacterEncodingOverride):
109 |                 item.bozo = 1
110 |             elif isinstance(
111 |                 bozo_exception,
112 |                 (feedparser.CharacterEncodingUnknown, feedparser.UndeclaredNamespace),
113 |             ):
114 |                 logger.warning("No valid feed data for %s: %s", item, bozo_exception)
115 |                 return False
116 | 
117 |         feed = parsed.get("feed")
118 |         if not feed:
119 |             return False
120 |         if not parsed.get("entries"):
121 |             return False
122 | 
123 |         # Only search if no hubs already present from headers
124 |         if not item.hubs:
125 |             item.hubs, item.self_url = self.websub_links(feed)
126 | 
127 |         if item.hubs and item.self_url:
128 |             item.is_push = True
129 | 
130 |         item.version = parsed.get("version")
131 |         item.title = self.feed_title(feed)
132 |         item.description = self.feed_description(feed)
133 |         item.is_podcast = self.is_podcast(parsed)
134 | 
135 |         try:
136 |             dates = []
137 |             now_date = datetime.utcnow().date()
138 | 
139 |             entries = parsed.get("entries", [])
140 |             item.item_count = len(entries)
141 | 
142 |             dates.extend(
143 |                 FeedInfoParser.entry_dates(entries, ["updated", "published"], now_date)
144 |             )
145 | 
146 |             if dates:
147 |                 item.last_updated = sorted(dates, reverse=True)[0]
148 |                 item.velocity = self.entry_velocity(dates)
149 |             elif feed.get("updated"):
150 |                 item.last_updated = datestring_to_utc_datetime(feed.get("updated"))
151 |         except Exception as e:
152 |             logger.exception("Unable to get feed published date: %s", e)
153 |             pass
154 | 
155 |         return True
156 | 
157 |     def parse_json(self, item: FeedInfo, data: dict) -> bool:
158 |         """
159 |         Get info from JSON feed.
160 | 
161 |         :param item: FeedInfo object
162 |         :param data: JSON object
163 |         :return: None
164 |         """
165 |         item.version = data.get("version")
166 |         if "https://jsonfeed.org/version/" not in item.version:
167 |             item.bozo = 1
168 |             return False
169 | 
170 |         if not data.get("items"):
171 |             return False
172 | 
173 |         item.title = data.get("title")
174 |         item.description = data.get("description")
175 | 
176 |         favicon = data.get("favicon")
177 |         if favicon:
178 |             item.favicon = URL(favicon)
179 | 
180 |         # Only search if no hubs already present from headers
181 |         if not item.hubs:
182 |             try:
183 |                 item.hubs = list(hub.get("url") for hub in data.get("hubs", []))
184 |             except (IndexError, AttributeError):
185 |                 pass
186 | 
187 |         if item.hubs:
188 |             item.is_push = True
189 | 
190 |         try:
191 |             dates = []
192 |             now_date: date = datetime.utcnow().date()
193 | 
194 |             entries = data.get("items", [])
195 |             item.item_count = len(entries)
196 | 
197 |             dates.extend(
198 |                 FeedInfoParser.entry_dates(
199 |                     entries, ["date_modified", "date_published"], now_date
200 |                 )
201 |             )
202 | 
203 |             if dates:
204 |                 item.last_updated = sorted(dates, reverse=True)[0]
205 |                 item.velocity = self.entry_velocity(dates)
206 |         except Exception as e:
207 |             logger.exception("Unable to get feed published date: %s", e)
208 |             pass
209 | 
210 |         return True
211 | 
212 |     @staticmethod
213 |     def parse_raw_data(
214 |         raw_data: Union[str, bytes], encoding: str = "utf-8", headers: Dict = None
215 |     ) -> Dict:
216 |         """
217 |         Loads the raw RSS/Atom XML data.
218 |         Returns feedparser Dict.
219 |         https://pythonhosted.org/feedparser/
220 | 
221 |         :param raw_data: RSS/Atom XML feed
222 |         :type raw_data: str
223 |         :param encoding: Character encoding of raw_data
224 |         :type encoding: str
225 |         :param headers: Response headers
226 |         :return: Dict
227 |         """
228 |         if not encoding:
229 |             encoding = "utf-8"
230 | 
231 |         h = {}
232 |         if headers:
233 |             if isinstance(headers, dict):
234 |                 h = headers
235 |             else:
236 |                 try:
237 |                     h.update({k.lower(): v for (k, v) in headers.items()})
238 |                 except KeyError:
239 |                     pass
240 | 
241 |             h.pop("content-encoding", None)
242 | 
243 |         try:
244 |             start = time.perf_counter()
245 | 
246 |             if isinstance(raw_data, str):
247 |                 raw_data: bytes = raw_data.encode(encoding)
248 | 
249 |             raw_data = raw_data.strip()
250 |             content_length = len(raw_data)
251 | 
252 |             # We want to pass data into feedparser as bytes, otherwise if we accidentally pass a url string
253 |             # it will attempt a fetch
254 |             data = feedparser.parse(raw_data, response_headers=h)
255 | 
256 |             dur = int((time.perf_counter() - start) * 1000)
257 |             logger.debug("Feed Parse: size=%s dur=%sms", content_length, dur)
258 | 
259 |             return data
260 |         except Exception as e:
261 |             logger.exception("Could not parse RSS data: %s", e)
262 | 
263 |     def feed_title(self, feed: dict) -> str:
264 |         """
265 |         Get feed title
266 | 
267 |         :param feed: feed dict
268 |         :return: str
269 |         """
270 |         title = feed.get("title", None)
271 |         if not title:
272 |             return ""
273 |         return self.clean_title(title)
274 | 
275 |     def clean_title(self, title: str) -> str:
276 |         """
277 |         Cleans title string, and shortens if too long.
278 |         Have had issues with dodgy feed titles.
279 | 
280 |         :param title: Title string
281 |         :return: str
282 |         """
283 |         try:
284 |             title = BeautifulSoup(title, self.crawler.htmlparser).get_text()
285 |             if len(title) > 1024:
286 |                 title = title[:1020] + "..."
287 |             return title
288 |         except Exception as ex:
289 |             logger.exception("Failed to clean title: %s", ex)
290 |             return ""
291 | 
292 |     @staticmethod
293 |     def is_podcast(parsed: dict) -> bool:
294 |         """
295 |         Check if the feed is a Podcast.
296 | 
297 |         :param parsed: Feedparser dict
298 |         :return: bool
299 |         """
300 |         if not parsed:
301 |             return False
302 | 
303 |         has_itunes: bool = "itunes" in parsed.get("namespaces", {})
304 | 
305 |         has_enclosures = False
306 | 
307 |         for entry in parsed.get("entries", []):
308 |             for enclosure in entry.get("enclosures", []):
309 |                 if "audio" in enclosure.get("type"):
310 |                     has_enclosures = True
311 | 
312 |         return has_itunes and has_enclosures
313 | 
314 |     @staticmethod
315 |     def feed_description(feed: dict) -> str:
316 |         """
317 |         Get feed description.
318 | 
319 |         :param feed: feed dict
320 |         :return: str
321 |         """
322 |         subtitle = feed.get("subtitle", None)
323 |         if subtitle:
324 |             return subtitle
325 |         return feed.get("description", None)
326 | 
327 |     @staticmethod
328 |     def websub_links(feed: dict) -> Tuple[List[str], str]:
329 |         """
330 |         Returns a tuple containing the hub url and the self url for
331 |         a parsed feed.
332 | 
333 |         :param feed: An RSS feed parsed by feedparser
334 |         :type feed: dict
335 |         :return: tuple
336 |         """
337 |         links = feed.get("links", [])
338 |         return FeedInfoParser.find_hubs_and_self_links(links)
339 | 
340 |     @staticmethod
341 |     def header_links(headers: dict) -> Tuple[List[str], str]:
342 |         """
343 |         Attempt to get self and hub links from HTTP headers
344 |         https://www.w3.org/TR/websub/#x4-discovery
345 | 
346 |         :param headers: Dict of HTTP headers
347 |         :return: None
348 |         """
349 |         link_header = headers.get("Link")
350 |         links: list = []
351 | 
352 |         if link_header:
353 |             links = parse_header_links(to_string(link_header))
354 | 
355 |         return FeedInfoParser.find_hubs_and_self_links(links)
356 | 
357 |     @staticmethod
358 |     def find_hubs_and_self_links(links: List[dict]) -> Tuple[List[str], str]:
359 |         """
360 |         Parses a list of links into self and hubs urls
361 | 
362 |         :param links: List of parsed HTTP Link Dicts
363 |         :return: Tuple
364 |         """
365 |         hub_urls: List[str] = []
366 |         self_url: str = ""
367 | 
368 |         if not links:
369 |             return [], ""
370 | 
371 |         for link in links:
372 |             try:
373 |                 if link["rel"] == "hub":
374 |                     href: str = link["href"]
375 |                     hub_urls.append(href)
376 |                 elif link["rel"] == "self":
377 |                     self_url = link["href"]
378 |             except KeyError:
379 |                 continue
380 | 
381 |         return hub_urls, self_url
382 | 
383 |     @staticmethod
384 |     def score_item(item: FeedInfo, original_url: URL):
385 |         score = 0
386 | 
387 |         url_str = str(item.url).lower()
388 | 
389 |         # -- Score Decrement --
390 | 
391 |         if original_url:
392 |             host = remove_www(original_url.host)
393 | 
394 |             if host not in item.url.host:
395 |                 score -= 20
396 | 
397 |         # Decrement the score by every extra path in the url
398 |         parts_len = len(item.url.parts)
399 |         if parts_len > 2:
400 |             score -= (parts_len - 2) * 2
401 | 
402 |         if item.bozo:
403 |             score -= 20
404 |         if not item.description:
405 |             score -= 10
406 |         if "georss" in url_str:
407 |             score -= 10
408 |         if "alt" in url_str:
409 |             score -= 7
410 |         if "comments" in url_str or "comments" in item.title.lower():
411 |             score -= 15
412 |         if "feedburner" in url_str:
413 |             score -= 10
414 | 
415 |         # -- Score Increment --
416 |         if item.url.scheme == "https":
417 |             score += 10
418 |         if item.is_push:
419 |             score += 10
420 |         if "index" in url_str:
421 |             score += 30
422 | 
423 |         if "comments" in url_str or "comments" in item.title.lower():
424 |             score -= 15
425 |         else:
426 |             score += int(item.velocity)
427 | 
428 |         if any(map(url_str.count, ["/home", "/top", "/most", "/magazine"])):
429 |             score += 10
430 | 
431 |         kw = ["atom", "rss", ".xml", "feed", "rdf"]
432 |         for p, t in zip(range(len(kw) * 2, 0, -2), kw):
433 |             if t in url_str:
434 |                 score += p
435 | 
436 |         item.score = score
437 | 
438 |     @staticmethod
439 |     def entry_dates(entries: List[Dict], date_names: List[str], current_date: date):
440 |         """
441 |         Return published or updated dates from feed entries.
442 | 
443 |         :param entries: List of feed entries as dicts.
444 |         :param date_names: List of key names of entry published or updated values.
445 |         :param current_date: The current date.
446 |         :return: generator that returns datetimes.
447 |         """
448 |         for entry in entries:
449 |             for name in date_names:
450 |                 try:
451 |                     entry_date: datetime = datestring_to_utc_datetime(entry[name])
452 |                     if entry_date.date() <= current_date:
453 |                         yield entry_date
454 |                 except (KeyError, ValueError):
455 |                     pass
456 | 
457 |     @staticmethod
458 |     def entry_velocity(dates: List[datetime]) -> float:
459 |         """
460 |         Calculate velocity of posted entries, returns a float of the average number of entries posted per day.
461 | 
462 |         :param dates: List of entry dates
463 |         :return: Average entries per day
464 |         """
465 |         if not dates or len(dates) < 3:
466 |             return 0
467 | 
468 |         dates = sorted(dates)
469 |         deltas = []
470 |         previous_date: datetime = dates[0]
471 | 
472 |         for current_date in dates[1:]:
473 |             if current_date == previous_date:
474 |                 continue
475 |             delta = current_date - previous_date
476 |             deltas.append(delta.total_seconds())
477 |             previous_date = current_date
478 | 
479 |         if not deltas:
480 |             return 0
481 | 
482 |         mean_seconds_delta = mean(deltas)
483 | 
484 |         result = round(86400 / mean_seconds_delta, 3)
485 |         return result
486 | 
487 |     @staticmethod
488 |     def validate_self_url(item: FeedInfo) -> None:
489 |         """
490 |         Validate the self url
491 | 
492 |         :param item: FeedInfo item
493 |         """
494 |         try:
495 |             item.self_url = URL(item.self_url)
496 |         except ValueError:
497 |             item.self_url = ""
498 |             return
499 | 
500 |         if item.self_url and item.self_url != item.url:
501 |             # Handle a case where the item url contains a trailing slash and the self url doesn't.
502 |             if str(item.url).strip("/") == str(item.self_url):
503 |                 item.url = URL(str(item.url).strip("/"))
504 |                 return
505 | 
506 |             # The self url should be an absolute url.
507 |             if not item.self_url.is_absolute():
508 |                 if str(item.self_url) in str(item.url):
509 |                     item.self_url = item.url
510 |                 else:
511 |                     item.self_url = ""
512 | 


--------------------------------------------------------------------------------
/feedsearch_crawler/crawler/crawler.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import copy
  3 | import inspect
  4 | import logging
  5 | from abc import ABC, abstractmethod
  6 | from collections import OrderedDict
  7 | from fnmatch import fnmatch
  8 | from statistics import harmonic_mean, median
  9 | from types import AsyncGeneratorType
 10 | from typing import List, Any, Dict, Set
 11 | from typing import Union
 12 | 
 13 | import aiohttp
 14 | import time
 15 | from aiohttp import ClientTimeout
 16 | from yarl import URL
 17 | 
 18 | from feedsearch_crawler.crawler.duplicatefilter import DuplicateFilter
 19 | from feedsearch_crawler.crawler.item import Item
 20 | from feedsearch_crawler.crawler.lib import (
 21 |     coerce_url,
 22 |     ignore_aiohttp_ssl_error,
 23 |     Stats,
 24 |     CallbackResult,
 25 |     CrawlerPriorityQueue,
 26 |     parse_href_to_url,
 27 | )
 28 | from feedsearch_crawler.crawler.queueable import Queueable
 29 | from feedsearch_crawler.crawler.request import Request
 30 | from feedsearch_crawler.crawler.response import Response
 31 | from feedsearch_crawler.crawler.trace import add_trace_config
 32 | 
 33 | try:
 34 |     import uvloop
 35 | 
 36 |     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 37 | except ImportError:
 38 |     uvloop = None
 39 |     pass
 40 | 
 41 | 
 42 | logger = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | class Crawler(ABC):
 46 | 
 47 |     # Class Name of the Duplicate Filter.
 48 |     # May be overridden to use different Duplicate Filter.
 49 |     # Not an instantiation of the class.
 50 |     duplicate_filter_class = DuplicateFilter
 51 | 
 52 |     # Callback to be run after all workers are finished.
 53 |     post_crawl_callback = None
 54 | 
 55 |     # URLs to start the crawl.
 56 |     start_urls = []
 57 |     # Domain patterns that are allowed to be crawled.
 58 |     allowed_domains = []
 59 | 
 60 |     # Max number of concurrent http requests.
 61 |     concurrency: int = 10
 62 |     # Max size of incoming http response content.
 63 |     max_content_length = 1024 * 1024 * 10
 64 |     # Max crawl depth. i.e. The max length of the response history.
 65 |     max_depth: int = 4
 66 |     # Max callback recursion depth, to prevent accidental infinite recursion from AsyncGenerators.
 67 |     max_callback_recursion: int = 10
 68 |     # Time in seconds to delay each HTTP request.
 69 |     delay: float = 0
 70 | 
 71 |     # List of worker tasks.
 72 |     _workers = []
 73 | 
 74 |     # ClientSession for requests. Created on Crawl start.
 75 |     _session: aiohttp.ClientSession
 76 |     # Task queue for Requests. Created on Crawl start.
 77 |     _request_queue: CrawlerPriorityQueue
 78 |     # Semaphore for controlling HTTP Request concurrency.
 79 |     _semaphore: asyncio.Semaphore
 80 | 
 81 |     def __init__(
 82 |         self,
 83 |         start_urls: List[str] = None,
 84 |         allowed_domains: List[str] = None,
 85 |         concurrency: int = 10,
 86 |         total_timeout: Union[float, ClientTimeout] = 30,
 87 |         request_timeout: Union[float, ClientTimeout] = 5,
 88 |         user_agent: str = "",
 89 |         max_content_length: int = 1024 * 1024 * 10,
 90 |         max_depth: int = 10,
 91 |         headers: dict = None,
 92 |         allowed_schemes: List[str] = None,
 93 |         delay: float = 0.5,
 94 |         max_retries: int = 3,
 95 |         ssl: bool = False,
 96 |         trace: bool = False,
 97 |         *args,
 98 |         **kwargs,
 99 |     ):
100 |         """
101 |         Base class for a WebCrawler implementation.
102 | 
103 |         :param allowed_schemes: List of strings of allowed Request URI schemes. e.g. ["http", "https"]
104 |         :param start_urls: List of initial URLs to crawl.
105 |         :param allowed_domains: List of domain patterns that are allowed. Uses Unix shell-style wildcards.
106 |         :param concurrency: Max number of workers and of concurrent HTTP requests.
107 |         :param total_timeout: Total aiohttp ClientSession timeout in seconds.
108 |             Crawl will end if this timeout is triggered.
109 |         :param request_timeout: Total timeout in seconds for each individual HTTP request.
110 |         :param user_agent: Default User-Agent for HTTP requests.
111 |         :param max_content_length: Max size in bytes of incoming http response content.
112 |         :param max_depth: Max crawl depth. i.e. The max length of the response history.
113 |         :param headers: Default HTTP headers to be included in each request.
114 |         :param delay: Time in seconds to delay each HTTP request.
115 |         :param max_retries: Maximum number of retries for each failed HTTP request.
116 |         :param ssl: Enables strict SSL checking.
117 |         :param trace: Enables aiohttp trace debugging.
118 |         :param args: Additional positional arguments for subclasses.
119 |         :param kwargs: Additional keyword arguments for subclasses.
120 |         """
121 |         self.start_urls = start_urls or []
122 |         self.allowed_domains = allowed_domains or []
123 | 
124 |         self.concurrency = concurrency
125 | 
126 |         if not isinstance(total_timeout, ClientTimeout):
127 |             total_timeout = aiohttp.ClientTimeout(total=total_timeout)
128 |         if not isinstance(request_timeout, ClientTimeout):
129 |             request_timeout = aiohttp.ClientTimeout(total=request_timeout)
130 | 
131 |         self.total_timeout: ClientTimeout = total_timeout
132 |         self.request_timeout: ClientTimeout = request_timeout
133 | 
134 |         self.max_content_length = max_content_length
135 |         self.max_depth = max_depth
136 | 
137 |         self.user_agent = user_agent or (
138 |             "Mozilla/5.0 (compatible; Feedsearch-Crawler; +https://pypi.org/project/feedsearch-crawler)"
139 |         )
140 | 
141 |         self.headers = {"User-Agent": self.user_agent, "Upgrade-Insecure-Requests": "1"}
142 | 
143 |         if headers:
144 |             self.headers = {**self.headers, **headers}
145 | 
146 |         self.allowed_schemes = allowed_schemes
147 |         self.delay = delay
148 |         self.max_retries = max_retries
149 |         self._ssl = ssl
150 |         self._trace = trace
151 | 
152 |         # Default set for parsed items.
153 |         self.items: set = set()
154 | 
155 |         # URL Duplicate Filter instance.
156 |         self._duplicate_filter = self.duplicate_filter_class()
157 | 
158 |         # List of total durations in Milliseconds for the total handling time of all Requests.
159 |         self._stats_request_durations = []
160 |         # List of total duration in Milliseconds of all HTTP requests.
161 |         self._stats_request_latencies = []
162 |         # List of Content Length in bytes of all Responses.
163 |         self._stats_response_content_lengths = []
164 |         # List of time in Milliseconds that each item spend on the queue.
165 |         self._stats_queue_wait_times = []
166 |         # List of the size of the queue each time an item was popped off the queue.
167 |         self._stats_queue_sizes = []
168 | 
169 |         # Initialise Crawl Statistics.
170 |         self.stats: dict = {
171 |             Stats.REQUESTS_QUEUED: 0,
172 |             Stats.REQUESTS_SUCCESSFUL: 0,
173 |             Stats.REQUESTS_FAILED: 0,
174 |             Stats.CONTENT_LENGTH_TOTAL: 0,
175 |             Stats.CONTENT_LENGTH_AVG: 0,
176 |             Stats.CONTENT_LENGTH_MIN: 0,
177 |             Stats.CONTENT_LENGTH_MAX: 0,
178 |             Stats.CONTENT_LENGTH_MEDIAN: 0,
179 |             Stats.ITEMS_PROCESSED: 0,
180 |             Stats.URLS_SEEN: 0,
181 |             Stats.REQUESTS_DURATION_AVG: 0,
182 |             Stats.REQUESTS_DURATION_MAX: 0,
183 |             Stats.REQUESTS_DURATION_MIN: 0,
184 |             Stats.REQUESTS_DURATION_TOTAL: 0,
185 |             Stats.REQUESTS_DURATION_MEDIAN: 0,
186 |             Stats.TOTAL_DURATION: 0,
187 |             Stats.STATUS_CODES: {},
188 |             Stats.QUEUE_WAIT_MAX: 0,
189 |             Stats.QUEUE_WAIT_MIN: 0,
190 |             Stats.QUEUE_WAIT_AVG: 0,
191 |             Stats.QUEUE_WAIT_MEDIAN: 0,
192 |             Stats.QUEUE_SIZE_MAX: 0,
193 |             Stats.QUEUE_SIZE_AVG: 0,
194 |             Stats.QUEUE_SIZE_MEDIAN: 0,
195 |             Stats.QUEUED_TOTAL: 0,
196 |             Stats.REQUESTS_RETRIED: 0,
197 |         }
198 | 
199 |     async def _handle_request(self, request: Request) -> None:
200 |         """
201 |         Handle fetching of Requests and processing of Request callbacks.
202 | 
203 |         :param request: Request
204 |         :return: None
205 |         """
206 |         try:
207 |             if request.has_run and not request.should_retry:
208 |                 logger.warning("%s has already run", request)
209 |                 return
210 | 
211 |             start = time.perf_counter()
212 | 
213 |             # Fetch the request and run its callback
214 |             # results, response = await request.fetch_callback(self._semaphore)
215 |             results, response = await request.fetch_callback()
216 | 
217 |             dur = int((time.perf_counter() - start) * 1000)
218 |             self._stats_request_durations.append(dur)
219 |             self._stats_request_latencies.append(request.req_latency)
220 |             logger.debug(
221 |                 "Fetched: url=%s dur=%dms latency=%dms read=%dms status=%s prev=%s",
222 |                 response.url,
223 |                 dur,
224 |                 request.req_latency,
225 |                 request.content_read,
226 |                 response.status_code,
227 |                 response.originator_url,
228 |             )
229 | 
230 |             if response.ok:
231 |                 self.stats[Stats.REQUESTS_SUCCESSFUL] += 1
232 |             else:
233 |                 self.stats[Stats.REQUESTS_FAILED] += 1
234 | 
235 |             if response.status_code in self.stats[Stats.STATUS_CODES]:
236 |                 self.stats[Stats.STATUS_CODES][response.status_code] += 1
237 |             else:
238 |                 self.stats[Stats.STATUS_CODES][response.status_code] = 1
239 | 
240 |             self._stats_response_content_lengths.append(response.content_length)
241 | 
242 |             # Mark the Response URL as seen in the duplicate filter, as it may be different from the Request URL
243 |             # due to redirects.
244 |             await self._duplicate_filter.url_seen(response.url, response.method)
245 | 
246 |             # Add callback results to the queue for processing.
247 |             if results:
248 |                 self._put_queue(CallbackResult(results, 0))
249 | 
250 |             # Add Request back to the queue for retrying.
251 |             if request.should_retry:
252 |                 self.stats[Stats.REQUESTS_RETRIED] += 1
253 |                 self._put_queue(request)
254 | 
255 |         except asyncio.CancelledError as e:
256 |             logger.debug("Cancelled: %s, %s", request, e)
257 |         except Exception as e:
258 |             logger.exception("Exception during %s: %s", request, e)
259 |         finally:
260 |             return
261 | 
262 |     async def _process_request_callback_result(
263 |         self, result: Any, callback_recursion: int = 0
264 |     ) -> None:
265 |         """
266 |         Process the Request callback result depending on the result type.
267 |         Request callbacks may contain nested iterators.
268 | 
269 |         :param result: Callback Result. May be an CallbackResult class, AsyncGenerator, Coroutine, Request, or Item.
270 |         :param callback_recursion: Incremented counter to limit this method's recursion.
271 |         :return: None
272 |         """
273 |         if callback_recursion >= self.max_callback_recursion:
274 |             logger.warning(
275 |                 "Max callback recursion of %d reached", self.max_callback_recursion
276 |             )
277 |             return
278 | 
279 |         try:
280 |             # If a CallbackResult class is passed, process the result values from within the class.
281 |             if isinstance(result, CallbackResult):
282 |                 await self._process_request_callback_result(
283 |                     result.result, result.callback_recursion
284 |                 )
285 |             # For async generators, put each value back on the queue for processing.
286 |             # This will happen recursively until the end of the recursion chain or max_callback_recursion is reached.
287 |             elif inspect.isasyncgen(result):
288 |                 async for value in result:
289 |                     if value:
290 |                         self._put_queue(CallbackResult(value, callback_recursion + 1))
291 |             # For coroutines, await the result then put the value back on the queue for further processing.
292 |             elif inspect.iscoroutine(result):
293 |                 value = await result
294 |                 self._put_queue(CallbackResult(value, callback_recursion + 1))
295 |             # Requests are put onto the queue to be fetched.
296 |             elif isinstance(result, Request):
297 |                 self._process_request(result)
298 | 
299 |             # Items are handled by the implementing Class.
300 |             elif isinstance(result, Item):
301 |                 await self.process_item(result)
302 |                 self.stats[Stats.ITEMS_PROCESSED] += 1
303 |         except Exception as e:
304 |             logger.exception(e)
305 | 
306 |     def _process_request(self, request: Request) -> None:
307 |         """
308 |         Process a Request onto the Request Queue.
309 | 
310 |         :param request: HTTP Request
311 |         :return: None
312 |         """
313 |         if not request:
314 |             return
315 | 
316 |         self.stats[Stats.REQUESTS_QUEUED] += 1
317 |         logger.debug("Queue Add: %s", request)
318 |         # Add the Request to the queue for processing.
319 |         self._put_queue(request)
320 | 
321 |     def is_allowed_domain(self, url: URL) -> bool:
322 |         """
323 |         Check that the URL host is in the list of allowed domain patterns.
324 |         Domain patterns are Unix shell-style wildcards.
325 |         https://docs.python.org/3/library/fnmatch.html
326 | 
327 |         :param url: URL object
328 |         :return: boolean
329 |         """
330 |         if not self.allowed_domains:
331 |             return True
332 | 
333 |         try:
334 |             if not url or not url.host:
335 |                 return False
336 |             host = url.host
337 |             for domain_pattern in self.allowed_domains:
338 |                 if fnmatch(host, domain_pattern):
339 |                     return True
340 |         except Exception as e:
341 |             logger.warning(e)
342 |         return False
343 | 
344 |     async def follow(
345 |         self,
346 |         url: Union[str, URL],
347 |         callback=None,
348 |         response: Response = None,
349 |         method: str = "GET",
350 |         delay: Union[float, None] = None,
351 |         priority: int = 0,
352 |         allow_domain: bool = False,
353 |         cb_kwargs: Dict = None,
354 |         max_content_length: int = None,
355 |         timeout: float = None,
356 |         retries: int = None,
357 |         **kwargs,
358 |     ) -> Union[Request, None]:
359 |         """
360 |         Follow a URL by creating an HTTP Request.
361 | 
362 |         If the URL is not absolute then it is joined with the previous Response URL.
363 |         The previous Response history is copied to the Request.
364 | 
365 |         Before a Request is followed, first check that the Request URL has not already been seen,
366 |         that the max URL depth has not been reached, and that the URI scheme is allowed.
367 | 
368 |         These checks are performed before the Request is created so that we don't yield multiple requests
369 |         to the same URL to the queue for further processing. We want to stop duplicates and invalid
370 |         requests as early as possible.
371 | 
372 |         :param url: URL to follow.
373 |         :param callback: Callback method to run if the Request is successful.
374 |         :param response: Previous Response that contained the Request URL.
375 |         :param kwargs: Optional Request keyword arguments. See Request for details.
376 |         :param method: HTTP method for Request.
377 |         :param delay: Optionally override the default delay for the Request.
378 |         :param priority: Optionally override the default priority of the Request.
379 |         :param allow_domain: Optionally override the allowed domains check.
380 |         :param max_content_length: Optionally override the maximum allowed size in bytes of Response body.
381 |         :param retries: Optionally override the number of Request retries.
382 |         :param timeout: Optionally override the Request timeout.
383 |         :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function.
384 |         :return: Request
385 |         """
386 |         original_url = copy.copy(url)
387 |         if isinstance(url, str):
388 |             url = parse_href_to_url(url)
389 | 
390 |         if not url:
391 |             logger.warning("Attempted to follow invalid URL: %s", original_url)
392 |             return
393 | 
394 |         history = []
395 |         if response:
396 |             # Join the URL to the Response URL if it doesn't contain a domain.
397 |             if not url.is_absolute() or not url.scheme:
398 |                 url = coerce_url(
399 |                     response.origin.join(url), default_scheme=response.scheme
400 |                 )
401 | 
402 |             # Restrict the depth of the Request chain to the maximum depth.
403 |             # This test happens before the URL duplicate check so that the URL might still be reachable by another path.
404 |             if self.max_depth and len(response.history) >= self.max_depth:
405 |                 logger.debug("Max Depth of '%d' reached: %s", self.max_depth, url)
406 |                 return
407 | 
408 |             # Copy the Response history so that it isn't a reference to a mutable object.
409 |             history = copy.deepcopy(response.history)
410 |         else:
411 |             if not url.is_absolute():
412 |                 logger.debug("URL should have domain: %s", url)
413 |                 return
414 | 
415 |             if not url.scheme:
416 |                 url = coerce_url(url)
417 | 
418 |         # The URL scheme must be in the list of allowed schemes.
419 |         if self.allowed_schemes and url.scheme not in self.allowed_schemes:
420 |             logger.debug("URI Scheme '%s' not allowed: %s", url.scheme, url)
421 |             return
422 | 
423 |         # The URL host must be in the list of allowed domains.
424 |         if not allow_domain and not self.is_allowed_domain(url):
425 |             logger.debug("Domain '%s' not allowed: %s", url.host, url)
426 |             return
427 | 
428 |         # Check if URL is not already seen, and add it to the duplicate filter seen list.
429 |         if await self._duplicate_filter.url_seen(url, method):
430 |             return
431 | 
432 |         request = Request(
433 |             url=url,
434 |             request_session=self._session,
435 |             history=history,
436 |             callback=callback,
437 |             xml_parser=self.parse_xml,
438 |             max_content_length=max_content_length or self.max_content_length,
439 |             timeout=timeout or self.request_timeout,
440 |             method=method,
441 |             delay=delay if isinstance(delay, float) else self.delay,
442 |             retries=retries or self.max_retries,
443 |             cb_kwargs=cb_kwargs,
444 |             **kwargs,
445 |         )
446 | 
447 |         # Override the Request priority only if the kwarg is provided.
448 |         if priority:
449 |             request.priority = priority
450 | 
451 |         return request
452 | 
453 |     @abstractmethod
454 |     async def process_item(self, item: Item) -> None:
455 |         """
456 |         Processed a parsed Item in some way. e.g. Add it to the Item set, or database, or send a signal.
457 | 
458 |         :param item: A parsed Item.
459 |         """
460 |         self.items.add(item)
461 | 
462 |     @abstractmethod
463 |     async def parse_xml(self, response_text: str) -> Any:
464 |         """
465 |         Parse Response text as XML.
466 |         Used to allow implementations to provide their own XML parser.
467 | 
468 |         :param response_text: Response text as string.
469 |         """
470 |         raise NotImplementedError("Not Implemented")
471 | 
472 |     @abstractmethod
473 |     async def parse(self, request: Request, response: Response) -> AsyncGeneratorType:
474 |         """
475 |         Parse an HTTP Response. Must yield Items, Requests, AsyncGenerators, or Coroutines.
476 | 
477 |         :param request: HTTP Request that created the Response.
478 |         :param response: HTTP Response.
479 |         """
480 |         raise NotImplementedError("Not Implemented")
481 | 
482 |     def _put_queue(self, queueable: Queueable) -> None:
483 |         """
484 |         Put an object that inherits from Queueable onto the Request Queue.
485 | 
486 |         :param queueable: An object that inherits from Queueable.
487 |         """
488 |         if not isinstance(queueable, Queueable):
489 |             raise ValueError("Object must inherit from Queueable Class")
490 | 
491 |         queueable.add_to_queue(self._request_queue)
492 |         self.stats[Stats.QUEUED_TOTAL] += 1
493 | 
494 |     async def _work(self, task_num):
495 |         """
496 |         Worker function for handling request queue items.
497 |         """
498 |         try:
499 |             while True:
500 |                 self._stats_queue_sizes.append(self._request_queue.qsize())
501 |                 item: Queueable = await self._request_queue.get()
502 |                 # logger.debug("Priority: %s Item: %s", item.priority, item)
503 |                 if item.get_queue_wait_time():
504 |                     # logger.debug(
505 |                     #     "Waited: %sms Item: %s", item.get_queue_wait_time(), item
506 |                     # )
507 |                     self._stats_queue_wait_times.append(item.get_queue_wait_time())
508 | 
509 |                 if self._session.closed:
510 |                     logger.debug("Session is closed. Cannot run %s", item)
511 |                     continue
512 | 
513 |                 try:
514 |                     # Fetch Request and handle callbacks
515 |                     if isinstance(item, Request):
516 |                         await self._handle_request(item)
517 |                     # Process Callback results
518 |                     elif isinstance(item, CallbackResult):
519 |                         await self._process_request_callback_result(
520 |                             item.result, item.callback_recursion
521 |                         )
522 |                 except Exception as e:
523 |                     logger.exception("Error handling item: %s : %s", item, e)
524 |                 finally:
525 |                     self._request_queue.task_done()
526 |         except asyncio.CancelledError:
527 |             logger.debug("Cancelled Worker: %s", task_num)
528 | 
529 |     @staticmethod
530 |     async def _run_callback(callback, *args, **kwargs) -> None:
531 |         """
532 |         Runs a callback function.
533 | 
534 |         :param callback: Function to run. May be async.
535 |         :param args: Positional arguments to pass to the function.
536 |         :param kwargs: Keyword arguments to pass to the function.
537 |         :return: None
538 |         """
539 |         if not callback:
540 |             return
541 |         if inspect.iscoroutinefunction(callback):
542 |             await callback(*args, **kwargs)
543 |         elif inspect.isfunction(callback):
544 |             callback(*args, **kwargs)
545 |         else:
546 |             logger.warning("Callback %s must be a coroutine or function", callback)
547 | 
548 |     def create_start_urls(self, urls: List[Union[URL, str]]) -> List[URL]:
549 |         """
550 |         Create the start URLs for the crawl from an initial URL. May be overridden.
551 | 
552 |         :param urls: Initial URLs
553 |         """
554 |         crawl_start_urls: Set[URL] = set()
555 | 
556 |         for url in urls + self.start_urls:
557 |             if isinstance(url, str):
558 |                 if "//" not in url:
559 |                     url = f"//{url}"
560 |                 url = URL(url)
561 | 
562 |             if url.scheme.lower() not in ["http", "https"]:
563 |                 url = url.with_scheme("http")
564 | 
565 |             crawl_start_urls.add(url)
566 | 
567 |         return list(crawl_start_urls)
568 | 
569 |     def record_statistics(self) -> None:
570 |         """
571 |         Record statistics.
572 |         """
573 |         self.stats[Stats.REQUESTS_DURATION_TOTAL] = int(
574 |             sum(self._stats_request_durations)
575 |         )
576 |         self.stats[Stats.REQUESTS_DURATION_AVG] = int(
577 |             harmonic_mean(self._stats_request_durations)
578 |         )
579 |         self.stats[Stats.REQUESTS_DURATION_MAX] = int(
580 |             max(self._stats_request_durations)
581 |         )
582 |         self.stats[Stats.REQUESTS_DURATION_MIN] = int(
583 |             min(self._stats_request_durations)
584 |         )
585 |         self.stats[Stats.REQUESTS_DURATION_MEDIAN] = int(
586 |             median(self._stats_request_durations)
587 |         )
588 | 
589 |         self.stats[Stats.CONTENT_LENGTH_TOTAL] = int(
590 |             sum(self._stats_response_content_lengths)
591 |         )
592 |         self.stats[Stats.CONTENT_LENGTH_AVG] = int(
593 |             harmonic_mean(self._stats_response_content_lengths)
594 |         )
595 |         self.stats[Stats.CONTENT_LENGTH_MAX] = int(
596 |             max(self._stats_response_content_lengths)
597 |         )
598 |         self.stats[Stats.CONTENT_LENGTH_MIN] = int(
599 |             min(self._stats_response_content_lengths)
600 |         )
601 |         self.stats[Stats.CONTENT_LENGTH_MEDIAN] = int(
602 |             median(self._stats_response_content_lengths)
603 |         )
604 | 
605 |         self.stats[Stats.URLS_SEEN] = len(self._duplicate_filter.fingerprints)
606 | 
607 |         self.stats[Stats.QUEUE_WAIT_AVG] = harmonic_mean(self._stats_queue_wait_times)
608 |         self.stats[Stats.QUEUE_WAIT_MIN] = min(self._stats_queue_wait_times)
609 |         self.stats[Stats.QUEUE_WAIT_MAX] = max(self._stats_queue_wait_times)
610 |         self.stats[Stats.QUEUE_WAIT_MEDIAN] = median(self._stats_queue_wait_times)
611 | 
612 |         self.stats[Stats.QUEUE_SIZE_MAX] = max(self._stats_queue_sizes)
613 |         self.stats[Stats.QUEUE_SIZE_AVG] = int(harmonic_mean(self._stats_queue_sizes))
614 |         self.stats[Stats.QUEUE_SIZE_MEDIAN] = int(median(self._stats_queue_sizes))
615 | 
616 |         self.stats[Stats.REQUESTS_LATENCY_AVG] = harmonic_mean(
617 |             self._stats_request_latencies
618 |         )
619 |         self.stats[Stats.REQUESTS_LATENCY_MAX] = int(max(self._stats_request_latencies))
620 |         self.stats[Stats.REQUESTS_LATENCY_MIN] = int(min(self._stats_request_latencies))
621 |         self.stats[Stats.REQUESTS_LATENCY_MEDIAN] = int(
622 |             median(self._stats_request_latencies)
623 |         )
624 |         self.stats[Stats.REQUESTS_LATENCY_TOTAL] = int(
625 |             sum(self._stats_request_latencies)
626 |         )
627 | 
628 |     def get_stats(self) -> dict:
629 |         """
630 |         Return crawl statistics as a sorted dictionary.
631 |         """
632 |         stats = {str(k): v for k, v in self.stats.items()}
633 |         return dict(OrderedDict(sorted(stats.items())).items())
634 | 
635 |     async def crawl(self, urls: Union[URL, str, List[Union[URL, str]]] = None) -> None:
636 |         """
637 |         Start the web crawler.
638 | 
639 |         :param urls: An optional URL or List of URLS to start the crawl, in addition to start_urls.
640 |         """
641 | 
642 |         # Fix for ssl errors
643 |         ignore_aiohttp_ssl_error(asyncio.get_running_loop())
644 | 
645 |         start = time.perf_counter()
646 | 
647 |         # Create start urls from the initial URL if provided.
648 |         if not urls:
649 |             urls = []
650 |         if isinstance(urls, (URL, str)):
651 |             urls = [urls]
652 |         self.start_urls = self.create_start_urls(urls)
653 | 
654 |         if not self.start_urls:
655 |             raise ValueError("crawler.start_urls are required")
656 | 
657 |         # Create the Request Queue within the asyncio loop.
658 |         self._request_queue = CrawlerPriorityQueue()
659 | 
660 |         # Create the Semaphore for controlling HTTP Request concurrency within the asyncio loop.
661 |         self._semaphore = asyncio.Semaphore(self.concurrency)
662 | 
663 |         trace_configs = []
664 |         if self._trace:
665 |             trace_configs.append(add_trace_config())
666 | 
667 |         conn = aiohttp.TCPConnector(
668 |             limit=0, ssl=self._ssl, ttl_dns_cache=self.total_timeout.total
669 |         )
670 |         # Create the ClientSession for HTTP Requests within the asyncio loop.
671 |         self._session = aiohttp.ClientSession(
672 |             timeout=self.total_timeout,
673 |             headers=self.headers,
674 |             connector=conn,
675 |             trace_configs=trace_configs,
676 |         )
677 | 
678 |         # Create a Request for each start URL and add it to the Request Queue.
679 |         for url in self.start_urls:
680 |             req = await self.follow(coerce_url(url), self.parse, delay=0)
681 |             if req:
682 |                 self._process_request(req)
683 | 
684 |         # Create workers to process the Request Queue.
685 |         # Create twice as many workers as potential concurrent requests, to help handle request callbacks without
686 |         # delay while other workers may be locked by the Semaphore.
687 |         self._workers = [
688 |             asyncio.create_task(self._work(i)) for i in range(self.concurrency * 2)
689 |         ]
690 | 
691 |         try:
692 |             # Run workers within the ClientSession.
693 |             async with self._session:
694 |                 await asyncio.wait_for(
695 |                     self._request_queue.join(), timeout=self.total_timeout.total
696 |                 )
697 |         except asyncio.TimeoutError:
698 |             logger.debug("Timed out after %s seconds", self.total_timeout.total)
699 |             self._request_queue.clear()
700 |         finally:
701 |             # Make sure all workers are cancelled.
702 |             for w in self._workers:
703 |                 w.cancel()
704 |             # Wait until all worker tasks are cancelled.
705 |             await asyncio.gather(*self._workers, return_exceptions=True)
706 | 
707 |         # Run the post crawl callback if it exists.
708 |         await self._run_callback(self.post_crawl_callback)
709 | 
710 |         # The ClientSession is closed only after all work is completed.
711 |         await self._session.close()
712 | 
713 |         duration = int((time.perf_counter() - start) * 1000)
714 |         self.stats[Stats.TOTAL_DURATION] = duration
715 | 
716 |         self.record_statistics()
717 | 
718 |         logger.info(
719 |             "Crawl finished: requests=%s time=%dms",
720 |             self.stats[Stats.REQUESTS_QUEUED],
721 |             duration,
722 |         )
723 |         logger.debug("Stats: %s", self.stats)
724 | 


--------------------------------------------------------------------------------