├── .gitignore ├── LICENSE ├── README.rst ├── feedfinder2.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .DS_Store 3 | dist 4 | feedfinder2.egg-info 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Daniel Foreman-Mackey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Feedfinder2 2 | =========== 3 | 4 | This is a Python library for finding links feeds on a website. It is based on 5 | `feedfinder `_ - originally 6 | written by `Mark 7 | Pilgrim `_ and 8 | subsequently maintained by `Aaron 9 | Swartz `_ until his untimely death. 10 | 11 | Usage 12 | ----- 13 | 14 | Feedfinder2 offers a single public function: ``find_feeds``. You would use it 15 | as follows: 16 | 17 | :: 18 | 19 | from feedfinder2 import find_feeds 20 | feeds = find_feeds("xkcd.com") 21 | 22 | Now, ``feeds`` is the list: ``['http://xkcd.com/atom.xml', 23 | 'http://xkcd.com/rss.xml']``. There is some attempt made to rank feeds from 24 | best candidate to worst but... well... you never know. 25 | 26 | License 27 | ------- 28 | 29 | Feedfinder2 is licensed under the MIT license (see LICENSE). 30 | -------------------------------------------------------------------------------- /feedfinder2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | __version__ = "0.0.4" 7 | 8 | try: 9 | __FEEDFINDER2_SETUP__ 10 | except NameError: 11 | __FEEDFINDER2_SETUP__ = False 12 | 13 | if not __FEEDFINDER2_SETUP__: 14 | __all__ = ["find_feeds"] 15 | 16 | import logging 17 | import requests 18 | from bs4 import BeautifulSoup 19 | from six.moves.urllib import parse as urlparse 20 | 21 | 22 | def coerce_url(url): 23 | url = url.strip() 24 | if url.startswith("feed://"): 25 | return "http://{0}".format(url[7:]) 26 | for proto in ["http://", "https://"]: 27 | if url.startswith(proto): 28 | return url 29 | return "http://{0}".format(url) 30 | 31 | 32 | class FeedFinder(object): 33 | 34 | def __init__(self, user_agent=None, timeout=None): 35 | if user_agent is None: 36 | user_agent = "feedfinder2/{0}".format(__version__) 37 | self.user_agent = user_agent 38 | self.timeout = timeout 39 | 40 | def get_feed(self, url): 41 | try: 42 | r = requests.get(url, headers={"User-Agent": self.user_agent}, timeout=self.timeout) 43 | except Exception as e: 44 | logging.warning("Error while getting '{0}'".format(url)) 45 | logging.warning("{0}".format(e)) 46 | return None 47 | return r.text 48 | 49 | def is_feed_data(self, text): 50 | data = text.lower() 51 | if data.count(" tags. 86 | logging.info("Looking for tags.") 87 | tree = BeautifulSoup(text, "html.parser") 88 | links = [] 89 | for link in tree.find_all("link"): 90 | if link.get("type") in ["application/rss+xml", 91 | "text/xml", 92 | "application/atom+xml", 93 | "application/x.atom+xml", 94 | "application/x-atom+xml"]: 95 | links.append(urlparse.urljoin(url, link.get("href", ""))) 96 | 97 | # Check the detected links. 98 | urls = list(filter(finder.is_feed, links)) 99 | logging.info("Found {0} feed tags.".format(len(urls))) 100 | if len(urls) and not check_all: 101 | return sort_urls(urls) 102 | 103 | # Look for tags. 104 | logging.info("Looking for tags.") 105 | local, remote = [], [] 106 | for a in tree.find_all("a"): 107 | href = a.get("href", None) 108 | if href is None: 109 | continue 110 | if "://" not in href and finder.is_feed_url(href): 111 | local.append(href) 112 | if finder.is_feedlike_url(href): 113 | remote.append(href) 114 | 115 | # Check the local URLs. 116 | local = [urlparse.urljoin(url, l) for l in local] 117 | urls += list(filter(finder.is_feed, local)) 118 | logging.info("Found {0} local links to feeds.".format(len(urls))) 119 | if len(urls) and not check_all: 120 | return sort_urls(urls) 121 | 122 | # Check the remote URLs. 123 | remote = [urlparse.urljoin(url, l) for l in remote] 124 | urls += list(filter(finder.is_feed, remote)) 125 | logging.info("Found {0} remote links to feeds.".format(len(urls))) 126 | if len(urls) and not check_all: 127 | return sort_urls(urls) 128 | 129 | # Guessing potential URLs. 130 | fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml", 131 | "index.rss"] 132 | urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f) 133 | for f in fns])) 134 | return sort_urls(urls) 135 | 136 | 137 | def url_feed_prob(url): 138 | if "comments" in url: 139 | return -2 140 | if "georss" in url: 141 | return -1 142 | kw = ["atom", "rss", "rdf", ".xml", "feed"] 143 | for p, t in zip(range(len(kw), 0, -1), kw): 144 | if t in url: 145 | return p 146 | return 0 147 | 148 | 149 | def sort_urls(feeds): 150 | return sorted(list(set(feeds)), key=url_feed_prob, reverse=True) 151 | 152 | 153 | if __name__ == "__main__": 154 | print(find_feeds("www.preposterousuniverse.com/blog/", timeout = 1)) 155 | print(find_feeds("www.preposterousuniverse.com/blog/")) 156 | print(find_feeds("http://xkcd.com")) 157 | print(find_feeds("dan.iel.fm/atom.xml")) 158 | print(find_feeds("dan.iel.fm", check_all=True)) 159 | print(find_feeds("kapadia.github.io")) 160 | print(find_feeds("blog.jonathansick.ca")) 161 | print(find_feeds("asdasd")) 162 | 163 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | if sys.argv[-1] == "publish": 5 | os.system("python setup.py sdist upload") 6 | sys.exit() 7 | 8 | # Hackishly inject a constant into builtins to enable importing of the 9 | # package before the library is built. 10 | if sys.version_info[0] < 3: 11 | import __builtin__ as builtins 12 | else: 13 | import builtins 14 | builtins.__FEEDFINDER2_SETUP__ = True 15 | import feedfinder2 16 | from setuptools import setup 17 | 18 | setup( 19 | name="feedfinder2", 20 | version=feedfinder2.__version__, 21 | url="https://github.com/dfm/feedfinder2", 22 | license="MIT", 23 | author="Dan Foreman-Mackey", 24 | author_email="foreman.mackey@gmail.com", 25 | install_requires=[ 26 | "six", 27 | "requests", 28 | "beautifulsoup4", 29 | ], 30 | description="Find the feed URLs for a website.", 31 | long_description=open("README.rst").read(), 32 | py_modules=["feedfinder2"], 33 | classifiers=[ 34 | "Programming Language :: Python", 35 | "Development Status :: 4 - Beta", 36 | "Natural Language :: English", 37 | "Environment :: Web Environment", 38 | "Intended Audience :: Developers", 39 | "License :: OSI Approved :: MIT License", 40 | "Operating System :: OS Independent", 41 | ], 42 | ) 43 | --------------------------------------------------------------------------------