├── .gitignore
├── LICENSE
├── README.rst
├── feedfinder2.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .DS_Store
3 | dist
4 | feedfinder2.egg-info
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Daniel Foreman-Mackey
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Feedfinder2
 2 | ===========
 3 | 
 4 | This is a Python library for finding links feeds on a website. It is based on
 5 | `feedfinder <http://www.aaronsw.com/2002/feedfinder/>`_ - originally
 6 | written by `Mark
 7 | Pilgrim <http://en.wikipedia.org/wiki/Mark_Pilgrim_(software_developer)>`_ and
 8 | subsequently maintained by `Aaron
 9 | Swartz <http://en.wikipedia.org/wiki/Aaron_Swartz>`_ until his untimely death.
10 | 
11 | Usage
12 | -----
13 | 
14 | Feedfinder2 offers a single public function: ``find_feeds``. You would use it
15 | as follows:
16 | 
17 | ::
18 | 
19 |     from feedfinder2 import find_feeds
20 |     feeds = find_feeds("xkcd.com")
21 | 
22 | Now, ``feeds`` is the list: ``['http://xkcd.com/atom.xml',
23 | 'http://xkcd.com/rss.xml']``. There is some attempt made to rank feeds from
24 | best candidate to worst but... well... you never know.
25 | 
26 | License
27 | -------
28 | 
29 | Feedfinder2 is licensed under the MIT license (see LICENSE).
30 | 


--------------------------------------------------------------------------------
/feedfinder2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | __version__ = "0.0.4"
  7 | 
  8 | try:
  9 |     __FEEDFINDER2_SETUP__
 10 | except NameError:
 11 |     __FEEDFINDER2_SETUP__ = False
 12 | 
 13 | if not __FEEDFINDER2_SETUP__:
 14 |     __all__ = ["find_feeds"]
 15 | 
 16 |     import logging
 17 |     import requests
 18 |     from bs4 import BeautifulSoup
 19 |     from six.moves.urllib import parse as urlparse
 20 | 
 21 | 
 22 | def coerce_url(url):
 23 |     url = url.strip()
 24 |     if url.startswith("feed://"):
 25 |         return "http://{0}".format(url[7:])
 26 |     for proto in ["http://", "https://"]:
 27 |         if url.startswith(proto):
 28 |             return url
 29 |     return "http://{0}".format(url)
 30 | 
 31 | 
 32 | class FeedFinder(object):
 33 | 
 34 |     def __init__(self, user_agent=None, timeout=None):
 35 |         if user_agent is None:
 36 |             user_agent = "feedfinder2/{0}".format(__version__)
 37 |         self.user_agent = user_agent
 38 |         self.timeout = timeout
 39 | 
 40 |     def get_feed(self, url):
 41 |         try:
 42 |             r = requests.get(url, headers={"User-Agent": self.user_agent}, timeout=self.timeout)
 43 |         except Exception as e:
 44 |             logging.warning("Error while getting '{0}'".format(url))
 45 |             logging.warning("{0}".format(e))
 46 |             return None
 47 |         return r.text
 48 | 
 49 |     def is_feed_data(self, text):
 50 |         data = text.lower()
 51 |         if data.count("<html"):
 52 |             return False
 53 |         return data.count("<rss")+data.count("<rdf")+data.count("<feed")
 54 | 
 55 |     def is_feed(self, url):
 56 |         text = self.get_feed(url)
 57 |         if text is None:
 58 |             return False
 59 |         return self.is_feed_data(text)
 60 | 
 61 |     def is_feed_url(self, url):
 62 |         return any(map(url.lower().endswith,
 63 |                        [".rss", ".rdf", ".xml", ".atom"]))
 64 | 
 65 |     def is_feedlike_url(self, url):
 66 |         return any(map(url.lower().count,
 67 |                        ["rss", "rdf", "xml", "atom", "feed"]))
 68 | 
 69 | 
 70 | def find_feeds(url, check_all=False, user_agent=None, timeout=None):
 71 |     finder = FeedFinder(user_agent=user_agent, timeout=timeout)
 72 | 
 73 |     # Format the URL properly.
 74 |     url = coerce_url(url)
 75 | 
 76 |     # Download the requested URL.
 77 |     text = finder.get_feed(url)
 78 |     if text is None:
 79 |         return []
 80 | 
 81 |     # Check if it is already a feed.
 82 |     if finder.is_feed_data(text):
 83 |         return [url]
 84 | 
 85 |     # Look for <link> tags.
 86 |     logging.info("Looking for <link> tags.")
 87 |     tree = BeautifulSoup(text, "html.parser")
 88 |     links = []
 89 |     for link in tree.find_all("link"):
 90 |         if link.get("type") in ["application/rss+xml",
 91 |                                 "text/xml",
 92 |                                 "application/atom+xml",
 93 |                                 "application/x.atom+xml",
 94 |                                 "application/x-atom+xml"]:
 95 |             links.append(urlparse.urljoin(url, link.get("href", "")))
 96 | 
 97 |     # Check the detected links.
 98 |     urls = list(filter(finder.is_feed, links))
 99 |     logging.info("Found {0} feed <link> tags.".format(len(urls)))
100 |     if len(urls) and not check_all:
101 |         return sort_urls(urls)
102 | 
103 |     # Look for <a> tags.
104 |     logging.info("Looking for <a> tags.")
105 |     local, remote = [], []
106 |     for a in tree.find_all("a"):
107 |         href = a.get("href", None)
108 |         if href is None:
109 |             continue
110 |         if "://" not in href and finder.is_feed_url(href):
111 |             local.append(href)
112 |         if finder.is_feedlike_url(href):
113 |             remote.append(href)
114 | 
115 |     # Check the local URLs.
116 |     local = [urlparse.urljoin(url, l) for l in local]
117 |     urls += list(filter(finder.is_feed, local))
118 |     logging.info("Found {0} local <a> links to feeds.".format(len(urls)))
119 |     if len(urls) and not check_all:
120 |         return sort_urls(urls)
121 | 
122 |     # Check the remote URLs.
123 |     remote = [urlparse.urljoin(url, l) for l in remote]
124 |     urls += list(filter(finder.is_feed, remote))
125 |     logging.info("Found {0} remote <a> links to feeds.".format(len(urls)))
126 |     if len(urls) and not check_all:
127 |         return sort_urls(urls)
128 | 
129 |     # Guessing potential URLs.
130 |     fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
131 |            "index.rss"]
132 |     urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
133 |                                          for f in fns]))
134 |     return sort_urls(urls)
135 | 
136 | 
137 | def url_feed_prob(url):
138 |     if "comments" in url:
139 |         return -2
140 |     if "georss" in url:
141 |         return -1
142 |     kw = ["atom", "rss", "rdf", ".xml", "feed"]
143 |     for p, t in zip(range(len(kw), 0, -1), kw):
144 |         if t in url:
145 |             return p
146 |     return 0
147 | 
148 | 
149 | def sort_urls(feeds):
150 |     return sorted(list(set(feeds)), key=url_feed_prob, reverse=True)
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     print(find_feeds("www.preposterousuniverse.com/blog/", timeout = 1))
155 |     print(find_feeds("www.preposterousuniverse.com/blog/"))
156 |     print(find_feeds("http://xkcd.com"))
157 |     print(find_feeds("dan.iel.fm/atom.xml"))
158 |     print(find_feeds("dan.iel.fm", check_all=True))
159 |     print(find_feeds("kapadia.github.io"))
160 |     print(find_feeds("blog.jonathansick.ca"))
161 |     print(find_feeds("asdasd"))
162 |     
163 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | if sys.argv[-1] == "publish":
 5 |     os.system("python setup.py sdist upload")
 6 |     sys.exit()
 7 | 
 8 | # Hackishly inject a constant into builtins to enable importing of the
 9 | # package before the library is built.
10 | if sys.version_info[0] < 3:
11 |     import __builtin__ as builtins
12 | else:
13 |     import builtins
14 | builtins.__FEEDFINDER2_SETUP__ = True
15 | import feedfinder2
16 | from setuptools import setup
17 | 
18 | setup(
19 |     name="feedfinder2",
20 |     version=feedfinder2.__version__,
21 |     url="https://github.com/dfm/feedfinder2",
22 |     license="MIT",
23 |     author="Dan Foreman-Mackey",
24 |     author_email="foreman.mackey@gmail.com",
25 |     install_requires=[
26 |         "six",
27 |         "requests",
28 |         "beautifulsoup4",
29 |     ],
30 |     description="Find the feed URLs for a website.",
31 |     long_description=open("README.rst").read(),
32 |     py_modules=["feedfinder2"],
33 |     classifiers=[
34 |         "Programming Language :: Python",
35 |         "Development Status :: 4 - Beta",
36 |         "Natural Language :: English",
37 |         "Environment :: Web Environment",
38 |         "Intended Audience :: Developers",
39 |         "License :: OSI Approved :: MIT License",
40 |         "Operating System :: OS Independent",
41 |     ],
42 | )
43 | 


--------------------------------------------------------------------------------