├── .gitignore
├── LICENSE
├── README.rst
├── feedfinder2.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .DS_Store
3 | dist
4 | feedfinder2.egg-info
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Daniel Foreman-Mackey
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Feedfinder2
2 | ===========
3 |
4 | This is a Python library for finding links feeds on a website. It is based on
5 | `feedfinder `_ - originally
6 | written by `Mark
7 | Pilgrim `_ and
8 | subsequently maintained by `Aaron
9 | Swartz `_ until his untimely death.
10 |
11 | Usage
12 | -----
13 |
14 | Feedfinder2 offers a single public function: ``find_feeds``. You would use it
15 | as follows:
16 |
17 | ::
18 |
19 | from feedfinder2 import find_feeds
20 | feeds = find_feeds("xkcd.com")
21 |
22 | Now, ``feeds`` is the list: ``['http://xkcd.com/atom.xml',
23 | 'http://xkcd.com/rss.xml']``. There is some attempt made to rank feeds from
24 | best candidate to worst but... well... you never know.
25 |
26 | License
27 | -------
28 |
29 | Feedfinder2 is licensed under the MIT license (see LICENSE).
30 |
--------------------------------------------------------------------------------
/feedfinder2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import print_function
5 |
6 | __version__ = "0.0.4"
7 |
8 | try:
9 | __FEEDFINDER2_SETUP__
10 | except NameError:
11 | __FEEDFINDER2_SETUP__ = False
12 |
13 | if not __FEEDFINDER2_SETUP__:
14 | __all__ = ["find_feeds"]
15 |
16 | import logging
17 | import requests
18 | from bs4 import BeautifulSoup
19 | from six.moves.urllib import parse as urlparse
20 |
21 |
22 | def coerce_url(url):
23 | url = url.strip()
24 | if url.startswith("feed://"):
25 | return "http://{0}".format(url[7:])
26 | for proto in ["http://", "https://"]:
27 | if url.startswith(proto):
28 | return url
29 | return "http://{0}".format(url)
30 |
31 |
32 | class FeedFinder(object):
33 |
34 | def __init__(self, user_agent=None, timeout=None):
35 | if user_agent is None:
36 | user_agent = "feedfinder2/{0}".format(__version__)
37 | self.user_agent = user_agent
38 | self.timeout = timeout
39 |
40 | def get_feed(self, url):
41 | try:
42 | r = requests.get(url, headers={"User-Agent": self.user_agent}, timeout=self.timeout)
43 | except Exception as e:
44 | logging.warning("Error while getting '{0}'".format(url))
45 | logging.warning("{0}".format(e))
46 | return None
47 | return r.text
48 |
49 | def is_feed_data(self, text):
50 | data = text.lower()
51 | if data.count(" tags.
86 | logging.info("Looking for tags.")
87 | tree = BeautifulSoup(text, "html.parser")
88 | links = []
89 | for link in tree.find_all("link"):
90 | if link.get("type") in ["application/rss+xml",
91 | "text/xml",
92 | "application/atom+xml",
93 | "application/x.atom+xml",
94 | "application/x-atom+xml"]:
95 | links.append(urlparse.urljoin(url, link.get("href", "")))
96 |
97 | # Check the detected links.
98 | urls = list(filter(finder.is_feed, links))
99 | logging.info("Found {0} feed tags.".format(len(urls)))
100 | if len(urls) and not check_all:
101 | return sort_urls(urls)
102 |
103 | # Look for tags.
104 | logging.info("Looking for tags.")
105 | local, remote = [], []
106 | for a in tree.find_all("a"):
107 | href = a.get("href", None)
108 | if href is None:
109 | continue
110 | if "://" not in href and finder.is_feed_url(href):
111 | local.append(href)
112 | if finder.is_feedlike_url(href):
113 | remote.append(href)
114 |
115 | # Check the local URLs.
116 | local = [urlparse.urljoin(url, l) for l in local]
117 | urls += list(filter(finder.is_feed, local))
118 | logging.info("Found {0} local links to feeds.".format(len(urls)))
119 | if len(urls) and not check_all:
120 | return sort_urls(urls)
121 |
122 | # Check the remote URLs.
123 | remote = [urlparse.urljoin(url, l) for l in remote]
124 | urls += list(filter(finder.is_feed, remote))
125 | logging.info("Found {0} remote links to feeds.".format(len(urls)))
126 | if len(urls) and not check_all:
127 | return sort_urls(urls)
128 |
129 | # Guessing potential URLs.
130 | fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
131 | "index.rss"]
132 | urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
133 | for f in fns]))
134 | return sort_urls(urls)
135 |
136 |
137 | def url_feed_prob(url):
138 | if "comments" in url:
139 | return -2
140 | if "georss" in url:
141 | return -1
142 | kw = ["atom", "rss", "rdf", ".xml", "feed"]
143 | for p, t in zip(range(len(kw), 0, -1), kw):
144 | if t in url:
145 | return p
146 | return 0
147 |
148 |
149 | def sort_urls(feeds):
150 | return sorted(list(set(feeds)), key=url_feed_prob, reverse=True)
151 |
152 |
153 | if __name__ == "__main__":
154 | print(find_feeds("www.preposterousuniverse.com/blog/", timeout = 1))
155 | print(find_feeds("www.preposterousuniverse.com/blog/"))
156 | print(find_feeds("http://xkcd.com"))
157 | print(find_feeds("dan.iel.fm/atom.xml"))
158 | print(find_feeds("dan.iel.fm", check_all=True))
159 | print(find_feeds("kapadia.github.io"))
160 | print(find_feeds("blog.jonathansick.ca"))
161 | print(find_feeds("asdasd"))
162 |
163 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | if sys.argv[-1] == "publish":
5 | os.system("python setup.py sdist upload")
6 | sys.exit()
7 |
8 | # Hackishly inject a constant into builtins to enable importing of the
9 | # package before the library is built.
10 | if sys.version_info[0] < 3:
11 | import __builtin__ as builtins
12 | else:
13 | import builtins
14 | builtins.__FEEDFINDER2_SETUP__ = True
15 | import feedfinder2
16 | from setuptools import setup
17 |
18 | setup(
19 | name="feedfinder2",
20 | version=feedfinder2.__version__,
21 | url="https://github.com/dfm/feedfinder2",
22 | license="MIT",
23 | author="Dan Foreman-Mackey",
24 | author_email="foreman.mackey@gmail.com",
25 | install_requires=[
26 | "six",
27 | "requests",
28 | "beautifulsoup4",
29 | ],
30 | description="Find the feed URLs for a website.",
31 | long_description=open("README.rst").read(),
32 | py_modules=["feedfinder2"],
33 | classifiers=[
34 | "Programming Language :: Python",
35 | "Development Status :: 4 - Beta",
36 | "Natural Language :: English",
37 | "Environment :: Web Environment",
38 | "Intended Audience :: Developers",
39 | "License :: OSI Approved :: MIT License",
40 | "Operating System :: OS Independent",
41 | ],
42 | )
43 |
--------------------------------------------------------------------------------