53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/tests/atom/test_rfc_minimal.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from dateutil.tz import tzutc
4 |
5 | from atoma.atom import (
6 | AtomFeed, AtomEntry, AtomTextConstruct, AtomTextType, AtomPerson, AtomLink,
7 | parse_atom_file
8 | )
9 |
10 |
11 | def test_rfc_minimal():
12 | expected_entry = AtomEntry(
13 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
14 | value='Atom-Powered Robots Run Amok'),
15 | id_='urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a',
16 | updated=datetime.datetime(2003, 12, 13, 18, 30, 2, tzinfo=tzutc()),
17 | authors=[
18 | AtomPerson(name='John Doe', uri=None, email=None)
19 | ],
20 | contributors=[],
21 | links=[
22 | AtomLink(href='http://example.org/2003/12/13/atom03', rel=None,
23 | type_=None, hreflang=None, title=None, length=None)
24 | ],
25 | categories=[],
26 | published=None,
27 | rights=None,
28 | summary=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
29 | value='Some text.'),
30 | content=None,
31 | source=None
32 | )
33 | expected = AtomFeed(
34 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
35 | value='Example Feed'),
36 | id_='urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6',
37 | updated=datetime.datetime(2003, 12, 13, 18, 30, 2, tzinfo=tzutc()),
38 | authors=[
39 | AtomPerson(name='John Doe', uri=None, email=None)
40 | ],
41 | contributors=[],
42 | links=[
43 | AtomLink(href='http://example.org/', rel=None, type_=None,
44 | hreflang=None, title=None, length=None)
45 | ],
46 | categories=[],
47 | generator=None,
48 | subtitle=None,
49 | rights=None,
50 | icon=None,
51 | logo=None,
52 | entries=[
53 | expected_entry
54 | ]
55 | )
56 | assert parse_atom_file('tests/atom/rfc-minimal.xml') == expected
57 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from codecs import open
3 | from os import path
4 |
5 | here = path.abspath(path.dirname(__file__))
6 |
7 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
8 | long_description = f.read()
9 |
10 | with open(path.join(here, 'LICENSE'), encoding='utf-8') as f:
11 | long_description += f.read()
12 |
13 | with open(path.join(here, 'atoma', 'const.py'), encoding='utf-8') as fp:
14 | version = dict()
15 | exec(fp.read(), version)
16 | version = version['VERSION']
17 |
18 | setup(
19 | name='atoma',
20 | version=version,
21 | description='Atom, RSS and JSON feed parser for Python 3',
22 | long_description=long_description,
23 | long_description_content_type='text/x-rst',
24 | url='https://github.com/NicolasLM/atoma',
25 | author='Nicolas Le Manchet',
26 | author_email='nicolas@lemanchet.fr',
27 | license='MIT',
28 | python_requires=">=3.7",
29 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
30 | classifiers=[
31 | 'Development Status :: 4 - Beta',
32 | 'Intended Audience :: Developers',
33 | 'Topic :: Software Development :: Libraries',
34 | 'License :: OSI Approved :: MIT License',
35 | 'Natural Language :: English',
36 | 'Programming Language :: Python :: 3',
37 | 'Programming Language :: Python :: 3 :: Only',
38 | 'Programming Language :: Python :: 3.7',
39 | 'Programming Language :: Python :: 3.8',
40 | 'Programming Language :: Python :: 3.9',
41 | 'Programming Language :: Python :: 3.10',
42 | 'Programming Language :: Python :: 3.11',
43 | 'Topic :: Text Processing :: Markup :: XML'
44 | ],
45 | keywords='atom rss json feed feeds syndication parser RFC4287',
46 |
47 | packages=find_packages(include=('atoma', 'atoma.*')),
48 | install_requires=[
49 | 'defusedxml',
50 | 'attrs',
51 | 'python-dateutil'
52 | ],
53 |
54 | extras_require={
55 | 'tests': [
56 | 'pytest',
57 | 'pytest-cov',
58 | 'python-coveralls',
59 | 'pycodestyle'
60 | ]
61 | }
62 | )
63 |
--------------------------------------------------------------------------------
/tests/atom/test_atom.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from atoma.atom import (
4 | AtomFeed, parse_atom_file, parse_atom_bytes, FeedParseError
5 | )
6 | from atoma import FeedXMLError
7 |
8 | data = b"""\
9 |
10 |
11 | Example Feed
12 | foo
13 | 2003-12-13T18:30:02Z
14 |
15 | """
16 |
17 |
18 | def test_read_bytes():
19 | assert isinstance(parse_atom_bytes(data), AtomFeed)
20 |
21 |
22 | def test_broken_missing_id():
23 | with pytest.raises(FeedParseError):
24 | parse_atom_file('tests/atom/broken-missing-id.xml')
25 |
26 |
27 | def test_broken_missing_author():
28 | # The RFC mandates that at least one of feed or entries must have an author
29 | # but this is rarely the case in practice.
30 | parsed = parse_atom_file('tests/atom/broken-xkcd.xml')
31 | assert parsed.authors == list()
32 | assert parsed.entries[0].authors == list()
33 |
34 |
35 | def test_broken_missing_updated():
36 | # The RFC mandates that feed and entries have an updated date
37 | # but this is rarely the case in practice.
38 | parsed = parse_atom_file('tests/atom/broken-missing-updated.xml')
39 | assert parsed.updated is None
40 | assert parsed.entries[0].updated is None
41 |
42 |
43 | def test_broken_empty_fields():
44 | # As a general rule, XML tags should not be empty. In practice optional
45 | # fields are sometimes present in the feed but with an empty tag
46 | parsed = parse_atom_file('tests/atom/broken-empty-summary.xml')
47 | assert parsed.entries[0].summary is None
48 |
49 | parsed = parse_atom_file('tests/atom/broken-empty-title.xml')
50 | assert parsed.title is None
51 |
52 | parsed = parse_atom_file('tests/atom/broken-empty-updated.xml')
53 | assert parsed.updated is None
54 |
55 | parsed = parse_atom_file('tests/atom/broken-empty-author.xml')
56 | assert parsed.authors == []
57 | assert parsed.entries[0].authors == []
58 |
59 | parsed = parse_atom_file('tests/atom/broken-missing-author-name.xml')
60 | assert parsed.authors == []
61 | assert parsed.entries[0].authors == []
62 |
63 | # Require fields (id...) that have empty tags should throw an error
64 | with pytest.raises(FeedParseError):
65 | parse_atom_file('tests/atom/broken-empty-id.xml')
66 |
67 |
68 | def test_broken_not_xml():
69 | with pytest.raises(FeedXMLError):
70 | parse_atom_bytes(b'This is not an XML document')
71 |
--------------------------------------------------------------------------------
/tests/json_feed/jsonfeed.org.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "https://jsonfeed.org/version/1",
3 | "user_comment": "This feed allows you to read the posts from this site in any feed reader that supports the JSON Feed format. To add this feed to your reader, copy the following URL — https://jsonfeed.org/feed.json — and add it your reader.",
4 | "title": "JSON Feed",
5 | "description": "JSON Feed is a pragmatic syndication format for blogs, microblogs, and other time-based content.",
6 | "home_page_url": "https://jsonfeed.org/",
7 | "feed_url": "https://jsonfeed.org/feed.json",
8 | "author": {
9 | "name": "Brent Simmons and Manton Reece",
10 | "url": "https://jsonfeed.org/"
11 | },
12 | "items": [
13 | {
14 | "title": "Announcing JSON Feed",
15 | "date_published": "2017-05-17T08:02:12-07:00",
16 | "id": "https://jsonfeed.org/2017/05/17/announcing_json_feed",
17 | "url": "https://jsonfeed.org/2017/05/17/announcing_json_feed",
18 | "content_html": "
We — Manton Reece and Brent Simmons — have noticed that JSON has become the developers’ choice for APIs, and that developers will often go out of their way to avoid XML. JSON is simpler to read and write, and it’s less prone to bugs.
\n\n
So we developed JSON Feed, a format similar to RSS and Atom but in JSON. It reflects the lessons learned from our years of work reading and publishing feeds.
\n\n
See the spec. It’s at version 1, which may be the only version ever needed. If future versions are needed, version 1 feeds will still be valid feeds.
\n\n
Notes
\n\n
We have a WordPress plugin and, coming soon, a JSON Feed Parser for Swift. As more code is written, by us and others, we’ll update the code page.
This website — the Markdown files and supporting resources — is up on GitHub, and you’re welcome to comment there.
\n\n
This website is also a blog, and you can subscribe to the RSS feed or the JSON feed (if your reader supports it).
\n\n
We worked with a number of people on this over the course of several months. We list them, and thank them, at the bottom of the spec. But — most importantly — Craig Hockenberry spent a little time making it look pretty. :)
\n"
19 | }
20 | ]
21 | }
22 |
--------------------------------------------------------------------------------
/tests/rss/specification.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Liftoff News
5 | http://liftoff.msfc.nasa.gov/
6 | Liftoff to Space Exploration.
7 |
8 | en-us
9 | Tue, 10 Jun 2003 04:00:00 GMT
10 | Tue, 10 Jun 2003 09:41:01 GMT
11 | http://blogs.law.harvard.edu/tech/rss
12 | Weblog Editor 2.0
13 | editor@example.com
14 | webmaster@example.com
15 |
16 |
17 | Star City
18 | http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp
19 |
20 | How do Americans get ready to work with Russians aboard
21 | the International Space Station? They take a crash course in
22 | culture, language and protocol at Russia's <a
23 | href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.
24 |
25 | Tue, 03 Jun 2003 09:39:21 GMT
26 | http://liftoff.msfc.nasa.gov/2003/06/03.html#item573
27 |
28 |
29 |
30 | Sky watchers in Europe, Asia, and parts of Alaska and
31 | Canada will experience a <a
32 | href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial
33 | eclipse of the Sun</a> on Saturday, May 31st.
34 |
35 | Fri, 30 May 2003 11:06:42 GMT
36 | http://liftoff.msfc.nasa.gov/2003/05/30.html#item572
37 |
38 |
39 |
40 | The Engine That Does More
41 | http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp
42 | Before man travels to Mars, NASA hopes to design new
43 | engines that will let us fly through the Solar System more
44 | quickly.
45 | The proposed VASIMR engine would do that.
46 |
47 | Tue, 27 May 2003 08:37:32 GMT
48 | http://liftoff.msfc.nasa.gov/2003/05/27.html#item571
49 |
50 |
51 |
52 | Astronauts' Dirty Laundry
53 | http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp
54 | Compared to earlier spacecraft, the International Space
55 | Station has many luxuries, but laundry facilities are not one of
56 | them. Instead, astronauts have other options.
57 |
58 | Tue, 20 May 2003 08:56:02 GMT
59 | http://liftoff.msfc.nasa.gov/2003/05/20.html#item570
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/tests/opml/test_opml.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from dateutil.tz import tzutc
4 | import pytest
5 |
6 | from atoma.opml import (
7 | parse_opml_file, parse_opml_bytes, get_feed_list, OPML, OPMLOutline
8 | )
9 | from atoma import FeedXMLError
10 |
11 | data = b"""\
12 |
13 |
14 |
15 | states.opml
16 |
17 |
18 |
19 |
20 |
21 | """
22 |
23 |
24 | def test_read_bytes():
25 | assert isinstance(parse_opml_bytes(data), OPML)
26 |
27 |
28 | def test_nested_subscription_list():
29 | o = parse_opml_file('tests/opml/nested-subscription-list.xml')
30 | assert get_feed_list(o) == [
31 | 'http://1.com/rss.xml',
32 | 'http://2.com/rss.xml',
33 | 'http://3.com/rss.xml',
34 | 'http://4.com/rss.xml',
35 | 'http://5.com/rss.xml',
36 | ]
37 |
38 |
39 | def test_missing_outline_title():
40 | o = parse_opml_file('tests/opml/broken-no-title.xml')
41 | assert get_feed_list(o) == [
42 | 'https://xkcd.com/rss.xml',
43 | 'http://antirez.com/rss',
44 | 'https://what-if.xkcd.com/feed.atom',
45 | ]
46 |
47 |
48 | def test_subscription_list():
49 | expected = OPML(
50 | title='mySubscriptions.opml',
51 | owner_name='Dave Winer',
52 | owner_email='dave@scripting.com',
53 | date_created=datetime(2005, 6, 18, 12, 11, 52, tzinfo=tzutc()),
54 | date_modified=datetime(2005, 8, 2, 21, 42, 48, tzinfo=tzutc()),
55 | expansion_state=None,
56 | vertical_scroll_state=1,
57 | window_top=61,
58 | window_left=304,
59 | window_bottom=562,
60 | window_right=842,
61 | outlines=[
62 | OPMLOutline(
63 | text='CNET News.com',
64 | type='rss',
65 | xml_url='http://news.com.com/2547-1_3-0-5.xml',
66 | description='Tech news and business reports by CNET News.com.',
67 | html_url='http://news.com.com/',
68 | language='unknown',
69 | title='CNET News.com',
70 | version='RSS2',
71 | outlines=[]
72 | ),
73 | OPMLOutline(
74 | text='washingtonpost.com - Politics',
75 | type='rss',
76 | xml_url='http://www.washingtonpost.com/wp-srv/'
77 | 'politics/rssheadlines.xml',
78 | description='Politics',
79 | html_url='http://www.washingtonpost.com/wp-dyn/'
80 | 'politics?nav=rss_politics',
81 | language='unknown',
82 | title='washingtonpost.com - Politics',
83 | version='RSS2',
84 | outlines=[]
85 | )
86 | ]
87 | )
88 | assert parse_opml_file('tests/opml/subscription-list.xml') == expected
89 |
90 |
91 | def test_broken_not_xml():
92 | with pytest.raises(FeedXMLError):
93 | parse_opml_bytes(b'This is not an XML document')
94 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Atoma
2 | =====
3 |
4 | .. image:: https://github.com/NicolasLM/atoma/actions/workflows/test.yml/badge.svg
5 | :target: https://github.com/NicolasLM/atoma/actions/workflows/test.yml
6 | .. image:: https://codecov.io/gh/NicolasLM/atoma/branch/main/graph/badge.svg
7 | :target: https://codecov.io/gh/NicolasLM/atoma
8 |
9 |
10 |
11 | Atom, RSS and JSON feed parser for Python 3.
12 |
13 | Quickstart
14 | ----------
15 |
16 | Install Atoma with pip::
17 |
18 | pip install atoma
19 |
20 | Load and parse an Atom XML file:
21 |
22 | .. code:: python
23 |
24 | >>> import atoma
25 | >>> feed = atoma.parse_atom_feed('atom-feed.xml')
26 | >>> feed.description
27 | 'The blog relating the daily life of web agency developers'
28 | >>> len(feed.items)
29 | 5
30 |
31 | A small change is needed if you are dealing with an RSS XML file:
32 |
33 | .. code:: python
34 |
35 | >>> feed = atoma.parse_rss_feed('rss-feed.xml')
36 |
37 | Parsing feeds from the Internet is easy as well:
38 |
39 | .. code:: python
40 |
41 | >>> import atoma, requests
42 | >>> response = requests.get('http://lucumr.pocoo.org/feed.atom')
43 | >>> feed = atoma.parse_atom_bytes(response.content)
44 | >>> feed.title.value
45 | "Armin Ronacher's Thoughts and Writings"
46 |
47 | Features
48 | --------
49 |
50 | * RSS 2.0 - `RSS 2.0 Specification `_
51 | * Atom Syndication Format v1 - `RFC4287 `_
52 | * JSON Feed v1 - `JSON Feed specification `_
53 | * OPML 2.0, to share lists of feeds - `OPML 2.0 `_
54 | * Typed: feeds decomposed into meaningful Python objects
55 | * Secure: uses defusedxml to load untrusted feeds
56 | * Compatible with Python 3.6+
57 |
58 | Security warning
59 | ----------------
60 |
61 | If you use this library to display content from feeds in a web page, you NEED
62 | to clean the HTML contained in the feeds to prevent `Cross-site scripting (XSS)
63 | `_. The `bleach
64 | `_ library is recommended for cleaning feeds.
65 |
66 | Useful Resources
67 | ----------------
68 |
69 | To use this library a basic understanding of feeds is required. For Atom, the
70 | `Introduction to Atom `_ is a must
71 | read. The `RFC 4287 `_ can help lift some
72 | ambiguities. Finally the `feed validator `_ is
73 | great to test hand-crafted feeds.
74 |
75 | For RSS, the `RSS specification `_ and
76 | `rssboard.org `_ have a ton of information and
77 | examples.
78 |
79 | For OPML, the `OPML specification
80 | `_ has a paragraph dedicated
81 | to its usage for syndication
82 |
83 | Non-implemented Features
84 | ------------------------
85 |
86 | Some seldom used features are not implemented:
87 |
88 | * XML signature and encryption
89 | * Some Atom and RSS extensions
90 | * Atom content other than ``text``, ``html`` and ``xhtml``
91 |
92 | License
93 | -------
94 |
95 | MIT
96 |
--------------------------------------------------------------------------------
/atoma/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from dateutil.tz import gettz
3 | from xml.etree.ElementTree import Element
4 | from typing import Optional
5 |
6 | import dateutil.parser
7 | from defusedxml.ElementTree import parse as defused_xml_parse, ParseError
8 |
9 | from .exceptions import FeedXMLError, FeedParseError
10 |
11 | ns = {
12 | 'content': 'http://purl.org/rss/1.0/modules/content/',
13 | 'feed': 'http://www.w3.org/2005/Atom'
14 | }
15 |
16 | # Common timezone abbreviations defined in RFC 822, used by RSS
17 | # https://tools.ietf.org/html/rfc822#section-5.1
18 | tzinfos = {
19 | 'UT': gettz('GMT'),
20 | 'EST': -18000,
21 | 'EDT': -14400,
22 | 'CST': -21600,
23 | 'CDT': -18000,
24 | 'MST': -25200,
25 | 'MDT': -21600,
26 | 'PST': -28800,
27 | 'PDT': -25200
28 | }
29 |
30 |
31 | def parse_xml(xml_content):
32 | try:
33 | return defused_xml_parse(xml_content)
34 | except ParseError:
35 | raise FeedXMLError('Not a valid XML document')
36 |
37 |
38 | def get_child(element: Element, name,
39 | optional: bool=True) -> Optional[Element]:
40 | child = element.find(name, namespaces=ns)
41 |
42 | if child is None and not optional:
43 | raise FeedParseError(
44 | 'Could not parse feed: "{}" does not have a "{}"'
45 | .format(element.tag, name)
46 | )
47 |
48 | elif child is None:
49 | return None
50 |
51 | return child
52 |
53 |
54 | def get_text(element: Element, name, optional: bool=True) -> Optional[str]:
55 | child = get_child(element, name, optional)
56 | if child is None:
57 | return None
58 |
59 | if child.text is None:
60 | if optional:
61 | return None
62 |
63 | raise FeedParseError(
64 | 'Could not parse feed: "{}" text is required but is empty'
65 | .format(name)
66 | )
67 |
68 | return child.text.strip()
69 |
70 |
71 | def get_int(element: Element, name, optional: bool=True) -> Optional[int]:
72 | text = get_text(element, name, optional)
73 | if text is None:
74 | return None
75 |
76 | return int(text)
77 |
78 |
79 | def get_datetime(element: Element, name,
80 | optional: bool=True) -> Optional[datetime]:
81 | text = get_text(element, name, optional)
82 | if text is None:
83 | return None
84 |
85 | return try_parse_date(text)
86 |
87 |
88 | def try_parse_date(date_str: str) -> Optional[datetime]:
89 | try:
90 | date = dateutil.parser.parse(date_str, fuzzy=True, tzinfos=tzinfos)
91 | except (ValueError, OverflowError):
92 | return None
93 |
94 | if date.tzinfo is None:
95 | # TZ naive datetime, make it a TZ aware datetime by assuming it
96 | # contains UTC time
97 | date = date.replace(tzinfo=timezone.utc)
98 |
99 | return date
100 |
101 |
102 | def try_parse_length(length) -> Optional[int]:
103 | try:
104 | length = int(length)
105 | except (TypeError, ValueError):
106 | return None
107 |
108 | if length < 0:
109 | return None
110 |
111 | return length
112 |
--------------------------------------------------------------------------------
/tests/atom/test_rfc_more_extensive.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from dateutil.tz import tzutc, tzoffset
4 |
5 | from atoma.atom import (
6 | AtomFeed, AtomEntry, AtomTextConstruct, AtomTextType, AtomPerson, AtomLink,
7 | AtomGenerator, parse_atom_file
8 | )
9 |
10 |
11 | def test_rfc_more_extensive():
12 | expected_entry = AtomEntry(
13 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
14 | value='Atom draft-07 snapshot'),
15 | id_='tag:example.org,2003:3.2397',
16 | updated=datetime.datetime(2005, 7, 31, 12, 29, 29, tzinfo=tzutc()),
17 | authors=[
18 | AtomPerson(name='Mark Pilgrim', uri='http://example.org/',
19 | email='f8dy@example.com')
20 | ],
21 | contributors=[
22 | AtomPerson(name='Sam Ruby', uri=None, email=None),
23 | AtomPerson(name='Joe Gregorio', uri=None, email=None)
24 | ],
25 | links=[
26 | AtomLink(href='http://example.org/2005/04/02/atom',
27 | rel='alternate', type_='text/html', hreflang=None,
28 | title=None, length=None),
29 | AtomLink(href='http://example.org/audio/ph34r_my_podcast.mp3',
30 | rel='enclosure', type_='audio/mpeg', hreflang=None,
31 | title=None, length=1337)
32 | ],
33 | categories=[],
34 | published=datetime.datetime(2003, 12, 13, 8, 29, 29,
35 | tzinfo=tzoffset(None, -14400)),
36 | rights=None,
37 | summary=None,
38 | content=AtomTextConstruct(text_type=AtomTextType.xhtml, lang=None,
39 | value=''),
40 | source=None
41 | )
42 | expected = AtomFeed(
43 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
44 | value='dive into mark'),
45 | id_='tag:example.org,2003:3',
46 | updated=datetime.datetime(2005, 7, 31, 12, 29, 29, tzinfo=tzutc()),
47 | authors=[],
48 | contributors=[],
49 | links=[
50 | AtomLink(href='http://example.org/', rel='alternate',
51 | type_='text/html', hreflang='en', title=None,
52 | length=None),
53 | AtomLink(href='http://example.org/feed.atom', rel='self',
54 | type_='application/atom+xml', hreflang=None, title=None,
55 | length=None)
56 | ],
57 | categories=[],
58 | generator=AtomGenerator(name='Example Toolkit',
59 | uri='http://www.example.com/', version='1.0'),
60 | subtitle=AtomTextConstruct(text_type=AtomTextType.html, lang=None,
61 | value='A lot of effort\n '
62 | 'went into making this effortless'),
63 | rights=AtomTextConstruct(text_type=AtomTextType.text, lang=None,
64 | value='Copyright (c) 2003, Mark Pilgrim'),
65 | icon=None,
66 | logo=None,
67 | entries=[
68 | expected_entry
69 | ]
70 | )
71 | assert (
72 | parse_atom_file('tests/atom/rfc-more-extensive.xml') == expected
73 | )
74 |
--------------------------------------------------------------------------------
/tests/rss/test_rss.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from atoma.rss import (
4 | parse_rss_file, parse_rss_bytes, RSSChannel, FeedParseError
5 | )
6 | from atoma import FeedXMLError
7 |
8 | data = b"""\
9 |
10 |
11 |
12 | Foo
13 | http://foo.bar
14 | Foo bar.
15 |
16 | Baz item
17 |
18 |
19 |
20 | """
21 |
22 | cdata_description = """\
23 | I'm headed for France. I wasn't gonna go this year, but then last week \
24 | Valley Girl came out and I \
25 | said to myself, Joe Bob, you gotta get out of the country for a while."""
26 |
27 |
28 | def test_read_bytes():
29 | assert isinstance(parse_rss_bytes(data), RSSChannel)
30 |
31 |
32 | def test_broken_missing_title():
33 | # RSS feed title is mandatory by specs, but some feeds in the wild
34 | # do not provide it
35 | p = parse_rss_file('tests/rss/broken-missing-title.xml')
36 | assert p.title is None
37 |
38 |
39 | def test_broken_missing_description():
40 | # RSS feed description is mandatory by specs, but some feeds in the wild
41 | # do not provide it
42 | p = parse_rss_file('tests/rss/broken-missing-description.xml')
43 | assert p.description is None
44 |
45 |
46 | def test_broken_missing_link():
47 | # RSS feed link is mandatory by specs, but some feeds in the wild
48 | # do not provide it
49 | p = parse_rss_file('tests/rss/broken-missing-link.xml')
50 | assert p.link is None
51 | assert p.items[0].link is None
52 | assert p.items[0].guid is None
53 | assert p.items[1].link == 'http://link1'
54 | assert p.items[1].guid == 'http://link1'
55 | assert p.items[2].link == 'http://link2'
56 | assert p.items[2].guid == '646326554'
57 |
58 |
59 | def test_broken_missing_source_url():
60 | # The URL of a source is mandatory by specs, but some feeds in the wild
61 | # do not provide it
62 | p = parse_rss_file('tests/rss/broken-missing-source-url.xml')
63 | assert p.items[0].source.title == 'New York Times'
64 | assert p.items[0].source.url is None
65 |
66 |
67 | def test_broken_enclosure():
68 | # The length and type of an enclosure are mandatory by specs,
69 | # but some feeds in the wild do not provide them
70 | p = parse_rss_file('tests/rss/broken-enclosure.xml')
71 | for i in range(0, 3):
72 | assert p.items[i].enclosures[0].url == 'https://foo.com/test.mp3'
73 | assert p.items[i].enclosures[0].length is None
74 | assert p.items[i].enclosures[0].type is None
75 |
76 |
77 | def test_broken_version():
78 | with pytest.raises(FeedParseError):
79 | parse_rss_file('tests/rss/broken-version.xml')
80 |
81 |
82 | def test_broken_no_channel():
83 | with pytest.raises(FeedParseError):
84 | parse_rss_file('tests/rss/broken-no-channel.xml')
85 |
86 |
87 | def test_broken_not_xml():
88 | with pytest.raises(FeedXMLError):
89 | parse_rss_bytes(b'This is not an XML document')
90 |
91 |
92 | def test_encoding():
93 | parsed = parse_rss_file('tests/rss/encoding.xml')
94 | assert parsed.items[0].title == 'The & entity'
95 | assert parsed.items[1].title == "Nice what's he weigh?"
96 | assert parsed.items[2].title == "Rïchàrd Plop's ☃"
97 | assert parsed.items[2].description == cdata_description
98 | assert parsed.items[3].description is None
99 |
--------------------------------------------------------------------------------
/atoma/opml.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from io import BytesIO
3 | from typing import Optional, List
4 | from xml.etree.ElementTree import Element
5 |
6 | import attr
7 |
8 | from .utils import parse_xml, get_text, get_int, get_datetime
9 |
10 |
11 | @attr.s
12 | class OPMLOutline:
13 | text: Optional[str] = attr.ib()
14 | type: Optional[str] = attr.ib()
15 | xml_url: Optional[str] = attr.ib()
16 | description: Optional[str] = attr.ib()
17 | html_url: Optional[str] = attr.ib()
18 | language: Optional[str] = attr.ib()
19 | title: Optional[str] = attr.ib()
20 | version: Optional[str] = attr.ib()
21 |
22 | outlines: List['OPMLOutline'] = attr.ib()
23 |
24 |
25 | @attr.s
26 | class OPML:
27 | title: Optional[str] = attr.ib()
28 | owner_name: Optional[str] = attr.ib()
29 | owner_email: Optional[str] = attr.ib()
30 | date_created: Optional[datetime] = attr.ib()
31 | date_modified: Optional[datetime] = attr.ib()
32 | expansion_state: Optional[str] = attr.ib()
33 |
34 | vertical_scroll_state: Optional[int] = attr.ib()
35 | window_top: Optional[int] = attr.ib()
36 | window_left: Optional[int] = attr.ib()
37 | window_bottom: Optional[int] = attr.ib()
38 | window_right: Optional[int] = attr.ib()
39 |
40 | outlines: List[OPMLOutline] = attr.ib()
41 |
42 |
43 | def _get_outlines(element: Element) -> List[OPMLOutline]:
44 | rv = list()
45 |
46 | for outline in element.findall('outline'):
47 | rv.append(OPMLOutline(
48 | outline.attrib.get('text'),
49 | outline.attrib.get('type'),
50 | outline.attrib.get('xmlUrl'),
51 | outline.attrib.get('description'),
52 | outline.attrib.get('htmlUrl'),
53 | outline.attrib.get('language'),
54 | outline.attrib.get('title'),
55 | outline.attrib.get('version'),
56 | _get_outlines(outline)
57 | ))
58 |
59 | return rv
60 |
61 |
62 | def _parse_opml(root: Element) -> OPML:
63 | head = root.find('head')
64 | body = root.find('body')
65 |
66 | return OPML(
67 | get_text(head, 'title'),
68 | get_text(head, 'ownerName'),
69 | get_text(head, 'ownerEmail'),
70 | get_datetime(head, 'dateCreated'),
71 | get_datetime(head, 'dateModified'),
72 | get_text(head, 'expansionState'),
73 | get_int(head, 'vertScrollState'),
74 | get_int(head, 'windowTop'),
75 | get_int(head, 'windowLeft'),
76 | get_int(head, 'windowBottom'),
77 | get_int(head, 'windowRight'),
78 | outlines=_get_outlines(body)
79 | )
80 |
81 |
82 | def parse_opml_file(filename: str) -> OPML:
83 | """Parse an OPML document from a local XML file."""
84 | root = parse_xml(filename).getroot()
85 | return _parse_opml(root)
86 |
87 |
88 | def parse_opml_bytes(data: bytes) -> OPML:
89 | """Parse an OPML document from a byte-string containing XML data."""
90 | root = parse_xml(BytesIO(data)).getroot()
91 | return _parse_opml(root)
92 |
93 |
94 | def get_feed_list(opml_obj: OPML) -> List[str]:
95 | """Walk an OPML document to extract the list of feed it contains."""
96 | rv = list()
97 |
98 | def collect(obj):
99 | for outline in obj.outlines:
100 | if outline.type == 'rss' and outline.xml_url:
101 | rv.append(outline.xml_url)
102 |
103 | if outline.outlines:
104 | collect(outline)
105 |
106 | collect(opml_obj)
107 | return rv
108 |
--------------------------------------------------------------------------------
/tests/atom/broken-xkcd.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | xkcd.com
4 |
5 | https://xkcd.com/
6 | 2018-02-02T00:00:00Z
7 |
8 | Chicken Pox and Name Statistics
9 |
10 | 2018-02-02T00:00:00Z
11 | https://xkcd.com/1950/
12 | <img
13 | src="https://imgs.xkcd.com/comics/chicken_pox_and_name_statistics.png"
14 | title="People with all six of those names agree that it's weird that
15 | we have teeth, when you think about it for too long. Just about
16 | everyone agrees on that, except—in a still-unexplained statistical
17 | anomaly—people named "Trevor."" alt="People with
18 | all six of those names agree that it's weird that we have teeth,
19 | when you think about it for too long. Just about everyone agrees on
20 | that, except—in a still-unexplained statistical anomaly—people named
21 | "Trevor."" />
22 |
23 |
24 |
25 | Fruit Collider
26 |
27 | 2018-01-31T00:00:00Z
28 | https://xkcd.com/1949/
29 | <img
30 | src="https://imgs.xkcd.com/comics/fruit_collider.png" title="The
31 | most delicious exotic fruit discovered this way is the strawberry
32 | banana. Sadly, it's only stable in puree form, so it's currently
33 | limited to yogurt and smoothies, but they're building a massive
34 | collider in Europe to search for a strawberry banana that can be
35 | eaten whole." alt="The most delicious exotic fruit discovered this
36 | way is the strawberry banana. Sadly, it's only stable in puree form,
37 | so it's currently limited to yogurt and smoothies, but they're
38 | building a massive collider in Europe to search for a strawberry
39 | banana that can be eaten whole." />
40 |
41 |
42 |
43 | Campaign Fundraising Emails
44 |
45 | 2018-01-29T00:00:00Z
46 | https://xkcd.com/1948/
47 | <img
48 | src="https://imgs.xkcd.com/comics/campaign_fundraising_emails.png"
49 | title="The establishment doesn't take us seriously. You know who
50 | else they didn't take seriously? Hitler. I'll be like him, but a
51 | GOOD guy instead of..." alt="The establishment doesn't take us
52 | seriously. You know who else they didn't take seriously? Hitler.
53 | I'll be like him, but a GOOD guy instead of..." />
54 |
55 |
56 |
57 | Night Sky
58 |
59 | 2018-01-26T00:00:00Z
60 | https://xkcd.com/1947/
61 | <img
62 | src="https://imgs.xkcd.com/comics/night_sky.png" title="There's a
63 | mountain lion nearby, but it didn't notice you because it's reading
64 | Facebook." alt="There's a mountain lion nearby, but it didn't notice
65 | you because it's reading Facebook." />
66 |
67 |
68 |
--------------------------------------------------------------------------------
/tests/rss/test_specification.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from dateutil.tz import tzutc
4 |
5 | from atoma.rss import RSSChannel, RSSItem, parse_rss_file
6 |
7 |
8 | def test_specification():
9 | item_1 = RSSItem(
10 | title='Star City',
11 | link='http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp',
12 | description='How do Americans get ready to work with Russians aboard\n'
13 | ' the International Space Station? They tak'
14 | 'e a crash course in\n culture, language'
15 | ' and protocol at Russia\'s Star '
17 | 'City.',
18 | author=None,
19 | categories=[],
20 | comments=None,
21 | enclosures=[],
22 | guid='http://liftoff.msfc.nasa.gov/2003/06/03.html#item573',
23 | pub_date=datetime(2003, 6, 3, 9, 39, 21, tzinfo=tzutc()),
24 | source=None,
25 | content_encoded=None
26 | )
27 | item_2 = RSSItem(
28 | title=None,
29 | link=None,
30 | description='Sky watchers in Europe, Asia, and parts of Alaska and\n '
31 | ' Canada will experience a partial\n eclipse of the '
34 | 'Sun on Saturday, May 31st.',
35 | author=None,
36 | categories=[],
37 | comments=None,
38 | enclosures=[],
39 | guid='http://liftoff.msfc.nasa.gov/2003/05/30.html#item572',
40 | pub_date=datetime(2003, 5, 30, 11, 6, 42, tzinfo=tzutc()),
41 | source=None,
42 | content_encoded=None
43 | )
44 | item_3 = RSSItem(
45 | title='The Engine That Does More',
46 | link='http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp',
47 | description='Before man travels to Mars, NASA hopes to design new\n '
48 | ' engines that will let us fly through the '
49 | 'Solar System more\n quickly.\n '
50 | ' The proposed VASIMR engine would do that.',
51 | author=None,
52 | categories=[],
53 | comments=None,
54 | enclosures=[],
55 | guid='http://liftoff.msfc.nasa.gov/2003/05/27.html#item571',
56 | pub_date=datetime(2003, 5, 27, 8, 37, 32, tzinfo=tzutc()),
57 | source=None,
58 | content_encoded=None
59 | )
60 | item_4 = RSSItem(
61 | title="Astronauts' Dirty Laundry",
62 | link='http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp',
63 | description='Compared to earlier spacecraft, the International Space\n'
64 | ' Station has many luxuries, but laundry '
65 | 'facilities are not one of\n them. Instead,'
66 | ' astronauts have other options.',
67 | author=None,
68 | categories=[],
69 | comments=None,
70 | enclosures=[],
71 | guid='http://liftoff.msfc.nasa.gov/2003/05/20.html#item570',
72 | pub_date=datetime(2003, 5, 20, 8, 56, 2, tzinfo=tzutc()),
73 | source=None,
74 | content_encoded=None
75 | )
76 |
77 | expected = RSSChannel(
78 | title='Liftoff News',
79 | link='http://liftoff.msfc.nasa.gov/',
80 | description='Liftoff to Space Exploration.',
81 | language='en-us',
82 | copyright=None,
83 | managing_editor='editor@example.com',
84 | web_master='webmaster@example.com',
85 | pub_date=datetime(2003, 6, 10, 4, 0, tzinfo=tzutc()),
86 | last_build_date=datetime(2003, 6, 10, 9, 41, 1, tzinfo=tzutc()),
87 | categories=[],
88 | generator='Weblog Editor 2.0',
89 | docs='http://blogs.law.harvard.edu/tech/rss',
90 | ttl=None,
91 | image=None,
92 | items=[item_1, item_2, item_3, item_4],
93 | content_encoded=None
94 | )
95 | assert parse_rss_file('tests/rss/specification.xml') == expected
96 |
--------------------------------------------------------------------------------
/tests/json_feed/test_specs.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from dateutil.tz import tzoffset
4 |
5 | from atoma.json_feed import (
6 | parse_json_feed_file, JSONFeed, JSONFeedAuthor, JSONFeedItem
7 | )
8 |
9 | content_html = (
10 | '
We —\xa0Manton Reece and Brent Simmons —\xa0have noticed that JSON '
11 | 'has become the developers’ choice for APIs, and that developers will '
12 | 'often go out of their way to avoid XML. JSON is simpler to read and '
13 | 'write, and it’s less prone to bugs.
\n\n
So we developed JSON Feed, '
14 | 'a format similar to '
15 | 'RSS and Atom but '
16 | 'in JSON. It reflects the lessons learned from our years of work reading '
17 | 'and publishing feeds.
\n\n
'
18 | 'See the spec. It’s at version 1, which may be the only version ever '
19 | 'needed. If future versions are needed, version 1 feeds will still be '
20 | 'valid feeds.
\n\n
Notes
\n\n
We have a WordPress plugin and, coming soon, a '
22 | 'JSON Feed Parser for Swift. As more code is written, by us and others, '
23 | 'we’ll update the code page.
This website —\xa0the Markdown files and supporting '
27 | 'resources —\xa0is up '
28 | 'on GitHub, and you’re welcome to comment there.
\n\n
This '
29 | 'website is also a blog, and you can subscribe to the RSS feed or the JSON feed (if your reader supports it).'
32 | '
\n\n
We worked with a number of people on this over the course of '
33 | 'several months. We list them, and thank them, at the bottom of the spec. But — most importantly — '
35 | 'Craig Hockenberry spent a little time '
36 | 'making it look pretty. :)