├── atoma ├── const.py ├── exceptions.py ├── __init__.py ├── utils.py ├── opml.py ├── rss.py ├── simple.py ├── json_feed.py └── atom.py ├── MANIFEST.in ├── tests ├── rss │ ├── broken-no-channel.xml │ ├── broken-missing-description.xml │ ├── broken-missing-title.xml │ ├── broken-missing-source-url.xml │ ├── broken-version.xml │ ├── broken-missing-link.xml │ ├── broken-enclosure.xml │ ├── encoding.xml │ ├── little-used-elements.xml │ ├── test_little_used_elements.py │ ├── specification.xml │ ├── test_rss.py │ └── test_specification.py ├── atom │ ├── broken-missing-id.xml │ ├── broken-empty-author.xml │ ├── broken-empty-id.xml │ ├── broken-empty-updated.xml │ ├── broken-empty-title.xml │ ├── broken-missing-updated.xml │ ├── broken-empty-summary.xml │ ├── broken-missing-author-name.xml │ ├── rfc-minimal.xml │ ├── unicode.xml │ ├── rfc-more-extensive.xml │ ├── test_rfc_minimal.py │ ├── test_atom.py │ ├── test_rfc_more_extensive.py │ ├── broken-xkcd.xml │ └── test_unicode.py ├── opml │ ├── nested-subscription-list.xml │ ├── broken-no-title.xml │ ├── subscription-list.xml │ └── test_opml.py ├── json_feed │ ├── test_json_feed.py │ ├── podcast.json │ ├── jsonfeed.org.json │ └── test_specs.py └── test_utils.py ├── .gitignore ├── Makefile ├── LICENSE ├── .github └── workflows │ └── test.yml ├── setup.py └── README.rst /atoma/const.py: -------------------------------------------------------------------------------- 1 | VERSION = '0.0.17' 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst -------------------------------------------------------------------------------- /tests/rss/broken-no-channel.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .pyc 2 | __pycache__ 3 | .cache 4 | build 5 | dist 6 | atoma.egg-info 7 | .coverage 8 | .pytest_cache 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | publish: 2 | pip install -U pip setuptools wheel twine 3 | python setup.py sdist 4 | python setup.py bdist_wheel 5 | twine upload dist/* 6 | rm -fr build dist atoma.egg-info 7 | 8 | clean: 9 | rm -fr build dist atoma.egg-info 10 | 11 | -------------------------------------------------------------------------------- /tests/rss/broken-missing-description.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://foo.bar 5 | Foo bar 6 | 7 | Baz item 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/atom/broken-missing-id.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 2003-12-13T18:30:02Z 5 | 6 | John Doe 7 | 8 | -------------------------------------------------------------------------------- /tests/rss/broken-missing-title.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://foo.bar 5 | Foo bar. 6 | 7 | Baz item 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/rss/broken-missing-source-url.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Foo 5 | http://foo.bar 6 | Foo bar. 7 | 8 | Baz item 9 | New York Times 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /atoma/exceptions.py: -------------------------------------------------------------------------------- 1 | class FeedParseError(Exception): 2 | """Document is an invalid feed.""" 3 | 4 | 5 | class FeedDocumentError(Exception): 6 | """Document is not a supported file.""" 7 | 8 | 9 | class FeedXMLError(FeedDocumentError): 10 | """Document is not valid XML.""" 11 | 12 | 13 | class FeedJSONError(FeedDocumentError): 14 | """Document is not valid JSON.""" 15 | -------------------------------------------------------------------------------- /tests/rss/broken-version.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Liftoff News 5 | http://liftoff.msfc.nasa.gov/ 6 | Liftoff to Space Exploration. 7 | 8 | 9 | The &amp; entity 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /atoma/__init__.py: -------------------------------------------------------------------------------- 1 | from .atom import parse_atom_file, parse_atom_bytes 2 | from .rss import parse_rss_file, parse_rss_bytes 3 | from .json_feed import ( 4 | parse_json_feed, parse_json_feed_file, parse_json_feed_bytes 5 | ) 6 | from .opml import parse_opml_file, parse_opml_bytes 7 | from .exceptions import ( 8 | FeedParseError, FeedDocumentError, FeedXMLError, FeedJSONError 9 | ) 10 | from .const import VERSION 11 | 12 | __version__ = VERSION 13 | -------------------------------------------------------------------------------- /tests/atom/broken-empty-author.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Foo 4 | 2003-12-13T18:30:02Z 5 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 6 | 7 | 8 | Atom-Powered Robots Run Amok 9 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 10 | 2003-12-13T18:30:02Z 11 | 12 | -------------------------------------------------------------------------------- /tests/atom/broken-empty-id.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 2003-12-13T18:30:02Z 5 | 6 | 7 | Atom-Powered Robots Run Amok 8 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 9 | 2003-12-13T18:30:02Z 10 | 11 | John Doe 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/rss/broken-missing-link.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Foo 5 | Foo bar. 6 | 7 | Item 0 8 | 9 | 10 | Item 1 11 | http://link1 12 | 13 | 14 | Item 2 15 | 646326554 16 | http://link2 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /tests/atom/broken-empty-updated.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Foo 4 | 5 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 6 | 7 | Atom-Powered Robots Run Amok 8 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 9 | 2003-12-13T18:30:02Z 10 | 11 | John Doe 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/atom/broken-empty-title.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 2003-12-13T18:30:02Z 5 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 6 | 7 | Atom-Powered Robots Run Amok 8 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 9 | 2003-12-13T18:30:02Z 10 | 11 | John Doe 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/atom/broken-missing-updated.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 5 | 6 | Atom-Powered Robots Run Amok 7 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 8 | Some text. 9 | 10 | John 11 | john@doe.org 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/opml/nested-subscription-list.xml: -------------------------------------------------------------------------------- 1 | Nested -------------------------------------------------------------------------------- /tests/atom/broken-empty-summary.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 2003-12-13T18:30:02Z 5 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 6 | 7 | Atom-Powered Robots Run Amok 8 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 9 | 2003-12-13T18:30:02Z 10 | 11 | 12 | John Doe 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/atom/broken-missing-author-name.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example Feed 4 | 2003-12-13T18:30:02Z 5 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 6 | 7 | Atom-Powered Robots Run Amok 8 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 9 | 2003-12-13T18:30:02Z 10 | Some text. 11 | 12 | john@doe.org 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/rss/broken-enclosure.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Foo 5 | http://foo.bar 6 | Foo bar. 7 | 8 | Baz item 9 | 10 | 11 | 12 | Foo item 13 | 14 | 15 | 16 | Foo item 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /tests/atom/rfc-minimal.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Feed 5 | 6 | 2003-12-13T18:30:02Z 7 | 8 | John Doe 9 | 10 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 11 | 12 | 13 | Atom-Powered Robots Run Amok 14 | 15 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 16 | 2003-12-13T18:30:02Z 17 | Some text. 18 | 19 | 20 | -------------------------------------------------------------------------------- /tests/opml/broken-no-title.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | The Old Reader 5 | Tue, 02 Sep 2018 22:28:04 GMT 6 | Tue, 02 Sep 2018 22:28:04 GMT 7 | Foo 8 | foo@bar.com 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/json_feed/test_json_feed.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from atoma.json_feed import ( 3 | parse_json_feed_file, parse_json_feed_bytes, JSONFeedAttachment 4 | ) 5 | 6 | 7 | def test_attachments(): 8 | parsed = parse_json_feed_file('tests/json_feed/podcast.json') 9 | expected = JSONFeedAttachment( 10 | url='http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a', 11 | mime_type='audio/x-m4a', 12 | title=None, 13 | size_in_bytes=89970236, 14 | duration=timedelta(seconds=6629) 15 | ) 16 | assert parsed.items[0].attachments == [expected] 17 | 18 | 19 | def test_parse_bytes(): 20 | with open('tests/json_feed/jsonfeed.org.json', mode='rb') as f: 21 | data = f.read() 22 | parsed = parse_json_feed_bytes(data) 23 | assert parsed.title == 'JSON Feed' 24 | -------------------------------------------------------------------------------- /tests/rss/encoding.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Liftoff News 5 | http://liftoff.msfc.nasa.gov/ 6 | Liftoff to Space Exploration. 7 | 8 | 9 | The &amp; entity 10 | 11 | 12 | Nice <gorilla> what's he weigh? 13 | 14 | 15 | Rïchàrd Plop's ☃ 16 | Valley Girl came out and I said to myself, Joe Bob, you gotta get out of the country for a while.]]> 17 | 18 | 19 | Foo 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Nicolas Le Manchet 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /tests/rss/little-used-elements.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Foo 7 | http://foo.bar 8 | Foo bar. 9 | 10 | Media 11 | Public domain 12 | 60 13 | 14 | 15 | http://dallas.example.com 16 | Dallas Times-Herald 17 | http://dallas.example.com/masthead.gif 18 | Read the Dallas Times-Herald 19 | 32 20 | 96 21 | 22 | 23 | 24 | Baz item 25 | Los Angeles Herald-Examiner 26 | Photo 27 | Video 28 | 29 | 30 | What a beautiful day!

]]>
31 |
32 | 33 |
34 |
35 | -------------------------------------------------------------------------------- /tests/opml/subscription-list.xml: -------------------------------------------------------------------------------- 1 | mySubscriptions.opml Sat, 18 Jun 2005 12:11:52 GMT Tue, 02 Aug 2005 21:42:48 GMT Dave Winer dave@scripting.com 1 61 304 562 842 -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | 3 | from dateutil.tz import tzoffset 4 | 5 | from atoma.utils import try_parse_date, try_parse_length 6 | 7 | 8 | def test_try_parse_date(): 9 | expected = datetime( 10 | 2018, 11, 30, 17, 0, tzinfo=timezone(timedelta(seconds=32400)) 11 | ) 12 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00 +0900') == expected 13 | 14 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00:00 +0900') is None 15 | 16 | expected = datetime(2018, 11, 30, 17, 0, tzinfo=timezone.utc) 17 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00 GMT') == expected 18 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00 UT') == expected 19 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00 Z') == expected 20 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00') == expected 21 | 22 | expected = datetime(2018, 11, 30, 17, 0, tzinfo=tzoffset('PST', -28800)) 23 | assert try_parse_date('Fri, 30 Nov 2018 17:00:00 PST') == expected 24 | 25 | expected = datetime(2018, 10, 10, 18, 0, tzinfo=timezone.utc) 26 | assert try_parse_date('Web, 10 Oct 2018 18:00:00 +0000') == expected 27 | 28 | 29 | def test_try_parse_length(): 30 | assert try_parse_length(10) == 10 31 | assert try_parse_length(545332) == 545332 32 | assert try_parse_length(10.5633) == 10 33 | assert try_parse_length('10') == 10 34 | 35 | assert try_parse_length('foo') is None 36 | assert try_parse_length(-1) is None 37 | assert try_parse_length(None) is None 38 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | env: 6 | FORCE_COLOR: 1 7 | 8 | jobs: 9 | test: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 15 | os: [windows-latest, macos-latest, ubuntu-latest] 16 | include: 17 | # Include new variables for Codecov 18 | - { codecov-flag: GHA_Windows, os: windows-latest } 19 | - { codecov-flag: GHA_macOS, os: macos-latest } 20 | - { codecov-flag: GHA_Ubuntu, os: ubuntu-latest } 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | cache: pip 30 | cache-dependency-path: "setup.py" 31 | 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install -U pip 35 | python -m pip install -U wheel 36 | python -m pip install -e ".[tests]" 37 | 38 | - name: Run tests 39 | run: | 40 | pytest -v --cov=atoma tests/ --cov-report xml 41 | pycodestyle --ignore=E252 atoma tests 42 | 43 | - name: Upload coverage 44 | uses: codecov/codecov-action@v3 45 | with: 46 | flags: ${{ matrix.codecov-flag }} 47 | name: ${{ matrix.os }} Python ${{ matrix.python-version }} 48 | -------------------------------------------------------------------------------- /tests/atom/unicode.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Rïchàrd Plop's blog 4 | http://richard.plop/feed.atom 5 | 2017-06-05T00:00:00Z 6 | 7 | 8 | Rïchàrd Plop's personal blog. 9 | Werkzeug 10 | 11 | 12 | 13 | Article n°1 14 | http://richard.plop/2017/6/5/article-1 15 | 2017-06-05T00:00:00Z 16 | 17 | 18 | Rïchàrd Plop 19 | 20 | <p></p> 21 | 22 | 23 | 24 | Unicode snowman 25 | http://richard.plop/2017/6/5/article-2 26 | 2016-12-29T00:00:00Z 27 | 28 | 29 | Rïchàrd Plop 30 | 31 | 32 | 33 | 34 | 35 | Unicode snowman 3 36 | http://richard.plop/2017/6/5/article-3 37 | 2016-12-29T00:00:00Z 38 | 39 | 40 | 41 | http://example.org/ 42 | Example, Inc. 43 | 2003-12-13T18:30:02Z 44 | 45 | Foo Bar 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /tests/json_feed/podcast.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "https://jsonfeed.org/version/1.1", 3 | "user_comment": "This is a podcast feed. You can add this feed to your podcast client using the following URL: http://therecord.co/feed.json", 4 | "title": "The Record", 5 | "home_page_url": "http://therecord.co/", 6 | "feed_url": "http://therecord.co/feed.json", 7 | "items": [ 8 | { 9 | "id": "http://therecord.co/chris-parrish", 10 | "title": "Special #1 - Chris Parrish", 11 | "url": "http://therecord.co/chris-parrish", 12 | "content_text": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", 13 | "content_html": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", 14 | "summary": "Brent interviews Chris Parrish, co-host of The Record and one-half of Aged & Distilled.", 15 | "date_published": "2014-05-09T14:04:00-07:00", 16 | "attachments": [ 17 | { 18 | "url": "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a", 19 | "mime_type": "audio/x-m4a", 20 | "size_in_bytes": 89970236, 21 | "duration_in_seconds": 6629 22 | } 23 | ] 24 | } 25 | ] 26 | } -------------------------------------------------------------------------------- /tests/rss/test_little_used_elements.py: -------------------------------------------------------------------------------- 1 | from atoma.rss import ( 2 | RSSChannel, RSSItem, RSSEnclosure, RSSSource, RSSImage, parse_rss_file 3 | ) 4 | 5 | 6 | def test_little_used_elements(): 7 | item = RSSItem( 8 | title='Baz item', 9 | link=None, 10 | description=None, 11 | author=None, 12 | categories=['Photo', 'Video'], 13 | comments=None, 14 | enclosures=[ 15 | RSSEnclosure( 16 | url='http://dallas.example.com/joebob_050689.mp3', 17 | length=24986239, 18 | type='audio/mpeg' 19 | ), 20 | RSSEnclosure( 21 | url='http://dallas.example.com/foo.json', 22 | length=0, 23 | type='application/json') 24 | ], 25 | guid=None, 26 | pub_date=None, 27 | source=RSSSource( 28 | title='Los Angeles Herald-Examiner', 29 | url='http://la.example.com/rss.xml' 30 | ), 31 | content_encoded='

What a beautiful day!

' 32 | ) 33 | expected = RSSChannel( 34 | title='Foo', 35 | link='http://foo.bar', 36 | description='Foo bar.', 37 | language=None, 38 | copyright='Public domain', 39 | managing_editor=None, 40 | web_master=None, 41 | pub_date=None, 42 | last_build_date=None, 43 | categories=['Media'], 44 | generator=None, 45 | docs=None, 46 | ttl=60, 47 | image=RSSImage( 48 | url='http://dallas.example.com/masthead.gif', 49 | title='Dallas Times-Herald', 50 | link='http://dallas.example.com', 51 | width=96, 52 | height=32, 53 | description='Read the Dallas Times-Herald' 54 | ), 55 | items=[item], 56 | content_encoded=None 57 | ) 58 | assert parse_rss_file('tests/rss/little-used-elements.xml') == expected 59 | -------------------------------------------------------------------------------- /tests/atom/rfc-more-extensive.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dive into mark 5 | 6 | 7 | A <em>lot</em> of effort 8 | went into making this effortless 9 | 10 | 11 | 2005-07-31T12:29:29Z 12 | 13 | tag:example.org,2003:3 14 | 15 | 17 | 19 | 20 | Copyright (c) 2003, Mark Pilgrim 21 | 22 | 23 | Example Toolkit 24 | 25 | 26 | 27 | Atom draft-07 snapshot 28 | 30 | 32 | tag:example.org,2003:3.2397 33 | 2005-07-31T12:29:29Z 34 | 2003-12-13T08:29:29-04:00 35 | 36 | Mark Pilgrim 37 | http://example.org/ 38 | f8dy@example.com 39 | 40 | 41 | Sam Ruby 42 | 43 | 44 | Joe Gregorio 45 | 46 | 48 |
49 |

50 | [Update: The Atom draft is finished.] 51 |

52 |
53 |
54 |
55 | 56 |
-------------------------------------------------------------------------------- /tests/atom/test_rfc_minimal.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from dateutil.tz import tzutc 4 | 5 | from atoma.atom import ( 6 | AtomFeed, AtomEntry, AtomTextConstruct, AtomTextType, AtomPerson, AtomLink, 7 | parse_atom_file 8 | ) 9 | 10 | 11 | def test_rfc_minimal(): 12 | expected_entry = AtomEntry( 13 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 14 | value='Atom-Powered Robots Run Amok'), 15 | id_='urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a', 16 | updated=datetime.datetime(2003, 12, 13, 18, 30, 2, tzinfo=tzutc()), 17 | authors=[ 18 | AtomPerson(name='John Doe', uri=None, email=None) 19 | ], 20 | contributors=[], 21 | links=[ 22 | AtomLink(href='http://example.org/2003/12/13/atom03', rel=None, 23 | type_=None, hreflang=None, title=None, length=None) 24 | ], 25 | categories=[], 26 | published=None, 27 | rights=None, 28 | summary=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 29 | value='Some text.'), 30 | content=None, 31 | source=None 32 | ) 33 | expected = AtomFeed( 34 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 35 | value='Example Feed'), 36 | id_='urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6', 37 | updated=datetime.datetime(2003, 12, 13, 18, 30, 2, tzinfo=tzutc()), 38 | authors=[ 39 | AtomPerson(name='John Doe', uri=None, email=None) 40 | ], 41 | contributors=[], 42 | links=[ 43 | AtomLink(href='http://example.org/', rel=None, type_=None, 44 | hreflang=None, title=None, length=None) 45 | ], 46 | categories=[], 47 | generator=None, 48 | subtitle=None, 49 | rights=None, 50 | icon=None, 51 | logo=None, 52 | entries=[ 53 | expected_entry 54 | ] 55 | ) 56 | assert parse_atom_file('tests/atom/rfc-minimal.xml') == expected 57 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | with open(path.join(here, 'LICENSE'), encoding='utf-8') as f: 11 | long_description += f.read() 12 | 13 | with open(path.join(here, 'atoma', 'const.py'), encoding='utf-8') as fp: 14 | version = dict() 15 | exec(fp.read(), version) 16 | version = version['VERSION'] 17 | 18 | setup( 19 | name='atoma', 20 | version=version, 21 | description='Atom, RSS and JSON feed parser for Python 3', 22 | long_description=long_description, 23 | long_description_content_type='text/x-rst', 24 | url='https://github.com/NicolasLM/atoma', 25 | author='Nicolas Le Manchet', 26 | author_email='nicolas@lemanchet.fr', 27 | license='MIT', 28 | python_requires=">=3.7", 29 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 30 | classifiers=[ 31 | 'Development Status :: 4 - Beta', 32 | 'Intended Audience :: Developers', 33 | 'Topic :: Software Development :: Libraries', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Natural Language :: English', 36 | 'Programming Language :: Python :: 3', 37 | 'Programming Language :: Python :: 3 :: Only', 38 | 'Programming Language :: Python :: 3.7', 39 | 'Programming Language :: Python :: 3.8', 40 | 'Programming Language :: Python :: 3.9', 41 | 'Programming Language :: Python :: 3.10', 42 | 'Programming Language :: Python :: 3.11', 43 | 'Topic :: Text Processing :: Markup :: XML' 44 | ], 45 | keywords='atom rss json feed feeds syndication parser RFC4287', 46 | 47 | packages=find_packages(include=('atoma', 'atoma.*')), 48 | install_requires=[ 49 | 'defusedxml', 50 | 'attrs', 51 | 'python-dateutil' 52 | ], 53 | 54 | extras_require={ 55 | 'tests': [ 56 | 'pytest', 57 | 'pytest-cov', 58 | 'python-coveralls', 59 | 'pycodestyle' 60 | ] 61 | } 62 | ) 63 | -------------------------------------------------------------------------------- /tests/atom/test_atom.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from atoma.atom import ( 4 | AtomFeed, parse_atom_file, parse_atom_bytes, FeedParseError 5 | ) 6 | from atoma import FeedXMLError 7 | 8 | data = b"""\ 9 | 10 | 11 | Example Feed 12 | foo 13 | 2003-12-13T18:30:02Z 14 | 15 | """ 16 | 17 | 18 | def test_read_bytes(): 19 | assert isinstance(parse_atom_bytes(data), AtomFeed) 20 | 21 | 22 | def test_broken_missing_id(): 23 | with pytest.raises(FeedParseError): 24 | parse_atom_file('tests/atom/broken-missing-id.xml') 25 | 26 | 27 | def test_broken_missing_author(): 28 | # The RFC mandates that at least one of feed or entries must have an author 29 | # but this is rarely the case in practice. 30 | parsed = parse_atom_file('tests/atom/broken-xkcd.xml') 31 | assert parsed.authors == list() 32 | assert parsed.entries[0].authors == list() 33 | 34 | 35 | def test_broken_missing_updated(): 36 | # The RFC mandates that feed and entries have an updated date 37 | # but this is rarely the case in practice. 38 | parsed = parse_atom_file('tests/atom/broken-missing-updated.xml') 39 | assert parsed.updated is None 40 | assert parsed.entries[0].updated is None 41 | 42 | 43 | def test_broken_empty_fields(): 44 | # As a general rule, XML tags should not be empty. In practice optional 45 | # fields are sometimes present in the feed but with an empty tag 46 | parsed = parse_atom_file('tests/atom/broken-empty-summary.xml') 47 | assert parsed.entries[0].summary is None 48 | 49 | parsed = parse_atom_file('tests/atom/broken-empty-title.xml') 50 | assert parsed.title is None 51 | 52 | parsed = parse_atom_file('tests/atom/broken-empty-updated.xml') 53 | assert parsed.updated is None 54 | 55 | parsed = parse_atom_file('tests/atom/broken-empty-author.xml') 56 | assert parsed.authors == [] 57 | assert parsed.entries[0].authors == [] 58 | 59 | parsed = parse_atom_file('tests/atom/broken-missing-author-name.xml') 60 | assert parsed.authors == [] 61 | assert parsed.entries[0].authors == [] 62 | 63 | # Require fields (id...) that have empty tags should throw an error 64 | with pytest.raises(FeedParseError): 65 | parse_atom_file('tests/atom/broken-empty-id.xml') 66 | 67 | 68 | def test_broken_not_xml(): 69 | with pytest.raises(FeedXMLError): 70 | parse_atom_bytes(b'This is not an XML document') 71 | -------------------------------------------------------------------------------- /tests/json_feed/jsonfeed.org.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "https://jsonfeed.org/version/1", 3 | "user_comment": "This feed allows you to read the posts from this site in any feed reader that supports the JSON Feed format. To add this feed to your reader, copy the following URL — https://jsonfeed.org/feed.json — and add it your reader.", 4 | "title": "JSON Feed", 5 | "description": "JSON Feed is a pragmatic syndication format for blogs, microblogs, and other time-based content.", 6 | "home_page_url": "https://jsonfeed.org/", 7 | "feed_url": "https://jsonfeed.org/feed.json", 8 | "author": { 9 | "name": "Brent Simmons and Manton Reece", 10 | "url": "https://jsonfeed.org/" 11 | }, 12 | "items": [ 13 | { 14 | "title": "Announcing JSON Feed", 15 | "date_published": "2017-05-17T08:02:12-07:00", 16 | "id": "https://jsonfeed.org/2017/05/17/announcing_json_feed", 17 | "url": "https://jsonfeed.org/2017/05/17/announcing_json_feed", 18 | "content_html": "

We — Manton Reece and Brent Simmons — have noticed that JSON has become the developers’ choice for APIs, and that developers will often go out of their way to avoid XML. JSON is simpler to read and write, and it’s less prone to bugs.

\n\n

So we developed JSON Feed, a format similar to RSS and Atom but in JSON. It reflects the lessons learned from our years of work reading and publishing feeds.

\n\n

See the spec. It’s at version 1, which may be the only version ever needed. If future versions are needed, version 1 feeds will still be valid feeds.

\n\n

Notes

\n\n

We have a WordPress plugin and, coming soon, a JSON Feed Parser for Swift. As more code is written, by us and others, we’ll update the code page.

\n\n

See Mapping RSS and Atom to JSON Feed for more on the similarities between the formats.

\n\n

This website — the Markdown files and supporting resources — is up on GitHub, and you’re welcome to comment there.

\n\n

This website is also a blog, and you can subscribe to the RSS feed or the JSON feed (if your reader supports it).

\n\n

We worked with a number of people on this over the course of several months. We list them, and thank them, at the bottom of the spec. But — most importantly — Craig Hockenberry spent a little time making it look pretty. :)

\n" 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /tests/rss/specification.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Liftoff News 5 | http://liftoff.msfc.nasa.gov/ 6 | Liftoff to Space Exploration. 7 | 8 | en-us 9 | Tue, 10 Jun 2003 04:00:00 GMT 10 | Tue, 10 Jun 2003 09:41:01 GMT 11 | http://blogs.law.harvard.edu/tech/rss 12 | Weblog Editor 2.0 13 | editor@example.com 14 | webmaster@example.com 15 | 16 | 17 | Star City 18 | http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp 19 | 20 | How do Americans get ready to work with Russians aboard 21 | the International Space Station? They take a crash course in 22 | culture, language and protocol at Russia's <a 23 | href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. 24 | 25 | Tue, 03 Jun 2003 09:39:21 GMT 26 | http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 27 | 28 | 29 | 30 | Sky watchers in Europe, Asia, and parts of Alaska and 31 | Canada will experience a <a 32 | href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial 33 | eclipse of the Sun</a> on Saturday, May 31st. 34 | 35 | Fri, 30 May 2003 11:06:42 GMT 36 | http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 37 | 38 | 39 | 40 | The Engine That Does More 41 | http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp 42 | Before man travels to Mars, NASA hopes to design new 43 | engines that will let us fly through the Solar System more 44 | quickly. 45 | The proposed VASIMR engine would do that. 46 | 47 | Tue, 27 May 2003 08:37:32 GMT 48 | http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 49 | 50 | 51 | 52 | Astronauts' Dirty Laundry 53 | http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp 54 | Compared to earlier spacecraft, the International Space 55 | Station has many luxuries, but laundry facilities are not one of 56 | them. Instead, astronauts have other options. 57 | 58 | Tue, 20 May 2003 08:56:02 GMT 59 | http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /tests/opml/test_opml.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from dateutil.tz import tzutc 4 | import pytest 5 | 6 | from atoma.opml import ( 7 | parse_opml_file, parse_opml_bytes, get_feed_list, OPML, OPMLOutline 8 | ) 9 | from atoma import FeedXMLError 10 | 11 | data = b"""\ 12 | 13 | 14 | 15 | states.opml 16 | 17 | 18 | 19 | 20 | 21 | """ 22 | 23 | 24 | def test_read_bytes(): 25 | assert isinstance(parse_opml_bytes(data), OPML) 26 | 27 | 28 | def test_nested_subscription_list(): 29 | o = parse_opml_file('tests/opml/nested-subscription-list.xml') 30 | assert get_feed_list(o) == [ 31 | 'http://1.com/rss.xml', 32 | 'http://2.com/rss.xml', 33 | 'http://3.com/rss.xml', 34 | 'http://4.com/rss.xml', 35 | 'http://5.com/rss.xml', 36 | ] 37 | 38 | 39 | def test_missing_outline_title(): 40 | o = parse_opml_file('tests/opml/broken-no-title.xml') 41 | assert get_feed_list(o) == [ 42 | 'https://xkcd.com/rss.xml', 43 | 'http://antirez.com/rss', 44 | 'https://what-if.xkcd.com/feed.atom', 45 | ] 46 | 47 | 48 | def test_subscription_list(): 49 | expected = OPML( 50 | title='mySubscriptions.opml', 51 | owner_name='Dave Winer', 52 | owner_email='dave@scripting.com', 53 | date_created=datetime(2005, 6, 18, 12, 11, 52, tzinfo=tzutc()), 54 | date_modified=datetime(2005, 8, 2, 21, 42, 48, tzinfo=tzutc()), 55 | expansion_state=None, 56 | vertical_scroll_state=1, 57 | window_top=61, 58 | window_left=304, 59 | window_bottom=562, 60 | window_right=842, 61 | outlines=[ 62 | OPMLOutline( 63 | text='CNET News.com', 64 | type='rss', 65 | xml_url='http://news.com.com/2547-1_3-0-5.xml', 66 | description='Tech news and business reports by CNET News.com.', 67 | html_url='http://news.com.com/', 68 | language='unknown', 69 | title='CNET News.com', 70 | version='RSS2', 71 | outlines=[] 72 | ), 73 | OPMLOutline( 74 | text='washingtonpost.com - Politics', 75 | type='rss', 76 | xml_url='http://www.washingtonpost.com/wp-srv/' 77 | 'politics/rssheadlines.xml', 78 | description='Politics', 79 | html_url='http://www.washingtonpost.com/wp-dyn/' 80 | 'politics?nav=rss_politics', 81 | language='unknown', 82 | title='washingtonpost.com - Politics', 83 | version='RSS2', 84 | outlines=[] 85 | ) 86 | ] 87 | ) 88 | assert parse_opml_file('tests/opml/subscription-list.xml') == expected 89 | 90 | 91 | def test_broken_not_xml(): 92 | with pytest.raises(FeedXMLError): 93 | parse_opml_bytes(b'This is not an XML document') 94 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Atoma 2 | ===== 3 | 4 | .. image:: https://github.com/NicolasLM/atoma/actions/workflows/test.yml/badge.svg 5 | :target: https://github.com/NicolasLM/atoma/actions/workflows/test.yml 6 | .. image:: https://codecov.io/gh/NicolasLM/atoma/branch/main/graph/badge.svg 7 | :target: https://codecov.io/gh/NicolasLM/atoma 8 | 9 | 10 | 11 | Atom, RSS and JSON feed parser for Python 3. 12 | 13 | Quickstart 14 | ---------- 15 | 16 | Install Atoma with pip:: 17 | 18 | pip install atoma 19 | 20 | Load and parse an Atom XML file: 21 | 22 | .. code:: python 23 | 24 | >>> import atoma 25 | >>> feed = atoma.parse_atom_feed('atom-feed.xml') 26 | >>> feed.description 27 | 'The blog relating the daily life of web agency developers' 28 | >>> len(feed.items) 29 | 5 30 | 31 | A small change is needed if you are dealing with an RSS XML file: 32 | 33 | .. code:: python 34 | 35 | >>> feed = atoma.parse_rss_feed('rss-feed.xml') 36 | 37 | Parsing feeds from the Internet is easy as well: 38 | 39 | .. code:: python 40 | 41 | >>> import atoma, requests 42 | >>> response = requests.get('http://lucumr.pocoo.org/feed.atom') 43 | >>> feed = atoma.parse_atom_bytes(response.content) 44 | >>> feed.title.value 45 | "Armin Ronacher's Thoughts and Writings" 46 | 47 | Features 48 | -------- 49 | 50 | * RSS 2.0 - `RSS 2.0 Specification `_ 51 | * Atom Syndication Format v1 - `RFC4287 `_ 52 | * JSON Feed v1 - `JSON Feed specification `_ 53 | * OPML 2.0, to share lists of feeds - `OPML 2.0 `_ 54 | * Typed: feeds decomposed into meaningful Python objects 55 | * Secure: uses defusedxml to load untrusted feeds 56 | * Compatible with Python 3.6+ 57 | 58 | Security warning 59 | ---------------- 60 | 61 | If you use this library to display content from feeds in a web page, you NEED 62 | to clean the HTML contained in the feeds to prevent `Cross-site scripting (XSS) 63 | `_. The `bleach 64 | `_ library is recommended for cleaning feeds. 65 | 66 | Useful Resources 67 | ---------------- 68 | 69 | To use this library a basic understanding of feeds is required. For Atom, the 70 | `Introduction to Atom `_ is a must 71 | read. The `RFC 4287 `_ can help lift some 72 | ambiguities. Finally the `feed validator `_ is 73 | great to test hand-crafted feeds. 74 | 75 | For RSS, the `RSS specification `_ and 76 | `rssboard.org `_ have a ton of information and 77 | examples. 78 | 79 | For OPML, the `OPML specification 80 | `_ has a paragraph dedicated 81 | to its usage for syndication 82 | 83 | Non-implemented Features 84 | ------------------------ 85 | 86 | Some seldom used features are not implemented: 87 | 88 | * XML signature and encryption 89 | * Some Atom and RSS extensions 90 | * Atom content other than ``text``, ``html`` and ``xhtml`` 91 | 92 | License 93 | ------- 94 | 95 | MIT 96 | -------------------------------------------------------------------------------- /atoma/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from dateutil.tz import gettz 3 | from xml.etree.ElementTree import Element 4 | from typing import Optional 5 | 6 | import dateutil.parser 7 | from defusedxml.ElementTree import parse as defused_xml_parse, ParseError 8 | 9 | from .exceptions import FeedXMLError, FeedParseError 10 | 11 | ns = { 12 | 'content': 'http://purl.org/rss/1.0/modules/content/', 13 | 'feed': 'http://www.w3.org/2005/Atom' 14 | } 15 | 16 | # Common timezone abbreviations defined in RFC 822, used by RSS 17 | # https://tools.ietf.org/html/rfc822#section-5.1 18 | tzinfos = { 19 | 'UT': gettz('GMT'), 20 | 'EST': -18000, 21 | 'EDT': -14400, 22 | 'CST': -21600, 23 | 'CDT': -18000, 24 | 'MST': -25200, 25 | 'MDT': -21600, 26 | 'PST': -28800, 27 | 'PDT': -25200 28 | } 29 | 30 | 31 | def parse_xml(xml_content): 32 | try: 33 | return defused_xml_parse(xml_content) 34 | except ParseError: 35 | raise FeedXMLError('Not a valid XML document') 36 | 37 | 38 | def get_child(element: Element, name, 39 | optional: bool=True) -> Optional[Element]: 40 | child = element.find(name, namespaces=ns) 41 | 42 | if child is None and not optional: 43 | raise FeedParseError( 44 | 'Could not parse feed: "{}" does not have a "{}"' 45 | .format(element.tag, name) 46 | ) 47 | 48 | elif child is None: 49 | return None 50 | 51 | return child 52 | 53 | 54 | def get_text(element: Element, name, optional: bool=True) -> Optional[str]: 55 | child = get_child(element, name, optional) 56 | if child is None: 57 | return None 58 | 59 | if child.text is None: 60 | if optional: 61 | return None 62 | 63 | raise FeedParseError( 64 | 'Could not parse feed: "{}" text is required but is empty' 65 | .format(name) 66 | ) 67 | 68 | return child.text.strip() 69 | 70 | 71 | def get_int(element: Element, name, optional: bool=True) -> Optional[int]: 72 | text = get_text(element, name, optional) 73 | if text is None: 74 | return None 75 | 76 | return int(text) 77 | 78 | 79 | def get_datetime(element: Element, name, 80 | optional: bool=True) -> Optional[datetime]: 81 | text = get_text(element, name, optional) 82 | if text is None: 83 | return None 84 | 85 | return try_parse_date(text) 86 | 87 | 88 | def try_parse_date(date_str: str) -> Optional[datetime]: 89 | try: 90 | date = dateutil.parser.parse(date_str, fuzzy=True, tzinfos=tzinfos) 91 | except (ValueError, OverflowError): 92 | return None 93 | 94 | if date.tzinfo is None: 95 | # TZ naive datetime, make it a TZ aware datetime by assuming it 96 | # contains UTC time 97 | date = date.replace(tzinfo=timezone.utc) 98 | 99 | return date 100 | 101 | 102 | def try_parse_length(length) -> Optional[int]: 103 | try: 104 | length = int(length) 105 | except (TypeError, ValueError): 106 | return None 107 | 108 | if length < 0: 109 | return None 110 | 111 | return length 112 | -------------------------------------------------------------------------------- /tests/atom/test_rfc_more_extensive.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from dateutil.tz import tzutc, tzoffset 4 | 5 | from atoma.atom import ( 6 | AtomFeed, AtomEntry, AtomTextConstruct, AtomTextType, AtomPerson, AtomLink, 7 | AtomGenerator, parse_atom_file 8 | ) 9 | 10 | 11 | def test_rfc_more_extensive(): 12 | expected_entry = AtomEntry( 13 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 14 | value='Atom draft-07 snapshot'), 15 | id_='tag:example.org,2003:3.2397', 16 | updated=datetime.datetime(2005, 7, 31, 12, 29, 29, tzinfo=tzutc()), 17 | authors=[ 18 | AtomPerson(name='Mark Pilgrim', uri='http://example.org/', 19 | email='f8dy@example.com') 20 | ], 21 | contributors=[ 22 | AtomPerson(name='Sam Ruby', uri=None, email=None), 23 | AtomPerson(name='Joe Gregorio', uri=None, email=None) 24 | ], 25 | links=[ 26 | AtomLink(href='http://example.org/2005/04/02/atom', 27 | rel='alternate', type_='text/html', hreflang=None, 28 | title=None, length=None), 29 | AtomLink(href='http://example.org/audio/ph34r_my_podcast.mp3', 30 | rel='enclosure', type_='audio/mpeg', hreflang=None, 31 | title=None, length=1337) 32 | ], 33 | categories=[], 34 | published=datetime.datetime(2003, 12, 13, 8, 29, 29, 35 | tzinfo=tzoffset(None, -14400)), 36 | rights=None, 37 | summary=None, 38 | content=AtomTextConstruct(text_type=AtomTextType.xhtml, lang=None, 39 | value=''), 40 | source=None 41 | ) 42 | expected = AtomFeed( 43 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 44 | value='dive into mark'), 45 | id_='tag:example.org,2003:3', 46 | updated=datetime.datetime(2005, 7, 31, 12, 29, 29, tzinfo=tzutc()), 47 | authors=[], 48 | contributors=[], 49 | links=[ 50 | AtomLink(href='http://example.org/', rel='alternate', 51 | type_='text/html', hreflang='en', title=None, 52 | length=None), 53 | AtomLink(href='http://example.org/feed.atom', rel='self', 54 | type_='application/atom+xml', hreflang=None, title=None, 55 | length=None) 56 | ], 57 | categories=[], 58 | generator=AtomGenerator(name='Example Toolkit', 59 | uri='http://www.example.com/', version='1.0'), 60 | subtitle=AtomTextConstruct(text_type=AtomTextType.html, lang=None, 61 | value='A lot of effort\n ' 62 | 'went into making this effortless'), 63 | rights=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 64 | value='Copyright (c) 2003, Mark Pilgrim'), 65 | icon=None, 66 | logo=None, 67 | entries=[ 68 | expected_entry 69 | ] 70 | ) 71 | assert ( 72 | parse_atom_file('tests/atom/rfc-more-extensive.xml') == expected 73 | ) 74 | -------------------------------------------------------------------------------- /tests/rss/test_rss.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from atoma.rss import ( 4 | parse_rss_file, parse_rss_bytes, RSSChannel, FeedParseError 5 | ) 6 | from atoma import FeedXMLError 7 | 8 | data = b"""\ 9 | 10 | 11 | 12 | Foo 13 | http://foo.bar 14 | Foo bar. 15 | 16 | Baz item 17 | 18 | 19 | 20 | """ 21 | 22 | cdata_description = """\ 23 | I'm headed for France. I wasn't gonna go this year, but then last week \ 24 | Valley Girl came out and I \ 25 | said to myself, Joe Bob, you gotta get out of the country for a while.""" 26 | 27 | 28 | def test_read_bytes(): 29 | assert isinstance(parse_rss_bytes(data), RSSChannel) 30 | 31 | 32 | def test_broken_missing_title(): 33 | # RSS feed title is mandatory by specs, but some feeds in the wild 34 | # do not provide it 35 | p = parse_rss_file('tests/rss/broken-missing-title.xml') 36 | assert p.title is None 37 | 38 | 39 | def test_broken_missing_description(): 40 | # RSS feed description is mandatory by specs, but some feeds in the wild 41 | # do not provide it 42 | p = parse_rss_file('tests/rss/broken-missing-description.xml') 43 | assert p.description is None 44 | 45 | 46 | def test_broken_missing_link(): 47 | # RSS feed link is mandatory by specs, but some feeds in the wild 48 | # do not provide it 49 | p = parse_rss_file('tests/rss/broken-missing-link.xml') 50 | assert p.link is None 51 | assert p.items[0].link is None 52 | assert p.items[0].guid is None 53 | assert p.items[1].link == 'http://link1' 54 | assert p.items[1].guid == 'http://link1' 55 | assert p.items[2].link == 'http://link2' 56 | assert p.items[2].guid == '646326554' 57 | 58 | 59 | def test_broken_missing_source_url(): 60 | # The URL of a source is mandatory by specs, but some feeds in the wild 61 | # do not provide it 62 | p = parse_rss_file('tests/rss/broken-missing-source-url.xml') 63 | assert p.items[0].source.title == 'New York Times' 64 | assert p.items[0].source.url is None 65 | 66 | 67 | def test_broken_enclosure(): 68 | # The length and type of an enclosure are mandatory by specs, 69 | # but some feeds in the wild do not provide them 70 | p = parse_rss_file('tests/rss/broken-enclosure.xml') 71 | for i in range(0, 3): 72 | assert p.items[i].enclosures[0].url == 'https://foo.com/test.mp3' 73 | assert p.items[i].enclosures[0].length is None 74 | assert p.items[i].enclosures[0].type is None 75 | 76 | 77 | def test_broken_version(): 78 | with pytest.raises(FeedParseError): 79 | parse_rss_file('tests/rss/broken-version.xml') 80 | 81 | 82 | def test_broken_no_channel(): 83 | with pytest.raises(FeedParseError): 84 | parse_rss_file('tests/rss/broken-no-channel.xml') 85 | 86 | 87 | def test_broken_not_xml(): 88 | with pytest.raises(FeedXMLError): 89 | parse_rss_bytes(b'This is not an XML document') 90 | 91 | 92 | def test_encoding(): 93 | parsed = parse_rss_file('tests/rss/encoding.xml') 94 | assert parsed.items[0].title == 'The & entity' 95 | assert parsed.items[1].title == "Nice what's he weigh?" 96 | assert parsed.items[2].title == "Rïchàrd Plop's ☃" 97 | assert parsed.items[2].description == cdata_description 98 | assert parsed.items[3].description is None 99 | -------------------------------------------------------------------------------- /atoma/opml.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from io import BytesIO 3 | from typing import Optional, List 4 | from xml.etree.ElementTree import Element 5 | 6 | import attr 7 | 8 | from .utils import parse_xml, get_text, get_int, get_datetime 9 | 10 | 11 | @attr.s 12 | class OPMLOutline: 13 | text: Optional[str] = attr.ib() 14 | type: Optional[str] = attr.ib() 15 | xml_url: Optional[str] = attr.ib() 16 | description: Optional[str] = attr.ib() 17 | html_url: Optional[str] = attr.ib() 18 | language: Optional[str] = attr.ib() 19 | title: Optional[str] = attr.ib() 20 | version: Optional[str] = attr.ib() 21 | 22 | outlines: List['OPMLOutline'] = attr.ib() 23 | 24 | 25 | @attr.s 26 | class OPML: 27 | title: Optional[str] = attr.ib() 28 | owner_name: Optional[str] = attr.ib() 29 | owner_email: Optional[str] = attr.ib() 30 | date_created: Optional[datetime] = attr.ib() 31 | date_modified: Optional[datetime] = attr.ib() 32 | expansion_state: Optional[str] = attr.ib() 33 | 34 | vertical_scroll_state: Optional[int] = attr.ib() 35 | window_top: Optional[int] = attr.ib() 36 | window_left: Optional[int] = attr.ib() 37 | window_bottom: Optional[int] = attr.ib() 38 | window_right: Optional[int] = attr.ib() 39 | 40 | outlines: List[OPMLOutline] = attr.ib() 41 | 42 | 43 | def _get_outlines(element: Element) -> List[OPMLOutline]: 44 | rv = list() 45 | 46 | for outline in element.findall('outline'): 47 | rv.append(OPMLOutline( 48 | outline.attrib.get('text'), 49 | outline.attrib.get('type'), 50 | outline.attrib.get('xmlUrl'), 51 | outline.attrib.get('description'), 52 | outline.attrib.get('htmlUrl'), 53 | outline.attrib.get('language'), 54 | outline.attrib.get('title'), 55 | outline.attrib.get('version'), 56 | _get_outlines(outline) 57 | )) 58 | 59 | return rv 60 | 61 | 62 | def _parse_opml(root: Element) -> OPML: 63 | head = root.find('head') 64 | body = root.find('body') 65 | 66 | return OPML( 67 | get_text(head, 'title'), 68 | get_text(head, 'ownerName'), 69 | get_text(head, 'ownerEmail'), 70 | get_datetime(head, 'dateCreated'), 71 | get_datetime(head, 'dateModified'), 72 | get_text(head, 'expansionState'), 73 | get_int(head, 'vertScrollState'), 74 | get_int(head, 'windowTop'), 75 | get_int(head, 'windowLeft'), 76 | get_int(head, 'windowBottom'), 77 | get_int(head, 'windowRight'), 78 | outlines=_get_outlines(body) 79 | ) 80 | 81 | 82 | def parse_opml_file(filename: str) -> OPML: 83 | """Parse an OPML document from a local XML file.""" 84 | root = parse_xml(filename).getroot() 85 | return _parse_opml(root) 86 | 87 | 88 | def parse_opml_bytes(data: bytes) -> OPML: 89 | """Parse an OPML document from a byte-string containing XML data.""" 90 | root = parse_xml(BytesIO(data)).getroot() 91 | return _parse_opml(root) 92 | 93 | 94 | def get_feed_list(opml_obj: OPML) -> List[str]: 95 | """Walk an OPML document to extract the list of feed it contains.""" 96 | rv = list() 97 | 98 | def collect(obj): 99 | for outline in obj.outlines: 100 | if outline.type == 'rss' and outline.xml_url: 101 | rv.append(outline.xml_url) 102 | 103 | if outline.outlines: 104 | collect(outline) 105 | 106 | collect(opml_obj) 107 | return rv 108 | -------------------------------------------------------------------------------- /tests/atom/broken-xkcd.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | xkcd.com 4 | 5 | https://xkcd.com/ 6 | 2018-02-02T00:00:00Z 7 | 8 | Chicken Pox and Name Statistics 9 | 10 | 2018-02-02T00:00:00Z 11 | https://xkcd.com/1950/ 12 | <img 13 | src="https://imgs.xkcd.com/comics/chicken_pox_and_name_statistics.png" 14 | title="People with all six of those names agree that it's weird that 15 | we have teeth, when you think about it for too long. Just about 16 | everyone agrees on that, except—in a still-unexplained statistical 17 | anomaly—people named &quot;Trevor.&quot;" alt="People with 18 | all six of those names agree that it's weird that we have teeth, 19 | when you think about it for too long. Just about everyone agrees on 20 | that, except—in a still-unexplained statistical anomaly—people named 21 | &quot;Trevor.&quot;" /> 22 | 23 | 24 | 25 | Fruit Collider 26 | 27 | 2018-01-31T00:00:00Z 28 | https://xkcd.com/1949/ 29 | <img 30 | src="https://imgs.xkcd.com/comics/fruit_collider.png" title="The 31 | most delicious exotic fruit discovered this way is the strawberry 32 | banana. Sadly, it's only stable in puree form, so it's currently 33 | limited to yogurt and smoothies, but they're building a massive 34 | collider in Europe to search for a strawberry banana that can be 35 | eaten whole." alt="The most delicious exotic fruit discovered this 36 | way is the strawberry banana. Sadly, it's only stable in puree form, 37 | so it's currently limited to yogurt and smoothies, but they're 38 | building a massive collider in Europe to search for a strawberry 39 | banana that can be eaten whole." /> 40 | 41 | 42 | 43 | Campaign Fundraising Emails 44 | 45 | 2018-01-29T00:00:00Z 46 | https://xkcd.com/1948/ 47 | <img 48 | src="https://imgs.xkcd.com/comics/campaign_fundraising_emails.png" 49 | title="The establishment doesn't take us seriously. You know who 50 | else they didn't take seriously? Hitler. I'll be like him, but a 51 | GOOD guy instead of..." alt="The establishment doesn't take us 52 | seriously. You know who else they didn't take seriously? Hitler. 53 | I'll be like him, but a GOOD guy instead of..." /> 54 | 55 | 56 | 57 | Night Sky 58 | 59 | 2018-01-26T00:00:00Z 60 | https://xkcd.com/1947/ 61 | <img 62 | src="https://imgs.xkcd.com/comics/night_sky.png" title="There's a 63 | mountain lion nearby, but it didn't notice you because it's reading 64 | Facebook." alt="There's a mountain lion nearby, but it didn't notice 65 | you because it's reading Facebook." /> 66 | 67 | 68 | -------------------------------------------------------------------------------- /tests/rss/test_specification.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from dateutil.tz import tzutc 4 | 5 | from atoma.rss import RSSChannel, RSSItem, parse_rss_file 6 | 7 | 8 | def test_specification(): 9 | item_1 = RSSItem( 10 | title='Star City', 11 | link='http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp', 12 | description='How do Americans get ready to work with Russians aboard\n' 13 | ' the International Space Station? They tak' 14 | 'e a crash course in\n culture, language' 15 | ' and protocol at Russia\'s Star ' 17 | 'City.', 18 | author=None, 19 | categories=[], 20 | comments=None, 21 | enclosures=[], 22 | guid='http://liftoff.msfc.nasa.gov/2003/06/03.html#item573', 23 | pub_date=datetime(2003, 6, 3, 9, 39, 21, tzinfo=tzutc()), 24 | source=None, 25 | content_encoded=None 26 | ) 27 | item_2 = RSSItem( 28 | title=None, 29 | link=None, 30 | description='Sky watchers in Europe, Asia, and parts of Alaska and\n ' 31 | ' Canada will experience a partial\n eclipse of the ' 34 | 'Sun on Saturday, May 31st.', 35 | author=None, 36 | categories=[], 37 | comments=None, 38 | enclosures=[], 39 | guid='http://liftoff.msfc.nasa.gov/2003/05/30.html#item572', 40 | pub_date=datetime(2003, 5, 30, 11, 6, 42, tzinfo=tzutc()), 41 | source=None, 42 | content_encoded=None 43 | ) 44 | item_3 = RSSItem( 45 | title='The Engine That Does More', 46 | link='http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp', 47 | description='Before man travels to Mars, NASA hopes to design new\n ' 48 | ' engines that will let us fly through the ' 49 | 'Solar System more\n quickly.\n ' 50 | ' The proposed VASIMR engine would do that.', 51 | author=None, 52 | categories=[], 53 | comments=None, 54 | enclosures=[], 55 | guid='http://liftoff.msfc.nasa.gov/2003/05/27.html#item571', 56 | pub_date=datetime(2003, 5, 27, 8, 37, 32, tzinfo=tzutc()), 57 | source=None, 58 | content_encoded=None 59 | ) 60 | item_4 = RSSItem( 61 | title="Astronauts' Dirty Laundry", 62 | link='http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp', 63 | description='Compared to earlier spacecraft, the International Space\n' 64 | ' Station has many luxuries, but laundry ' 65 | 'facilities are not one of\n them. Instead,' 66 | ' astronauts have other options.', 67 | author=None, 68 | categories=[], 69 | comments=None, 70 | enclosures=[], 71 | guid='http://liftoff.msfc.nasa.gov/2003/05/20.html#item570', 72 | pub_date=datetime(2003, 5, 20, 8, 56, 2, tzinfo=tzutc()), 73 | source=None, 74 | content_encoded=None 75 | ) 76 | 77 | expected = RSSChannel( 78 | title='Liftoff News', 79 | link='http://liftoff.msfc.nasa.gov/', 80 | description='Liftoff to Space Exploration.', 81 | language='en-us', 82 | copyright=None, 83 | managing_editor='editor@example.com', 84 | web_master='webmaster@example.com', 85 | pub_date=datetime(2003, 6, 10, 4, 0, tzinfo=tzutc()), 86 | last_build_date=datetime(2003, 6, 10, 9, 41, 1, tzinfo=tzutc()), 87 | categories=[], 88 | generator='Weblog Editor 2.0', 89 | docs='http://blogs.law.harvard.edu/tech/rss', 90 | ttl=None, 91 | image=None, 92 | items=[item_1, item_2, item_3, item_4], 93 | content_encoded=None 94 | ) 95 | assert parse_rss_file('tests/rss/specification.xml') == expected 96 | -------------------------------------------------------------------------------- /tests/json_feed/test_specs.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from dateutil.tz import tzoffset 4 | 5 | from atoma.json_feed import ( 6 | parse_json_feed_file, JSONFeed, JSONFeedAuthor, JSONFeedItem 7 | ) 8 | 9 | content_html = ( 10 | '

We —\xa0Manton Reece and Brent Simmons —\xa0have noticed that JSON ' 11 | 'has become the developers’ choice for APIs, and that developers will ' 12 | 'often go out of their way to avoid XML. JSON is simpler to read and ' 13 | 'write, and it’s less prone to bugs.

\n\n

So we developed JSON Feed, ' 14 | 'a format similar to ' 15 | 'RSS and Atom but ' 16 | 'in JSON. It reflects the lessons learned from our years of work reading ' 17 | 'and publishing feeds.

\n\n

' 18 | 'See the spec. It’s at version 1, which may be the only version ever ' 19 | 'needed. If future versions are needed, version 1 feeds will still be ' 20 | 'valid feeds.

\n\n

Notes

\n\n

We have a WordPress plugin and, coming soon, a ' 22 | 'JSON Feed Parser for Swift. As more code is written, by us and others, ' 23 | 'we’ll update the code page.

' 24 | '\n\n

See Mapping RSS ' 25 | 'and Atom to JSON Feed for more on the similarities between the ' 26 | 'formats.

\n\n

This website —\xa0the Markdown files and supporting ' 27 | 'resources —\xa0is up ' 28 | 'on GitHub, and you’re welcome to comment there.

\n\n

This ' 29 | 'website is also a blog, and you can subscribe to the RSS feed or the JSON feed (if your reader supports it).' 32 | '

\n\n

We worked with a number of people on this over the course of ' 33 | 'several months. We list them, and thank them, at the bottom of the spec. But — most importantly — ' 35 | 'Craig Hockenberry spent a little time ' 36 | 'making it look pretty. :)

\n' 37 | ) 38 | 39 | 40 | def test_rfc_minimal(): 41 | expect = JSONFeed( 42 | version='https://jsonfeed.org/version/1', 43 | title='JSON Feed', 44 | home_page_url='https://jsonfeed.org/', 45 | feed_url='https://jsonfeed.org/feed.json', 46 | description='JSON Feed is a pragmatic syndication format for blogs, ' 47 | 'microblogs, and other time-based content.', 48 | user_comment='This feed allows you to read the posts from this site ' 49 | 'in any feed reader that supports the JSON Feed format. ' 50 | 'To add this feed to your reader, copy the following URL ' 51 | '— https://jsonfeed.org/feed.json — and add it your ' 52 | 'reader.', 53 | next_url=None, 54 | icon=None, 55 | favicon=None, 56 | author=JSONFeedAuthor( 57 | name='Brent Simmons and Manton Reece', 58 | url='https://jsonfeed.org/', 59 | avatar=None 60 | ), 61 | authors=[JSONFeedAuthor( 62 | name='Brent Simmons and Manton Reece', 63 | url='https://jsonfeed.org/', 64 | avatar=None 65 | )], 66 | language=None, 67 | expired=False, 68 | items=[JSONFeedItem( 69 | id_='https://jsonfeed.org/2017/05/17/announcing_json_feed', 70 | url='https://jsonfeed.org/2017/05/17/announcing_json_feed', 71 | external_url=None, 72 | title='Announcing JSON Feed', 73 | content_html=content_html, 74 | content_text=None, 75 | summary=None, 76 | image=None, 77 | banner_image=None, 78 | date_published=datetime.datetime(2017, 5, 17, 8, 2, 12, 79 | tzinfo=tzoffset(None, -25200)), 80 | date_modified=None, 81 | author=None, 82 | authors=[], 83 | language=None, 84 | tags=[], 85 | attachments=[])] 86 | ) 87 | assert parse_json_feed_file('tests/json_feed/jsonfeed.org.json') == expect 88 | -------------------------------------------------------------------------------- /tests/atom/test_unicode.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from dateutil.tz import tzutc 4 | 5 | from atoma.atom import ( 6 | AtomFeed, AtomEntry, AtomTextConstruct, AtomTextType, AtomPerson, AtomLink, 7 | AtomGenerator, AtomCategory, parse_atom_file 8 | ) 9 | 10 | 11 | def test_rfc_unicode(): 12 | 13 | expected_entry_1 = AtomEntry( 14 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 15 | value='Article n°1'), 16 | id_='http://richard.plop/2017/6/5/article-1', 17 | updated=datetime.datetime(2017, 6, 5, 0, 0, tzinfo=tzutc()), 18 | authors=[ 19 | AtomPerson(name='Rïchàrd Plop', uri=None, email=None) 20 | ], 21 | contributors=[], 22 | links=[ 23 | AtomLink(href='http://richard.plop/2017/6/5/article-1', rel=None, 24 | type_=None, hreflang=None, title=None, length=None) 25 | ], 26 | categories=[], 27 | published=None, 28 | rights=None, 29 | summary=None, 30 | content=AtomTextConstruct(text_type=AtomTextType.html, lang=None, 31 | value='

'), 32 | source=None 33 | ) 34 | expected_entry_2 = AtomEntry( 35 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 36 | value='Unicode snowman'), 37 | id_='http://richard.plop/2017/6/5/article-2', 38 | updated=datetime.datetime(2016, 12, 29, 0, 0, tzinfo=tzutc()), 39 | authors=[ 40 | AtomPerson(name='Rïchàrd Plop', uri=None, email=None) 41 | ], 42 | contributors=[], 43 | links=[ 44 | AtomLink(href='http://richard.plop/2017/6/5/unicode-snowman', 45 | rel=None, type_=None, hreflang=None, title=None, 46 | length=None) 47 | ], 48 | categories=[], 49 | published=None, 50 | rights=None, 51 | summary=None, 52 | content=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 53 | value='☃'), 54 | source=None 55 | ) 56 | source_feed = AtomFeed( 57 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 58 | value='Example, Inc.'), 59 | id_='http://example.org/', 60 | updated=datetime.datetime(2003, 12, 13, 18, 30, 2, tzinfo=tzutc()), 61 | authors=[ 62 | AtomPerson(name='Foo Bar', uri=None, email=None) 63 | ], 64 | contributors=[], 65 | links=[], 66 | categories=[], 67 | generator=None, 68 | subtitle=None, 69 | rights=None, 70 | icon=None, 71 | logo=None, 72 | entries=[] 73 | ) 74 | expected_entry_3 = AtomEntry( 75 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 76 | value='Unicode snowman 3'), 77 | id_='http://richard.plop/2017/6/5/article-3', 78 | updated=datetime.datetime(2016, 12, 29, 0, 0, tzinfo=tzutc()), 79 | authors=[ 80 | AtomPerson(name='Foo Bar', uri=None, email=None) 81 | ], 82 | contributors=[], 83 | links=[ 84 | AtomLink(href='http://richard.plop/2017/6/5/unicode-snowman', 85 | rel=None, type_=None, hreflang=None, title=None, 86 | length=None) 87 | ], 88 | categories=[], 89 | published=None, 90 | rights=None, 91 | summary=None, 92 | content=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 93 | value='☃'), 94 | source=source_feed 95 | ) 96 | expected = AtomFeed( 97 | title=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 98 | value="Rïchàrd Plop's blog"), 99 | id_='http://richard.plop/feed.atom', 100 | updated=datetime.datetime(2017, 6, 5, 0, 0, tzinfo=tzutc()), 101 | authors=[], 102 | contributors=[], 103 | links=[ 104 | AtomLink(href='http://richard.plop/', rel=None, type_=None, 105 | hreflang=None, title=None, length=None), 106 | AtomLink(href='http://richard.plop/feed.atom', rel='self', 107 | type_=None, hreflang=None, title=None, length=None) 108 | ], 109 | categories=[ 110 | AtomCategory(term='python', scheme=None, label='Python') 111 | ], 112 | generator=AtomGenerator(name='Werkzeug', uri=None, version=None), 113 | subtitle=AtomTextConstruct(text_type=AtomTextType.text, lang=None, 114 | value="Rïchàrd Plop's personal blog."), 115 | rights=None, 116 | icon=None, 117 | logo=None, 118 | entries=[expected_entry_1, expected_entry_2, expected_entry_3] 119 | ) 120 | assert parse_atom_file('tests/atom/unicode.xml') == expected 121 | -------------------------------------------------------------------------------- /atoma/rss.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from io import BytesIO 3 | from typing import Optional, List 4 | from xml.etree.ElementTree import Element 5 | 6 | import attr 7 | 8 | from .utils import ( 9 | parse_xml, get_child, get_text, get_int, get_datetime, FeedParseError, 10 | try_parse_length 11 | ) 12 | 13 | 14 | @attr.s 15 | class RSSImage: 16 | url: str = attr.ib() 17 | title: Optional[str] = attr.ib() 18 | link: str = attr.ib() 19 | width: int = attr.ib() 20 | height: int = attr.ib() 21 | description: Optional[str] = attr.ib() 22 | 23 | 24 | @attr.s 25 | class RSSEnclosure: 26 | url: str = attr.ib() 27 | length: Optional[int] = attr.ib() 28 | type: Optional[str] = attr.ib() 29 | 30 | 31 | @attr.s 32 | class RSSSource: 33 | title: str = attr.ib() 34 | url: Optional[str] = attr.ib() 35 | 36 | 37 | @attr.s 38 | class RSSItem: 39 | title: Optional[str] = attr.ib() 40 | link: Optional[str] = attr.ib() 41 | description: Optional[str] = attr.ib() 42 | author: Optional[str] = attr.ib() 43 | categories: List[str] = attr.ib() 44 | comments: Optional[str] = attr.ib() 45 | enclosures: List[RSSEnclosure] = attr.ib() 46 | guid: Optional[str] = attr.ib() 47 | pub_date: Optional[datetime] = attr.ib() 48 | source: Optional[RSSSource] = attr.ib() 49 | 50 | # Extension 51 | content_encoded: Optional[str] = attr.ib() 52 | 53 | 54 | @attr.s 55 | class RSSChannel: 56 | title: Optional[str] = attr.ib() 57 | link: Optional[str] = attr.ib() 58 | description: Optional[str] = attr.ib() 59 | language: Optional[str] = attr.ib() 60 | copyright: Optional[str] = attr.ib() 61 | managing_editor: Optional[str] = attr.ib() 62 | web_master: Optional[str] = attr.ib() 63 | pub_date: Optional[datetime] = attr.ib() 64 | last_build_date: Optional[datetime] = attr.ib() 65 | categories: List[str] = attr.ib() 66 | generator: Optional[str] = attr.ib() 67 | docs: Optional[str] = attr.ib() 68 | ttl: Optional[int] = attr.ib() 69 | image: Optional[RSSImage] = attr.ib() 70 | 71 | items: List[RSSItem] = attr.ib() 72 | 73 | # Extension 74 | content_encoded: Optional[str] = attr.ib() 75 | 76 | 77 | def _get_image(element: Element, name, 78 | optional: bool=True) -> Optional[RSSImage]: 79 | child = get_child(element, name, optional) 80 | if child is None: 81 | return None 82 | 83 | return RSSImage( 84 | get_text(child, 'url', optional=False), 85 | get_text(child, 'title'), 86 | get_text(child, 'link', optional=False), 87 | get_int(child, 'width') or 88, 88 | get_int(child, 'height') or 31, 89 | get_text(child, 'description') 90 | ) 91 | 92 | 93 | def _get_source(element: Element, name, 94 | optional: bool=True) -> Optional[RSSSource]: 95 | child = get_child(element, name, optional) 96 | if child is None: 97 | return None 98 | 99 | return RSSSource( 100 | child.text.strip(), 101 | child.attrib.get('url'), 102 | ) 103 | 104 | 105 | def _get_enclosure(element: Element) -> RSSEnclosure: 106 | return RSSEnclosure( 107 | element.attrib['url'], 108 | try_parse_length(element.attrib.get('length')), 109 | element.attrib.get('type'), 110 | ) 111 | 112 | 113 | def _get_link(element: Element) -> Optional[str]: 114 | """Attempt to retrieve item link. 115 | 116 | Use the GUID as a fallback if it is a permalink. 117 | """ 118 | link = get_text(element, 'link') 119 | if link is not None: 120 | return link 121 | 122 | guid = get_child(element, 'guid') 123 | if guid is not None and guid.attrib.get('isPermaLink') == 'true': 124 | return get_text(element, 'guid') 125 | 126 | return None 127 | 128 | 129 | def _get_item(element: Element) -> RSSItem: 130 | root = element 131 | 132 | title = get_text(root, 'title') 133 | link = _get_link(root) 134 | description = get_text(root, 'description') 135 | author = get_text(root, 'author') 136 | categories = [e.text for e in root.findall('category')] 137 | comments = get_text(root, 'comments') 138 | enclosure = [_get_enclosure(e) for e in root.findall('enclosure')] 139 | guid = get_text(root, 'guid') 140 | pub_date = get_datetime(root, 'pubDate') 141 | source = _get_source(root, 'source') 142 | 143 | content_encoded = get_text(root, 'content:encoded') 144 | 145 | return RSSItem( 146 | title, 147 | link, 148 | description, 149 | author, 150 | categories, 151 | comments, 152 | enclosure, 153 | guid, 154 | pub_date, 155 | source, 156 | content_encoded 157 | ) 158 | 159 | 160 | def _parse_rss(root: Element) -> RSSChannel: 161 | rss_version = root.get('version') 162 | if rss_version != '2.0': 163 | raise FeedParseError('Cannot process RSS feed version "{}"' 164 | .format(rss_version)) 165 | 166 | root = root.find('channel') 167 | if root is None: 168 | raise FeedParseError('RSS does not have a channel') 169 | 170 | title = get_text(root, 'title') 171 | link = get_text(root, 'link') 172 | description = get_text(root, 'description') 173 | language = get_text(root, 'language') 174 | copyright = get_text(root, 'copyright') 175 | managing_editor = get_text(root, 'managingEditor') 176 | web_master = get_text(root, 'webMaster') 177 | pub_date = get_datetime(root, 'pubDate') 178 | last_build_date = get_datetime(root, 'lastBuildDate') 179 | categories = [e.text for e in root.findall('category')] 180 | generator = get_text(root, 'generator') 181 | docs = get_text(root, 'docs') 182 | ttl = get_int(root, 'ttl') 183 | 184 | image = _get_image(root, 'image') 185 | items = [_get_item(e) for e in root.findall('item')] 186 | 187 | content_encoded = get_text(root, 'content:encoded') 188 | 189 | return RSSChannel( 190 | title, 191 | link, 192 | description, 193 | language, 194 | copyright, 195 | managing_editor, 196 | web_master, 197 | pub_date, 198 | last_build_date, 199 | categories, 200 | generator, 201 | docs, 202 | ttl, 203 | image, 204 | items, 205 | content_encoded 206 | ) 207 | 208 | 209 | def parse_rss_file(filename: str) -> RSSChannel: 210 | """Parse an RSS feed from a local XML file.""" 211 | root = parse_xml(filename).getroot() 212 | return _parse_rss(root) 213 | 214 | 215 | def parse_rss_bytes(data: bytes) -> RSSChannel: 216 | """Parse an RSS feed from a byte-string containing XML data.""" 217 | root = parse_xml(BytesIO(data)).getroot() 218 | return _parse_rss(root) 219 | -------------------------------------------------------------------------------- /atoma/simple.py: -------------------------------------------------------------------------------- 1 | """Simple API that abstracts away the differences between feed types.""" 2 | 3 | from datetime import datetime, timedelta 4 | import html 5 | import os 6 | from typing import Optional, List, Tuple 7 | import urllib.parse 8 | 9 | import attr 10 | 11 | from . import atom, rss, json_feed 12 | from .exceptions import ( 13 | FeedParseError, FeedDocumentError, FeedXMLError, FeedJSONError 14 | ) 15 | 16 | 17 | @attr.s 18 | class Attachment: 19 | link: str = attr.ib() 20 | mime_type: Optional[str] = attr.ib() 21 | title: Optional[str] = attr.ib() 22 | size_in_bytes: Optional[int] = attr.ib() 23 | duration: Optional[timedelta] = attr.ib() 24 | 25 | 26 | @attr.s 27 | class Article: 28 | id: str = attr.ib() 29 | title: Optional[str] = attr.ib() 30 | link: Optional[str] = attr.ib() 31 | content: str = attr.ib() 32 | published_at: Optional[datetime] = attr.ib() 33 | updated_at: Optional[datetime] = attr.ib() 34 | attachments: List[Attachment] = attr.ib() 35 | 36 | 37 | @attr.s 38 | class Feed: 39 | title: str = attr.ib() 40 | subtitle: Optional[str] = attr.ib() 41 | link: Optional[str] = attr.ib() 42 | updated_at: Optional[datetime] = attr.ib() 43 | articles: List[Article] = attr.ib() 44 | 45 | 46 | def _adapt_atom_feed(atom_feed: atom.AtomFeed) -> Feed: 47 | articles = list() 48 | for entry in atom_feed.entries: 49 | if entry.content is not None: 50 | content = entry.content.value 51 | elif entry.summary is not None: 52 | content = entry.summary.value 53 | else: 54 | content = '' 55 | published_at, updated_at = _get_article_dates(entry.published, 56 | entry.updated) 57 | # Find article link and attachments 58 | article_link = None 59 | attachments = list() 60 | for candidate_link in entry.links: 61 | if candidate_link.rel in ('alternate', None): 62 | article_link = candidate_link.href 63 | elif candidate_link.rel == 'enclosure': 64 | attachments.append(Attachment( 65 | title=_get_attachment_title(candidate_link.title, 66 | candidate_link.href), 67 | link=candidate_link.href, 68 | mime_type=candidate_link.type_, 69 | size_in_bytes=candidate_link.length, 70 | duration=None 71 | )) 72 | 73 | if entry.title is None: 74 | entry_title = None 75 | elif entry.title.text_type in (atom.AtomTextType.html, 76 | atom.AtomTextType.xhtml): 77 | entry_title = html.unescape(entry.title.value).strip() 78 | else: 79 | entry_title = entry.title.value 80 | 81 | articles.append(Article( 82 | entry.id_, 83 | entry_title, 84 | article_link, 85 | content, 86 | published_at, 87 | updated_at, 88 | attachments 89 | )) 90 | 91 | # Find feed link 92 | link = None 93 | for candidate_link in atom_feed.links: 94 | if candidate_link.rel == 'self': 95 | link = candidate_link.href 96 | break 97 | 98 | return Feed( 99 | atom_feed.title.value if atom_feed.title else atom_feed.id_, 100 | atom_feed.subtitle.value if atom_feed.subtitle else None, 101 | link, 102 | atom_feed.updated, 103 | articles 104 | ) 105 | 106 | 107 | def _adapt_rss_channel(rss_channel: rss.RSSChannel) -> Feed: 108 | articles = list() 109 | for item in rss_channel.items: 110 | attachments = [ 111 | Attachment(link=e.url, mime_type=e.type, size_in_bytes=e.length, 112 | title=_get_attachment_title(None, e.url), duration=None) 113 | for e in item.enclosures 114 | ] 115 | articles.append(Article( 116 | item.guid or item.link, 117 | item.title, 118 | item.link, 119 | item.content_encoded or item.description or '', 120 | item.pub_date, 121 | None, 122 | attachments 123 | )) 124 | 125 | if rss_channel.title is None and rss_channel.link is None: 126 | raise FeedParseError('RSS feed does not have a title nor a link') 127 | 128 | return Feed( 129 | rss_channel.title if rss_channel.title else rss_channel.link, 130 | rss_channel.description, 131 | rss_channel.link, 132 | rss_channel.pub_date, 133 | articles 134 | ) 135 | 136 | 137 | def _adapt_json_feed(json_feed: json_feed.JSONFeed) -> Feed: 138 | articles = list() 139 | for item in json_feed.items: 140 | attachments = [ 141 | Attachment(a.url, a.mime_type, 142 | _get_attachment_title(a.title, a.url), 143 | a.size_in_bytes, a.duration) 144 | for a in item.attachments 145 | ] 146 | articles.append(Article( 147 | item.id_, 148 | item.title, 149 | item.url, 150 | item.content_html or item.content_text or '', 151 | item.date_published, 152 | item.date_modified, 153 | attachments 154 | )) 155 | 156 | return Feed( 157 | json_feed.title, 158 | json_feed.description, 159 | json_feed.feed_url, 160 | None, 161 | articles 162 | ) 163 | 164 | 165 | def _get_article_dates(published_at: Optional[datetime], 166 | updated_at: Optional[datetime] 167 | ) -> Tuple[Optional[datetime], Optional[datetime]]: 168 | if published_at and updated_at: 169 | return published_at, updated_at 170 | 171 | if updated_at: 172 | return updated_at, None 173 | 174 | if published_at: 175 | return published_at, None 176 | 177 | return None, None 178 | 179 | 180 | def _get_attachment_title(attachment_title: Optional[str], link: str) -> str: 181 | if attachment_title: 182 | return attachment_title 183 | 184 | parsed_link = urllib.parse.urlparse(link) 185 | return os.path.basename(parsed_link.path) 186 | 187 | 188 | def _simple_parse(pairs, content) -> Feed: 189 | is_xml = True 190 | is_json = True 191 | for parser, adapter in pairs: 192 | try: 193 | return adapter(parser(content)) 194 | except FeedXMLError: 195 | is_xml = False 196 | except FeedJSONError: 197 | is_json = False 198 | except FeedParseError: 199 | continue 200 | 201 | if not is_xml and not is_json: 202 | raise FeedDocumentError('File is not a supported feed type') 203 | 204 | raise FeedParseError('File is not a valid supported feed') 205 | 206 | 207 | def simple_parse_file(filename: str) -> Feed: 208 | """Parse an Atom, RSS or JSON feed from a local file.""" 209 | pairs = ( 210 | (rss.parse_rss_file, _adapt_rss_channel), 211 | (atom.parse_atom_file, _adapt_atom_feed), 212 | (json_feed.parse_json_feed_file, _adapt_json_feed) 213 | ) 214 | return _simple_parse(pairs, filename) 215 | 216 | 217 | def simple_parse_bytes(data: bytes) -> Feed: 218 | """Parse an Atom, RSS or JSON feed from a byte-string containing data.""" 219 | pairs = ( 220 | (rss.parse_rss_bytes, _adapt_rss_channel), 221 | (atom.parse_atom_bytes, _adapt_atom_feed), 222 | (json_feed.parse_json_feed_bytes, _adapt_json_feed) 223 | ) 224 | return _simple_parse(pairs, data) 225 | -------------------------------------------------------------------------------- /atoma/json_feed.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import json 3 | from typing import Optional, List, Tuple 4 | 5 | import attr 6 | 7 | from .exceptions import FeedParseError, FeedJSONError 8 | from .utils import try_parse_date 9 | 10 | 11 | @attr.s 12 | class JSONFeedAuthor: 13 | 14 | name: Optional[str] = attr.ib() 15 | url: Optional[str] = attr.ib() 16 | avatar: Optional[str] = attr.ib() 17 | 18 | 19 | @attr.s 20 | class JSONFeedAttachment: 21 | 22 | url: str = attr.ib() 23 | mime_type: str = attr.ib() 24 | title: Optional[str] = attr.ib() 25 | size_in_bytes: Optional[int] = attr.ib() 26 | duration: Optional[timedelta] = attr.ib() 27 | 28 | 29 | @attr.s 30 | class JSONFeedItem: 31 | 32 | id_: str = attr.ib() 33 | url: Optional[str] = attr.ib() 34 | external_url: Optional[str] = attr.ib() 35 | title: Optional[str] = attr.ib() 36 | content_html: Optional[str] = attr.ib() 37 | content_text: Optional[str] = attr.ib() 38 | summary: Optional[str] = attr.ib() 39 | image: Optional[str] = attr.ib() 40 | banner_image: Optional[str] = attr.ib() 41 | date_published: Optional[datetime] = attr.ib() 42 | date_modified: Optional[datetime] = attr.ib() 43 | author: Optional[JSONFeedAuthor] = attr.ib() # Deprecated in JSON Feed 1.1 44 | authors: List[JSONFeedAuthor] = attr.ib() 45 | language: Optional[str] = attr.ib() 46 | tags: List[str] = attr.ib() 47 | attachments: List[JSONFeedAttachment] = attr.ib() 48 | 49 | 50 | @attr.s 51 | class JSONFeed: 52 | 53 | version: str = attr.ib() 54 | title: str = attr.ib() 55 | home_page_url: Optional[str] = attr.ib() 56 | feed_url: Optional[str] = attr.ib() 57 | description: Optional[str] = attr.ib() 58 | user_comment: Optional[str] = attr.ib() 59 | next_url: Optional[str] = attr.ib() 60 | icon: Optional[str] = attr.ib() 61 | favicon: Optional[str] = attr.ib() 62 | author: Optional[JSONFeedAuthor] = attr.ib() # Deprecated in JSON Feed 1.1 63 | authors: List[JSONFeedAuthor] = attr.ib() 64 | language: Optional[str] = attr.ib() 65 | expired: bool = attr.ib() 66 | 67 | items: List[JSONFeedItem] = attr.ib() 68 | 69 | 70 | def _get_items(root: dict) -> List[JSONFeedItem]: 71 | rv = [] 72 | items = root.get('items', []) 73 | if not items: 74 | return rv 75 | 76 | for item in items: 77 | rv.append(_get_item(item)) 78 | 79 | return rv 80 | 81 | 82 | def _get_item(item_dict: dict) -> JSONFeedItem: 83 | author, authors = _get_author(item_dict) 84 | return JSONFeedItem( 85 | id_=_get_text(item_dict, 'id', optional=False), 86 | url=_get_text(item_dict, 'url'), 87 | external_url=_get_text(item_dict, 'external_url'), 88 | title=_get_text(item_dict, 'title'), 89 | content_html=_get_text(item_dict, 'content_html'), 90 | content_text=_get_text(item_dict, 'content_text'), 91 | summary=_get_text(item_dict, 'summary'), 92 | image=_get_text(item_dict, 'image'), 93 | banner_image=_get_text(item_dict, 'banner_image'), 94 | date_published=_get_datetime(item_dict, 'date_published'), 95 | date_modified=_get_datetime(item_dict, 'date_modified'), 96 | author=author, 97 | authors=authors, 98 | language=_get_text(item_dict, 'language'), 99 | tags=_get_tags(item_dict, 'tags'), 100 | attachments=_get_attachments(item_dict, 'attachments') 101 | ) 102 | 103 | 104 | def _get_attachments(root, name) -> List[JSONFeedAttachment]: 105 | rv = list() 106 | for attachment_dict in root.get(name, []): 107 | rv.append(JSONFeedAttachment( 108 | _get_text(attachment_dict, 'url', optional=False), 109 | _get_text(attachment_dict, 'mime_type', optional=False), 110 | _get_text(attachment_dict, 'title'), 111 | _get_int(attachment_dict, 'size_in_bytes'), 112 | _get_duration(attachment_dict, 'duration_in_seconds') 113 | )) 114 | return rv 115 | 116 | 117 | def _get_tags(root, name) -> List[str]: 118 | tags = root.get(name, []) 119 | return [tag for tag in tags if isinstance(tag, str)] 120 | 121 | 122 | def _get_datetime(root: dict, name, optional: bool=True) -> Optional[datetime]: 123 | text = _get_text(root, name, optional) 124 | if text is None: 125 | return None 126 | 127 | return try_parse_date(text) 128 | 129 | 130 | def _get_expired(root: dict) -> bool: 131 | if root.get('expired') is True: 132 | return True 133 | 134 | return False 135 | 136 | 137 | def _get_author(root: dict) ->\ 138 | Tuple[Optional[JSONFeedAuthor], List[JSONFeedAuthor]]: 139 | """Retrieve the author/authors of a JSON Feed. 140 | 141 | In JSON Feed version 1.0, only a single author was available and is 142 | superseded in version 1.1 by the authors key. 143 | """ 144 | authors = root.get('authors', []) 145 | if not authors and root.get('author'): 146 | authors.append(root.get('author')) 147 | 148 | rv = [ 149 | JSONFeedAuthor( 150 | name=_get_text(author_dict, 'name'), 151 | url=_get_text(author_dict, 'url'), 152 | avatar=_get_text(author_dict, 'avatar'), 153 | ) 154 | for author_dict in authors 155 | ] 156 | try: 157 | return rv[0], rv 158 | except IndexError: 159 | return None, rv 160 | 161 | 162 | def _get_int(root: dict, name: str, optional: bool=True) -> Optional[int]: 163 | rv = root.get(name) 164 | if not optional and rv is None: 165 | raise FeedParseError('Could not parse feed: "{}" int is required but ' 166 | 'is empty'.format(name)) 167 | 168 | if optional and rv is None: 169 | return None 170 | 171 | if not isinstance(rv, int): 172 | raise FeedParseError('Could not parse feed: "{}" is not an int' 173 | .format(name)) 174 | 175 | return rv 176 | 177 | 178 | def _get_duration(root: dict, name: str, 179 | optional: bool=True) -> Optional[timedelta]: 180 | duration = _get_int(root, name, optional) 181 | if duration is None: 182 | return None 183 | 184 | return timedelta(seconds=duration) 185 | 186 | 187 | def _get_text(root: dict, name: str, optional: bool=True) -> Optional[str]: 188 | rv = root.get(name) 189 | if not optional and rv is None: 190 | raise FeedParseError('Could not parse feed: "{}" text is required but ' 191 | 'is empty'.format(name)) 192 | 193 | if optional and rv is None: 194 | return None 195 | 196 | if not isinstance(rv, str): 197 | raise FeedParseError('Could not parse feed: "{}" is not a string' 198 | .format(name)) 199 | 200 | return rv 201 | 202 | 203 | def parse_json_feed(root: dict) -> JSONFeed: 204 | author, authors = _get_author(root) 205 | return JSONFeed( 206 | version=_get_text(root, 'version', optional=False), 207 | title=_get_text(root, 'title', optional=False), 208 | home_page_url=_get_text(root, 'home_page_url'), 209 | feed_url=_get_text(root, 'feed_url'), 210 | description=_get_text(root, 'description'), 211 | user_comment=_get_text(root, 'user_comment'), 212 | next_url=_get_text(root, 'next_url'), 213 | icon=_get_text(root, 'icon'), 214 | favicon=_get_text(root, 'favicon'), 215 | author=author, 216 | authors=authors, 217 | language=_get_text(root, 'language'), 218 | expired=_get_expired(root), 219 | items=_get_items(root) 220 | ) 221 | 222 | 223 | def parse_json_feed_file(filename: str) -> JSONFeed: 224 | """Parse a JSON feed from a local json file.""" 225 | with open(filename) as f: 226 | try: 227 | root = json.load(f) 228 | except (json.decoder.JSONDecodeError, UnicodeDecodeError): 229 | raise FeedJSONError('Not a valid JSON document') 230 | 231 | return parse_json_feed(root) 232 | 233 | 234 | def parse_json_feed_bytes(data: bytes) -> JSONFeed: 235 | """Parse a JSON feed from a byte-string containing JSON data.""" 236 | try: 237 | root = json.loads(data) 238 | except (json.decoder.JSONDecodeError, UnicodeDecodeError): 239 | raise FeedJSONError('Not a valid JSON document') 240 | 241 | return parse_json_feed(root) 242 | -------------------------------------------------------------------------------- /atoma/atom.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import enum 3 | from io import BytesIO 4 | from typing import Optional, List 5 | from xml.etree.ElementTree import Element 6 | 7 | import attr 8 | 9 | from .utils import ( 10 | parse_xml, get_child, get_text, get_datetime, FeedParseError, ns, 11 | try_parse_length 12 | ) 13 | 14 | 15 | class AtomTextType(enum.Enum): 16 | text = "text" 17 | html = "html" 18 | xhtml = "xhtml" 19 | 20 | 21 | @attr.s 22 | class AtomTextConstruct: 23 | text_type: str = attr.ib() 24 | lang: Optional[str] = attr.ib() 25 | value: str = attr.ib() 26 | 27 | 28 | @attr.s 29 | class AtomEntry: 30 | title: AtomTextConstruct = attr.ib() 31 | id_: str = attr.ib() 32 | 33 | # Should be mandatory but many feeds use published instead 34 | updated: Optional[datetime] = attr.ib() 35 | 36 | authors: List['AtomPerson'] = attr.ib() 37 | contributors: List['AtomPerson'] = attr.ib() 38 | links: List['AtomLink'] = attr.ib() 39 | categories: List['AtomCategory'] = attr.ib() 40 | published: Optional[datetime] = attr.ib() 41 | rights: Optional[AtomTextConstruct] = attr.ib() 42 | summary: Optional[AtomTextConstruct] = attr.ib() 43 | content: Optional[AtomTextConstruct] = attr.ib() 44 | source: Optional['AtomFeed'] = attr.ib() 45 | 46 | 47 | @attr.s 48 | class AtomFeed: 49 | title: Optional[AtomTextConstruct] = attr.ib() 50 | id_: str = attr.ib() 51 | 52 | # Should be mandatory but many feeds do not include it 53 | updated: Optional[datetime] = attr.ib() 54 | 55 | authors: List['AtomPerson'] = attr.ib() 56 | contributors: List['AtomPerson'] = attr.ib() 57 | links: List['AtomLink'] = attr.ib() 58 | categories: List['AtomCategory'] = attr.ib() 59 | generator: Optional['AtomGenerator'] = attr.ib() 60 | subtitle: Optional[AtomTextConstruct] = attr.ib() 61 | rights: Optional[AtomTextConstruct] = attr.ib() 62 | icon: Optional[str] = attr.ib() 63 | logo: Optional[str] = attr.ib() 64 | 65 | entries: List[AtomEntry] = attr.ib() 66 | 67 | 68 | @attr.s 69 | class AtomPerson: 70 | name: str = attr.ib() 71 | uri: Optional[str] = attr.ib() 72 | email: Optional[str] = attr.ib() 73 | 74 | 75 | @attr.s 76 | class AtomLink: 77 | href: str = attr.ib() 78 | rel: Optional[str] = attr.ib() 79 | type_: Optional[str] = attr.ib() 80 | hreflang: Optional[str] = attr.ib() 81 | title: Optional[str] = attr.ib() 82 | length: Optional[int] = attr.ib() 83 | 84 | 85 | @attr.s 86 | class AtomCategory: 87 | term: str = attr.ib() 88 | scheme: Optional[str] = attr.ib() 89 | label: Optional[str] = attr.ib() 90 | 91 | 92 | @attr.s 93 | class AtomGenerator: 94 | name: str = attr.ib() 95 | uri: Optional[str] = attr.ib() 96 | version: Optional[str] = attr.ib() 97 | 98 | 99 | def _get_generator(element: Element, name, 100 | optional: bool=True) -> Optional[AtomGenerator]: 101 | child = get_child(element, name, optional) 102 | if child is None: 103 | return None 104 | 105 | return AtomGenerator( 106 | child.text.strip(), 107 | child.attrib.get('uri'), 108 | child.attrib.get('version'), 109 | ) 110 | 111 | 112 | def _get_text_construct(element: Element, name, 113 | optional: bool=True) -> Optional[AtomTextConstruct]: 114 | child = get_child(element, name, optional) 115 | if child is None: 116 | return None 117 | 118 | try: 119 | text_type = AtomTextType(child.attrib['type']) 120 | except KeyError: 121 | text_type = AtomTextType.text 122 | 123 | try: 124 | lang = child.lang 125 | except AttributeError: 126 | lang = None 127 | 128 | if child.text is None: 129 | if optional: 130 | return None 131 | 132 | raise FeedParseError( 133 | 'Could not parse atom feed: "{}" text is required but is empty' 134 | .format(name) 135 | ) 136 | 137 | return AtomTextConstruct( 138 | text_type, 139 | lang, 140 | child.text.strip() 141 | ) 142 | 143 | 144 | def _get_person(element: Element) -> Optional[AtomPerson]: 145 | try: 146 | return AtomPerson( 147 | get_text(element, 'feed:name', optional=False), 148 | get_text(element, 'feed:uri'), 149 | get_text(element, 'feed:email') 150 | ) 151 | except FeedParseError: 152 | return None 153 | 154 | 155 | def _get_link(element: Element) -> AtomLink: 156 | return AtomLink( 157 | element.attrib['href'], 158 | element.attrib.get('rel'), 159 | element.attrib.get('type'), 160 | element.attrib.get('hreflang'), 161 | element.attrib.get('title'), 162 | try_parse_length(element.attrib.get('length')) 163 | ) 164 | 165 | 166 | def _get_category(element: Element) -> AtomCategory: 167 | return AtomCategory( 168 | element.attrib['term'], 169 | element.attrib.get('scheme'), 170 | element.attrib.get('label'), 171 | ) 172 | 173 | 174 | def _get_entry(element: Element, 175 | default_authors: List[AtomPerson]) -> AtomEntry: 176 | root = element 177 | 178 | # Mandatory 179 | title = _get_text_construct(root, 'feed:title') 180 | id_ = get_text(root, 'feed:id') 181 | 182 | # Optional 183 | try: 184 | source = _parse_atom(get_child(root, 'feed:source', optional=False), 185 | parse_entries=False) 186 | except FeedParseError: 187 | source = None 188 | source_authors = [] 189 | else: 190 | source_authors = source.authors 191 | 192 | authors = [_get_person(e) 193 | for e in root.findall('feed:author', ns)] or default_authors 194 | authors = [a for a in authors if a is not None] 195 | authors = authors or default_authors or source_authors 196 | 197 | contributors = [_get_person(e) 198 | for e in root.findall('feed:contributor', ns) if e] 199 | contributors = [c for c in contributors if c is not None] 200 | 201 | links = [_get_link(e) for e in root.findall('feed:link', ns)] 202 | categories = [_get_category(e) for e in root.findall('feed:category', ns)] 203 | 204 | updated = get_datetime(root, 'feed:updated') 205 | published = get_datetime(root, 'feed:published') 206 | rights = _get_text_construct(root, 'feed:rights') 207 | summary = _get_text_construct(root, 'feed:summary') 208 | content = _get_text_construct(root, 'feed:content') 209 | 210 | return AtomEntry( 211 | title, 212 | id_, 213 | updated, 214 | authors, 215 | contributors, 216 | links, 217 | categories, 218 | published, 219 | rights, 220 | summary, 221 | content, 222 | source 223 | ) 224 | 225 | 226 | def _parse_atom(root: Element, parse_entries: bool=True) -> AtomFeed: 227 | # Mandatory 228 | id_ = get_text(root, 'feed:id', optional=False) 229 | 230 | # Optional 231 | title = _get_text_construct(root, 'feed:title') 232 | updated = get_datetime(root, 'feed:updated') 233 | authors = [_get_person(e) 234 | for e in root.findall('feed:author', ns) if e] 235 | authors = [a for a in authors if a is not None] 236 | contributors = [_get_person(e) 237 | for e in root.findall('feed:contributor', ns) if e] 238 | contributors = [c for c in contributors if c is not None] 239 | links = [_get_link(e) 240 | for e in root.findall('feed:link', ns)] 241 | categories = [_get_category(e) 242 | for e in root.findall('feed:category', ns)] 243 | 244 | generator = _get_generator(root, 'feed:generator') 245 | subtitle = _get_text_construct(root, 'feed:subtitle') 246 | rights = _get_text_construct(root, 'feed:rights') 247 | icon = get_text(root, 'feed:icon') 248 | logo = get_text(root, 'feed:logo') 249 | 250 | if parse_entries: 251 | entries = [_get_entry(e, authors) 252 | for e in root.findall('feed:entry', ns)] 253 | else: 254 | entries = [] 255 | 256 | atom_feed = AtomFeed( 257 | title, 258 | id_, 259 | updated, 260 | authors, 261 | contributors, 262 | links, 263 | categories, 264 | generator, 265 | subtitle, 266 | rights, 267 | icon, 268 | logo, 269 | entries 270 | ) 271 | return atom_feed 272 | 273 | 274 | def parse_atom_file(filename: str) -> AtomFeed: 275 | """Parse an Atom feed from a local XML file.""" 276 | root = parse_xml(filename).getroot() 277 | return _parse_atom(root) 278 | 279 | 280 | def parse_atom_bytes(data: bytes) -> AtomFeed: 281 | """Parse an Atom feed from a byte-string containing XML data.""" 282 | root = parse_xml(BytesIO(data)).getroot() 283 | return _parse_atom(root) 284 | --------------------------------------------------------------------------------