├── test_requirements.txt
├── .gitignore
├── .coveragerc
├── tox.ini
├── HISTORY.rst
├── src
    └── ao3
    │   ├── utils.py
    │   ├── __init__.py
    │   ├── users.py
    │   └── works.py
├── LICENSE
├── tests
    └── test_utils.py
├── setup.py
├── examples
    └── kudos_to_pinboard.py
└── README.rst


/test_requirements.txt:
--------------------------------------------------------------------------------
1 | coverage
2 | pytest
3 | pytest-cov
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python detritus
 2 | *.egg-info
 3 | build
 4 | dist
 5 | 
 6 | # Test files
 7 | .cache
 8 | .coverage
 9 | .tox
10 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | include =
4 |     **/.tox/*/lib/*/site-packages/ao3/*.py
5 | 
6 | [report]
7 | show_missing = True
8 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27, py33, py34, py35, py36, pypy, lint
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     -r{toxinidir}/test_requirements.txt
 7 | commands =
 8 |     coverage run -m py.test {posargs} {toxinidir}/tests/
 9 |     coverage report
10 | 
11 | [testenv:pypy]
12 | commands = py.test {posargs} {toxinidir}/tests/
13 | 
14 | [testenv:lint]
15 | basepython = python3.6
16 | deps = flake8
17 | commands = flake8 --max-complexity 10 src tests
18 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | Release History
 2 | ===============
 3 | 
 4 | 0.2.0 (15 January 2017)
 5 | ***********************
 6 | 
 7 | - This changes the ``reading_history()`` method to include the date when a
 8 |   user last read the item.  This opens the door to caching the reading history,
 9 |   not fetching the whole thing on every run.
10 | 
11 | 0.1.2 (14 January 2017)
12 | ***********************
13 | 
14 | - This should have been 0.1.0 (actually includes ``ao3.utils``).
15 | 
16 | 0.1.0 (14 January 2017)
17 | ***********************
18 | 
19 | - First PyPI release.
20 | 


--------------------------------------------------------------------------------
/src/ao3/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8
 2 | """Utility functions."""
 3 | 
 4 | import re
 5 | 
 6 | # Regex for extracting the work ID from an AO3 URL.  Designed to match URLs
 7 | # of the form
 8 | #
 9 | #     https://archiveofourown.org/works/1234567
10 | #     http://archiveofourown.org/works/1234567
11 | #
12 | WORK_URL_REGEX = re.compile(
13 |     r'^https?://archiveofourown.org/works/'
14 |     r'(?P<work_id>[0-9]+)'
15 | )
16 | 
17 | 
18 | def work_id_from_url(url):
19 |     """Given an AO3 URL, return the work ID."""
20 |     match = WORK_URL_REGEX.match(url)
21 |     if match:
22 |         return match.group('work_id')
23 |     else:
24 |         raise RuntimeError('%r is not a recognised AO3 work URL')
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Alex Chan
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/src/ao3/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8
 2 | 
 3 | import requests
 4 | from . import utils
 5 | from .users import User
 6 | from .works import Work
 7 | 
 8 | class AO3(object):
 9 |     """A scraper for the Archive of Our Own (AO3)."""
10 | 
11 |     def __init__(self):
12 |         self.user = None
13 |         self.session = requests.Session()
14 | 
15 |     def __repr__(self):
16 |         return '%s()' % (type(self).__name__)
17 | 
18 |     def work(self, id):
19 |         """Look up a work that's been posted to AO3.
20 | 
21 |         :param id: the work ID.  In the URL to a work, this is the number.
22 |             e.g. the work ID of http://archiveofourown.org/works/1234 is 1234.
23 |         """
24 |         return Work(id=id, sess=self.session)
25 | 
26 |     def login(self, username, password):
27 |         """Log in to the archive.
28 | 
29 |         This allows you to access pages that are only available while
30 |         logged in.  This doesn't do any checking that the password is correct.
31 | 
32 |         """
33 |         self.user = User(username=username, password=password, sess=self.session)
34 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8
 2 | """Tests for ao3.utils."""
 3 | 
 4 | import pytest
 5 | 
 6 | from ao3 import utils
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('url, work_id', [
10 |     ('https://archiveofourown.org/works/1', '1'),
11 |     ('https://archiveofourown.org/works/1234567', '1234567'),
12 |     ('https://archiveofourown.org/works/1?view_adult=true', '1'),
13 |     ('https://archiveofourown.org/works/1234567?view_adult=true', '1234567'),
14 |     ('http://archiveofourown.org/works/1?view_adult=true', '1'),
15 |     ('http://archiveofourown.org/works/1234567?view_adult=true', '1234567'),
16 | ])
17 | def test_work_id_from_url(url, work_id):
18 |     assert utils.work_id_from_url(url) == work_id
19 | 
20 | 
21 | @pytest.mark.parametrize('bad_url', [
22 |     'http://google.co.uk',
23 |     'http://archiveofourown.org/users/username',
24 | ])
25 | def test_work_id_from_bad_url_raises_runtimeerror(bad_url):
26 |     """Trying to get a work ID from a non-work URL raises a RuntimeError."""
27 |     with pytest.raises(RuntimeError) as exc:
28 |         utils.work_id_from_url(bad_url)
29 |     assert 'not a recognised AO3 work URL' in exc.value.message
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import codecs
 5 | import os
 6 | 
 7 | from setuptools import find_packages, setup
 8 | 
 9 | 
10 | def local_file(name):
11 |     return os.path.relpath(os.path.join(os.path.dirname(__file__), name))
12 | 
13 | 
14 | SOURCE = local_file('src')
15 | README = local_file('README.rst')
16 | long_description = codecs.open(README, encoding='utf-8').read()
17 | 
18 | 
19 | setup(
20 |     name='ao3',
21 |     version='0.2.0',
22 |     description='A Python API for scraping AO3 (the Archive of Our Own)',
23 |     long_description=long_description,
24 |     url='https://github.com/alexwlchan/ao3',
25 |     author='Alex Chan',
26 |     author_email='alex@alexwlchan.net',
27 |     license='MIT',
28 | 
29 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
30 |     classifiers=[
31 |         'Development Status :: 3 - Alpha',
32 |         'Intended Audience :: Other Audience',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Programming Language :: Python :: 2',
35 |         'Programming Language :: Python :: 2.7',
36 |         'Programming Language :: Python :: 3',
37 |         'Programming Language :: Python :: 3.3',
38 |         'Programming Language :: Python :: 3.4',
39 |         'Programming Language :: Python :: 3.5',
40 |         'Programming Language :: Python :: 3.6',
41 |     ],
42 |     packages=find_packages(SOURCE),
43 |     package_dir={'': SOURCE},
44 |     install_requires=[
45 |         'beautifulsoup4>=4.5.3, <5',
46 |         'requests>=2.12.4, <3',
47 |     ],
48 | )
49 | 


--------------------------------------------------------------------------------
/examples/kudos_to_pinboard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8
 3 | """
 4 | A script for syncing kudos from AO3 to Pinboard.
 5 | 
 6 | You need the ``ao3`` module installed to use this script.  Original idea from
 7 | https://twitter.com/anatsuno/status/427177496875122688
 8 | 
 9 | To use this script:
10 | 
11 | 1.  Enable Viewing History on AO3
12 |     (My Preferences > Preferences > Misc > Turn on Viewing History)
13 | 
14 | 2.  Fill in your AO3 and Pinboard credentials below.
15 | 
16 | 3.  Run the script: ``python kudos_to_pinboard.py``.  Any items on AO3 that
17 |     you've read in the last seven days, and where you've left kudos, will
18 |     be added to Pinboard if you don't already have a bookmark.
19 | 
20 |     The new bookmarks are tagged `ao3_kudos_sync`.
21 | 
22 | """
23 | 
24 | from datetime import datetime, timedelta
25 | 
26 | from ao3 import AO3
27 | from ao3.works import RestrictedWork
28 | import requests
29 | 
30 | 
31 | # AO3 login credentials
32 | AO3_USERNAME = '<USERNAME>'
33 | AO3_PASSWORD = '<PASSWORD>'
34 | 
35 | # Pinboard API token.  https://pinboard.in/settings/password
36 | PINBOARD_API_TOKEN = '<API_TOKEN>'
37 | 
38 | 
39 | def main():
40 |     api = AO3()
41 |     api.login(username=AO3_USERNAME, password=AO3_PASSWORD)
42 | 
43 |     for work_id, last_read in api.user.reading_history():
44 |         if last_read < (datetime.now() - timedelta(days=7)).date():
45 |             break
46 |         try:
47 |             work = api.work(id=work_id)
48 |         except RestrictedWork:
49 |             print('Skipping %s as a restricted work' % work_id)
50 |             continue
51 |         if api.user.username in work.kudos_left_by:
52 |             title = '%s - %s - %s [Archive of Our Own]' % (
53 |                 work.title, work.author, work.fandoms[0])
54 |             print('Saving %s to Pinboard...' % work.url)
55 |             requests.get('https://api.pinboard.in/v1/posts/add', params={
56 |                 'url': work.url,
57 |                 'description': title,
58 |                 'tags': 'ao3_kudos_sync',
59 |                 'replace': 'no',
60 |                 'auth_token': PINBOARD_API_TOKEN,
61 |                 'format': 'json',
62 |             })
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | **Maintenance note, 19 July 2020:** This isn't actively maintained, and it hasn't been for a long time.
  2 | I created this library/repo to accompany a `blog post I wrote in 2018 <https://alexwlchan.net/2017/01/scrape-logged-in-ao3/>`_, but I haven't looked at it much since then and I don't have much time for open source these days.
  3 | 
  4 | FWIW, if I were to work on this again, I'd start by decoupling the HTML parsing and the I/O logic (see my PyCon UK talk about `sans I/O programming <https://alexwlchan.net/2019/10/sans-io-programming/>`_).
  5 | 
  6 | I hope this repo serves as a useful pointer, but don't expect updates any time soon.
  7 | 
  8 | ----
  9 | 
 10 | ao3.py
 11 | ======
 12 | 
 13 | This Python package provides a scripted interface to some of the data on
 14 | `AO3 <https://archiveofourown.org/>`_ (the Archive of Our Own).
 15 | 
 16 | It is **not** an official API.
 17 | 
 18 | Motivation
 19 | **********
 20 | 
 21 | I want to be able to write Python scripts that use data from AO3.
 22 | 
 23 | An official API for AO3 data has been `on the roadmap <http://archiveofourown.org/admin_posts/295>`_
 24 | for a couple of years.  Until that appears, I've cobbled together my own
 25 | page-scraping code that does the job.  It's a bit messy and fragile, but it
 26 | seems to work most of the time.
 27 | 
 28 | If/when we get the proper API, I'd drop this in a heartbeat and do it
 29 | properly.
 30 | 
 31 | Installation
 32 | ************
 33 | 
 34 | Install using pip:
 35 | 
 36 | .. code-block:: console
 37 | 
 38 |    $ pip install ao3
 39 | 
 40 | I'm trying to support Python 2.7, Python 3.3+ and PyPy.
 41 | 
 42 | Usage
 43 | *****
 44 | 
 45 | Create an API instance:
 46 | 
 47 | .. code-block:: pycon
 48 | 
 49 |    >>> from ao3 import AO3, 
 50 |    >>> api = AO3()
 51 | 
 52 | Looking up information about a work
 53 | -----------------------------------
 54 | 
 55 | Getting a work:
 56 | 
 57 | .. code-block:: pycon
 58 | 
 59 |    >>> work = api.work(id='258626')
 60 | 
 61 | The ``id`` is the numeric portion of the URL.  For example, the work ID of
 62 | ``https://archiveofourown.org/works/258626`` is ``258626``.
 63 | 
 64 | Get a URL:
 65 | 
 66 | .. code-block:: pycon
 67 | 
 68 |    >>> work.url
 69 |    'https://archiveofourown.org/works/258626'
 70 | 
 71 | You can then look up a number of attributes, similar to the Stats panel at the
 72 | top of a page.  Here's the full set you can look up:
 73 | 
 74 | .. code-block:: pycon
 75 | 
 76 |    >>> work.title
 77 |    'The Morning After'
 78 | 
 79 |    >>> work.author
 80 |    'ambyr'
 81 | 
 82 |    >>> work.summary
 83 |    "<p>Delicious just can't understand why it's the shy, quiet ones who get all the girls.</p>"
 84 | 
 85 |    >>> work.rating
 86 |    ['Teen And Up Audiences']
 87 | 
 88 |    >>> work.warnings
 89 |    []
 90 | 
 91 | (An empty list is synonymous with "No Archive Warnings", so that it's a falsey
 92 | value.)
 93 | 
 94 | .. code-block:: pycon
 95 | 
 96 |    >>> work.category
 97 |    ['F/M']
 98 | 
 99 |    >>> work.fandoms
100 |    ['Anthropomorfic - Fandom']
101 | 
102 |    >>> work.relationship
103 |    ['Pinboard/Fandom']
104 | 
105 |    >>> work.characters
106 |    ['Pinboard', 'Delicious - Character', 'Diigo - Character']
107 | 
108 |    >>> work.additional_tags
109 |    ['crackfic', 'Meta', 'so very not my usual thing']
110 | 
111 |    >>> work.language
112 |    'English'
113 | 
114 |    >>> work.published
115 |    datetime.date(2011, 9, 29)
116 | 
117 |    >>> work.words
118 |    605
119 | 
120 |    >>> work.comments
121 |    122
122 | 
123 |    >>> work.kudos
124 |    1238
125 | 
126 |    >>> for name in work.kudos_left_by:
127 |    ...     print(name)
128 |    ...
129 |    winterbelles
130 |    AnonEhouse
131 |    SailAweigh
132 |    # and so on
133 | 
134 |    >>> work.bookmarks
135 |    99
136 | 
137 |    >>> work.hits
138 |    43037
139 | 
140 | There's also a method for dumping all the information about a work into JSON,
141 | for easy export/passing into other places:
142 | 
143 | .. code-block:: pycon
144 | 
145 |    >>> work.json()
146 |    '{"rating": ["Teen And Up Audiences"], "fandoms": ["Anthropomorfic - Fandom"], "characters": ["Pinboard", "Delicious - Character", "Diigo - Character"], "language": "English", "additional_tags": ["crackfic", "Meta", "so very not my usual thing"], "warnings": [], "id": "258626", "stats": {"hits": 43037, "words": 605, "bookmarks": 99, "comments": 122, "published": "2011-09-29", "kudos": 1238}, "author": "ambyr", "category": ["F/M"], "title": "The Morning After", "relationship": ["Pinboard/Fandom"], "summary": "<p>Delicious just can\'t understand why it\'s the shy, quiet ones who get all the girls.</p>"}'
147 | 
148 | Looking up your account
149 | -----------------------
150 | 
151 | If you have an account on AO3, you can log in to access pages that aren't
152 | available to the public:
153 | 
154 | .. code-block:: pycon
155 | 
156 |    >>> api.login('username', 'password')
157 | 
158 | If you have Viewing History enabled, you can get a list of work IDs from 
159 | that history, like so:
160 | 
161 | .. code-block:: pycon
162 | 
163 |    >>> for entry in api.user.reading_history():
164 |    ...     print(entry.work_id)
165 |    ...
166 |    '123'
167 |    '456'
168 |    '789'
169 |    # and so on
170 | 
171 | You can enable Viewing History in the settings pane.
172 | 
173 | One interesting side effect of this is that you can use it to get a list
174 | of works where you've left kudos:
175 | 
176 | .. code-block:: python
177 | 
178 |    from ao3 import AO3
179 |    from ao3.works import RestrictedWork
180 | 
181 |    api = AO3()
182 |    api.login('username', 'password')
183 | 
184 |    for entry in api.user.reading_history():
185 |        try:
186 |            work = api.work(id=entry.work_id)
187 |        except RestrictedWork:
188 |            continue
189 |        print(work.url + '... ', end='')
190 |        if api.user.username in work.kudos_left_by:
191 |            print('yes')
192 |        else:
193 |            print('no')
194 | 
195 | Warning: this is `very` slow.  It has to go back and load a page for everything
196 | you've ever read.  Don't use this if you're on a connection with limited
197 | bandwidth.
198 | 
199 | This doesn't include "restricted" works -- works that require you to be a
200 | logged-in user to see them.
201 | 
202 | (The reading page tells you when you last read something.  If you cached the
203 | results, and then subsequent runs only rechecked fics you'd read since the
204 | last run, you could make this quite efficient.  Exercise for the reader.)
205 | 
206 | Looking up your bookmarks
207 | -------------------------
208 | 
209 | If you login as a user you can look up the bookmarks for that user. You can 
210 | get the bookmarks as a list of AO3 id numbers or as a list of work objects.
211 | 
212 | Warning: This is very slow as as the api has to go back and retrieve every 
213 | page.
214 | 
215 | Get the bookmarks as works:
216 | 
217 | .. code-block:: pycon
218 | 
219 |    >>> for bookmark in api.user.bookmarks():
220 |    ...     print(bookmark.title)
221 |    ...
222 |    'Story Name'
223 |    'Fanfiction Title'
224 |    'Read This Fic'
225 |    # and so on
226 | 
227 | Get the bookmarks as a list of id numbers:
228 | 
229 | .. code-block:: pycon
230 | 
231 |    >>> for bookmark_id in api.user.bookmarks_ids():
232 |    ...     print(bookmark_id)
233 |    ...
234 |    '123'
235 |    '456'
236 |    '789'
237 |    # and so on
238 | 
239 | 
240 | 
241 | License
242 | *******
243 | 
244 | The project is licensed under the MIT license.
245 | 


--------------------------------------------------------------------------------
/src/ao3/users.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8
  2 | 
  3 | from datetime import datetime
  4 | import collections
  5 | import itertools
  6 | import re
  7 | 
  8 | from bs4 import BeautifulSoup, Tag
  9 | import requests
 10 | 
 11 | from .works import Work
 12 | 
 13 | 
 14 | ReadingHistoryItem = collections.namedtuple(
 15 |     'ReadingHistoryItem', ['work_id', 'last_read'])
 16 | 
 17 | 
 18 | class User(object):
 19 | 
 20 |     def __init__(self, username, password, sess=None):
 21 |         self.username = username
 22 | 
 23 |         if sess == None:
 24 |             sess = requests.Session()
 25 | 
 26 |         req = sess.get('https://archiveofourown.org')
 27 |         soup = BeautifulSoup(req.text, features='html.parser')
 28 | 
 29 |         authenticity_token = soup.find('input', {'name': 'authenticity_token'})['value']
 30 | 
 31 |         req = sess.post('https://archiveofourown.org/user_sessions', params={
 32 |             'authenticity_token': authenticity_token,
 33 |             'user_session[login]': username,
 34 |             'user_session[password]': password,
 35 |         })
 36 | 
 37 |         # Unfortunately AO3 doesn't use HTTP status codes to communicate
 38 |         # results -- it's a 200 even if the login fails.
 39 |         if 'Please try again' in req.text:
 40 |             raise RuntimeError(
 41 |                 'Error logging in to AO3; is your password correct?')
 42 | 
 43 |         self.sess = sess
 44 |     def __repr__(self):
 45 |         return '%s(username=%r)' % (type(self).__name__, self.username)
 46 | 
 47 |     def bookmarks_ids(self):
 48 |         """
 49 |         Returns a list of the user's bookmarks' ids. Ignores external work bookmarks.
 50 | 
 51 |         User must be logged in to see private bookmarks.
 52 |         """
 53 | 
 54 |         api_url = (
 55 |             'https://archiveofourown.org/users/%s/bookmarks?page=%%d'
 56 |             % self.username)
 57 | 
 58 |         bookmarks = []
 59 | 
 60 |         num_works = 0
 61 |         for page_no in itertools.count(start=1):
 62 |             # print("Finding page: \t" + str(page_no) + " of bookmarks. \t" + str(num_works) + " bookmarks ids found.")
 63 | 
 64 |             req = self.sess.get(api_url % page_no)
 65 |             soup = BeautifulSoup(req.text, features='html.parser')
 66 | 
 67 |             # The entries are stored in a list of the form:
 68 |             #
 69 |             #     <ol class="bookmark index group">
 70 |             #       <li id="bookmark_12345" class="bookmark blurb group" role="article">
 71 |             #         ...
 72 |             #       </li>
 73 |             #       <li id="bookmark_67890" class="bookmark blurb group" role="article">
 74 |             #         ...
 75 |             #       </li>
 76 |             #       ...
 77 |             #     </o
 78 | 
 79 |             ol_tag = soup.find('ol', attrs={'class': 'bookmark'})
 80 |             
 81 | 
 82 |             for li_tag in ol_tag.findAll('li', attrs={'class': 'blurb'}):
 83 |                 num_works = num_works + 1
 84 |                 try:
 85 |                     # <h4 class="heading">
 86 |                     #     <a href="/works/12345678">Work Title</a>
 87 |                     #     <a href="/users/authorname/pseuds/authorpseud" rel="author">Author Name</a>
 88 |                     # </h4>
 89 | 
 90 |                     for h4_tag in li_tag.findAll('h4', attrs={'class': 'heading'}):
 91 |                         for link in h4_tag.findAll('a'):
 92 |                             if ('works' in link.get('href')) and not ('external_works' in link.get('href')):
 93 |                                 work_id = link.get('href').replace('/works/', '')
 94 |                                 bookmarks.append(work_id)
 95 |                 except KeyError:
 96 |                     # A deleted work shows up as
 97 |                     #
 98 |                     #      <li class="deleted reading work blurb group">
 99 |                     #
100 |                     # There's nothing that we can do about that, so just skip
101 |                     # over it.
102 |                     if 'deleted' in li_tag.attrs['class']:
103 |                         pass
104 |                     else:
105 |                         raise
106 | 
107 | 
108 |             # The pagination button at the end of the page is of the form
109 |             #
110 |             #     <li class="next" title="next"> ... </li>
111 |             #
112 |             # If there's another page of results, this contains an <a> tag
113 |             # pointing to the next page.  Otherwise, it contains a <span>
114 |             # tag with the 'disabled' class.
115 |             try:
116 |                 next_button = soup.find('li', attrs={'class': 'next'})
117 |                 if next_button.find('span', attrs={'class': 'disabled'}):
118 |                     break
119 |             except:
120 |                 # In case of absence of "next"
121 |                 break
122 | 
123 |         return bookmarks
124 | 
125 |     def bookmarks(self):
126 |         """
127 |         Returns a list of the user's bookmarks as Work objects.
128 | 
129 |         Takes forever.
130 | 
131 |         User must be logged in to see private bookmarks.
132 |         """
133 | 
134 |         bookmark_total = 0
135 |         bookmark_ids = self.bookmarks_ids()
136 |         bookmarks = []
137 | 
138 |         for bookmark_id in bookmark_ids:
139 |             work = Work(bookmark_id, self.sess)
140 |             bookmarks.append(work)
141 | 
142 |             bookmark_total = bookmark_total + 1
143 |             # print (str(bookmark_total) + "\t bookmarks found.")
144 | 
145 |         return bookmarks
146 | 
147 |     def reading_history(self):
148 |         """Returns a list of articles in the user's reading history.
149 | 
150 |         This requires the user to turn on the Viewing History feature.
151 | 
152 |         This generates a series of ``ReadingHistoryItem`` instances,
153 |         a 2-tuple ``(work_id, last_read)``.
154 |         """
155 |         # TODO: What happens if you don't have this feature enabled?
156 | 
157 |         # URL for the user's reading history page
158 |         api_url = (
159 |             'https://archiveofourown.org/users/%s/readings?page=%%d' %
160 |             self.username)
161 | 
162 |         for page_no in itertools.count(start=1):
163 |             req = self.sess.get(api_url % page_no)
164 |             soup = BeautifulSoup(req.text, features='html.parser')
165 | 
166 |             # The entries are stored in a list of the form:
167 |             #
168 |             #     <ol class="reading work index group">
169 |             #       <li id="work_12345" class="reading work blurb group">
170 |             #         ...
171 |             #       </li>
172 |             #       <li id="work_67890" class="reading work blurb group">
173 |             #         ...
174 |             #       </li>
175 |             #       ...
176 |             #     </ol>
177 |             #
178 |             ol_tag = soup.find('ol', attrs={'class': 'reading'})
179 |             for li_tag in ol_tag.findAll('li', attrs={'class': 'blurb'}):
180 |                 try:
181 |                     work_id = li_tag.attrs['id'].replace('work_', '')
182 | 
183 |                     # Within the <li>, the last viewed date is stored as
184 |                     #
185 |                     #     <h4 class="viewed heading">
186 |                     #         <span>Last viewed:</span> 24 Dec 2012
187 |                     #
188 |                     #         (Latest version.)
189 |                     #
190 |                     #         Viewed once
191 |                     #     </h4>
192 |                     #
193 |                     h4_tag = li_tag.find('h4', attrs={'class': 'viewed'})
194 |                     date_str = re.search(
195 |                         r'[0-9]{1,2} [A-Z][a-z]+ [0-9]{4}',
196 |                         h4_tag.contents[2]).group(0)
197 |                     date = datetime.strptime(date_str, '%d %b %Y').date()
198 | 
199 |                     yield work_id, date
200 |                 except KeyError:
201 |                     # A deleted work shows up as
202 |                     #
203 |                     #      <li class="deleted reading work blurb group">
204 |                     #
205 |                     # There's nothing that we can do about that, so just skip
206 |                     # over it.
207 |                     if 'deleted' in li_tag.attrs['class']:
208 |                         pass
209 |                     else:
210 |                         raise
211 | 
212 |             # The pagination button at the end of the page is of the form
213 |             #
214 |             #     <li class="next" title="next"> ... </li>
215 |             #
216 |             # If there's another page of results, this contains an <a> tag
217 |             # pointing to the next page.  Otherwise, it contains a <span>
218 |             # tag with the 'disabled' class.
219 |             try:
220 |                 next_button = soup.find('li', attrs={'class': 'next'})
221 |                 if next_button.find('span', attrs={'class': 'disabled'}):
222 |                     break
223 |             except:
224 |                 # In case of absence of "next"
225 |                 break
226 | 


--------------------------------------------------------------------------------
/src/ao3/works.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8
  2 | 
  3 | from datetime import datetime
  4 | import json
  5 | 
  6 | from bs4 import BeautifulSoup, Tag
  7 | import requests
  8 | 
  9 | 
 10 | class WorkNotFound(Exception):
 11 |     pass
 12 | 
 13 | 
 14 | class RestrictedWork(Exception):
 15 |     pass
 16 | 
 17 | 
 18 | class Work(object):
 19 | 
 20 |     def __init__(self, id, sess=None):
 21 |         self.id = id
 22 | 
 23 |         # Fetch the HTML for this work
 24 |         if sess == None:
 25 |             sess = requests.Session()
 26 |             
 27 |         req = sess.get('https://archiveofourown.org/works/%s' % self.id)
 28 | 
 29 |         if req.status_code == 404:
 30 |             raise WorkNotFound('Unable to find a work with id %r' % self.id)
 31 |         elif req.status_code != 200:
 32 |             raise RuntimeError('Unexpected error from AO3 API: %r (%r)' % (
 33 |                 req.text, req.statuscode))
 34 | 
 35 |         # For some works, AO3 throws up an interstitial page asking you to
 36 |         # confirm that you really want to see the adult works.  Yes, we do.
 37 |         if 'This work could have adult content' in req.text:
 38 |             req = sess.get(
 39 |                 'https://archiveofourown.org/works/%s?view_adult=true' %
 40 |                 self.id)
 41 | 
 42 |         # Check for restricted works, which require you to be logged in
 43 |         # first.  See https://archiveofourown.org/admin_posts/138
 44 |         # To make this work, we'd need to have a common Session object
 45 |         # across all the API classes.  Not impossible, but fiddlier than I
 46 |         # care to implement right now.
 47 |         # TODO: Fix this.
 48 |         if 'This work is only available to registered users' in req.text:
 49 |             raise RestrictedWork('Looking at work ID %s requires login')
 50 | 
 51 |         self._html = req.text
 52 |         self._soup = BeautifulSoup(self._html, 'html.parser')
 53 | 
 54 |     def __repr__(self):
 55 |         return '%s(id=%r)' % (type(self).__name__, self.id)
 56 | 
 57 |     def __eq__(self, other):
 58 |         return self.id == other.id
 59 | 
 60 |     def __ne__(self, other):
 61 |         return not (self == other)
 62 | 
 63 |     def __hash__(self):
 64 |         return hash(repr(self))
 65 | 
 66 |     @property
 67 |     def url(self):
 68 |         """A URL to this work."""
 69 |         return 'https://archiveofourown.org/works/%s' % self.id
 70 | 
 71 |     @property
 72 |     def title(self):
 73 |         """The title of this work."""
 74 |         # The title of the work is stored in an <h2> tag of the form
 75 |         #
 76 |         #     <h2 class="title heading">[title]</h2>
 77 |         #
 78 |         # TODO: Retrieve title from restricted work
 79 |         title_tag = self._soup.find('h2', attrs={'class': 'title'})
 80 |         return title_tag.contents[0].strip()
 81 | 
 82 |     @property
 83 |     def author(self):
 84 |         """The author of this work."""
 85 |         # The author of the work is kept in the byline, in the form
 86 |         #
 87 |         #     <h3 class="byline heading">
 88 |         #       <a href="/users/[author_name]" rel="author">[author_name]</a>
 89 |         #     </h3>
 90 |         #
 91 |         byline_tag = self._soup.find('h3', attrs={'class': 'byline'})
 92 |         a_tag = [t
 93 |                  for t in byline_tag.contents
 94 |                  if isinstance(t, Tag)]
 95 |         assert len(a_tag) == 1
 96 |         return a_tag[0].contents[0].strip()
 97 | 
 98 |     @property
 99 |     def summary(self):
100 |         """The author summary of the work."""
101 |         # The author summary is kept in the following format:
102 |         #
103 |         #     <div class="summary module" role="complementary">
104 |         #       <h3 class="heading">Summary:</h3>
105 |         #       <blockquote class="userstuff">
106 |         #         [author_summary_html]
107 |         #       </blockquote>
108 |         #     </div>
109 |         #
110 |         summary_div = self._soup.find('div', attrs={'class': 'summary'})
111 |         blockquote = summary_div.find('blockquote')
112 |         return blockquote.renderContents().decode('utf8').strip()
113 | 
114 |     def _lookup_stat(self, class_name, default=None):
115 |         """Returns the value of a stat."""
116 |         # The stats are stored in a series of divs of the form
117 |         #
118 |         #     <dd class="[field_name]">[field_value]</div>
119 |         #
120 |         # This is a convenience method for looking up values from these divs.
121 |         #
122 |         dd_tag = self._soup.find('dd', attrs={'class': class_name})
123 |         if dd_tag is None:
124 |             return default
125 |         if 'tags' in dd_tag.attrs['class']:
126 |             return self._lookup_list_stat(dd_tag=dd_tag)
127 |         return dd_tag.contents[0]
128 | 
129 |     def _lookup_list_stat(self, dd_tag):
130 |         """Returns the value of a list statistic.
131 | 
132 |         Some statistics can have multiple values (e.g. the list of characters).
133 |         This helper method should be used to retrieve those.
134 | 
135 |         """
136 |         # A list tag is stored in the form
137 |         #
138 |         #     <dd class="[field_name] tags">
139 |         #       <ul class="commas">
140 |         #         <li><a href="/further-works">[value 1]</a></li>
141 |         #         <li><a href="/more-info">[value 2]</a></li>
142 |         #         <li class="last"><a href="/more-works">[value 3]</a></li>
143 |         #       </ul>
144 |         #     </dd>
145 |         #
146 |         # We want to get the data from the individual <li> elements.
147 |         li_tags = dd_tag.findAll('li')
148 |         a_tags = [t.contents[0] for t in li_tags]
149 |         return [t.contents[0] for t in a_tags]
150 | 
151 |     @property
152 |     def rating(self):
153 |         """The age rating for this work."""
154 |         return self._lookup_stat('rating', [])
155 | 
156 |     @property
157 |     def warnings(self):
158 |         """Any archive warnings on the work."""
159 |         value = self._lookup_stat('warning', [])
160 |         if value == ['No Archive Warnings Apply']:
161 |             return []
162 |         else:
163 |             return value
164 | 
165 |     @property
166 |     def category(self):
167 |         """The category of the work."""
168 |         return self._lookup_stat('category', [])
169 | 
170 |     @property
171 |     def fandoms(self):
172 |         """The fandoms in this work."""
173 |         return self._lookup_stat('fandom', [])
174 | 
175 |     @property
176 |     def relationship(self):
177 |         """The relationships in this work."""
178 |         return self._lookup_stat('relationship', [])
179 | 
180 |     @property
181 |     def characters(self):
182 |         """The characters in this work."""
183 |         return self._lookup_stat('character', [])
184 | 
185 |     @property
186 |     def additional_tags(self):
187 |         """Any additional tags on the work."""
188 |         return self._lookup_stat('freeform', [])
189 | 
190 |     @property
191 |     def language(self):
192 |         """The language in which this work is published."""
193 |         return self._lookup_stat('language', "").strip()
194 | 
195 |     @property
196 |     def published(self):
197 |         """The date when this work was published."""
198 |         date_str = self._lookup_stat('published')
199 |         date_val = datetime.strptime(date_str, '%Y-%m-%d')
200 |         return date_val.date()
201 | 
202 |     @property
203 |     def words(self):
204 |         """The number of words in this work."""
205 |         return int(self._lookup_stat('words', 0))
206 | 
207 |     @property
208 |     def comments(self):
209 |         """The number of comments on this work."""
210 |         return int(self._lookup_stat('comments', 0))
211 | 
212 |     @property
213 |     def kudos(self):
214 |         """The number of kudos on this work."""
215 |         return int(self._lookup_stat('kudos', 0))
216 | 
217 |     @property
218 |     def kudos_left_by(self):
219 |         """Returns a list of usernames who left kudos on this work."""
220 |         # The list of usernames who left kudos is stored in the following
221 |         # format:
222 |         #
223 |         #     <div id="kudos">
224 |         #       <p class="kudos">
225 |         #         <a href="/users/[username1]">[username1]</a>
226 |         #         <a href="/users/[username2]">[username2]</a>
227 |         #         ...
228 |         #       </p>
229 |         #     </div>
230 |         #
231 |         # And yes, this really does include every username.  The fic with the
232 |         # most kudos is http://archiveofourown.org/works/2080878, and this
233 |         # approach successfully retrieved the username of everybody who
234 |         # left kudos.
235 |         kudos_div = self._soup.find('div', attrs={'id': 'kudos'})
236 |         for a_tag in kudos_div.findAll('a'):
237 | 
238 |             # If a fic has lots of kudos, not all the users who left kudos
239 |             # are displayed by default.  There's a link for expanding the
240 |             # list of users:
241 |             #
242 |             #     <a href="/works/[work_id]/kudos" id="kudos_summary">
243 |             #
244 |             # and another for collapsing the list afterward:
245 |             #
246 |             #     <a href="#" id="kudos_collapser">
247 |             #
248 |             if a_tag.attrs.get('id') in ('kudos_collapser', 'kudos_summary'):
249 |                 continue
250 | 
251 |             # There's sometimes a kudos summary that can be expanded to
252 | 
253 |             yield a_tag.attrs['href'].replace('/users/', '')
254 | 
255 |     @property
256 |     def bookmarks(self):
257 |         """The number of times this work has been bookmarked."""
258 |         # This returns a link of the form
259 |         #
260 |         #     <a href="/works/9079264/bookmarks">102</a>
261 |         #
262 |         # It might be nice to follow that page and get a list of who has
263 |         # bookmarked this, but for now just return the number.
264 |         return int(self._lookup_stat('bookmarks').contents[0])
265 | 
266 |     @property
267 |     def hits(self):
268 |         """The number of hits this work has received."""
269 |         return int(self._lookup_stat('hits', 0))
270 | 
271 |     def json(self, *args, **kwargs):
272 |         """Provide a complete representation of the work in JSON.
273 | 
274 |         *args and **kwargs are passed directly to `json.dumps()` from the
275 |         standard library.
276 | 
277 |         """
278 |         data = {
279 |             'id': self.id,
280 |             'title': self.title,
281 |             'author': self.author,
282 |             'summary': self.summary,
283 |             'rating': self.rating,
284 |             'warnings': self.warnings,
285 |             'category': self.category,
286 |             'fandoms': self.fandoms,
287 |             'relationship': self.relationship,
288 |             'characters': self.characters,
289 |             'additional_tags': self.additional_tags,
290 |             'language': self.language,
291 |             'stats': {
292 |                 'published': str(self.published),
293 |                 'words': self.words,
294 |                 # TODO: chapters
295 |                 'comments': self.comments,
296 |                 'kudos': self.kudos,
297 |                 'bookmarks': self.bookmarks,
298 |                 'hits': self.hits,
299 |             }
300 |         }
301 |         return json.dumps(data, *args, **kwargs)
302 | 


--------------------------------------------------------------------------------