├── test_requirements.txt ├── .gitignore ├── .coveragerc ├── tox.ini ├── HISTORY.rst ├── src └── ao3 │ ├── utils.py │ ├── __init__.py │ ├── users.py │ └── works.py ├── LICENSE ├── tests └── test_utils.py ├── setup.py ├── examples └── kudos_to_pinboard.py └── README.rst /test_requirements.txt: -------------------------------------------------------------------------------- 1 | coverage 2 | pytest 3 | pytest-cov 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python detritus 2 | *.egg-info 3 | build 4 | dist 5 | 6 | # Test files 7 | .cache 8 | .coverage 9 | .tox 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | include = 4 | **/.tox/*/lib/*/site-packages/ao3/*.py 5 | 6 | [report] 7 | show_missing = True 8 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py33, py34, py35, py36, pypy, lint 3 | 4 | [testenv] 5 | deps = 6 | -r{toxinidir}/test_requirements.txt 7 | commands = 8 | coverage run -m py.test {posargs} {toxinidir}/tests/ 9 | coverage report 10 | 11 | [testenv:pypy] 12 | commands = py.test {posargs} {toxinidir}/tests/ 13 | 14 | [testenv:lint] 15 | basepython = python3.6 16 | deps = flake8 17 | commands = flake8 --max-complexity 10 src tests 18 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | Release History 2 | =============== 3 | 4 | 0.2.0 (15 January 2017) 5 | *********************** 6 | 7 | - This changes the ``reading_history()`` method to include the date when a 8 | user last read the item. This opens the door to caching the reading history, 9 | not fetching the whole thing on every run. 10 | 11 | 0.1.2 (14 January 2017) 12 | *********************** 13 | 14 | - This should have been 0.1.0 (actually includes ``ao3.utils``). 15 | 16 | 0.1.0 (14 January 2017) 17 | *********************** 18 | 19 | - First PyPI release. 20 | -------------------------------------------------------------------------------- /src/ao3/utils.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | """Utility functions.""" 3 | 4 | import re 5 | 6 | # Regex for extracting the work ID from an AO3 URL. Designed to match URLs 7 | # of the form 8 | # 9 | # https://archiveofourown.org/works/1234567 10 | # http://archiveofourown.org/works/1234567 11 | # 12 | WORK_URL_REGEX = re.compile( 13 | r'^https?://archiveofourown.org/works/' 14 | r'(?P[0-9]+)' 15 | ) 16 | 17 | 18 | def work_id_from_url(url): 19 | """Given an AO3 URL, return the work ID.""" 20 | match = WORK_URL_REGEX.match(url) 21 | if match: 22 | return match.group('work_id') 23 | else: 24 | raise RuntimeError('%r is not a recognised AO3 work URL') 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /src/ao3/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | import requests 4 | from . import utils 5 | from .users import User 6 | from .works import Work 7 | 8 | class AO3(object): 9 | """A scraper for the Archive of Our Own (AO3).""" 10 | 11 | def __init__(self): 12 | self.user = None 13 | self.session = requests.Session() 14 | 15 | def __repr__(self): 16 | return '%s()' % (type(self).__name__) 17 | 18 | def work(self, id): 19 | """Look up a work that's been posted to AO3. 20 | 21 | :param id: the work ID. In the URL to a work, this is the number. 22 | e.g. the work ID of http://archiveofourown.org/works/1234 is 1234. 23 | """ 24 | return Work(id=id, sess=self.session) 25 | 26 | def login(self, username, password): 27 | """Log in to the archive. 28 | 29 | This allows you to access pages that are only available while 30 | logged in. This doesn't do any checking that the password is correct. 31 | 32 | """ 33 | self.user = User(username=username, password=password, sess=self.session) 34 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | """Tests for ao3.utils.""" 3 | 4 | import pytest 5 | 6 | from ao3 import utils 7 | 8 | 9 | @pytest.mark.parametrize('url, work_id', [ 10 | ('https://archiveofourown.org/works/1', '1'), 11 | ('https://archiveofourown.org/works/1234567', '1234567'), 12 | ('https://archiveofourown.org/works/1?view_adult=true', '1'), 13 | ('https://archiveofourown.org/works/1234567?view_adult=true', '1234567'), 14 | ('http://archiveofourown.org/works/1?view_adult=true', '1'), 15 | ('http://archiveofourown.org/works/1234567?view_adult=true', '1234567'), 16 | ]) 17 | def test_work_id_from_url(url, work_id): 18 | assert utils.work_id_from_url(url) == work_id 19 | 20 | 21 | @pytest.mark.parametrize('bad_url', [ 22 | 'http://google.co.uk', 23 | 'http://archiveofourown.org/users/username', 24 | ]) 25 | def test_work_id_from_bad_url_raises_runtimeerror(bad_url): 26 | """Trying to get a work ID from a non-work URL raises a RuntimeError.""" 27 | with pytest.raises(RuntimeError) as exc: 28 | utils.work_id_from_url(bad_url) 29 | assert 'not a recognised AO3 work URL' in exc.value.message 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import codecs 5 | import os 6 | 7 | from setuptools import find_packages, setup 8 | 9 | 10 | def local_file(name): 11 | return os.path.relpath(os.path.join(os.path.dirname(__file__), name)) 12 | 13 | 14 | SOURCE = local_file('src') 15 | README = local_file('README.rst') 16 | long_description = codecs.open(README, encoding='utf-8').read() 17 | 18 | 19 | setup( 20 | name='ao3', 21 | version='0.2.0', 22 | description='A Python API for scraping AO3 (the Archive of Our Own)', 23 | long_description=long_description, 24 | url='https://github.com/alexwlchan/ao3', 25 | author='Alex Chan', 26 | author_email='alex@alexwlchan.net', 27 | license='MIT', 28 | 29 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 30 | classifiers=[ 31 | 'Development Status :: 3 - Alpha', 32 | 'Intended Audience :: Other Audience', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Programming Language :: Python :: 2', 35 | 'Programming Language :: Python :: 2.7', 36 | 'Programming Language :: Python :: 3', 37 | 'Programming Language :: Python :: 3.3', 38 | 'Programming Language :: Python :: 3.4', 39 | 'Programming Language :: Python :: 3.5', 40 | 'Programming Language :: Python :: 3.6', 41 | ], 42 | packages=find_packages(SOURCE), 43 | package_dir={'': SOURCE}, 44 | install_requires=[ 45 | 'beautifulsoup4>=4.5.3, <5', 46 | 'requests>=2.12.4, <3', 47 | ], 48 | ) 49 | -------------------------------------------------------------------------------- /examples/kudos_to_pinboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | """ 4 | A script for syncing kudos from AO3 to Pinboard. 5 | 6 | You need the ``ao3`` module installed to use this script. Original idea from 7 | https://twitter.com/anatsuno/status/427177496875122688 8 | 9 | To use this script: 10 | 11 | 1. Enable Viewing History on AO3 12 | (My Preferences > Preferences > Misc > Turn on Viewing History) 13 | 14 | 2. Fill in your AO3 and Pinboard credentials below. 15 | 16 | 3. Run the script: ``python kudos_to_pinboard.py``. Any items on AO3 that 17 | you've read in the last seven days, and where you've left kudos, will 18 | be added to Pinboard if you don't already have a bookmark. 19 | 20 | The new bookmarks are tagged `ao3_kudos_sync`. 21 | 22 | """ 23 | 24 | from datetime import datetime, timedelta 25 | 26 | from ao3 import AO3 27 | from ao3.works import RestrictedWork 28 | import requests 29 | 30 | 31 | # AO3 login credentials 32 | AO3_USERNAME = '' 33 | AO3_PASSWORD = '' 34 | 35 | # Pinboard API token. https://pinboard.in/settings/password 36 | PINBOARD_API_TOKEN = '' 37 | 38 | 39 | def main(): 40 | api = AO3() 41 | api.login(username=AO3_USERNAME, password=AO3_PASSWORD) 42 | 43 | for work_id, last_read in api.user.reading_history(): 44 | if last_read < (datetime.now() - timedelta(days=7)).date(): 45 | break 46 | try: 47 | work = api.work(id=work_id) 48 | except RestrictedWork: 49 | print('Skipping %s as a restricted work' % work_id) 50 | continue 51 | if api.user.username in work.kudos_left_by: 52 | title = '%s - %s - %s [Archive of Our Own]' % ( 53 | work.title, work.author, work.fandoms[0]) 54 | print('Saving %s to Pinboard...' % work.url) 55 | requests.get('https://api.pinboard.in/v1/posts/add', params={ 56 | 'url': work.url, 57 | 'description': title, 58 | 'tags': 'ao3_kudos_sync', 59 | 'replace': 'no', 60 | 'auth_token': PINBOARD_API_TOKEN, 61 | 'format': 'json', 62 | }) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | **Maintenance note, 19 July 2020:** This isn't actively maintained, and it hasn't been for a long time. 2 | I created this library/repo to accompany a `blog post I wrote in 2018 `_, but I haven't looked at it much since then and I don't have much time for open source these days. 3 | 4 | FWIW, if I were to work on this again, I'd start by decoupling the HTML parsing and the I/O logic (see my PyCon UK talk about `sans I/O programming `_). 5 | 6 | I hope this repo serves as a useful pointer, but don't expect updates any time soon. 7 | 8 | ---- 9 | 10 | ao3.py 11 | ====== 12 | 13 | This Python package provides a scripted interface to some of the data on 14 | `AO3 `_ (the Archive of Our Own). 15 | 16 | It is **not** an official API. 17 | 18 | Motivation 19 | ********** 20 | 21 | I want to be able to write Python scripts that use data from AO3. 22 | 23 | An official API for AO3 data has been `on the roadmap `_ 24 | for a couple of years. Until that appears, I've cobbled together my own 25 | page-scraping code that does the job. It's a bit messy and fragile, but it 26 | seems to work most of the time. 27 | 28 | If/when we get the proper API, I'd drop this in a heartbeat and do it 29 | properly. 30 | 31 | Installation 32 | ************ 33 | 34 | Install using pip: 35 | 36 | .. code-block:: console 37 | 38 | $ pip install ao3 39 | 40 | I'm trying to support Python 2.7, Python 3.3+ and PyPy. 41 | 42 | Usage 43 | ***** 44 | 45 | Create an API instance: 46 | 47 | .. code-block:: pycon 48 | 49 | >>> from ao3 import AO3, 50 | >>> api = AO3() 51 | 52 | Looking up information about a work 53 | ----------------------------------- 54 | 55 | Getting a work: 56 | 57 | .. code-block:: pycon 58 | 59 | >>> work = api.work(id='258626') 60 | 61 | The ``id`` is the numeric portion of the URL. For example, the work ID of 62 | ``https://archiveofourown.org/works/258626`` is ``258626``. 63 | 64 | Get a URL: 65 | 66 | .. code-block:: pycon 67 | 68 | >>> work.url 69 | 'https://archiveofourown.org/works/258626' 70 | 71 | You can then look up a number of attributes, similar to the Stats panel at the 72 | top of a page. Here's the full set you can look up: 73 | 74 | .. code-block:: pycon 75 | 76 | >>> work.title 77 | 'The Morning After' 78 | 79 | >>> work.author 80 | 'ambyr' 81 | 82 | >>> work.summary 83 | "

Delicious just can't understand why it's the shy, quiet ones who get all the girls.

" 84 | 85 | >>> work.rating 86 | ['Teen And Up Audiences'] 87 | 88 | >>> work.warnings 89 | [] 90 | 91 | (An empty list is synonymous with "No Archive Warnings", so that it's a falsey 92 | value.) 93 | 94 | .. code-block:: pycon 95 | 96 | >>> work.category 97 | ['F/M'] 98 | 99 | >>> work.fandoms 100 | ['Anthropomorfic - Fandom'] 101 | 102 | >>> work.relationship 103 | ['Pinboard/Fandom'] 104 | 105 | >>> work.characters 106 | ['Pinboard', 'Delicious - Character', 'Diigo - Character'] 107 | 108 | >>> work.additional_tags 109 | ['crackfic', 'Meta', 'so very not my usual thing'] 110 | 111 | >>> work.language 112 | 'English' 113 | 114 | >>> work.published 115 | datetime.date(2011, 9, 29) 116 | 117 | >>> work.words 118 | 605 119 | 120 | >>> work.comments 121 | 122 122 | 123 | >>> work.kudos 124 | 1238 125 | 126 | >>> for name in work.kudos_left_by: 127 | ... print(name) 128 | ... 129 | winterbelles 130 | AnonEhouse 131 | SailAweigh 132 | # and so on 133 | 134 | >>> work.bookmarks 135 | 99 136 | 137 | >>> work.hits 138 | 43037 139 | 140 | There's also a method for dumping all the information about a work into JSON, 141 | for easy export/passing into other places: 142 | 143 | .. code-block:: pycon 144 | 145 | >>> work.json() 146 | '{"rating": ["Teen And Up Audiences"], "fandoms": ["Anthropomorfic - Fandom"], "characters": ["Pinboard", "Delicious - Character", "Diigo - Character"], "language": "English", "additional_tags": ["crackfic", "Meta", "so very not my usual thing"], "warnings": [], "id": "258626", "stats": {"hits": 43037, "words": 605, "bookmarks": 99, "comments": 122, "published": "2011-09-29", "kudos": 1238}, "author": "ambyr", "category": ["F/M"], "title": "The Morning After", "relationship": ["Pinboard/Fandom"], "summary": "

Delicious just can\'t understand why it\'s the shy, quiet ones who get all the girls.

"}' 147 | 148 | Looking up your account 149 | ----------------------- 150 | 151 | If you have an account on AO3, you can log in to access pages that aren't 152 | available to the public: 153 | 154 | .. code-block:: pycon 155 | 156 | >>> api.login('username', 'password') 157 | 158 | If you have Viewing History enabled, you can get a list of work IDs from 159 | that history, like so: 160 | 161 | .. code-block:: pycon 162 | 163 | >>> for entry in api.user.reading_history(): 164 | ... print(entry.work_id) 165 | ... 166 | '123' 167 | '456' 168 | '789' 169 | # and so on 170 | 171 | You can enable Viewing History in the settings pane. 172 | 173 | One interesting side effect of this is that you can use it to get a list 174 | of works where you've left kudos: 175 | 176 | .. code-block:: python 177 | 178 | from ao3 import AO3 179 | from ao3.works import RestrictedWork 180 | 181 | api = AO3() 182 | api.login('username', 'password') 183 | 184 | for entry in api.user.reading_history(): 185 | try: 186 | work = api.work(id=entry.work_id) 187 | except RestrictedWork: 188 | continue 189 | print(work.url + '... ', end='') 190 | if api.user.username in work.kudos_left_by: 191 | print('yes') 192 | else: 193 | print('no') 194 | 195 | Warning: this is `very` slow. It has to go back and load a page for everything 196 | you've ever read. Don't use this if you're on a connection with limited 197 | bandwidth. 198 | 199 | This doesn't include "restricted" works -- works that require you to be a 200 | logged-in user to see them. 201 | 202 | (The reading page tells you when you last read something. If you cached the 203 | results, and then subsequent runs only rechecked fics you'd read since the 204 | last run, you could make this quite efficient. Exercise for the reader.) 205 | 206 | Looking up your bookmarks 207 | ------------------------- 208 | 209 | If you login as a user you can look up the bookmarks for that user. You can 210 | get the bookmarks as a list of AO3 id numbers or as a list of work objects. 211 | 212 | Warning: This is very slow as as the api has to go back and retrieve every 213 | page. 214 | 215 | Get the bookmarks as works: 216 | 217 | .. code-block:: pycon 218 | 219 | >>> for bookmark in api.user.bookmarks(): 220 | ... print(bookmark.title) 221 | ... 222 | 'Story Name' 223 | 'Fanfiction Title' 224 | 'Read This Fic' 225 | # and so on 226 | 227 | Get the bookmarks as a list of id numbers: 228 | 229 | .. code-block:: pycon 230 | 231 | >>> for bookmark_id in api.user.bookmarks_ids(): 232 | ... print(bookmark_id) 233 | ... 234 | '123' 235 | '456' 236 | '789' 237 | # and so on 238 | 239 | 240 | 241 | License 242 | ******* 243 | 244 | The project is licensed under the MIT license. 245 | -------------------------------------------------------------------------------- /src/ao3/users.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | from datetime import datetime 4 | import collections 5 | import itertools 6 | import re 7 | 8 | from bs4 import BeautifulSoup, Tag 9 | import requests 10 | 11 | from .works import Work 12 | 13 | 14 | ReadingHistoryItem = collections.namedtuple( 15 | 'ReadingHistoryItem', ['work_id', 'last_read']) 16 | 17 | 18 | class User(object): 19 | 20 | def __init__(self, username, password, sess=None): 21 | self.username = username 22 | 23 | if sess == None: 24 | sess = requests.Session() 25 | 26 | req = sess.get('https://archiveofourown.org') 27 | soup = BeautifulSoup(req.text, features='html.parser') 28 | 29 | authenticity_token = soup.find('input', {'name': 'authenticity_token'})['value'] 30 | 31 | req = sess.post('https://archiveofourown.org/user_sessions', params={ 32 | 'authenticity_token': authenticity_token, 33 | 'user_session[login]': username, 34 | 'user_session[password]': password, 35 | }) 36 | 37 | # Unfortunately AO3 doesn't use HTTP status codes to communicate 38 | # results -- it's a 200 even if the login fails. 39 | if 'Please try again' in req.text: 40 | raise RuntimeError( 41 | 'Error logging in to AO3; is your password correct?') 42 | 43 | self.sess = sess 44 | def __repr__(self): 45 | return '%s(username=%r)' % (type(self).__name__, self.username) 46 | 47 | def bookmarks_ids(self): 48 | """ 49 | Returns a list of the user's bookmarks' ids. Ignores external work bookmarks. 50 | 51 | User must be logged in to see private bookmarks. 52 | """ 53 | 54 | api_url = ( 55 | 'https://archiveofourown.org/users/%s/bookmarks?page=%%d' 56 | % self.username) 57 | 58 | bookmarks = [] 59 | 60 | num_works = 0 61 | for page_no in itertools.count(start=1): 62 | # print("Finding page: \t" + str(page_no) + " of bookmarks. \t" + str(num_works) + " bookmarks ids found.") 63 | 64 | req = self.sess.get(api_url % page_no) 65 | soup = BeautifulSoup(req.text, features='html.parser') 66 | 67 | # The entries are stored in a list of the form: 68 | # 69 | #
    70 | #
  1. 71 | # ... 72 | #
  2. 73 | #
  3. 74 | # ... 75 | #
  4. 76 | # ... 77 | # 86 | # Work Title 87 | # 88 | # 89 | 90 | for h4_tag in li_tag.findAll('h4', attrs={'class': 'heading'}): 91 | for link in h4_tag.findAll('a'): 92 | if ('works' in link.get('href')) and not ('external_works' in link.get('href')): 93 | work_id = link.get('href').replace('/works/', '') 94 | bookmarks.append(work_id) 95 | except KeyError: 96 | # A deleted work shows up as 97 | # 98 | #
  5. 99 | # 100 | # There's nothing that we can do about that, so just skip 101 | # over it. 102 | if 'deleted' in li_tag.attrs['class']: 103 | pass 104 | else: 105 | raise 106 | 107 | 108 | # The pagination button at the end of the page is of the form 109 | # 110 | #
  6. 111 | # 112 | # If there's another page of results, this contains an tag 113 | # pointing to the next page. Otherwise, it contains a 114 | # tag with the 'disabled' class. 115 | try: 116 | next_button = soup.find('li', attrs={'class': 'next'}) 117 | if next_button.find('span', attrs={'class': 'disabled'}): 118 | break 119 | except: 120 | # In case of absence of "next" 121 | break 122 | 123 | return bookmarks 124 | 125 | def bookmarks(self): 126 | """ 127 | Returns a list of the user's bookmarks as Work objects. 128 | 129 | Takes forever. 130 | 131 | User must be logged in to see private bookmarks. 132 | """ 133 | 134 | bookmark_total = 0 135 | bookmark_ids = self.bookmarks_ids() 136 | bookmarks = [] 137 | 138 | for bookmark_id in bookmark_ids: 139 | work = Work(bookmark_id, self.sess) 140 | bookmarks.append(work) 141 | 142 | bookmark_total = bookmark_total + 1 143 | # print (str(bookmark_total) + "\t bookmarks found.") 144 | 145 | return bookmarks 146 | 147 | def reading_history(self): 148 | """Returns a list of articles in the user's reading history. 149 | 150 | This requires the user to turn on the Viewing History feature. 151 | 152 | This generates a series of ``ReadingHistoryItem`` instances, 153 | a 2-tuple ``(work_id, last_read)``. 154 | """ 155 | # TODO: What happens if you don't have this feature enabled? 156 | 157 | # URL for the user's reading history page 158 | api_url = ( 159 | 'https://archiveofourown.org/users/%s/readings?page=%%d' % 160 | self.username) 161 | 162 | for page_no in itertools.count(start=1): 163 | req = self.sess.get(api_url % page_no) 164 | soup = BeautifulSoup(req.text, features='html.parser') 165 | 166 | # The entries are stored in a list of the form: 167 | # 168 | #
      169 | #
    1. 170 | # ... 171 | #
    2. 172 | #
    3. 173 | # ... 174 | #
    4. 175 | # ... 176 | #
    177 | # 178 | ol_tag = soup.find('ol', attrs={'class': 'reading'}) 179 | for li_tag in ol_tag.findAll('li', attrs={'class': 'blurb'}): 180 | try: 181 | work_id = li_tag.attrs['id'].replace('work_', '') 182 | 183 | # Within the
  7. , the last viewed date is stored as 184 | # 185 | #

    186 | # Last viewed: 24 Dec 2012 187 | # 188 | # (Latest version.) 189 | # 190 | # Viewed once 191 | #

    192 | # 193 | h4_tag = li_tag.find('h4', attrs={'class': 'viewed'}) 194 | date_str = re.search( 195 | r'[0-9]{1,2} [A-Z][a-z]+ [0-9]{4}', 196 | h4_tag.contents[2]).group(0) 197 | date = datetime.strptime(date_str, '%d %b %Y').date() 198 | 199 | yield work_id, date 200 | except KeyError: 201 | # A deleted work shows up as 202 | # 203 | #
  8. 204 | # 205 | # There's nothing that we can do about that, so just skip 206 | # over it. 207 | if 'deleted' in li_tag.attrs['class']: 208 | pass 209 | else: 210 | raise 211 | 212 | # The pagination button at the end of the page is of the form 213 | # 214 | #
  9. 215 | # 216 | # If there's another page of results, this contains an
    tag 217 | # pointing to the next page. Otherwise, it contains a 218 | # tag with the 'disabled' class. 219 | try: 220 | next_button = soup.find('li', attrs={'class': 'next'}) 221 | if next_button.find('span', attrs={'class': 'disabled'}): 222 | break 223 | except: 224 | # In case of absence of "next" 225 | break 226 | -------------------------------------------------------------------------------- /src/ao3/works.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | from datetime import datetime 4 | import json 5 | 6 | from bs4 import BeautifulSoup, Tag 7 | import requests 8 | 9 | 10 | class WorkNotFound(Exception): 11 | pass 12 | 13 | 14 | class RestrictedWork(Exception): 15 | pass 16 | 17 | 18 | class Work(object): 19 | 20 | def __init__(self, id, sess=None): 21 | self.id = id 22 | 23 | # Fetch the HTML for this work 24 | if sess == None: 25 | sess = requests.Session() 26 | 27 | req = sess.get('https://archiveofourown.org/works/%s' % self.id) 28 | 29 | if req.status_code == 404: 30 | raise WorkNotFound('Unable to find a work with id %r' % self.id) 31 | elif req.status_code != 200: 32 | raise RuntimeError('Unexpected error from AO3 API: %r (%r)' % ( 33 | req.text, req.statuscode)) 34 | 35 | # For some works, AO3 throws up an interstitial page asking you to 36 | # confirm that you really want to see the adult works. Yes, we do. 37 | if 'This work could have adult content' in req.text: 38 | req = sess.get( 39 | 'https://archiveofourown.org/works/%s?view_adult=true' % 40 | self.id) 41 | 42 | # Check for restricted works, which require you to be logged in 43 | # first. See https://archiveofourown.org/admin_posts/138 44 | # To make this work, we'd need to have a common Session object 45 | # across all the API classes. Not impossible, but fiddlier than I 46 | # care to implement right now. 47 | # TODO: Fix this. 48 | if 'This work is only available to registered users' in req.text: 49 | raise RestrictedWork('Looking at work ID %s requires login') 50 | 51 | self._html = req.text 52 | self._soup = BeautifulSoup(self._html, 'html.parser') 53 | 54 | def __repr__(self): 55 | return '%s(id=%r)' % (type(self).__name__, self.id) 56 | 57 | def __eq__(self, other): 58 | return self.id == other.id 59 | 60 | def __ne__(self, other): 61 | return not (self == other) 62 | 63 | def __hash__(self): 64 | return hash(repr(self)) 65 | 66 | @property 67 | def url(self): 68 | """A URL to this work.""" 69 | return 'https://archiveofourown.org/works/%s' % self.id 70 | 71 | @property 72 | def title(self): 73 | """The title of this work.""" 74 | # The title of the work is stored in an

    tag of the form 75 | # 76 | #

    [title]

    77 | # 78 | # TODO: Retrieve title from restricted work 79 | title_tag = self._soup.find('h2', attrs={'class': 'title'}) 80 | return title_tag.contents[0].strip() 81 | 82 | @property 83 | def author(self): 84 | """The author of this work.""" 85 | # The author of the work is kept in the byline, in the form 86 | # 87 | #
    90 | # 91 | byline_tag = self._soup.find('h3', attrs={'class': 'byline'}) 92 | a_tag = [t 93 | for t in byline_tag.contents 94 | if isinstance(t, Tag)] 95 | assert len(a_tag) == 1 96 | return a_tag[0].contents[0].strip() 97 | 98 | @property 99 | def summary(self): 100 | """The author summary of the work.""" 101 | # The author summary is kept in the following format: 102 | # 103 | # 109 | # 110 | summary_div = self._soup.find('div', attrs={'class': 'summary'}) 111 | blockquote = summary_div.find('blockquote') 112 | return blockquote.renderContents().decode('utf8').strip() 113 | 114 | def _lookup_stat(self, class_name, default=None): 115 | """Returns the value of a stat.""" 116 | # The stats are stored in a series of divs of the form 117 | # 118 | #
    [field_value] 119 | # 120 | # This is a convenience method for looking up values from these divs. 121 | # 122 | dd_tag = self._soup.find('dd', attrs={'class': class_name}) 123 | if dd_tag is None: 124 | return default 125 | if 'tags' in dd_tag.attrs['class']: 126 | return self._lookup_list_stat(dd_tag=dd_tag) 127 | return dd_tag.contents[0] 128 | 129 | def _lookup_list_stat(self, dd_tag): 130 | """Returns the value of a list statistic. 131 | 132 | Some statistics can have multiple values (e.g. the list of characters). 133 | This helper method should be used to retrieve those. 134 | 135 | """ 136 | # A list tag is stored in the form 137 | # 138 | #
    139 | # 144 | #
    145 | # 146 | # We want to get the data from the individual
  10. elements. 147 | li_tags = dd_tag.findAll('li') 148 | a_tags = [t.contents[0] for t in li_tags] 149 | return [t.contents[0] for t in a_tags] 150 | 151 | @property 152 | def rating(self): 153 | """The age rating for this work.""" 154 | return self._lookup_stat('rating', []) 155 | 156 | @property 157 | def warnings(self): 158 | """Any archive warnings on the work.""" 159 | value = self._lookup_stat('warning', []) 160 | if value == ['No Archive Warnings Apply']: 161 | return [] 162 | else: 163 | return value 164 | 165 | @property 166 | def category(self): 167 | """The category of the work.""" 168 | return self._lookup_stat('category', []) 169 | 170 | @property 171 | def fandoms(self): 172 | """The fandoms in this work.""" 173 | return self._lookup_stat('fandom', []) 174 | 175 | @property 176 | def relationship(self): 177 | """The relationships in this work.""" 178 | return self._lookup_stat('relationship', []) 179 | 180 | @property 181 | def characters(self): 182 | """The characters in this work.""" 183 | return self._lookup_stat('character', []) 184 | 185 | @property 186 | def additional_tags(self): 187 | """Any additional tags on the work.""" 188 | return self._lookup_stat('freeform', []) 189 | 190 | @property 191 | def language(self): 192 | """The language in which this work is published.""" 193 | return self._lookup_stat('language', "").strip() 194 | 195 | @property 196 | def published(self): 197 | """The date when this work was published.""" 198 | date_str = self._lookup_stat('published') 199 | date_val = datetime.strptime(date_str, '%Y-%m-%d') 200 | return date_val.date() 201 | 202 | @property 203 | def words(self): 204 | """The number of words in this work.""" 205 | return int(self._lookup_stat('words', 0)) 206 | 207 | @property 208 | def comments(self): 209 | """The number of comments on this work.""" 210 | return int(self._lookup_stat('comments', 0)) 211 | 212 | @property 213 | def kudos(self): 214 | """The number of kudos on this work.""" 215 | return int(self._lookup_stat('kudos', 0)) 216 | 217 | @property 218 | def kudos_left_by(self): 219 | """Returns a list of usernames who left kudos on this work.""" 220 | # The list of usernames who left kudos is stored in the following 221 | # format: 222 | # 223 | #
    224 | #

    225 | # [username1] 226 | # [username2] 227 | # ... 228 | #

    229 | #
    230 | # 231 | # And yes, this really does include every username. The fic with the 232 | # most kudos is http://archiveofourown.org/works/2080878, and this 233 | # approach successfully retrieved the username of everybody who 234 | # left kudos. 235 | kudos_div = self._soup.find('div', attrs={'id': 'kudos'}) 236 | for a_tag in kudos_div.findAll('a'): 237 | 238 | # If a fic has lots of kudos, not all the users who left kudos 239 | # are displayed by default. There's a link for expanding the 240 | # list of users: 241 | # 242 | # 243 | # 244 | # and another for collapsing the list afterward: 245 | # 246 | # 247 | # 248 | if a_tag.attrs.get('id') in ('kudos_collapser', 'kudos_summary'): 249 | continue 250 | 251 | # There's sometimes a kudos summary that can be expanded to 252 | 253 | yield a_tag.attrs['href'].replace('/users/', '') 254 | 255 | @property 256 | def bookmarks(self): 257 | """The number of times this work has been bookmarked.""" 258 | # This returns a link of the form 259 | # 260 | # 102 261 | # 262 | # It might be nice to follow that page and get a list of who has 263 | # bookmarked this, but for now just return the number. 264 | return int(self._lookup_stat('bookmarks').contents[0]) 265 | 266 | @property 267 | def hits(self): 268 | """The number of hits this work has received.""" 269 | return int(self._lookup_stat('hits', 0)) 270 | 271 | def json(self, *args, **kwargs): 272 | """Provide a complete representation of the work in JSON. 273 | 274 | *args and **kwargs are passed directly to `json.dumps()` from the 275 | standard library. 276 | 277 | """ 278 | data = { 279 | 'id': self.id, 280 | 'title': self.title, 281 | 'author': self.author, 282 | 'summary': self.summary, 283 | 'rating': self.rating, 284 | 'warnings': self.warnings, 285 | 'category': self.category, 286 | 'fandoms': self.fandoms, 287 | 'relationship': self.relationship, 288 | 'characters': self.characters, 289 | 'additional_tags': self.additional_tags, 290 | 'language': self.language, 291 | 'stats': { 292 | 'published': str(self.published), 293 | 'words': self.words, 294 | # TODO: chapters 295 | 'comments': self.comments, 296 | 'kudos': self.kudos, 297 | 'bookmarks': self.bookmarks, 298 | 'hits': self.hits, 299 | } 300 | } 301 | return json.dumps(data, *args, **kwargs) 302 | --------------------------------------------------------------------------------