├── .gitignore
├── LICENSE
├── README.md
├── bin
    └── update_pages.py
├── pyscp
    ├── __init__.py
    ├── core.py
    ├── orm.py
    ├── resources
    │   ├── cover.png
    │   ├── pages
    │   │   ├── cover.xhtml
    │   │   ├── intro.xhtml
    │   │   ├── license.xhtml
    │   │   └── title.xhtml
    │   ├── stafflist.txt
    │   ├── stylesheet.css
    │   └── templates
    │   │   ├── container.xml
    │   │   ├── content.opf
    │   │   ├── page.xhtml
    │   │   └── toc.ncx
    ├── snapshot.py
    ├── stats
    │   ├── __init__.py
    │   ├── counters.py
    │   ├── filters.py
    │   ├── scalars.py
    │   └── updater.py
    ├── utils.py
    └── wikidot.py
├── setup.py
└── tests
    └── test_core.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | *.sublime-*
3 | .idea/
4 | *.log
5 | *.pass
6 | .project
7 | */.coverage
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 anqxyr
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyscp
  2 | 
  3 | **pyscp** is a python library for interacting with wikidot-hosted websites. The library is mainly intended for use by the administrative staff of the www.scp-wiki.net website, and has a host of feature exclusive to it. However, the majority of the core functionality should be applicalbe to any wikidot-based site.
  4 | 
  5 | ## Installation
  6 | 
  7 | Download the latest code, open the containing folder, and run the following command:
  8 | ```
  9 | pip install . --user
 10 | ```
 11 | Done.
 12 | 
 13 | ## Examples
 14 | 
 15 | ### Acessing Pages
 16 | 
 17 | ```python
 18 | import pyscp
 19 | 
 20 | wiki = pyscp.wikidot.Wiki('www.scp-wiki.net')
 21 | p = wiki('scp-837')
 22 | print(
 23 |     '"{}" has a rating of {}, {} revisions, and {} comments.'
 24 |     .format(p.title, p.rating, len(p.history), len(p.comments)))
 25 | ```
 26 | ```
 27 | "SCP-837: Multiplying Clay" has a rating of 108, 14 revisions, and 54 comments.
 28 | ```
 29 | 
 30 | You can access other sites as well:
 31 | 
 32 | ```python
 33 | ru_wiki = pyscp.wikidot.Wiki('scpfoundation.ru')
 34 | p = ru_wiki('scp-837')
 35 | print('"{}" was created by {} on {}.'.format(p.title, p.author, p.created))
 36 | ```
 37 | ```
 38 | "SCP-837 - Глина умножения" was created by Gene R on 2012-12-26 11:12:13.
 39 | ```
 40 | 
 41 | If the site doesn't use a custom domain, you can use the name of the site instead of the full url. E.g. `Wiki('scpsandbox2')` is the same as `Wiki('scpsandbox2.wikidot.com')`.
 42 | 
 43 | ### Editing Pages
 44 | 
 45 | ```python
 46 | 
 47 | wiki = pyscp.wikidot.Wiki('scpsandbox2')
 48 | wiki.auth('example_username', 'example_password')
 49 | p = wiki('test')
 50 | last_revision = p.history[-1].number
 51 | p.edit(
 52 |     source='= This is centered **text** that uses Wikidot markup.',
 53 |     title="you can skip the title if you don't want changing it",
 54 |     #you can leave out the comment too, but that'd be rude
 55 |     comment='testing automated editing')
 56 | print(p.text)   # see if it worked
 57 | p.revert(last_revision)  # let's revert it back to what it were.
 58 | ```
 59 | ```
 60 | This is centered text that uses Wikidot markup.
 61 | ```
 62 | 
 63 | 
 64 | ### Snapshots
 65 | 
 66 | When working with large number of pages, it could be faster to create a snapshot of the site than to download the pages one by one. Snapshots are optimized to download a large amount of data in the shortest possible time using multithreading.
 67 | 
 68 | ```python
 69 | import pyscp
 70 | 
 71 | creator = pyscp.snapshot.SnapshotCreator('www.scp-wiki.net', 'snapshot_file.db')
 72 | creator.take_snapshot(forums=False)
 73 | # that's where we wait half an hour for it to finish
 74 | ```
 75 | 
 76 | Once a snapshot is created, you can use `snapshot.Wiki` to read pages same as in the first example:
 77 | 
 78 | ```python
 79 | wiki = pyscp.snapshot.Wiki('www.scp-wiki.net', 'snapshot_file.db')
 80 | p = wiki('scp-9005-2')
 81 | print(
 82 |     '"{}" has a rating of {}, was created by {}, and is awesome.'
 83 |     .format(p.title, p.rating, p.author))
 84 | print('Other pages by {}:'.format(p.author))
 85 | for other in wiki.list_pages(author=p.author):
 86 |     print(
 87 |         '{} (rating: {}, created: {})'
 88 |         .format(other.title, other.rating, other.created))
 89 | ```
 90 | ```
 91 | Page "SCP-9005-2" has a rating of 80, was created by yellowdrakex, and is awesome.
 92 | Other pages by yellowdrakex:
 93 | ClusterfREDACTED (rating: 112, created: 2011-10-20 18:08:49)
 94 | Dr Rights' Draft Box (rating: None, created: 2009-02-01 18:58:36)
 95 | Dr. Rights' Personal Log (rating: 3, created: 2008-11-26 23:03:27)
 96 | Dr. Rights' Personnel File (rating: 13, created: 2008-11-24 20:45:34)
 97 | Fifteen To Sixteen (rating: 17, created: 2010-02-15 05:55:58)
 98 | Great Short Story Concepts (rating: 1, created: 2010-06-03 19:26:06)
 99 | RUN AWAY FOREVURRR (rating: 79, created: 2011-10-24 16:34:23)
100 | SCP-288: The "Stepford Marriage" Rings (rating: 56, created: 2008-11-27 07:47:01)
101 | SCP-291: Disassembler/Reassembler (rating: 113, created: 2008-11-24 20:11:11)
102 | ...
103 | ```
104 | 


--------------------------------------------------------------------------------
/bin/update_pages.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Update wiki pages.
  5 | 
  6 | This script is used to update scp-wiki tale hubs and other such pages.
  7 | """
  8 | 
  9 | ###############################################################################
 10 | # Module Imports
 11 | ###############################################################################
 12 | 
 13 | import arrow
 14 | import collections
 15 | import logging
 16 | import pyscp
 17 | import re
 18 | import string
 19 | 
 20 | ###############################################################################
 21 | 
 22 | log = logging.getLogger('pyscp')
 23 | 
 24 | ###############################################################################
 25 | 
 26 | TEMPLATE = """
 27 | [[# {name}]]
 28 | [[div class="section"]]
 29 | +++ {disp}
 30 | [#top ⇑]
 31 | {header}
 32 | {body}
 33 | [[/div]]
 34 | 
 35 | """
 36 | 
 37 | ###############################################################################
 38 | 
 39 | 
 40 | class Updater:
 41 | 
 42 |     def __init__(self, wiki, pages):
 43 |         self.wiki = wiki
 44 |         self.pages = pages
 45 | 
 46 |     def disp(self):
 47 |         return self.keys()
 48 | 
 49 |     def get_author(self, page):
 50 |         return page.build_attribution_string(
 51 |             user_formatter='[[user {}]]', separator=' _\n')
 52 | 
 53 |     def get_section(self, idx):
 54 |         name = self.keys()[idx]
 55 |         disp = self.disp()[idx]
 56 |         pages = [p for p in self.pages if self.keyfunc(p) == name]
 57 | 
 58 |         if pages:
 59 |             body = '\n'.join(map(
 60 |                 self.format_page, sorted(pages, key=self.sortfunc)))
 61 |         else:
 62 |             body = self.NODATA
 63 | 
 64 |         return TEMPLATE.format(
 65 |             name=name.replace(' ', '-'),
 66 |             disp=disp,
 67 |             header=self.HEADER,
 68 |             body=body)
 69 | 
 70 |     def update(self, *targets):
 71 |         output = ['']
 72 |         for idx in range(len(self.keys())):
 73 |             section = self.get_section(idx)
 74 |             if len(output[-1]) + len(section) < 180000:
 75 |                 output[-1] += section
 76 |             else:
 77 |                 output.append(section)
 78 |         for idx, target in enumerate(targets):
 79 |             source = output[idx] if idx < len(output) else ''
 80 |             self.wiki(target).revert(0)
 81 |             self.wiki(target).edit(source, comment='automated update')
 82 |             log.info('{} {}'.format(target, len(source)))
 83 | 
 84 | ###############################################################################
 85 | 
 86 | 
 87 | class TaleUpdater(Updater):
 88 | 
 89 |     HEADER = '||~ Title||~ Author||~ Created||'
 90 |     NODATA = '||||||= **NO DATA AVAILABLE**||'
 91 | 
 92 |     def format_page(self, page=None):
 93 |         return '||[[[{}|]]]||{}||//{}//||\n||||||{}||'.format(
 94 |             page._body['fullname'], self.get_author(page),
 95 |             page.created[:10], page._body['preview'])
 96 | 
 97 |     def update(self, target):
 98 |         targets = [
 99 |             'component:tales-by-{}-{}'.format(target, i + 1) for i in range(5)]
100 |         super().update(*targets)
101 | 
102 | 
103 | class TalesByTitle(TaleUpdater):
104 | 
105 |     def keys(self):
106 |         return list(string.ascii_uppercase) + ['misc']
107 | 
108 |     def keyfunc(self, page):
109 |         if not page._body['title']:
110 |             return 'misc'
111 |         l = page._body['title'][0]
112 |         return l.upper() if l.isalpha() else 'misc'
113 | 
114 |     def sortfunc(self, page):
115 |         return page._body['title'].lower()
116 | 
117 | 
118 | class TalesByAuthor(TaleUpdater):
119 | 
120 |     def keys(self):
121 |         return sorted(list(string.ascii_uppercase) + ['Dr', 'misc'])
122 | 
123 |     def keyfunc(self, page):
124 |         templates = collections.defaultdict(lambda: '{user}')
125 |         authors = page.build_attribution_string(templates).split(', ')
126 |         author = authors[0]
127 |         if re.match(r'Dr[^a-z]|Doctor|Doc[^a-z]', author):
128 |             return 'Dr'
129 |         elif author[0].isalpha():
130 |             return author[0].upper()
131 |         else:
132 |             return 'misc'
133 | 
134 |     def sortfunc(self, page):
135 |         author = sorted(page.metadata.keys())[0]
136 |         return author.lower()
137 | 
138 | 
139 | class TalesByDate(TaleUpdater):
140 | 
141 |     def disp(self):
142 |         return [
143 |             arrow.get(i, 'YYYY-MM').format('MMMM YYYY') for i in self.keys()]
144 | 
145 |     def keys(self):
146 |         return [i.format('YYYY-MM') for i in
147 |                 arrow.Arrow.range('month', arrow.get('2008-07'), arrow.now())]
148 | 
149 |     def keyfunc(self, page=None):
150 |         return page.created[:7]
151 | 
152 |     def sortfunc(self, page):
153 |         return page.created
154 | 
155 | 
156 | def update_tale_hubs(wiki):
157 |     pages = list(wiki.list_pages(
158 |         tags='tale -hub -_sys',
159 |         body='title created_by created_at preview tags'))
160 |     TalesByTitle(wiki, pages).update('title')
161 |     TalesByAuthor(wiki, pages).update('author')
162 |     TalesByDate(wiki, pages).update('date')
163 | 
164 | ###############################################################################
165 | 
166 | 
167 | class CreditUpdater(Updater):
168 | 
169 |     HEADER = ''
170 |     NODATA = '||||= **NO DATA AVAILABLE**||'
171 | 
172 |     def format_page(self, page):
173 |         return '||[[[{}|{}]]]||{}||'.format(
174 |             page._body['fullname'],
175 |             page.title.replace('[', '').replace(']', ''),
176 |             self.get_author(page))
177 | 
178 |     def sortfunc(self, page):
179 |         title = []
180 |         for word in re.split('([0-9]+)', page._body['title']):
181 |             if word.isdigit():
182 |                 title.append(int(word))
183 |             else:
184 |                 title.append(word.lower())
185 |         return title
186 | 
187 |     def update(self, target):
188 |         super().update('component:credits-' + target)
189 | 
190 | 
191 | class SeriesCredits(CreditUpdater):
192 | 
193 |     def __init__(self, wiki, pages, series):
194 |         super().__init__(wiki, pages)
195 |         self.series = (series - 1) * 1000
196 | 
197 |     def keys(self):
198 |         return ['{:03}-{:03}'.format(i or 2, i + 99)
199 |                 for i in range(self.series, self.series + 999, 100)]
200 | 
201 |     def keyfunc(self, page):
202 |         num = re.search('[scp]+-([0-9]+)$', page._body['fullname'])
203 |         if not num:
204 |             return
205 |         num = (int(num.group(1)) // 100) * 100
206 |         return '{:03}-{:03}'.format(num or 2, num + 99)
207 | 
208 | 
209 | class MiscCredits(CreditUpdater):
210 | 
211 |     def __init__(self, wiki, pages):
212 |         self.proposals = pyscp.wikidot.Wiki('scp-wiki')('scp-001').links
213 |         super().__init__(wiki, pages)
214 | 
215 |     def keys(self):
216 |         return 'proposals explained joke archived'.split()
217 | 
218 |     def disp(self):
219 |         return [
220 |             '001 Proposals', 'Explained Phenomena',
221 |             'Joke Articles', 'Archived Articles']
222 | 
223 |     def keyfunc(self, page):
224 |         if page.url in self.proposals:
225 |             return 'proposals'
226 |         for tag in ('explained', 'joke', 'archived'):
227 |             if tag in page.tags:
228 |                 return tag
229 | 
230 | 
231 | def update_credit_hubs(wiki):
232 |     pages = list(wiki.list_pages(
233 |         tag='scp', body='title created_by tags'))
234 |     wiki = pyscp.wikidot.Wiki('scpsandbox2')
235 |     with open('pyscp_bot.pass') as file:
236 |         wiki.auth('jarvis-bot', file.read())
237 | 
238 |     SeriesCredits(wiki, pages, 1).update('series1')
239 |     SeriesCredits(wiki, pages, 2).update('series2')
240 |     SeriesCredits(wiki, pages, 3).update('series3')
241 |     MiscCredits(wiki, pages).update('misc')
242 | 
243 | ###############################################################################
244 | 
245 | wiki = pyscp.wikidot.Wiki('scp-wiki')
246 | with open('/media/hdd0/code/pyscp/bin/pyscp_bot.pass') as file:
247 |     wiki.auth('jarvis-bot', file.read())
248 | 
249 | pyscp.utils.default_logging()
250 | #update_credit_hubs(wiki)
251 | 
252 | update_tale_hubs(wiki)
253 | 


--------------------------------------------------------------------------------
/pyscp/__init__.py:
--------------------------------------------------------------------------------
1 | from pyscp import core, utils, snapshot, wikidot


--------------------------------------------------------------------------------
/pyscp/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Abstract Base Classes.
  5 | 
  6 | pyscp builds most of its functionality on top of three large classes: Wiki,
  7 | Page, and Thread. This module contains the abstract base classes for those
  8 | three. The ABC-s define the abstact methods that each child must implement,
  9 | as well as some common functionality that builds on top of the abstract
 10 | methods.
 11 | 
 12 | Each class inheriting from the ABC-s must implement its own realization of
 13 | the abstract methods, and can also provide additional methods unique to it.
 14 | 
 15 | This module also defines the named tuples for simple containers used by the
 16 | three core classes, such as Revision or Vote.
 17 | """
 18 | 
 19 | 
 20 | ###############################################################################
 21 | # Module Imports
 22 | ###############################################################################
 23 | 
 24 | import abc
 25 | import arrow
 26 | import bs4
 27 | import collections
 28 | import functools
 29 | import itertools
 30 | import re
 31 | import urllib.parse
 32 | import logging
 33 | 
 34 | import pyscp.utils
 35 | 
 36 | ###############################################################################
 37 | # Global Constants And Variables
 38 | ###############################################################################
 39 | 
 40 | log = logging.getLogger(__name__)
 41 | 
 42 | ###############################################################################
 43 | # Abstract Base Classes
 44 | ###############################################################################
 45 | 
 46 | 
 47 | class Page(metaclass=abc.ABCMeta):
 48 |     """
 49 |     Page Abstract Base Class.
 50 | 
 51 |     Page object are wrappers around individual wiki-pages, and allow simple
 52 |     operations with them, such as retrieving the rating or the author.
 53 | 
 54 |     Each Page instance is attached to a specific instance of the Wiki class.
 55 |     The wiki may be used by the page to retrieve a list of titles or other
 56 |     similar wiki-wide information that may be used by the Page to, in turn,
 57 |     deduce some information about itself.
 58 | 
 59 |     Typically, the Page instances should not be created directly. Instead,
 60 |     calling an instance of a Wiki class will creating a Page instance
 61 |     attached to that wiki.
 62 |     """
 63 | 
 64 |     ###########################################################################
 65 |     # Special Methods
 66 |     ###########################################################################
 67 | 
 68 |     def __init__(self, wiki, url):
 69 |         self.url = url
 70 |         self._wiki = wiki
 71 | 
 72 |     def __repr__(self):
 73 |         return '{}.{}({}, {})'.format(
 74 |             self.__module__, self.__class__.__name__,
 75 |             repr(self.url), repr(self._wiki))
 76 | 
 77 |     def __eq__(self, other):
 78 |         if not hasattr(other, 'url') or not hasattr(other, '_wiki'):
 79 |             return False
 80 |         return self.url == other.url and self._wiki is other._wiki
 81 | 
 82 |     ###########################################################################
 83 |     # Abstract Methods
 84 |     ###########################################################################
 85 | 
 86 |     @property
 87 |     @abc.abstractmethod
 88 |     def _pdata(self):
 89 |         """
 90 |         Commonly used data about the page.
 91 | 
 92 |         This method should return a tuple, the first three elements of which
 93 |         are the id number of the page; the id number of the page's comments
 94 |         thread; and the html contents of the page.
 95 | 
 96 |         Any additional elements of the tuple are left to the discretion
 97 |         of the individual Page implimentations.
 98 |         """
 99 |         pass
100 | 
101 |     @property
102 |     @abc.abstractmethod
103 |     def history(self):
104 |         """
105 |         Revision history of the page.
106 | 
107 |         Should return a sorted list of Revision named tuples.
108 |         """
109 |         pass
110 | 
111 |     @property
112 |     @abc.abstractmethod
113 |     def votes(self):
114 |         """
115 |         Page votes.
116 | 
117 |         Should return a list of Vote named tuples.
118 |         """
119 |         pass
120 | 
121 |     @property
122 |     @abc.abstractmethod
123 |     def tags(self):
124 |         """
125 |         Page tags.
126 | 
127 |         Should return a set of strings.
128 |         """
129 |         pass
130 | 
131 |     ###########################################################################
132 |     # Internal Methods
133 |     ###########################################################################
134 | 
135 |     @property
136 |     def _id(self):
137 |         """Unique ID number of the page."""
138 |         return self._pdata[0]
139 | 
140 |     @pyscp.utils.cached_property
141 |     def _thread(self):
142 |         """Thread object corresponding to the page's comments thread."""
143 |         return self._wiki.Thread(self._wiki, self._pdata[1])
144 | 
145 |     @property
146 |     def _raw_title(self):
147 |         """Title as displayed on the page."""
148 |         title = self._soup.find(id='page-title')
149 |         return title.text.strip() if title else ''
150 | 
151 |     @property
152 |     def _raw_author(self):
153 |         return self.history[0].user
154 | 
155 |     @property
156 |     def _soup(self):
157 |         """BeautifulSoup of the contents of the page."""
158 |         return bs4.BeautifulSoup(self.html, 'lxml')
159 | 
160 |     ###########################################################################
161 |     # Properties
162 |     ###########################################################################
163 | 
164 |     @property
165 |     def html(self):
166 |         """HTML contents of the page."""
167 |         return self._pdata[2]
168 | 
169 |     @property
170 |     def posts(self):
171 |         """List of the comments made on the page."""
172 |         return self._thread.posts
173 | 
174 |     @property
175 |     def comments(self):
176 |         """Alias for Page.posts."""
177 |         return self._thread.posts
178 | 
179 |     @property
180 |     def text(self):
181 |         """Plain text of the page."""
182 |         return self._soup.find(id='page-content').text
183 | 
184 |     @property
185 |     def wordcount(self):
186 |         """Number of words encountered on the page."""
187 |         return len(re.findall(r"[\w'█_-]+", self.text))
188 | 
189 |     @property
190 |     def images(self):
191 |         """Number of images dislayed on the page."""
192 |         # TODO: needs more work.
193 |         return [i['src'] for i in self._soup('img')]
194 | 
195 |     @property
196 |     def name(self):
197 |         return self.url.split('/')[-1]
198 | 
199 |     @property
200 |     def title(self):
201 |         """
202 |         Title of the page.
203 | 
204 |         In case of SCP articles, will include the title from the 'series' page.
205 |         """
206 |         try:
207 |             return '{}: {}'.format(
208 |                 self._raw_title, self._wiki.titles()[self.url])
209 |         except KeyError:
210 |             return self._raw_title
211 | 
212 |     @property
213 |     def created(self):
214 |         """When was the page created."""
215 |         return self.history[0].time
216 | 
217 |     @property
218 |     def metadata(self):
219 |         """
220 |         Return page metadata.
221 | 
222 |         Authors in this case includes all users related to the creation
223 |         and subsequent maintenance of the page. The values of the dict
224 |         describe the user's relationship to the page.
225 |         """
226 |         data = [i for i in self._wiki.metadata() if i.url == self.url]
227 |         data = {i.user: i for i in data}
228 | 
229 |         if 'author' not in {i.role for i in data.values()}:
230 |             meta = Metadata(self.url, self._raw_author, 'author', None)
231 |             data[self._raw_author] = meta
232 | 
233 |         for k, v in data.items():
234 |             if v.role == 'author' and not v.date:
235 |                 data[k] = v._replace(date=self.created)
236 | 
237 |         return data
238 | 
239 |     @property
240 |     def rating(self):
241 |         """Rating of the page, excluding deleted accounts."""
242 |         return sum(
243 |             v.value for v in self.votes if v.user != '(account deleted)')
244 | 
245 |     @property
246 |     @pyscp.utils.listify()
247 |     def links(self):
248 |         """
249 |         Other pages linked from this one.
250 | 
251 |         Returns an ordered list of unique urls. Off-site links or links to
252 |         images are not included.
253 |         """
254 |         unique = set()
255 |         for element in self._soup.select('#page-content a'):
256 |             href = element.get('href', None)
257 |             if (not href or href[0] != '/' or  # bad or absolute link
258 |                     href[-4:] in ('.png', '.jpg', '.gif')):
259 |                 continue
260 |             url = self._wiki.site + href.rstrip('|')
261 |             if url not in unique:
262 |                 unique.add(url)
263 |                 yield url
264 | 
265 |     @property
266 |     def parent(self):
267 |         """Parent of the current page."""
268 |         if not self.html:
269 |             return None
270 |         breadcrumb = self._soup.select('#breadcrumbs a')
271 |         if breadcrumb:
272 |             return self._wiki.site + breadcrumb[-1]['href']
273 | 
274 |     @property
275 |     def is_mainlist(self):
276 |         """
277 |         Indicate whether the page is a mainlist scp article.
278 | 
279 |         This is an scp-wiki exclusive property.
280 |         """
281 |         if 'scp-wiki' not in self._wiki.site:
282 |             return False
283 |         if 'scp' not in self.tags:
284 |             return False
285 |         return bool(re.search(r'/scp-[0-9]{3,4}$', self.url))
286 | 
287 |     ###########################################################################
288 |     # Methods
289 |     ###########################################################################
290 | 
291 |     def build_attribution_string(
292 |             self, templates=None, group_templates=None, separator=', ',
293 |             user_formatter=None):
294 |         """
295 |         Create an attribution string based on the page's metadata.
296 | 
297 |         This is a commonly needed operation. The result should be a nicely
298 |         formatted, human-readable description of who was and is involved with
299 |         the page, and in what role.
300 |         """
301 |         roles = 'author rewrite translator maintainer'.split()
302 | 
303 |         if not templates:
304 |             templates = {i: '{{user}} ({})'.format(i) for i in roles}
305 | 
306 |         items = list(self.metadata.values())
307 |         items.sort(key=lambda x: [roles.index(x.role), x.date])
308 | 
309 |         # group users in the same role on the same date together
310 |         itemdict = collections.OrderedDict()
311 |         for i in items:
312 |             user = user_formatter.format(i.user) if user_formatter else i.user
313 |             key = (i.role, i.date)
314 |             itemdict[key] = itemdict.get(key, []) + [user]
315 | 
316 |         output = []
317 | 
318 |         for (role, date), users in itemdict.items():
319 | 
320 |             hdate = arrow.get(date).humanize() if date else ''
321 | 
322 |             if group_templates and len(users) > 1:
323 |                 output.append(
324 |                     group_templates[role].format(
325 |                         date=date,
326 |                         hdate=hdate,
327 |                         users=', '.join(users[:-1]),
328 |                         last_user=users[-1]))
329 |             else:
330 |                 for user in users:
331 |                     output.append(
332 |                         templates[role].format(
333 |                             date=date, hdate=hdate, user=user))
334 | 
335 |         return separator.join(output)
336 | 
337 | 
338 | class Thread(metaclass=abc.ABCMeta):
339 |     """
340 |     Thread Abstract Base Class.
341 | 
342 |     Thread objects represent individual forum threads. Most pages have a
343 |     corresponding comments thread, accessible via Page._thread.
344 |     """
345 | 
346 |     def __init__(self, wiki, _id, title=None, description=None):
347 |         self._wiki = wiki
348 |         self._id, self.title, self.description = _id, title, description
349 | 
350 |     @abc.abstractmethod
351 |     def posts(self):
352 |         """Posts in this thread."""
353 |         pass
354 | 
355 | 
356 | class Wiki(metaclass=abc.ABCMeta):
357 |     """
358 |     Wiki Abstract Base Class.
359 | 
360 |     Wiki objects provide wiki-wide functionality not limited to individual
361 |     pages or threads.
362 |     """
363 | 
364 |     ###########################################################################
365 |     # Class Attributes
366 |     ###########################################################################
367 | 
368 |     # should point to the respective Page and Thread classes in each submodule.
369 | 
370 |     Page = Page
371 |     Thread = Thread
372 | 
373 |     ###########################################################################
374 |     # Special Methods
375 |     ###########################################################################
376 | 
377 |     def __init__(self, site):
378 |         parsed = urllib.parse.urlparse(site)
379 |         netloc = parsed.netloc if parsed.netloc else parsed.path
380 |         if '.' not in netloc:
381 |             netloc += '.wikidot.com'
382 |         self.site = urllib.parse.urlunparse(['http', netloc, '', '', '', ''])
383 |         self._title_data = {}
384 | 
385 |     def __call__(self, name):
386 |         url = name if self.site in name else '{}/{}'.format(self.site, name)
387 |         url = url.replace(' ', '-').replace('_', '-').lower()
388 |         return self.Page(self, url)
389 | 
390 |     ###########################################################################
391 | 
392 |     @functools.lru_cache(maxsize=1)
393 |     def metadata(self):
394 |         """
395 |         List page ownership metadata.
396 | 
397 |         This method is exclusive to the scp-wiki, and is used to fine-tune
398 |         the page ownership information beyond what is possible with Wikidot.
399 |         This allows a single page to have an author different from the user
400 |         who created the zeroth revision of the page, or even have multiple
401 |         users attached to the page in various roles.
402 |         """
403 |         if 'scp-wiki' not in self.site:
404 |             return []
405 |         soup = self('attribution-metadata')._soup
406 |         results = []
407 |         for row in soup('tr')[1:]:
408 |             name, user, type_, date = [i.text.strip() for i in row('td')]
409 |             name = name.lower()
410 |             url = '{}/{}'.format(self.site, name)
411 |             results.append(pyscp.core.Metadata(url, user, type_, date))
412 |         return results
413 | 
414 |     def _update_titles(self):
415 |         for name in (
416 |                 'scp-series', 'scp-series-2', 'scp-series-3', 'scp-series-4', 'scp-series-5',
417 |                 'joke-scps', 'scp-ex', 'archived-scps'):
418 |             page = self(name)
419 |             try:
420 |                 soup = page._soup
421 |             except:
422 |                 continue
423 |             self._title_data[name] = soup
424 | 
425 |     @functools.lru_cache(maxsize=1)
426 |     @pyscp.utils.ignore(value={})
427 |     @pyscp.utils.log_errors(logger=log.error)
428 |     def titles(self):
429 |         """Dict of url/title pairs for scp articles."""
430 |         if 'scp-wiki' not in self.site:
431 |             return {}
432 | 
433 |         self._update_titles()
434 | 
435 |         elems = [i.select('ul > li') for i in self._title_data.values()]
436 |         elems = list(itertools.chain(*elems))
437 |         try:
438 |             elems += list(self('scp-001')._soup(class_='series')[1]('p'))
439 |         except:
440 |             pass
441 | 
442 |         titles = {}
443 |         for elem in elems:
444 | 
445 |             sep = ' - ' if ' - ' in elem.text else ', '
446 |             try:
447 |                 url1 = self.site + elem.a['href']
448 |                 skip, title = elem.text.split(sep, maxsplit=1)
449 |             except (ValueError, TypeError):
450 |                 continue
451 | 
452 |             if title != '[ACCESS DENIED]':
453 |                 url2 = self.site + '/' + skip.lower()
454 |                 titles[url1] = titles[url2] = title
455 | 
456 |         return titles
457 | 
458 |     def list_pages(self, **kwargs):
459 |         """Return pages matching the specified criteria."""
460 |         pages = self._list_pages_parsed(**kwargs)
461 |         author = kwargs.pop('author', None)
462 |         if not author:
463 |             # if 'author' isn't specified, there's no need to check rewrites
464 |             return pages
465 |         include, exclude = set(), set()
466 |         for meta in self.metadata():
467 |             if meta.user == author:
468 |                 # if username matches, include regardless of type
469 |                 include.add(meta.url)
470 |             elif meta.role == 'author':
471 |                 # exclude only if override type is author.
472 |                 # if url has author and rewrite author,
473 |                 # it will appear in list_pages for both.
474 |                 exclude.add(meta.url)
475 |         urls = {p.url for p in pages} | include - exclude
476 |         # if no other options beside author were specified,
477 |         # just return everything we can
478 |         if not kwargs:
479 |             return map(self, sorted(urls))
480 |         # otherwise, retrieve the list of urls without the author parameter
481 |         # to check which urls we should return and in which order
482 |         pages = self._list_pages_parsed(**kwargs)
483 |         return [p for p in pages if p.url in urls]
484 | 
485 | ###############################################################################
486 | # Named Tuple Containers
487 | ###############################################################################
488 | 
489 | nt = collections.namedtuple
490 | Revision = nt('Revision', 'id number user time comment')
491 | Vote = nt('Vote', 'user value')
492 | Post = nt('Post', 'id title content user time parent')
493 | File = nt('File', 'url name filetype size')
494 | Metadata = nt('Metadata', 'url user role date')
495 | Category = nt('Category', 'id title description size')
496 | Image = nt('Image', 'url source status notes data')
497 | del nt
498 | 


--------------------------------------------------------------------------------
/pyscp/orm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | ###############################################################################
  4 | # Module Imports
  5 | ###############################################################################
  6 | 
  7 | import concurrent.futures
  8 | import logging
  9 | import peewee
 10 | import queue
 11 | 
 12 | from itertools import islice
 13 | 
 14 | ###############################################################################
 15 | # Global Constants And Variables
 16 | ###############################################################################
 17 | 
 18 | log = logging.getLogger('pyscp.orm')
 19 | pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
 20 | queue = queue.Queue()
 21 | 
 22 | 
 23 | def queue_execution(fn, args=(), kw={}):
 24 |     queue.put(dict(fn=fn, args=args, kw=kw))
 25 |     pool.submit(async_write)
 26 | 
 27 | ###############################################################################
 28 | # Database ORM Classes
 29 | ###############################################################################
 30 | 
 31 | db = peewee.Proxy()
 32 | 
 33 | 
 34 | class BaseModel(peewee.Model):
 35 | 
 36 |     class Meta:
 37 |         database = db
 38 | 
 39 |     @classmethod
 40 |     def create(cls, **kw):
 41 |         queue_execution(fn=super().create, kw=kw)
 42 | 
 43 |     @classmethod
 44 |     def create_table(cls):
 45 |         if not hasattr(cls, '_id_cache'):
 46 |             cls._id_cache = []
 47 |         queue_execution(fn=super().create_table, args=(True,))
 48 | 
 49 |     @classmethod
 50 |     def insert_many(cls, data):
 51 |         data_iter = iter(data)
 52 |         chunk = list(islice(data_iter, 500))
 53 |         while chunk:
 54 |             queue_execution(
 55 |                 fn=lambda x: super(BaseModel, cls).insert_many(x).execute(),
 56 |                 args=(chunk, ))
 57 |             chunk = list(islice(data_iter, 500))
 58 | 
 59 |     @classmethod
 60 |     def convert_to_id(cls, data, key='user'):
 61 |         for row in data:
 62 |             if row[key] not in cls._id_cache:
 63 |                 cls._id_cache.append(row[key])
 64 |             row[key] = cls._id_cache.index(row[key]) + 1
 65 |             yield row
 66 | 
 67 |     @classmethod
 68 |     def write_ids(cls, field_name):
 69 |         cls.insert_many([
 70 |             {'id': cls._id_cache.index(value) + 1, field_name: value}
 71 |             for value in set(cls._id_cache)])
 72 |         cls._id_cache.clear()
 73 | 
 74 | 
 75 | class ForumCategory(BaseModel):
 76 |     title = peewee.CharField()
 77 |     description = peewee.TextField()
 78 | 
 79 | 
 80 | class ForumThread(BaseModel):
 81 |     category = peewee.ForeignKeyField(ForumCategory, null=True)
 82 |     title = peewee.CharField(null=True)
 83 |     description = peewee.TextField(null=True)
 84 | 
 85 | 
 86 | class Page(BaseModel):
 87 |     url = peewee.CharField(unique=True)
 88 |     html = peewee.TextField()
 89 |     thread = peewee.ForeignKeyField(
 90 |         ForumThread, related_name='page', null=True)
 91 | 
 92 | 
 93 | class User(BaseModel):
 94 |     name = peewee.CharField(unique=True)
 95 | 
 96 | 
 97 | class Revision(BaseModel):
 98 |     page = peewee.ForeignKeyField(Page, related_name='revisions', index=True)
 99 |     user = peewee.ForeignKeyField(User, related_name='revisions', index=True)
100 |     number = peewee.IntegerField()
101 |     time = peewee.DateTimeField()
102 |     comment = peewee.CharField(null=True)
103 | 
104 | 
105 | class Vote(BaseModel):
106 |     page = peewee.ForeignKeyField(Page, related_name='votes', index=True)
107 |     user = peewee.ForeignKeyField(User, related_name='votes', index=True)
108 |     value = peewee.IntegerField()
109 | 
110 | 
111 | class ForumPost(BaseModel):
112 |     thread = peewee.ForeignKeyField(
113 |         ForumThread, related_name='posts', index=True)
114 |     user = peewee.ForeignKeyField(User, related_name='posts', index=True)
115 |     parent = peewee.ForeignKeyField('self', null=True)
116 |     title = peewee.CharField(null=True)
117 |     time = peewee.DateTimeField()
118 |     content = peewee.TextField()
119 | 
120 | 
121 | class Tag(BaseModel):
122 |     name = peewee.CharField(unique=True)
123 | 
124 | 
125 | class PageTag(BaseModel):
126 |     page = peewee.ForeignKeyField(Page, related_name='tags', index=True)
127 |     tag = peewee.ForeignKeyField(Tag, related_name='pages', index=True)
128 | 
129 | 
130 | class OverrideType(BaseModel):
131 |     name = peewee.CharField(unique=True)
132 | 
133 | 
134 | class Override(BaseModel):
135 |     url = peewee.ForeignKeyField(Page, to_field=Page.url, index=True)
136 |     user = peewee.ForeignKeyField(User, index=True)
137 |     type = peewee.ForeignKeyField(OverrideType)
138 | 
139 | 
140 | class ImageStatus(BaseModel):
141 |     name = peewee.CharField(unique=True)
142 | 
143 | 
144 | class Image(BaseModel):
145 |     url = peewee.CharField(unique=True)
146 |     source = peewee.CharField()
147 |     data = peewee.BlobField()
148 |     status = peewee.ForeignKeyField(ImageStatus)
149 |     notes = peewee.TextField(null=True)
150 | 
151 | ###############################################################################
152 | # Helper Functions
153 | ###############################################################################
154 | 
155 | 
156 | def async_write(buffer=[]):
157 |     item = queue.get()
158 |     buffer.append(item)
159 |     if len(buffer) > 500 or queue.empty():
160 |         log.debug('Processing {} queue items.'.format(len(buffer)))
161 |         with db.transaction():
162 |             write_buffer(buffer)
163 |         buffer.clear()
164 | 
165 | 
166 | def write_buffer(buffer):
167 |     for item in buffer:
168 |         try:
169 |             item['fn'](*item.get('args', ()), **item.get('kw', {}))
170 |         except:
171 |             log.exception(
172 |                 'Exception while processing queue item: {}'
173 |                 .format(item))
174 |         queue.task_done()
175 | 
176 | 
177 | def create_tables(*tables):
178 |     for table in tables:
179 |         eval(table).create_table()
180 | 
181 | 
182 | def connect(dbpath):
183 |     log.info('Connecting to the database at {}'.format(dbpath))
184 |     db.initialize(peewee.SqliteDatabase(dbpath))
185 |     db.connect()
186 | 
187 | 
188 | ###############################################################################
189 | # Macros
190 | ###############################################################################
191 | 
192 | 
193 | def votes_by_user(user):
194 |     up, down = [], []
195 |     for vote in (Vote.select().join(User).where(User.name == user)):
196 |         if vote.value == 1:
197 |             up.append(vote.page.url)
198 |         else:
199 |             down.append(vote.page.url)
200 |     return {'+': up, '-': down}
201 | 


--------------------------------------------------------------------------------
/pyscp/resources/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anqxyr/pyscp/fc85c808495f8f47783db6fb12a79ce7727e919c/pyscp/resources/cover.png


--------------------------------------------------------------------------------
/pyscp/resources/pages/cover.xhtml:
--------------------------------------------------------------------------------
1 | <div style="text-aling: center">
2 |   <img src="../cover.png" style="width: 400; height: 640; text-align: center;"/>
3 | </div>


--------------------------------------------------------------------------------
/pyscp/resources/pages/intro.xhtml:
--------------------------------------------------------------------------------
 1 | <div class="intro">
 2 | <p><strong>Mankind in its present state has been around for a quarter of a million years, yet only the last 4,000 have been of any significance.</strong></p>
 3 | <p>So, what did we do for nearly 250,000 years? We huddled in caves and around small fires, fearful of the things that we didn't understand. It was more than explaining why the sun came up, it was the mystery of enormous birds with heads of men and rocks that came to life. So we called them 'gods' and 'demons', begged them to spare us, and prayed for salvation.</p>
 4 | <p>In time, their numbers dwindled and ours rose. The world began to make more sense when there were fewer things to fear, yet the unexplained can never truly go away, as if the universe demands the absurd and impossible.</p>
 5 | <p><strong>Mankind must not go back to hiding in fear.</strong>No one else will protect us, and we must stand up for ourselves.</p>
 6 | <p>While the rest of mankind dwells in the light, we must stand in the darkness to fight it, contain it, and shield it from the eyes of the public, so that others may live in a sane and normal world.</p>
 7 | <div class="sign">
 8 | <p><strong>We secure. We contain. We protect.</strong></p>
 9 | <p><strong>— The Administrator</strong></p>
10 | </div>
11 | </div>
12 | 


--------------------------------------------------------------------------------
/pyscp/resources/pages/license.xhtml:
--------------------------------------------------------------------------------
1 | <div class="license"><p>This book contains the collected works of the SCP Foundation, a collaborative fiction writing website. All contents are licensed under the CC-BY-SA 3.0 license. The stories comprising the book are available online at www.scp-wiki.net .</p><div class="footer"></div></div>


--------------------------------------------------------------------------------
/pyscp/resources/pages/title.xhtml:
--------------------------------------------------------------------------------
1 | <div class="title1">
2 | <h1 class="title1-bold">SCP Foundation</h1>
3 | <div class="italic">Ebook edition</div>
4 | </div>


--------------------------------------------------------------------------------
/pyscp/resources/stafflist.txt:
--------------------------------------------------------------------------------
 1 | Accelerando
 2 | anqxyr
 3 | Blaroth
 4 | Bouncl
 5 | Chubert
 6 | Crayne
 7 | Devereaux
 8 | Dexanote
 9 | djkaktus
10 | Doctor Anborough
11 | DrBright
12 | DrClef
13 | DrEverettMann
14 | Drewbear
15 | Eskobar
16 | Faminepulse
17 | Fantem
18 | FlameShirt
19 | FortuneFavorsBold
20 | Gaffney
21 | Kalinin
22 | Kate McTiriss
23 | LurkD
24 | MisterFlames
25 | murphy_slaw
26 | Nioki
27 | Photosynthetic
28 | Pig_catapult
29 | Pixeltasim
30 | ProcyonLotor
31 | pxdnbluesoul
32 | Reject
33 | Riemann
34 | Roget
35 | Rumetzen
36 | Silberescher
37 | Sophia Light
38 | SoullessSingularity
39 | spikebrennan
40 | thattallfellow
41 | thedeadlymoose
42 | TroyL
43 | Tuomey Tombstone
44 | Vincent_Redgrave
45 | Vivax
46 | weizhong
47 | Wogglebug
48 | Zyn
49 | 


--------------------------------------------------------------------------------
/pyscp/resources/stylesheet.css:
--------------------------------------------------------------------------------
  1 | @namespace h "http://www.w3.org/1999/xhtml";
  2 |     .title1 {
  3 |         text-align: center;
  4 |         }
  5 |     .title1-bold {
  6 |         font-size: 250%;
  7 |         font-weight: bold;
  8 |         }
  9 |     .title2 {
 10 |         font-size: 150%;
 11 |         font-weight: bold;
 12 |         text-align: center;
 13 |     }
 14 |     .bold {
 15 |         font-weight: bold;
 16 |         }
 17 |     .italic {
 18 |         font-style: italic;
 19 |         }
 20 |     .license {
 21 |         font-style: italic;
 22 |         margin-left: 10%;
 23 |         margin-top: 40%;
 24 |         max-width: 80%;
 25 |         text-align: justify;
 26 |         }
 27 |     .quote {
 28 |         background-color: #fafafa;
 29 |         border: 1px dashed #bbb;
 30 |         margin: 0.5em 5%;
 31 |         padding: 0 1em;
 32 |         }
 33 |     .collapsible {
 34 |         background-color: #fafafa;
 35 |         margin: 0.5em 5%;
 36 |         padding: 0 1em;
 37 |         }
 38 |     .collaps-title {
 39 |         border-bottom: 1px solid #444;
 40 |         font-weight: bold;
 41 |         margin: 0 -1em;
 42 |         padding: 0.5em 1em;
 43 |         }
 44 |     .collapsible .quote{
 45 |         background-color: #E0E0E0;
 46 |         }
 47 |     .scp-title {
 48 |         font-size: 120%;
 49 |         font-weight: bold;
 50 |         margin: 2em 0;
 51 |         }
 52 |     .tale-title {
 53 |         font-size: 120%;
 54 |         font-style: italic;
 55 |         margin: 2em 0;
 56 |         text-align: center;
 57 |         }
 58 |     table {
 59 |         border-bottom: 1px solid #999;
 60 |         border-collapse: separate;
 61 |         border-right: 1px solid #999;
 62 |         border-spacing: 0;
 63 |         }
 64 |     table th {
 65 |         background-color: #e0e0e0;
 66 |         }
 67 |     table th, table td {
 68 |         border-left: 1px solid #999;
 69 |         border-top: 1px solid #999;
 70 |         max-width: 1600px;
 71 |         overflow: hidden;
 72 |         padding: 10px;
 73 |         page-break-inside: avoid;
 74 |         }
 75 |     .intro {
 76 |         font-size: 130%;
 77 |         margin: 5em 5%;
 78 |     }
 79 |     .sign {
 80 |         text-align: center;
 81 |     }
 82 |     .attrib {
 83 |         font-size: 80%;
 84 |     }
 85 |     .link {
 86 |         font-family: monospace;
 87 |         font-weight: bold;
 88 |         text-decoration: underline;
 89 |     }
 90 |     .footer {
 91 |         text-align: center;
 92 |     }
 93 |     * {
 94 |         font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
 95 |     }
 96 |     .scp-image-block {
 97 |       float: right;
 98 |       clear: right;
 99 |       margin: 0 2em 1em 2em;
100 |       border: solid 1px #666;
101 |       box-shadow: 0 1px 6px rgba(0,0,0,.25);
102 |       width: 300px;
103 |     }
104 |     .scp-image-block.block-left {
105 |       float: left;
106 |       clear: left;
107 |       margin: 0 2em 1em 0;
108 |     }
109 |     .scp-image-block img {
110 |       border: 0;
111 |       width: 300px;
112 |     }
113 |     .scp-image-block .scp-image-caption {
114 |       background-color: #eee;
115 |       border-top: solid 1px #666;
116 |       padding: 2px 0;
117 |       font-size: 80%;
118 |       font-weight: bold;
119 |       text-align: center;
120 |       width: 300px;
121 |     }
122 |     .scp-image-block > p {
123 |       margin: 0;
124 |     }
125 |     .scp-image-block .scp-image-caption > p {
126 |       margin: 0;
127 |       padding: 0 10px;
128 |     }
129 | 


--------------------------------------------------------------------------------
/pyscp/resources/templates/container.xml:
--------------------------------------------------------------------------------
1 | <?xml version='1.0' encoding='UTF-8'?>
2 | <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
3 |   <rootfiles>
4 |     <rootfile media-type="application/oebps-package+xml" full-path="content.opf"/>
5 |   </rootfiles>
6 | </container>


--------------------------------------------------------------------------------
/pyscp/resources/templates/content.opf:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <package xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.ipdf.org/vocab/rendition/#" unique-identifier="uuid_id" version="3.0">
 3 |   <metadata xmlns:opf="http://www.idpf.org/2007/opf" xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 |     <meta property="dcterms:modified"/>
 5 |     <meta name="cover" content="cover"/>
 6 |     <dc:title/>
 7 |     <dc:creator/>
 8 |     <dc:date/>
 9 |     <dc:identifier id="uuid_id" opf:scheme="uuid"/>
10 |     <dc:language/>
11 |   </metadata>
12 |   <manifest>
13 |     <item href="stylesheet.css" id="stylesheet" media-type="text/css"/>
14 |     <item href="toc.ncx" id="toc" media-type="application/x-dtbncx+xml"/>
15 |     <item href="cover.png" id="cover" media-type="image/png" properties="cover-image"/>
16 |   </manifest>
17 |   <spine toc="toc"/>
18 | </package>


--------------------------------------------------------------------------------
/pyscp/resources/templates/page.xhtml:
--------------------------------------------------------------------------------
1 | <?xml version='1.0' encoding='UTF-8'?>
2 | <!DOCTYPE html>
3 | <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">
4 |   <head>
5 |     <title/>
6 |     <link href="../stylesheet.css" rel="stylesheet" type="text/css"/>
7 |   </head>
8 |   <body/>
9 | </html>


--------------------------------------------------------------------------------
/pyscp/resources/templates/toc.ncx:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
 3 |   <head>
 4 |     <meta content="" name="dtb:uid"/>
 5 |     <meta content="0" name="dtb:depth"/>
 6 |     <meta content="0" name="dtb:totalPageCount"/>
 7 |     <meta content="0" name="dtb:maxPageNumber"/>
 8 |   </head>
 9 |   <docTitle>
10 |     <text/>
11 |   </docTitle>
12 |   <navMap/>
13 | </ncx>


--------------------------------------------------------------------------------
/pyscp/snapshot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Snapshot access classes.
  5 | 
  6 | This module contains the classes that facilitate information extraction
  7 | and communication with the sqlite Snapshots.
  8 | """
  9 | 
 10 | ###############################################################################
 11 | # Module Imports
 12 | ###############################################################################
 13 | 
 14 | import bs4
 15 | import concurrent.futures
 16 | import functools
 17 | import itertools
 18 | import logging
 19 | import operator
 20 | import pathlib
 21 | import re
 22 | import requests
 23 | 
 24 | from pyscp import core, orm, utils
 25 | 
 26 | ###############################################################################
 27 | # Global Constants And Variables
 28 | ###############################################################################
 29 | 
 30 | log = logging.getLogger(__name__)
 31 | 
 32 | ###############################################################################
 33 | 
 34 | 
 35 | class Page(core.Page):
 36 |     """Page object."""
 37 | 
 38 |     ###########################################################################
 39 |     # Internal Methods
 40 |     ###########################################################################
 41 | 
 42 |     def _query(self, ptable, stable='User'):
 43 |         """Generate SQL queries used to retrieve data."""
 44 |         pt, st = [getattr(orm, i) for i in (ptable, stable)]
 45 |         return pt.select(pt, st.name).join(st).where(pt.page == self._id)
 46 | 
 47 |     @utils.cached_property
 48 |     def _pdata(self):
 49 |         """Preload the ids and contents of the page."""
 50 |         pdata = orm.Page.get(orm.Page.url == self.url)
 51 |         return pdata.id, pdata._data['thread'], pdata.html
 52 | 
 53 |     ###########################################################################
 54 |     # Properties
 55 |     ###########################################################################
 56 | 
 57 |     @property
 58 |     def html(self):
 59 |         """Return HTML contents of the page."""
 60 |         return self._pdata[2]
 61 | 
 62 |     @utils.cached_property
 63 |     def history(self):
 64 |         """Return the revisions of the page."""
 65 |         revs = self._query('Revision')
 66 |         revs = sorted(revs, key=lambda x: x.number)
 67 |         return [core.Revision(
 68 |                 r.id, r.number, r.user.name, str(r.time), r.comment)
 69 |                 for r in revs]
 70 | 
 71 |     @utils.cached_property
 72 |     def votes(self):
 73 |         """Return all votes made on the page."""
 74 |         return [core.Vote(v.user.name, v.value)
 75 |                 for v in self._query('Vote')]
 76 | 
 77 |     @utils.cached_property
 78 |     def tags(self):
 79 |         """Return the set of tags with which the page is tagged."""
 80 |         return {pt.tag.name for pt in self._query('PageTag', 'Tag')}
 81 | 
 82 | 
 83 | class Thread(core.Thread):
 84 |     """Discussion/forum thread."""
 85 | 
 86 |     @utils.cached_property
 87 |     def posts(self):
 88 |         """Post objects belonging to this thread."""
 89 |         fp = orm.ForumPost
 90 |         us = orm.User
 91 |         query = fp.select(fp, us.name).join(us).where(fp.thread == self._id)
 92 |         return [core.Post(
 93 |                 p.id, p.title, p.content, p.user.name,
 94 |                 str(p.time), p._data['parent'])
 95 |                 for p in query]
 96 | 
 97 | 
 98 | class Wiki(core.Wiki):
 99 |     """Snapshot of a Wikidot website."""
100 | 
101 |     Page = Page
102 |     Thread = Thread
103 |     # Tautology = Tautology
104 | 
105 |     ###########################################################################
106 |     # Special Methods
107 |     ###########################################################################
108 | 
109 |     def __init__(self, site, dbpath):
110 |         """Create wiki instance."""
111 |         super().__init__(site)
112 |         if not pathlib.Path(dbpath).exists():
113 |             raise FileNotFoundError(dbpath)
114 |         self.dbpath = dbpath
115 |         orm.connect(dbpath)
116 | 
117 |     def __repr__(self):
118 |         """Pretty-print current instance."""
119 |         return '{}.{}({}, {})'.format(
120 |             self.__module__,
121 |             self.__class__.__qualname__,
122 |             repr(self.site),
123 |             repr(self.dbpath))
124 | 
125 |     ###########################################################################
126 |     # Internal Methods
127 |     ###########################################################################
128 | 
129 |     @staticmethod
130 |     def _filter_author(author):
131 |         return (orm.Page.select(orm.Page.url)
132 |                 .join(orm.Revision).join(orm.User)
133 |                 .where(orm.Revision.number == 0)
134 |                 .where(orm.User.name == author))
135 | 
136 |     @staticmethod
137 |     def _filter_tag(tag):
138 |         return (orm.Page.select(orm.Page.url)
139 |                 .join(orm.PageTag).join(orm.Tag)
140 |                 .where(orm.Tag.name == tag))
141 | 
142 |     @staticmethod
143 |     def _get_operator(string):
144 |         symbol, *values = re.split(r'(\d+)', string)
145 |         opdict = {
146 |             '>': 'gt', '<': 'lt', '>=': 'ge', '<=': 'le', '=': 'eq', '': 'eq'}
147 |         if symbol not in opdict:
148 |             raise ValueError
149 |         return getattr(operator, opdict[symbol]), values
150 | 
151 |     def _filter_rating(self, rating):
152 |         compare, values = self._get_operator(rating)
153 |         rating = int(values[0])
154 |         return (orm.Page.select(orm.Page.url)
155 |                 .join(orm.Vote).group_by(orm.Page.url)
156 |                 .having(compare(orm.peewee.fn.sum(orm.Vote.value), rating)))
157 | 
158 |     def _filter_created(self, created):
159 |         compare, values = self._get_operator(created)
160 |         date = '-'.join(values[::2])
161 |         return (orm.Page.select(orm.Page.url)
162 |                 .join(orm.Revision).where(orm.Revision.number == 0)
163 |                 .group_by(orm.Page.url)
164 |                 .having(compare(
165 |                     orm.peewee.fn.substr(orm.Revision.time, 1, len(date)),
166 |                     date)))
167 | 
168 |     def _list_pages_parsed(self, **kwargs):
169 |         query = orm.Page.select(orm.Page.url)
170 |         keys = ('author', 'tag', 'rating', 'created')
171 |         keys = [k for k in keys if k in kwargs]
172 |         for k in keys:
173 |             query = query & getattr(self, '_filter_' + k)(kwargs[k])
174 |         if 'limit' in kwargs:
175 |             query = query.limit(kwargs['limit'])
176 |         return map(self, [p.url for p in query])
177 | 
178 |     ###########################################################################
179 |     # SCP-Wiki Specific Methods
180 |     ###########################################################################
181 | 
182 |     @functools.lru_cache(maxsize=1)
183 |     def list_images(self):
184 |         """Image metadata."""
185 |         query = (
186 |             orm.Image.select(orm.Image, orm.ImageStatus.name)
187 |             .join(orm.ImageStatus))
188 |         return [core.Image(r.url, r.source, r.status.name, r.notes, r.data)
189 |                 for r in query]
190 | 
191 | ###############################################################################
192 | 
193 | 
194 | class SnapshotCreator:
195 |     """
196 |     Create a snapshot of a wikidot site.
197 | 
198 |     This class uses WikidotConnector to iterate over all the pages of a site,
199 |     and save the html content, revision history, votes, and the discussion
200 |     of each to a sqlite database. Optionally, standalone forum threads can be
201 |     saved too.
202 | 
203 |     In case of the scp-wiki, some additional information is saved:
204 |     images for which their CC status has been confirmed, and info about
205 |     overwriting page authorship.
206 | 
207 |     In general, this class will not save images hosted on the site that is
208 |     being saved. Only the html content, discussions, and revision/vote
209 |     metadata is saved.
210 |     """
211 | 
212 |     def __init__(self, dbpath):
213 |         """Create an instance."""
214 |         if pathlib.Path(dbpath).exists():
215 |             raise FileExistsError(dbpath)
216 |         orm.connect(dbpath)
217 |         self.pool = concurrent.futures.ThreadPoolExecutor(max_workers=20)
218 | 
219 |     def take_snapshot(self, wiki, forums=False):
220 |         """Take new snapshot."""
221 |         self.wiki = wiki
222 |         self._save_all_pages()
223 |         if forums:
224 |             self._save_forums()
225 |         if 'scp-wiki' in self.wiki.site:
226 |             self._save_meta()
227 |         orm.queue.join()
228 |         self._save_cache()
229 |         orm.queue.join()
230 |         log.info('Snapshot succesfully taken.')
231 | 
232 |     def _save_all_pages(self):
233 |         """Iterate over the site pages, call _save_page for each."""
234 |         orm.create_tables(
235 |             'Page', 'Revision', 'Vote', 'ForumPost',
236 |             'PageTag', 'ForumThread', 'User', 'Tag')
237 |         count = next(
238 |             self.wiki.list_pages(body='total', limit=1))._body['total']
239 |         bar = utils.ProgressBar('SAVING PAGES'.ljust(20), int(count))
240 |         bar.start()
241 |         for _ in self.pool.map(self._save_page, self.wiki.list_pages()):
242 |             bar.value += 1
243 |         bar.stop()
244 | 
245 |     @utils.ignore(requests.HTTPError)
246 |     def _save_page(self, page):
247 |         """Download contents, revisions, votes and discussion of the page."""
248 |         orm.Page.create(
249 |             id=page._id, url=page.url, thread=page._thread._id, html=page.html)
250 | 
251 |         revisions = orm.User.convert_to_id(i._asdict() for i in page.history)
252 |         votes = orm.User.convert_to_id(i._asdict() for i in page.votes)
253 |         tags = [{'tag': t} for t in page.tags]
254 |         tags = orm.Tag.convert_to_id(tags, key='tag')
255 | 
256 |         def _insert(table, data):
257 |             table.insert_many(dict(i, page=page._id) for i in data)
258 | 
259 |         _insert(orm.Revision, revisions)
260 |         _insert(orm.Vote, votes)
261 |         _insert(orm.PageTag, tags)
262 | 
263 |         self._save_thread(page._thread)
264 | 
265 |     def _save_forums(self):
266 |         """Download and save standalone forum threads."""
267 |         orm.create_tables(
268 |             'ForumPost', 'ForumThread', 'ForumCategory', 'User')
269 |         cats = self.wiki.list_categories()
270 |         cats = [i for i in cats if i.title != 'Per page discussions']
271 |         orm.ForumCategory.insert_many(dict(
272 |             id=c.id,
273 |             title=c.title,
274 |             description=c.description) for c in cats)
275 |         total_size = sum(c.size for c in cats)
276 |         bar = utils.ProgressBar('SAVING FORUM THREADS', total_size)
277 |         bar.start()
278 |         for cat in cats:
279 |             threads = set(self.wiki.list_threads(cat.id))
280 |             c_id = itertools.repeat(cat.id)
281 |             for _ in self.pool.map(self._save_thread, threads, c_id):
282 |                 bar.value += 1
283 |         bar.stop()
284 | 
285 |     def _save_thread(self, thread, c_id=None):
286 |         orm.ForumThread.create(
287 |             category=c_id, id=thread._id,
288 |             title=thread.title, description=thread.description)
289 |         posts = orm.User.convert_to_id([i._asdict() for i in thread.posts])
290 |         orm.ForumPost.insert_many(
291 |             dict(p, thread=thread._id) for p in posts)
292 | 
293 |     def _save_meta(self):
294 |         orm.create_tables(
295 |             'Image', 'ImageStatus')
296 |         licenses = {
297 |             'PERMISSION GRANTED', 'BY-NC-SA CC', 'BY-SA CC', 'PUBLIC DOMAIN'}
298 |         images = [i for i in self.wiki.list_images() if i.status in licenses]
299 |         self.ibar = utils.ProgressBar(
300 |             'SAVING IMAGES'.ljust(20), len(images))
301 |         self.ibar.start()
302 |         data = list(self.pool.map(self._save_image, images))
303 |         self.ibar.stop()
304 |         images = orm.ImageStatus.convert_to_id(
305 |             [i._asdict() for i in images], key='status')
306 |         orm.Image.insert_many(
307 |             dict(i, data=d) for i, d in zip(images, data) if d)
308 | 
309 |     @utils.ignore(requests.RequestException)
310 |     def _save_image(self, image):
311 |         self.ibar.value += 1
312 |         if not image.source:
313 |             log.info('Image source not specified: ' + image.url)
314 |             return
315 |         return self.wiki.req.get(image.url, allow_redirects=True).content
316 | 
317 |     def _save_cache(self):
318 |         for table in orm.User, orm.Tag, orm.OverrideType, orm.ImageStatus:
319 |             if hasattr(table, '_id_cache') and table._id_cache:
320 |                 table.write_ids('name')
321 | 


--------------------------------------------------------------------------------
/pyscp/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anqxyr/pyscp/fc85c808495f8f47783db6fb12a79ce7727e919c/pyscp/stats/__init__.py


--------------------------------------------------------------------------------
/pyscp/stats/counters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Counters.
 5 | 
 6 | Take a list of pages and a scalar, and return a collections.Counter instance.
 7 | """
 8 | 
 9 | ###############################################################################
10 | # Imports
11 | ###############################################################################
12 | 
13 | import collections
14 | import re
15 | 
16 | ###############################################################################
17 | 
18 | 
19 | def make_counter(pages, func, key):
20 |     """Generic counter factory."""
21 |     subgroups = collections.defaultdict(list)
22 |     for p in pages:
23 |         key_value = key(p)
24 |         if key_value:
25 |             subgroups[key_value].append(p)
26 |     return collections.Counter({k: func(v) for k, v in subgroups.items()})
27 | 
28 | 
29 | def author(pages, func):
30 |     """Group per page author."""
31 |     return make_counter(pages, func, lambda p: p.author)
32 | 
33 | 
34 | def month(pages, func):
35 |     """Group per month the page was posted on."""
36 |     return make_counter(pages, func, lambda p: p.created[:7])
37 | 
38 | 
39 | def page(pages, func):
40 |     """Each page into its own group."""
41 |     return make_counter(pages, func, lambda p: p.url)
42 | 
43 | 
44 | def block(pages, func):
45 |     """Group skips based on which 100-block they're in."""
46 |     def key(page):
47 |         if 'scp' not in page.tags:
48 |             return
49 |         match = re.search(r'[0-9]{3,4}$', page.url)
50 |         if not match:
51 |             return
52 |         match = int(match.group())
53 |         if match == 1:
54 |             return
55 |         return str((match // 100) * 100).zfill(3)
56 |     return make_counter(pages, func, key)
57 | 
58 | 
59 | def chain(pages, func, *counters):
60 |     """Apply counters one after another."""
61 |     if len(counters) == 1:
62 |         return counters[0](pages, func)
63 |     results = collections.Counter()
64 |     for key, val in counters[0](pages, lambda x: x).items():
65 |         for ikey, ival in chain(val, func, *counters[1:]).items():
66 |             results['%s, %s' % (key, ikey)] = ival
67 |     return results
68 | 


--------------------------------------------------------------------------------
/pyscp/stats/filters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Filters.
 5 | 
 6 | Take a list of pages and return a subset of the list.
 7 | """
 8 | 
 9 | ###############################################################################
10 | # Imports
11 | ###############################################################################
12 | 
13 | import pyscp.stats.counters as cn
14 | import pyscp.stats.scalars as sc
15 | 
16 | ###############################################################################
17 | 
18 | 
19 | def tag(pages, tag):
20 |     """Pages with a given tag."""
21 |     if not tag:
22 |         return pages
23 |     return [p for p in pages if tag in p.tags]
24 | 
25 | 
26 | def user(pages, user):
27 |     """Pages by a certain user."""
28 |     return [p for p in pages if p.author == user]
29 | 
30 | 
31 | # TODO: needs more indicative name.
32 | def min_authored(pages, min_val=3):
33 |     """Pages by authors who have at least min_val pages."""
34 |     authors = cn.author(pages, sc.count)
35 |     return [p for p in pages if authors[p.author] >= min_val]
36 | 
37 | 
38 | def filter_rating(pages, min_val=20):
39 |     """Pages with rating above min_val."""
40 |     return [p for p in pages if p.rating > min_val]
41 | 


--------------------------------------------------------------------------------
/pyscp/stats/scalars.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Scalars.
 5 | 
 6 | Take a list of pages and return a single value.
 7 | """
 8 | 
 9 | def upvotes(pages):
10 |     """Upvotes."""
11 |     return sum([v.value for v in p.votes].count(1) for p in pages)
12 | 
13 | 
14 | def rating(pages):
15 |     """Net rating."""
16 |     return sum(p.rating for p in pages)
17 | 
18 | 
19 | def rating_average(pages):
20 |     """Average rating."""
21 |     return rating(pages) / len(pages)
22 | 
23 | 
24 | def divided(pages):
25 |     """Controversy score."""
26 |     return sum(len(p.votes) / p.rating for p in pages)
27 | 
28 | 
29 | def redactions(pages):
30 |     """Redaction score."""
31 |     return sum(
32 |         p.text.count('█') +
33 |         20 * sum(map(p.text.count, ('REDACTED', 'EXPUNGED')))
34 |         for p in pages)
35 | 
36 | 
37 | def wordcount(pages):
38 |     return sum(p.wordcount for p in pages)
39 | 
40 | 
41 | def wordcount_average(pages):
42 |     return wordcount(pages) / len(pages)
43 | 


--------------------------------------------------------------------------------
/pyscp/stats/updater.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Stat Updater.
  5 | 
  6 | Calculate stats from one wiki and write it into another.
  7 | """
  8 | 
  9 | ###############################################################################
 10 | # Module Imports
 11 | ###############################################################################
 12 | 
 13 | import logging
 14 | 
 15 | from pyscp import snapshot, wikidot, utils
 16 | from pyscp.stats import scalars, counters, filters
 17 | 
 18 | ###############################################################################
 19 | # Global Constants And Variables
 20 | ###############################################################################
 21 | 
 22 | log = logging.getLogger(__name__)
 23 | 
 24 | ###############################################################################
 25 | 
 26 | 
 27 | class Updater:
 28 | 
 29 |     scalars_author = (
 30 |         ('Pages Created', len),
 31 |         ('Net Rating', scalars.rating),
 32 |         ('Average Rating', scalars.rating_average),
 33 |         ('Wordcount', scalars.wordcount),
 34 |         ('Average Wordcount', scalars.wordcount_average))
 35 | 
 36 |     def __init__(self, source, target):
 37 |         self.pages = list(source.list_pages())
 38 |         self.target = target
 39 |         self.exist = [p.url for p in target.list_pages()]
 40 | 
 41 |     @staticmethod
 42 |     def source_counter(counter):
 43 |         """Build wikidot markup source for ranking pages."""
 44 |         source = ['||~ Rank||~ User||~ Score||']
 45 |         # sort by score, then alphabetically by user
 46 |         items = sorted(counter.items(), key=lambda x: x[0].lower())
 47 |         items = sorted(items, key=lambda x: x[1], reverse=True)
 48 |         template = '||{}||[[[user:{}]]]||{}||'
 49 |         for idx, (user, score) in enumerate(items):
 50 |             source.append(template.format(idx + 1, user, score))
 51 |         return '\n'.join(source)
 52 | 
 53 |     def source_author(self, user):
 54 |         """Build source code for the user's authorship stats."""
 55 |         pages = filters.user(self.pages, user)
 56 |         source = ['++ Authorship Statistics']
 57 |         if not pages:
 58 |             source.append('This user have not authored any pages.')
 59 |             return '\n'.join(source)
 60 |         for descr, func in self.scalars_author:
 61 |             text = '[[[ranking:{}]]]:@@{}@@**{}**'.format(
 62 |                 descr, ' ' * (40 - len(descr)), round(func(pages), 2))
 63 |             source.append('{{%s}}' % text)
 64 |         return '\n'.join(source)
 65 | 
 66 |     def post(self, name, source):
 67 |         """Update if exists; create if not; retry if failed."""
 68 |         p = self.target(name)
 69 |         for _ in range(10):  # retry ten times max
 70 |             if p.url in self.exist:
 71 |                 response = p.edit(source)
 72 |             else:
 73 |                 title = name.split(':')
 74 |                 response = p.create(source, title)
 75 |             if response['status'] == 'ok':
 76 |                 return
 77 |         log.error('Failed to post: %s', name)
 78 | 
 79 |     def update_users(self):
 80 |         """Update the stats wiki with the author stats."""
 81 |         users = {p.author for p in self.pages}
 82 |         for user in utils.pbar(users, 'UPDATING USER STATS'):
 83 |             self.post('user:' + user, self.source_author(user))
 84 | 
 85 |     def update_rankings(self):
 86 |         for descr, func in utils.pbar(
 87 |                 self.scalars_author, 'UPDATING RANKINGS'):
 88 |             value = self.source_counter(counters.author(self.pages, func))
 89 |             self.post('ranking:' + descr, round(value, 2))
 90 | 
 91 | 
 92 | ###############################################################################
 93 | 
 94 | if __name__ == "__main__":
 95 |     source = snapshot.Wiki(
 96 |         'www.scp-wiki.net', '/home/anqxyr/heap/_scp/scp-wiki.2015-06-23.db')
 97 |     target = wikidot.Wiki('scp-stats')
 98 |     target.auth('placeholder', 'placeholder')
 99 |     up = Updater(source, target)
100 |     up.update_rankings()
101 |     up.update_users()
102 | 


--------------------------------------------------------------------------------
/pyscp/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | ###############################################################################
  4 | # Module Imports
  5 | ###############################################################################
  6 | 
  7 | import logging
  8 | import re
  9 | import time
 10 | import threading
 11 | import signal
 12 | import functools
 13 | import inspect
 14 | 
 15 | ###############################################################################
 16 | # Decorators
 17 | ###############################################################################
 18 | 
 19 | ###############################################################################
 20 | # Decorator decorator is a simplified version of the code from the funcy lib.
 21 | # https://github.com/Suor/funcy
 22 | ###############################################################################
 23 | 
 24 | 
 25 | class Call:
 26 | 
 27 |     def __init__(self, func, args, kwargs):
 28 |         self.func, self.args, self.kwargs = func, args, kwargs
 29 | 
 30 |     def __call__(self):
 31 |         return self.func(*self.args, **self.kwargs)
 32 | 
 33 | 
 34 | def decorator(deco):
 35 |     spec = inspect.getargspec(deco)
 36 |     if len(spec.args) > 1 or spec.varargs or spec.keywords:
 37 |         @functools.wraps(deco)
 38 |         def _fab(*dargs, **dkwargs):
 39 |             return make_decorator(deco, *dargs, **dkwargs)
 40 |         return _fab
 41 |     else:
 42 |         return functools.wraps(deco)(make_decorator(deco))
 43 | 
 44 | 
 45 | def make_decorator(deco, *dargs, **dkwargs):
 46 |     def _decorator(func):
 47 |         @functools.wraps(func)
 48 |         def wrapper(*args, **kwargs):
 49 |             call = Call(func, args, kwargs)
 50 |             return deco(call, *dargs, **dkwargs)
 51 |         return wrapper
 52 |     return _decorator
 53 | 
 54 | ###############################################################################
 55 | 
 56 | 
 57 | @decorator
 58 | def listify(call, wrapper=list):
 59 |     return wrapper(call())
 60 | 
 61 | 
 62 | @decorator
 63 | def morph(call, catch_exc, raise_exc):
 64 |     try:
 65 |         return call()
 66 |     except catch_exc as error:
 67 |         raise raise_exc(error) from error
 68 | 
 69 | 
 70 | @decorator
 71 | def ignore(call, error=Exception, value=None):
 72 |     try:
 73 |         return call()
 74 |     except error:
 75 |         return value
 76 | 
 77 | 
 78 | @decorator
 79 | def log_errors(call, logger=print):
 80 |     try:
 81 |         return call()
 82 |     except Exception as error:
 83 |         logger(error)
 84 |         raise(error)
 85 | 
 86 | 
 87 | @decorator
 88 | def decochain(call, *decs):
 89 |     fn = call.func
 90 |     for dec in reversed(decs):
 91 |         fn = dec(fn)
 92 |     return fn(*call.args, **call.kwargs)
 93 | 
 94 | 
 95 | class cached_property:
 96 | 
 97 |     def __init__(self, func):
 98 |         self.func = func
 99 |         functools.update_wrapper(self, func)
100 | 
101 |     def __get__(self, obj, cls):
102 |         if not hasattr(obj, '_cache'):
103 |             obj._cache = {}
104 |         if self.func.__name__ not in obj._cache:
105 |             obj._cache[self.func.__name__] = self.func(obj)
106 |         return obj._cache[self.func.__name__]
107 | 
108 | ###############################################################################
109 | 
110 | 
111 | def split(text, delimeters):
112 |     pattern = '|'.join(map(re.escape, delimeters))
113 |     return re.split(pattern, text)
114 | 
115 | 
116 | class ProgressBar:
117 | 
118 |     def __init__(self, title, max_value):
119 |         self.title = title
120 |         self.max_value = max_value
121 |         self.value = 0
122 |         signal.signal(signal.SIGINT, self.exit)
123 | 
124 |     def start(self):
125 |         self.finished = False
126 |         self.time_started = time.time()
127 |         threading.Thread(target=self.run).start()
128 | 
129 |     def update(self):
130 |         print(self.line() + '\r', end='')
131 | 
132 |     def line(self):
133 |         filled = 40 * self.value / self.max_value
134 |         parts = ' ▏▎▍▌▋▊▉'
135 |         current = int(filled * len(parts)) % len(parts)
136 |         bar = '█' * int(filled) + parts[current] + ' ' * 40
137 |         tm = time.gmtime(time.time() - self.time_started)
138 |         return '{} |{}| {:>3}% ({}:{:02}:{:02})   '.format(
139 |             self.title,
140 |             bar[:40],
141 |             100 * self.value // self.max_value,
142 |             tm.tm_hour, tm.tm_min, tm.tm_sec)
143 | 
144 |     def run(self):
145 |         while not self.finished:
146 |             self.update()
147 |             time.sleep(1)
148 | 
149 |     def stop(self):
150 |         self.finished = True
151 |         print(self.line())
152 | 
153 |     def exit(self, signum, frame):
154 |         self.stop()
155 |         raise KeyboardInterrupt
156 | 
157 | 
158 | def pbar(it, title=None, max=None):
159 |     max = len(it) if max is None else max
160 |     title = '' if title is None else title + ' '
161 |     bar = ProgressBar(title, max)
162 |     bar.start()
163 |     for i in it:
164 |         yield i
165 |         bar.value += 1
166 |         bar.update()
167 |     bar.stop()
168 | 
169 | ###############################################################################
170 | 
171 | 
172 | class LogCount:
173 | 
174 |     def __init__(self):
175 |         self.count = 1
176 | 
177 |     def filter(self, record):
178 |         record.count = self.count
179 |         self.count += 1
180 |         return True
181 | 
182 | 
183 | def log_sql_debug():
184 |     logger = logging.getLogger('peewee')
185 |     logger.setLevel(logging.DEBUG)
186 |     logger.addFilter(LogCount())
187 |     term = logging.StreamHandler()
188 |     term.setFormatter(logging.Formatter('{count} {message}', style='{'))
189 |     logger.addHandler(term)
190 | 
191 | 
192 | def default_logging(debug=False):
193 |     term = logging.StreamHandler()
194 |     file = logging.FileHandler('pyscp.log', mode='a', delay=True)
195 |     if debug:
196 |         term.setLevel(logging.DEBUG)
197 |         file.setLevel(logging.DEBUG)
198 |     else:
199 |         term.setLevel(logging.INFO)
200 |         file.setLevel(logging.INFO)
201 |     term.setFormatter(logging.Formatter('{message}', style='{'))
202 |     file.setFormatter(
203 |         logging.Formatter('{asctime} {levelname:8s} {message}', style='{'))
204 |     logger = logging.getLogger('pyscp')
205 |     logger.setLevel(logging.DEBUG)
206 |     logger.addHandler(term)
207 |     logger.addHandler(file)
208 | 
209 | ###############################################################################
210 | 


--------------------------------------------------------------------------------
/pyscp/wikidot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Wikidot access classes.
  5 | 
  6 | This module contains the classes that facilitate information extraction
  7 | and communication with the Wikidot-hosted sites.
  8 | """
  9 | 
 10 | ###############################################################################
 11 | # Module Imports
 12 | ###############################################################################
 13 | 
 14 | import arrow
 15 | import bs4
 16 | import functools
 17 | import itertools
 18 | import logging
 19 | import pyscp
 20 | import re
 21 | import requests
 22 | 
 23 | ###############################################################################
 24 | # Global Constants And Variables
 25 | ###############################################################################
 26 | 
 27 | log = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | ###############################################################################
 31 | # Utility Classes
 32 | ###############################################################################
 33 | 
 34 | class InsistentRequest(requests.Session):
 35 |     """Make an auto-retrying request that handles connection loss."""
 36 | 
 37 |     def __init__(self, max_attempts=10):
 38 |         super().__init__()
 39 |         self.max_attempts = max_attempts
 40 | 
 41 |     def __repr__(self):
 42 |         return '{}(max_attempts={})'.format(
 43 |             self.__class__.__name__, self.max_attempts)
 44 | 
 45 |     def request(self, method, url, **kwargs):
 46 |         logged_kwargs = hide_pass(kwargs)
 47 |         logged_kwargs = repr(logged_kwargs) if logged_kwargs else ''
 48 |         log.debug('%s: %s %s', method, url, logged_kwargs)
 49 | 
 50 |         kwargs.setdefault('timeout', 60)
 51 |         kwargs.setdefault('allow_redirects', False)
 52 |         for _ in range(self.max_attempts):
 53 |             try:
 54 |                 resp = super().request(method=method, url=url, **kwargs)
 55 |             except (
 56 |                     requests.ConnectionError,
 57 |                     requests.Timeout,
 58 |                     requests.exceptions.ChunkedEncodingError):
 59 |                 continue
 60 |             if 200 <= resp.status_code < 300:
 61 |                 return resp
 62 |             elif 300 <= resp.status_code < 400:
 63 |                 raise requests.HTTPError(
 64 |                     'Redirect attempted with url: {}'.format(url))
 65 |             elif 400 <= resp.status_code < 600:
 66 |                 continue
 67 |         raise requests.ConnectionError(
 68 |             'Max retries exceeded with url: {}'.format(url))
 69 | 
 70 |     def get(self, url, **kwargs):
 71 |         return self.request('GET', url, **kwargs)
 72 | 
 73 |     def post(self, url, **kwargs):
 74 |         return self.request('POST', url, **kwargs)
 75 | 
 76 | 
 77 | def hide_pass(nested_dict):
 78 |     result = {}
 79 |     for k, v in nested_dict.items():
 80 |         if k in ('pass', 'password', 'pasw'):
 81 |             result[k] = '********'
 82 |         elif isinstance(v, dict):
 83 |             result[k] = hide_pass(v)
 84 |         else:
 85 |             result[k] = v
 86 |     return result
 87 | 
 88 | 
 89 | ###############################################################################
 90 | 
 91 | 
 92 | class Page(pyscp.core.Page):
 93 |     """Create Page object."""
 94 | 
 95 |     def __init__(self, wiki, url):
 96 |         super().__init__(wiki, url)
 97 |         self._body = {}
 98 | 
 99 |     ###########################################################################
100 |     # Internal Methods
101 |     ###########################################################################
102 | 
103 |     def _module(self, *args, **kwargs):
104 |         """Call Wikidot module."""
105 |         return self._wiki._module(*args, page_id=self._id, **kwargs)
106 | 
107 |     def _action(self, event, **kwargs):
108 |         """Execute WikiPageAction."""
109 |         return self._module(
110 |             'Empty', action='WikiPageAction', event=event, **kwargs)
111 | 
112 |     def _vote(self, value):
113 |         """Vote on the page."""
114 |         return self._action(
115 |             'RateAction',
116 |             event='ratePage' if value else 'cancelVote',
117 |             points=value,
118 |             force=True)
119 | 
120 |     def _flush(self, *names):
121 |         if not hasattr(self, '_cache'):
122 |             return
123 |         self._cache = {k: v for k, v in self._cache.items() if k not in names}
124 | 
125 |     @pyscp.utils.cached_property
126 |     def _pdata(self):
127 |         data = self._wiki.req.get(self.url).text
128 |         soup = bs4.BeautifulSoup(data, 'lxml')
129 |         return (int(re.search('pageId = ([0-9]+);', data).group(1)),
130 |                 parse_element_id(soup.find(id='discuss-button')),
131 |                 str(soup.find(id='main-content')),
132 |                 {e.text for e in soup.select('.page-tags a')})
133 | 
134 |     @property
135 |     def _raw_title(self):
136 |         if 'title' in self._body:
137 |             return self._body['title']
138 |         return super()._raw_title
139 | 
140 |     @property
141 |     def _raw_author(self):
142 |         if 'created_by' in self._body:
143 |             return self._body['created_by']
144 |         return super()._raw_author
145 | 
146 |     ###########################################################################
147 |     # Properties
148 |     ###########################################################################
149 | 
150 |     @property
151 |     def html(self):
152 |         return self._pdata[2]
153 | 
154 |     @pyscp.utils.cached_property
155 |     @pyscp.utils.listify()
156 |     def history(self):
157 |         """Return the revision history of the page."""
158 |         data = self._module(
159 |             'history/PageRevisionListModule', page=1, perpage=99999)['body']
160 |         soup = bs4.BeautifulSoup(data, 'lxml')
161 |         for row in reversed(soup('tr')[1:]):
162 |             rev_id = int(row['id'].split('-')[-1])
163 |             cells = row('td')
164 |             number = int(cells[0].text.strip('.'))
165 |             user = cells[4].text
166 |             time = parse_element_time(cells[5])
167 |             comment = cells[6].text if cells[6].text else None
168 |             yield pyscp.core.Revision(rev_id, number, user, time, comment)
169 | 
170 |     @pyscp.utils.cached_property
171 |     def votes(self):
172 |         """Return all votes made on the page."""
173 |         data = self._module('pagerate/WhoRatedPageModule')['body']
174 |         soup = bs4.BeautifulSoup(data, 'lxml')
175 |         spans = [i.text.strip() for i in soup('span')]
176 |         pairs = zip(spans[::2], spans[1::2])
177 |         return [pyscp.core.Vote(u, 1 if v == '+' else -1) for u, v in pairs]
178 | 
179 |     @property
180 |     def tags(self):
181 |         if 'tags' in self._body:
182 |             return set(self._body['tags'].split())
183 |         return self._pdata[3]
184 | 
185 |     @property
186 |     def source(self):
187 |         data = self._module('viewsource/ViewSourceModule')['body']
188 |         soup = bs4.BeautifulSoup(data, 'lxml')
189 |         return soup.text[11:].strip().replace(chr(160), ' ')
190 | 
191 |     @property
192 |     def created(self):
193 |         if 'created_at' in self._body:
194 |             time = arrow.get(self._body['created_at'], 'DD MMM YYYY HH:mm')
195 |             return time.format('YYYY-MM-DD HH:mm:ss')
196 |         return super().created
197 | 
198 |     @property
199 |     def rating(self):
200 |         if 'rating' in self._body:
201 |             return int(self._body['rating'])
202 |         return super().rating
203 | 
204 |     @pyscp.utils.cached_property
205 |     def files(self):
206 |         """List all files attached to the page."""
207 |         data = self._module('files/PageFilesModule')['body']
208 |         soup = bs4.BeautifulSoup(data, 'lxml')
209 |         if not soup.select('table.page-files'):
210 |             return []
211 |         files = soup.select('table.page-files')[0]('tr')[1:]
212 |         parsed = []
213 |         for file in files:
214 |             url = self._wiki.site + file.find('a')['href']
215 |             name = file.find('a').text.strip()
216 |             filetype = file('td')[1].text.strip()
217 |             size = file('td')[2].text.strip()
218 |             parsed.append(pyscp.core.File(url, name, filetype, size))
219 |         return parsed
220 | 
221 |     ###########################################################################
222 |     # Page-Modifying Methods
223 |     ###########################################################################
224 | 
225 |     def edit(self, source, title=None, comment=None):
226 |         """Overwrite the page with the new source and title."""
227 |         if title is None:
228 |             title = self._raw_title
229 |         self._flush('html', 'history', 'source')
230 |         wiki_page = self.url.split('/')[-1]
231 |         lock = self._module(
232 |             'edit/PageEditModule',
233 |             mode='page',
234 |             wiki_page=wiki_page,
235 |             force_lock=True)
236 |         return self._action(
237 |             'savePage',
238 |             source=source,
239 |             title=title,
240 |             comments=comment,
241 |             wiki_page=wiki_page,
242 |             lock_id=lock['lock_id'],
243 |             lock_secret=lock['lock_secret'],
244 |             revision_id=lock.get('page_revision_id', None))
245 | 
246 |     def create(self, source, title, comment=None):
247 |         if not hasattr(self, '_cache'):
248 |             self._cache = {}
249 |         self._cache['_pdata'] = (None, None, None)
250 |         response = self.edit(source, title, comment)
251 |         del self._cache['_pdata']
252 |         return response
253 | 
254 |     def revert(self, rev_n):
255 |         """Revert the page to a previous revision."""
256 |         self._flush('html', 'history', 'source', 'tags')
257 |         return self._action('revert', revisionId=self.history[rev_n].id)
258 | 
259 |     def set_tags(self, tags):
260 |         """Replace the tags of the page."""
261 |         res = self._action('saveTags', tags=' '.join(tags))
262 |         self._flush('history', '_pdata')
263 |         return res
264 | 
265 |     def upload(self, name, data):
266 |         url = self._wiki.site + '/default--flow/files__UploadTarget'
267 |         kwargs = dict(
268 |             pageId=self._id,
269 |             page_id=self._id,
270 |             action='FileAction',
271 |             event='uploadFile',
272 |             MAX_FILE_SIZE=52428800)
273 |         response = self._wiki.req.post(
274 |             url,
275 |             data=kwargs,
276 |             files={'userfile': (name, data)},
277 |             cookies={'wikidot_token7': '123456'})
278 |         response = bs4.BeautifulSoup(response.text, 'lxml')
279 |         status = response.find(id='status').text
280 |         message = response.find(id='message').text
281 |         if status != 'ok':
282 |             raise RuntimeError(message)
283 |         return response
284 | 
285 |     ###########################################################################
286 |     # Voting Methods
287 |     ###########################################################################
288 | 
289 |     def upvote(self):
290 |         self._vote(1)
291 |         self._flush('votes')
292 | 
293 |     def downvote(self):
294 |         self._vote(-1)
295 |         self._flush('votes')
296 | 
297 |     def cancel_vote(self):
298 |         self._vote(0)
299 |         self._flush('votes')
300 | 
301 | 
302 | class Thread(pyscp.core.Thread):
303 | 
304 |     @pyscp.utils.cached_property
305 |     @pyscp.utils.listify()
306 |     def posts(self):
307 |         if self._id is None:
308 |             return
309 |         pages = self._wiki._pager(
310 |             'forum/ForumViewThreadPostsModule', _key='pageNo', t=self._id)
311 |         pages = (bs4.BeautifulSoup(p['body'], 'lxml').body for p in pages)
312 |         pages = (p for p in pages if p)
313 |         posts = (p(class_='post-container', recursive=False) for p in pages)
314 |         posts = itertools.chain.from_iterable(posts)
315 |         for post, parent in crawl_posts(posts):
316 |             post_id = int(post['id'].split('-')[1])
317 |             title = post.find(class_='title').text.strip()
318 |             title = title if title else None
319 |             content = post.find(class_='content')
320 |             content.attrs.clear()
321 |             content = str(content)
322 |             user = post.find(class_='printuser').text
323 |             time = parse_element_time(post)
324 |             yield pyscp.core.Post(post_id, title, content, user, time, parent)
325 | 
326 |     def new_post(self, source, title=None, parent_id=None):
327 |         return self._wiki._module(
328 |             'Empty',
329 |             threadId=self._id,
330 |             parentId=parent_id,
331 |             title=title,
332 |             source=source,
333 |             action='ForumAction',
334 |             event='savePost')
335 | 
336 | 
337 | class Wiki(pyscp.core.Wiki):
338 |     """
339 |     Create a Wiki object.
340 | 
341 |     This class does not use any of the official Wikidot API, and instead
342 |     relies on sending http post/get requests to internal Wikidot pages and
343 |     parsing the returned data.
344 |     """
345 | 
346 |     Page = Page
347 |     Thread = Thread
348 |     # Tautology = Tautology
349 | 
350 |     ###########################################################################
351 |     # Special Methods
352 |     ###########################################################################
353 | 
354 |     def __init__(self, site):
355 |         super().__init__(site)
356 |         self.req = InsistentRequest()
357 | 
358 |     def __repr__(self):
359 |         return '{}.{}({})'.format(
360 |             self.__module__,
361 |             self.__class__.__name__,
362 |             repr(self.site))
363 | 
364 |     ###########################################################################
365 |     # Internal Methods
366 |     ###########################################################################
367 | 
368 |     @pyscp.utils.log_errors(log.warning)
369 |     def _module(self, _name, **kwargs):
370 |         """
371 |         Call a Wikidot module.
372 | 
373 |         This method is responsible for most of the class' functionality.
374 |         Almost all other methods of the class are using _module in one way
375 |         or another.
376 |         """
377 |         response = self.req.post(
378 |             self.site + '/ajax-module-connector.php',
379 |             data=dict(
380 |                 pageId=kwargs.get('page_id', None),  # fuck wikidot
381 |                 moduleName=_name,
382 |                 # token7 can be any 6-digit number, as long as it's the same
383 |                 # in the payload and in the cookie
384 |                 wikidot_token7='123456',
385 |                 **kwargs),
386 |             headers={'Content-Type': 'application/x-www-form-urlencoded;'},
387 |             cookies={'wikidot_token7': '123456'}).json()
388 |         if response['status'] != 'ok':
389 |             log.error(response)
390 |             raise RuntimeError(response.get('message') or response['status'])
391 |         return response
392 | 
393 |     def _pager(self, _name, _key, _update=None, **kwargs):
394 |         """Iterate over multi-page module results."""
395 |         first_page = self._module(_name, **kwargs)
396 |         yield first_page
397 |         counter = bs4.BeautifulSoup(
398 |             first_page['body'], 'lxml').find(class_='pager-no')
399 |         if not counter:
400 |             return
401 |         for idx in range(2, int(counter.text.split(' ')[-1]) + 1):
402 |             kwargs.update({_key: idx if _update is None else _update(idx)})
403 |             yield self._module(_name, **kwargs)
404 | 
405 |     def _list_pages_raw(self, **kwargs):
406 |         """
407 |         Call ListPages module.
408 | 
409 |         Wikidot's ListPages is an extremely versatile php module that can be
410 |         used to retrieve all sorts of interesting informations, from urls of
411 |         pages created by a given user, and up to full html contents of every
412 |         page on the site.
413 |         """
414 |         yield from self._pager(
415 |             'list/ListPagesModule',
416 |             _key='offset',
417 |             _update=lambda x: 250 * (x - 1),
418 |             perPage=250,
419 |             **kwargs)
420 | 
421 |     def _list_pages_parsed(self, **kwargs):
422 |         """
423 |         Call ListPages module and parse the results.
424 | 
425 |         Sets default arguments, parses ListPages body into a namedtuple.
426 |         Returns Page instances with a _body grafted in.
427 |         """
428 |         keys = set(kwargs.pop('body', '').split() + ['fullname'])
429 |         kwargs['module_body'] = '\n'.join(
430 |             map('||{0}||%%{0}%% ||'.format, keys))
431 |         kwargs['created_by'] = kwargs.pop('author', None)
432 |         lists = self._list_pages_raw(**kwargs)
433 |         soups = (bs4.BeautifulSoup(p['body'], 'lxml') for p in lists)
434 |         pages = (s.select('div.list-pages-item') for s in soups)
435 |         pages = itertools.chain.from_iterable(pages)
436 |         for page in pages:
437 |             data = {
438 |                 r('td')[0].text: r('td')[1].text.strip() for r in page('tr')}
439 |             page = self(data['fullname'])
440 |             page._body = data
441 |             yield page
442 | 
443 |     ###########################################################################
444 |     # Public Methods
445 |     ###########################################################################
446 | 
447 |     def auth(self, username, password):
448 |         """Login to wikidot with the given username/password pair."""
449 |         return self.req.post(
450 |             'https://www.wikidot.com/default--flow/login__LoginPopupScreen',
451 |             data=dict(
452 |                 login=username,
453 |                 password=password,
454 |                 action='Login2Action',
455 |                 event='login'))
456 | 
457 |     def list_categories(self):
458 |         """Return forum categories."""
459 |         data = self._module('forum/ForumStartModule')['body']
460 |         soup = bs4.BeautifulSoup(data, 'lxml')
461 |         for elem in [e.parent for e in soup(class_='name')]:
462 |             cat_id = parse_element_id(elem.select('.title a')[0])
463 |             title, description, size = [
464 |                 elem.find(class_=i).text.strip()
465 |                 for i in ('title', 'description', 'threads')]
466 |             yield pyscp.core.Category(
467 |                 cat_id, title, description, int(size))
468 | 
469 |     def list_threads(self, category_id):
470 |         """Return threads in the given category."""
471 |         pages = self._pager(
472 |             'forum/ForumViewCategoryModule', _key='p', c=category_id)
473 |         soups = (bs4.BeautifulSoup(p['body'], 'lxml') for p in pages)
474 |         elems = (s(class_='name') for s in soups)
475 |         for elem in itertools.chain(*elems):
476 |             thread_id = parse_element_id(elem.select('.title a')[0])
477 |             title, description = [
478 |                 elem.find(class_=i).text.strip()
479 |                 for i in ('title', 'description')]
480 |             yield self.Thread(self, thread_id, title, description)
481 | 
482 |     def send_pm(self, username, text, title=None):
483 |         lookup = self.req.get(
484 |             'https://www.wikidot.com/quickmodule.php?'
485 |             'module=UserLookupQModule&q=' + username).json()
486 |         if not lookup['users'] or lookup['users'][0]['name'] != username:
487 |             raise ValueError('Username Not Found')
488 |         user_id = lookup['users'][0]['user_id']
489 |         return self.req.post(
490 |             'https://www.wikidot.com/ajax-module-connector.php',
491 |             data=dict(
492 |                 moduleName='Empty',
493 |                 source=text,
494 |                 subject=title,
495 |                 to_user_id=user_id,
496 |                 action='DashboardMessageAction',
497 |                 event='send',
498 |                 wikidot_token7='123456'),
499 |             headers={'Content-Type': 'application/x-www-form-urlencoded;'},
500 |             cookies={'wikidot_token7': '123456'}).json()
501 | 
502 |     ###########################################################################
503 |     # SCP-Wiki Specific Methods
504 |     ###########################################################################
505 | 
506 |     @functools.lru_cache(maxsize=1)
507 |     @pyscp.utils.listify()
508 |     def list_images(self):
509 |         if 'scp-wiki' not in self.site:
510 |             return
511 |         base = 'http://scpsandbox2.wikidot.com/image-review-{}'
512 |         urls = [base.format(i) for i in range(1, 36)]
513 |         pages = [self.req.get(u).text for u in urls]
514 |         soups = [bs4.BeautifulSoup(p, 'lxml') for p in pages]
515 |         elems = [s('tr') for s in soups]
516 |         elems = itertools.chain(*elems)
517 |         elems = [e('td') for e in elems]
518 |         elems = [e for e in elems if e]
519 |         for elem in elems:
520 |             url = elem[0].find('img')['src']
521 |             source = elem[2].a['href'] if elem[2]('a') else None
522 |             status, notes = [elem[i].text for i in (3, 4)]
523 |             status, notes = [i if i else None for i in (status, notes)]
524 |             yield pyscp.core.Image(url, source, status, notes, None)
525 | 
526 | ###############################################################################
527 | 
528 | 
529 | @pyscp.utils.ignore((IndexError, TypeError))
530 | def parse_element_id(element):
531 |     """Extract the id number from the link."""
532 |     return int(element['href'].split('/')[2].split('-')[1])
533 | 
534 | 
535 | def parse_element_time(element):
536 |     """Extract and format time from an html element."""
537 |     unixtime = element.find(class_='odate')['class'][1].split('_')[1]
538 |     return arrow.get(unixtime).format('YYYY-MM-DD HH:mm:ss')
539 | 
540 | 
541 | def crawl_posts(post_containers, parent=None):
542 |     """
543 |     Retrieve posts from the comment tree.
544 | 
545 |     For each post-container in the given list, returns a tuple of
546 |     (post, parent). Then recurses onto all the post-container children
547 |     of the current post-container.
548 |     """
549 |     for container in post_containers:
550 |         yield container.find(class_='post'), parent
551 |         yield from crawl_posts(
552 |             container(class_='post-container', recursive=False),
553 |             int(container['id'].split('-')[1]))
554 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', encoding="utf8") as f:
 4 |     readme = f.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='pyscp',
 8 |     version='1.0.18',
 9 |     description='Python API and utilities for the scp-wiki.net website.',
10 |     long_description=readme,
11 |     url='https://github.com/anqxyr/pyscp/',
12 |     author='anqxyr',
13 |     author_email='anqxyr@gmail.com',
14 |     license='MIT',
15 |     classifiers=[
16 |         'Development Status :: 4 - Beta',
17 |         'Intended Audience :: Other Audience',
18 |         'License :: OSI Approved :: MIT License',
19 |         'Operating System :: OS Independent',
20 |         'Programming Language :: Python :: 3.4'],
21 |     packages=['pyscp'],
22 |     install_requires=[
23 |         'arrow',
24 |         'beautifulsoup4',
25 |         'blessings',
26 |         'lxml==3.3.3',
27 |         'requests',
28 |         'peewee==2.8.0'],
29 | )
30 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | ###############################################################################
 4 | # Module Imports
 5 | ###############################################################################
 6 | 
 7 | from pyscp.core import WikidotConnector, SnapshotConnector
 8 | import pytest
 9 | import random
10 | 
11 | ###############################################################################
12 | 
13 | DBPATH = '/home/anqxyr/heap/_scp/scp-wiki.2015-03-16.db'
14 | USERNAME = ''
15 | PASSWORD = ("""""")
16 | 
17 | 
18 | @pytest.mark.parametrize('cn', [
19 |     WikidotConnector('www.scp-wiki.net'),
20 |     SnapshotConnector('www.scp-wiki.net', DBPATH)])
21 | class TestSCPWikiConnectors:
22 | 
23 |     def test_revision(self, cn):
24 |         revision = cn('scp-1511').history[0]
25 |         assert revision.revision_id == 39167223
26 |         assert revision.page_id == 18578010
27 |         assert revision.number == 0
28 |         assert revision.user == 'anqxyr'
29 |         assert revision.time == '2013-06-30 16:34:37'
30 |         assert revision.comment == 'INITIATE HEAVEN SUBROUTINE'
31 | 
32 |     def test_post(self, cn):
33 |         post = cn('SCP-1511').comments[0]
34 |         assert post.post_id == 1806664
35 |         assert post.thread_id == 666715
36 |         assert post.parent is None
37 |         assert post.title is None
38 |         assert post.user == 'FlameShirt'
39 |         assert post.time == '2013-06-30 16:47:22'
40 |         assert post.wordcount == 26
41 | 
42 |     def test_list_pages(self, cn):
43 |         pages = list(cn.list_pages(author='anqxyr', tag='crystalline'))
44 |         assert pages == ['http://www.scp-wiki.net/scp-1511']
45 | 
46 |     def test_list_pages_rewrites(self, cn):
47 |         pages = list(cn.list_pages(author='thedeadlymoose', tag='thermal'))
48 |         assert 'http://www.scp-wiki.net/scp-003' in pages
49 | 
50 | 
51 | class TestActiveMethods:
52 | 
53 |     @pytest.fixture
54 |     def wiki(self, cache=[]):
55 |         if cache:
56 |             return cache[0]
57 |         if not USERNAME or not PASSWORD:
58 |             pytest.skip('need authentication data')
59 |         wiki = WikidotConnector('testwiki2')
60 |         wiki.auth(USERNAME, PASSWORD)
61 |         cache.append(wiki)
62 |         return wiki
63 | 
64 |     def test_edit_page(self, wiki):
65 |         value = random.randint(0, 1000000)
66 |         p = wiki('page1')
67 |         p.edit(value, comment='automated test')
68 |         assert p.source == str(value)
69 | 
70 |     def test_revert(self, wiki):
71 |         p = wiki('page1')
72 |         p.revert_to(24)
73 |         assert p.source == 'no source here'
74 | 
75 |     def test_set_tags(self, wiki):
76 |         value = random.randint(0, 1000000)
77 |         p = wiki('page1')
78 |         p.set_tags(p.tags + [str(value)])
79 |         assert str(value) in p.tags
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     pytest.main()
84 | 


--------------------------------------------------------------------------------