├── .gitignore ├── LICENSE ├── README.md ├── bin └── update_pages.py ├── pyscp ├── __init__.py ├── core.py ├── orm.py ├── resources │ ├── cover.png │ ├── pages │ │ ├── cover.xhtml │ │ ├── intro.xhtml │ │ ├── license.xhtml │ │ └── title.xhtml │ ├── stafflist.txt │ ├── stylesheet.css │ └── templates │ │ ├── container.xml │ │ ├── content.opf │ │ ├── page.xhtml │ │ └── toc.ncx ├── snapshot.py ├── stats │ ├── __init__.py │ ├── counters.py │ ├── filters.py │ ├── scalars.py │ └── updater.py ├── utils.py └── wikidot.py ├── setup.py └── tests └── test_core.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | *.sublime-* 3 | .idea/ 4 | *.log 5 | *.pass 6 | .project 7 | */.coverage 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 anqxyr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyscp 2 | 3 | **pyscp** is a python library for interacting with wikidot-hosted websites. The library is mainly intended for use by the administrative staff of the www.scp-wiki.net website, and has a host of feature exclusive to it. However, the majority of the core functionality should be applicalbe to any wikidot-based site. 4 | 5 | ## Installation 6 | 7 | Download the latest code, open the containing folder, and run the following command: 8 | ``` 9 | pip install . --user 10 | ``` 11 | Done. 12 | 13 | ## Examples 14 | 15 | ### Acessing Pages 16 | 17 | ```python 18 | import pyscp 19 | 20 | wiki = pyscp.wikidot.Wiki('www.scp-wiki.net') 21 | p = wiki('scp-837') 22 | print( 23 | '"{}" has a rating of {}, {} revisions, and {} comments.' 24 | .format(p.title, p.rating, len(p.history), len(p.comments))) 25 | ``` 26 | ``` 27 | "SCP-837: Multiplying Clay" has a rating of 108, 14 revisions, and 54 comments. 28 | ``` 29 | 30 | You can access other sites as well: 31 | 32 | ```python 33 | ru_wiki = pyscp.wikidot.Wiki('scpfoundation.ru') 34 | p = ru_wiki('scp-837') 35 | print('"{}" was created by {} on {}.'.format(p.title, p.author, p.created)) 36 | ``` 37 | ``` 38 | "SCP-837 - Глина умножения" was created by Gene R on 2012-12-26 11:12:13. 39 | ``` 40 | 41 | If the site doesn't use a custom domain, you can use the name of the site instead of the full url. E.g. `Wiki('scpsandbox2')` is the same as `Wiki('scpsandbox2.wikidot.com')`. 42 | 43 | ### Editing Pages 44 | 45 | ```python 46 | 47 | wiki = pyscp.wikidot.Wiki('scpsandbox2') 48 | wiki.auth('example_username', 'example_password') 49 | p = wiki('test') 50 | last_revision = p.history[-1].number 51 | p.edit( 52 | source='= This is centered **text** that uses Wikidot markup.', 53 | title="you can skip the title if you don't want changing it", 54 | #you can leave out the comment too, but that'd be rude 55 | comment='testing automated editing') 56 | print(p.text) # see if it worked 57 | p.revert(last_revision) # let's revert it back to what it were. 58 | ``` 59 | ``` 60 | This is centered text that uses Wikidot markup. 61 | ``` 62 | 63 | 64 | ### Snapshots 65 | 66 | When working with large number of pages, it could be faster to create a snapshot of the site than to download the pages one by one. Snapshots are optimized to download a large amount of data in the shortest possible time using multithreading. 67 | 68 | ```python 69 | import pyscp 70 | 71 | creator = pyscp.snapshot.SnapshotCreator('www.scp-wiki.net', 'snapshot_file.db') 72 | creator.take_snapshot(forums=False) 73 | # that's where we wait half an hour for it to finish 74 | ``` 75 | 76 | Once a snapshot is created, you can use `snapshot.Wiki` to read pages same as in the first example: 77 | 78 | ```python 79 | wiki = pyscp.snapshot.Wiki('www.scp-wiki.net', 'snapshot_file.db') 80 | p = wiki('scp-9005-2') 81 | print( 82 | '"{}" has a rating of {}, was created by {}, and is awesome.' 83 | .format(p.title, p.rating, p.author)) 84 | print('Other pages by {}:'.format(p.author)) 85 | for other in wiki.list_pages(author=p.author): 86 | print( 87 | '{} (rating: {}, created: {})' 88 | .format(other.title, other.rating, other.created)) 89 | ``` 90 | ``` 91 | Page "SCP-9005-2" has a rating of 80, was created by yellowdrakex, and is awesome. 92 | Other pages by yellowdrakex: 93 | ClusterfREDACTED (rating: 112, created: 2011-10-20 18:08:49) 94 | Dr Rights' Draft Box (rating: None, created: 2009-02-01 18:58:36) 95 | Dr. Rights' Personal Log (rating: 3, created: 2008-11-26 23:03:27) 96 | Dr. Rights' Personnel File (rating: 13, created: 2008-11-24 20:45:34) 97 | Fifteen To Sixteen (rating: 17, created: 2010-02-15 05:55:58) 98 | Great Short Story Concepts (rating: 1, created: 2010-06-03 19:26:06) 99 | RUN AWAY FOREVURRR (rating: 79, created: 2011-10-24 16:34:23) 100 | SCP-288: The "Stepford Marriage" Rings (rating: 56, created: 2008-11-27 07:47:01) 101 | SCP-291: Disassembler/Reassembler (rating: 113, created: 2008-11-24 20:11:11) 102 | ... 103 | ``` 104 | -------------------------------------------------------------------------------- /bin/update_pages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Update wiki pages. 5 | 6 | This script is used to update scp-wiki tale hubs and other such pages. 7 | """ 8 | 9 | ############################################################################### 10 | # Module Imports 11 | ############################################################################### 12 | 13 | import arrow 14 | import collections 15 | import logging 16 | import pyscp 17 | import re 18 | import string 19 | 20 | ############################################################################### 21 | 22 | log = logging.getLogger('pyscp') 23 | 24 | ############################################################################### 25 | 26 | TEMPLATE = """ 27 | [[# {name}]] 28 | [[div class="section"]] 29 | +++ {disp} 30 | [#top ⇑] 31 | {header} 32 | {body} 33 | [[/div]] 34 | 35 | """ 36 | 37 | ############################################################################### 38 | 39 | 40 | class Updater: 41 | 42 | def __init__(self, wiki, pages): 43 | self.wiki = wiki 44 | self.pages = pages 45 | 46 | def disp(self): 47 | return self.keys() 48 | 49 | def get_author(self, page): 50 | return page.build_attribution_string( 51 | user_formatter='[[user {}]]', separator=' _\n') 52 | 53 | def get_section(self, idx): 54 | name = self.keys()[idx] 55 | disp = self.disp()[idx] 56 | pages = [p for p in self.pages if self.keyfunc(p) == name] 57 | 58 | if pages: 59 | body = '\n'.join(map( 60 | self.format_page, sorted(pages, key=self.sortfunc))) 61 | else: 62 | body = self.NODATA 63 | 64 | return TEMPLATE.format( 65 | name=name.replace(' ', '-'), 66 | disp=disp, 67 | header=self.HEADER, 68 | body=body) 69 | 70 | def update(self, *targets): 71 | output = [''] 72 | for idx in range(len(self.keys())): 73 | section = self.get_section(idx) 74 | if len(output[-1]) + len(section) < 180000: 75 | output[-1] += section 76 | else: 77 | output.append(section) 78 | for idx, target in enumerate(targets): 79 | source = output[idx] if idx < len(output) else '' 80 | self.wiki(target).revert(0) 81 | self.wiki(target).edit(source, comment='automated update') 82 | log.info('{} {}'.format(target, len(source))) 83 | 84 | ############################################################################### 85 | 86 | 87 | class TaleUpdater(Updater): 88 | 89 | HEADER = '||~ Title||~ Author||~ Created||' 90 | NODATA = '||||||= **NO DATA AVAILABLE**||' 91 | 92 | def format_page(self, page=None): 93 | return '||[[[{}|]]]||{}||//{}//||\n||||||{}||'.format( 94 | page._body['fullname'], self.get_author(page), 95 | page.created[:10], page._body['preview']) 96 | 97 | def update(self, target): 98 | targets = [ 99 | 'component:tales-by-{}-{}'.format(target, i + 1) for i in range(5)] 100 | super().update(*targets) 101 | 102 | 103 | class TalesByTitle(TaleUpdater): 104 | 105 | def keys(self): 106 | return list(string.ascii_uppercase) + ['misc'] 107 | 108 | def keyfunc(self, page): 109 | if not page._body['title']: 110 | return 'misc' 111 | l = page._body['title'][0] 112 | return l.upper() if l.isalpha() else 'misc' 113 | 114 | def sortfunc(self, page): 115 | return page._body['title'].lower() 116 | 117 | 118 | class TalesByAuthor(TaleUpdater): 119 | 120 | def keys(self): 121 | return sorted(list(string.ascii_uppercase) + ['Dr', 'misc']) 122 | 123 | def keyfunc(self, page): 124 | templates = collections.defaultdict(lambda: '{user}') 125 | authors = page.build_attribution_string(templates).split(', ') 126 | author = authors[0] 127 | if re.match(r'Dr[^a-z]|Doctor|Doc[^a-z]', author): 128 | return 'Dr' 129 | elif author[0].isalpha(): 130 | return author[0].upper() 131 | else: 132 | return 'misc' 133 | 134 | def sortfunc(self, page): 135 | author = sorted(page.metadata.keys())[0] 136 | return author.lower() 137 | 138 | 139 | class TalesByDate(TaleUpdater): 140 | 141 | def disp(self): 142 | return [ 143 | arrow.get(i, 'YYYY-MM').format('MMMM YYYY') for i in self.keys()] 144 | 145 | def keys(self): 146 | return [i.format('YYYY-MM') for i in 147 | arrow.Arrow.range('month', arrow.get('2008-07'), arrow.now())] 148 | 149 | def keyfunc(self, page=None): 150 | return page.created[:7] 151 | 152 | def sortfunc(self, page): 153 | return page.created 154 | 155 | 156 | def update_tale_hubs(wiki): 157 | pages = list(wiki.list_pages( 158 | tags='tale -hub -_sys', 159 | body='title created_by created_at preview tags')) 160 | TalesByTitle(wiki, pages).update('title') 161 | TalesByAuthor(wiki, pages).update('author') 162 | TalesByDate(wiki, pages).update('date') 163 | 164 | ############################################################################### 165 | 166 | 167 | class CreditUpdater(Updater): 168 | 169 | HEADER = '' 170 | NODATA = '||||= **NO DATA AVAILABLE**||' 171 | 172 | def format_page(self, page): 173 | return '||[[[{}|{}]]]||{}||'.format( 174 | page._body['fullname'], 175 | page.title.replace('[', '').replace(']', ''), 176 | self.get_author(page)) 177 | 178 | def sortfunc(self, page): 179 | title = [] 180 | for word in re.split('([0-9]+)', page._body['title']): 181 | if word.isdigit(): 182 | title.append(int(word)) 183 | else: 184 | title.append(word.lower()) 185 | return title 186 | 187 | def update(self, target): 188 | super().update('component:credits-' + target) 189 | 190 | 191 | class SeriesCredits(CreditUpdater): 192 | 193 | def __init__(self, wiki, pages, series): 194 | super().__init__(wiki, pages) 195 | self.series = (series - 1) * 1000 196 | 197 | def keys(self): 198 | return ['{:03}-{:03}'.format(i or 2, i + 99) 199 | for i in range(self.series, self.series + 999, 100)] 200 | 201 | def keyfunc(self, page): 202 | num = re.search('[scp]+-([0-9]+)$', page._body['fullname']) 203 | if not num: 204 | return 205 | num = (int(num.group(1)) // 100) * 100 206 | return '{:03}-{:03}'.format(num or 2, num + 99) 207 | 208 | 209 | class MiscCredits(CreditUpdater): 210 | 211 | def __init__(self, wiki, pages): 212 | self.proposals = pyscp.wikidot.Wiki('scp-wiki')('scp-001').links 213 | super().__init__(wiki, pages) 214 | 215 | def keys(self): 216 | return 'proposals explained joke archived'.split() 217 | 218 | def disp(self): 219 | return [ 220 | '001 Proposals', 'Explained Phenomena', 221 | 'Joke Articles', 'Archived Articles'] 222 | 223 | def keyfunc(self, page): 224 | if page.url in self.proposals: 225 | return 'proposals' 226 | for tag in ('explained', 'joke', 'archived'): 227 | if tag in page.tags: 228 | return tag 229 | 230 | 231 | def update_credit_hubs(wiki): 232 | pages = list(wiki.list_pages( 233 | tag='scp', body='title created_by tags')) 234 | wiki = pyscp.wikidot.Wiki('scpsandbox2') 235 | with open('pyscp_bot.pass') as file: 236 | wiki.auth('jarvis-bot', file.read()) 237 | 238 | SeriesCredits(wiki, pages, 1).update('series1') 239 | SeriesCredits(wiki, pages, 2).update('series2') 240 | SeriesCredits(wiki, pages, 3).update('series3') 241 | MiscCredits(wiki, pages).update('misc') 242 | 243 | ############################################################################### 244 | 245 | wiki = pyscp.wikidot.Wiki('scp-wiki') 246 | with open('/media/hdd0/code/pyscp/bin/pyscp_bot.pass') as file: 247 | wiki.auth('jarvis-bot', file.read()) 248 | 249 | pyscp.utils.default_logging() 250 | #update_credit_hubs(wiki) 251 | 252 | update_tale_hubs(wiki) 253 | -------------------------------------------------------------------------------- /pyscp/__init__.py: -------------------------------------------------------------------------------- 1 | from pyscp import core, utils, snapshot, wikidot -------------------------------------------------------------------------------- /pyscp/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Abstract Base Classes. 5 | 6 | pyscp builds most of its functionality on top of three large classes: Wiki, 7 | Page, and Thread. This module contains the abstract base classes for those 8 | three. The ABC-s define the abstact methods that each child must implement, 9 | as well as some common functionality that builds on top of the abstract 10 | methods. 11 | 12 | Each class inheriting from the ABC-s must implement its own realization of 13 | the abstract methods, and can also provide additional methods unique to it. 14 | 15 | This module also defines the named tuples for simple containers used by the 16 | three core classes, such as Revision or Vote. 17 | """ 18 | 19 | 20 | ############################################################################### 21 | # Module Imports 22 | ############################################################################### 23 | 24 | import abc 25 | import arrow 26 | import bs4 27 | import collections 28 | import functools 29 | import itertools 30 | import re 31 | import urllib.parse 32 | import logging 33 | 34 | import pyscp.utils 35 | 36 | ############################################################################### 37 | # Global Constants And Variables 38 | ############################################################################### 39 | 40 | log = logging.getLogger(__name__) 41 | 42 | ############################################################################### 43 | # Abstract Base Classes 44 | ############################################################################### 45 | 46 | 47 | class Page(metaclass=abc.ABCMeta): 48 | """ 49 | Page Abstract Base Class. 50 | 51 | Page object are wrappers around individual wiki-pages, and allow simple 52 | operations with them, such as retrieving the rating or the author. 53 | 54 | Each Page instance is attached to a specific instance of the Wiki class. 55 | The wiki may be used by the page to retrieve a list of titles or other 56 | similar wiki-wide information that may be used by the Page to, in turn, 57 | deduce some information about itself. 58 | 59 | Typically, the Page instances should not be created directly. Instead, 60 | calling an instance of a Wiki class will creating a Page instance 61 | attached to that wiki. 62 | """ 63 | 64 | ########################################################################### 65 | # Special Methods 66 | ########################################################################### 67 | 68 | def __init__(self, wiki, url): 69 | self.url = url 70 | self._wiki = wiki 71 | 72 | def __repr__(self): 73 | return '{}.{}({}, {})'.format( 74 | self.__module__, self.__class__.__name__, 75 | repr(self.url), repr(self._wiki)) 76 | 77 | def __eq__(self, other): 78 | if not hasattr(other, 'url') or not hasattr(other, '_wiki'): 79 | return False 80 | return self.url == other.url and self._wiki is other._wiki 81 | 82 | ########################################################################### 83 | # Abstract Methods 84 | ########################################################################### 85 | 86 | @property 87 | @abc.abstractmethod 88 | def _pdata(self): 89 | """ 90 | Commonly used data about the page. 91 | 92 | This method should return a tuple, the first three elements of which 93 | are the id number of the page; the id number of the page's comments 94 | thread; and the html contents of the page. 95 | 96 | Any additional elements of the tuple are left to the discretion 97 | of the individual Page implimentations. 98 | """ 99 | pass 100 | 101 | @property 102 | @abc.abstractmethod 103 | def history(self): 104 | """ 105 | Revision history of the page. 106 | 107 | Should return a sorted list of Revision named tuples. 108 | """ 109 | pass 110 | 111 | @property 112 | @abc.abstractmethod 113 | def votes(self): 114 | """ 115 | Page votes. 116 | 117 | Should return a list of Vote named tuples. 118 | """ 119 | pass 120 | 121 | @property 122 | @abc.abstractmethod 123 | def tags(self): 124 | """ 125 | Page tags. 126 | 127 | Should return a set of strings. 128 | """ 129 | pass 130 | 131 | ########################################################################### 132 | # Internal Methods 133 | ########################################################################### 134 | 135 | @property 136 | def _id(self): 137 | """Unique ID number of the page.""" 138 | return self._pdata[0] 139 | 140 | @pyscp.utils.cached_property 141 | def _thread(self): 142 | """Thread object corresponding to the page's comments thread.""" 143 | return self._wiki.Thread(self._wiki, self._pdata[1]) 144 | 145 | @property 146 | def _raw_title(self): 147 | """Title as displayed on the page.""" 148 | title = self._soup.find(id='page-title') 149 | return title.text.strip() if title else '' 150 | 151 | @property 152 | def _raw_author(self): 153 | return self.history[0].user 154 | 155 | @property 156 | def _soup(self): 157 | """BeautifulSoup of the contents of the page.""" 158 | return bs4.BeautifulSoup(self.html, 'lxml') 159 | 160 | ########################################################################### 161 | # Properties 162 | ########################################################################### 163 | 164 | @property 165 | def html(self): 166 | """HTML contents of the page.""" 167 | return self._pdata[2] 168 | 169 | @property 170 | def posts(self): 171 | """List of the comments made on the page.""" 172 | return self._thread.posts 173 | 174 | @property 175 | def comments(self): 176 | """Alias for Page.posts.""" 177 | return self._thread.posts 178 | 179 | @property 180 | def text(self): 181 | """Plain text of the page.""" 182 | return self._soup.find(id='page-content').text 183 | 184 | @property 185 | def wordcount(self): 186 | """Number of words encountered on the page.""" 187 | return len(re.findall(r"[\w'█_-]+", self.text)) 188 | 189 | @property 190 | def images(self): 191 | """Number of images dislayed on the page.""" 192 | # TODO: needs more work. 193 | return [i['src'] for i in self._soup('img')] 194 | 195 | @property 196 | def name(self): 197 | return self.url.split('/')[-1] 198 | 199 | @property 200 | def title(self): 201 | """ 202 | Title of the page. 203 | 204 | In case of SCP articles, will include the title from the 'series' page. 205 | """ 206 | try: 207 | return '{}: {}'.format( 208 | self._raw_title, self._wiki.titles()[self.url]) 209 | except KeyError: 210 | return self._raw_title 211 | 212 | @property 213 | def created(self): 214 | """When was the page created.""" 215 | return self.history[0].time 216 | 217 | @property 218 | def metadata(self): 219 | """ 220 | Return page metadata. 221 | 222 | Authors in this case includes all users related to the creation 223 | and subsequent maintenance of the page. The values of the dict 224 | describe the user's relationship to the page. 225 | """ 226 | data = [i for i in self._wiki.metadata() if i.url == self.url] 227 | data = {i.user: i for i in data} 228 | 229 | if 'author' not in {i.role for i in data.values()}: 230 | meta = Metadata(self.url, self._raw_author, 'author', None) 231 | data[self._raw_author] = meta 232 | 233 | for k, v in data.items(): 234 | if v.role == 'author' and not v.date: 235 | data[k] = v._replace(date=self.created) 236 | 237 | return data 238 | 239 | @property 240 | def rating(self): 241 | """Rating of the page, excluding deleted accounts.""" 242 | return sum( 243 | v.value for v in self.votes if v.user != '(account deleted)') 244 | 245 | @property 246 | @pyscp.utils.listify() 247 | def links(self): 248 | """ 249 | Other pages linked from this one. 250 | 251 | Returns an ordered list of unique urls. Off-site links or links to 252 | images are not included. 253 | """ 254 | unique = set() 255 | for element in self._soup.select('#page-content a'): 256 | href = element.get('href', None) 257 | if (not href or href[0] != '/' or # bad or absolute link 258 | href[-4:] in ('.png', '.jpg', '.gif')): 259 | continue 260 | url = self._wiki.site + href.rstrip('|') 261 | if url not in unique: 262 | unique.add(url) 263 | yield url 264 | 265 | @property 266 | def parent(self): 267 | """Parent of the current page.""" 268 | if not self.html: 269 | return None 270 | breadcrumb = self._soup.select('#breadcrumbs a') 271 | if breadcrumb: 272 | return self._wiki.site + breadcrumb[-1]['href'] 273 | 274 | @property 275 | def is_mainlist(self): 276 | """ 277 | Indicate whether the page is a mainlist scp article. 278 | 279 | This is an scp-wiki exclusive property. 280 | """ 281 | if 'scp-wiki' not in self._wiki.site: 282 | return False 283 | if 'scp' not in self.tags: 284 | return False 285 | return bool(re.search(r'/scp-[0-9]{3,4}$', self.url)) 286 | 287 | ########################################################################### 288 | # Methods 289 | ########################################################################### 290 | 291 | def build_attribution_string( 292 | self, templates=None, group_templates=None, separator=', ', 293 | user_formatter=None): 294 | """ 295 | Create an attribution string based on the page's metadata. 296 | 297 | This is a commonly needed operation. The result should be a nicely 298 | formatted, human-readable description of who was and is involved with 299 | the page, and in what role. 300 | """ 301 | roles = 'author rewrite translator maintainer'.split() 302 | 303 | if not templates: 304 | templates = {i: '{{user}} ({})'.format(i) for i in roles} 305 | 306 | items = list(self.metadata.values()) 307 | items.sort(key=lambda x: [roles.index(x.role), x.date]) 308 | 309 | # group users in the same role on the same date together 310 | itemdict = collections.OrderedDict() 311 | for i in items: 312 | user = user_formatter.format(i.user) if user_formatter else i.user 313 | key = (i.role, i.date) 314 | itemdict[key] = itemdict.get(key, []) + [user] 315 | 316 | output = [] 317 | 318 | for (role, date), users in itemdict.items(): 319 | 320 | hdate = arrow.get(date).humanize() if date else '' 321 | 322 | if group_templates and len(users) > 1: 323 | output.append( 324 | group_templates[role].format( 325 | date=date, 326 | hdate=hdate, 327 | users=', '.join(users[:-1]), 328 | last_user=users[-1])) 329 | else: 330 | for user in users: 331 | output.append( 332 | templates[role].format( 333 | date=date, hdate=hdate, user=user)) 334 | 335 | return separator.join(output) 336 | 337 | 338 | class Thread(metaclass=abc.ABCMeta): 339 | """ 340 | Thread Abstract Base Class. 341 | 342 | Thread objects represent individual forum threads. Most pages have a 343 | corresponding comments thread, accessible via Page._thread. 344 | """ 345 | 346 | def __init__(self, wiki, _id, title=None, description=None): 347 | self._wiki = wiki 348 | self._id, self.title, self.description = _id, title, description 349 | 350 | @abc.abstractmethod 351 | def posts(self): 352 | """Posts in this thread.""" 353 | pass 354 | 355 | 356 | class Wiki(metaclass=abc.ABCMeta): 357 | """ 358 | Wiki Abstract Base Class. 359 | 360 | Wiki objects provide wiki-wide functionality not limited to individual 361 | pages or threads. 362 | """ 363 | 364 | ########################################################################### 365 | # Class Attributes 366 | ########################################################################### 367 | 368 | # should point to the respective Page and Thread classes in each submodule. 369 | 370 | Page = Page 371 | Thread = Thread 372 | 373 | ########################################################################### 374 | # Special Methods 375 | ########################################################################### 376 | 377 | def __init__(self, site): 378 | parsed = urllib.parse.urlparse(site) 379 | netloc = parsed.netloc if parsed.netloc else parsed.path 380 | if '.' not in netloc: 381 | netloc += '.wikidot.com' 382 | self.site = urllib.parse.urlunparse(['http', netloc, '', '', '', '']) 383 | self._title_data = {} 384 | 385 | def __call__(self, name): 386 | url = name if self.site in name else '{}/{}'.format(self.site, name) 387 | url = url.replace(' ', '-').replace('_', '-').lower() 388 | return self.Page(self, url) 389 | 390 | ########################################################################### 391 | 392 | @functools.lru_cache(maxsize=1) 393 | def metadata(self): 394 | """ 395 | List page ownership metadata. 396 | 397 | This method is exclusive to the scp-wiki, and is used to fine-tune 398 | the page ownership information beyond what is possible with Wikidot. 399 | This allows a single page to have an author different from the user 400 | who created the zeroth revision of the page, or even have multiple 401 | users attached to the page in various roles. 402 | """ 403 | if 'scp-wiki' not in self.site: 404 | return [] 405 | soup = self('attribution-metadata')._soup 406 | results = [] 407 | for row in soup('tr')[1:]: 408 | name, user, type_, date = [i.text.strip() for i in row('td')] 409 | name = name.lower() 410 | url = '{}/{}'.format(self.site, name) 411 | results.append(pyscp.core.Metadata(url, user, type_, date)) 412 | return results 413 | 414 | def _update_titles(self): 415 | for name in ( 416 | 'scp-series', 'scp-series-2', 'scp-series-3', 'scp-series-4', 'scp-series-5', 417 | 'joke-scps', 'scp-ex', 'archived-scps'): 418 | page = self(name) 419 | try: 420 | soup = page._soup 421 | except: 422 | continue 423 | self._title_data[name] = soup 424 | 425 | @functools.lru_cache(maxsize=1) 426 | @pyscp.utils.ignore(value={}) 427 | @pyscp.utils.log_errors(logger=log.error) 428 | def titles(self): 429 | """Dict of url/title pairs for scp articles.""" 430 | if 'scp-wiki' not in self.site: 431 | return {} 432 | 433 | self._update_titles() 434 | 435 | elems = [i.select('ul > li') for i in self._title_data.values()] 436 | elems = list(itertools.chain(*elems)) 437 | try: 438 | elems += list(self('scp-001')._soup(class_='series')[1]('p')) 439 | except: 440 | pass 441 | 442 | titles = {} 443 | for elem in elems: 444 | 445 | sep = ' - ' if ' - ' in elem.text else ', ' 446 | try: 447 | url1 = self.site + elem.a['href'] 448 | skip, title = elem.text.split(sep, maxsplit=1) 449 | except (ValueError, TypeError): 450 | continue 451 | 452 | if title != '[ACCESS DENIED]': 453 | url2 = self.site + '/' + skip.lower() 454 | titles[url1] = titles[url2] = title 455 | 456 | return titles 457 | 458 | def list_pages(self, **kwargs): 459 | """Return pages matching the specified criteria.""" 460 | pages = self._list_pages_parsed(**kwargs) 461 | author = kwargs.pop('author', None) 462 | if not author: 463 | # if 'author' isn't specified, there's no need to check rewrites 464 | return pages 465 | include, exclude = set(), set() 466 | for meta in self.metadata(): 467 | if meta.user == author: 468 | # if username matches, include regardless of type 469 | include.add(meta.url) 470 | elif meta.role == 'author': 471 | # exclude only if override type is author. 472 | # if url has author and rewrite author, 473 | # it will appear in list_pages for both. 474 | exclude.add(meta.url) 475 | urls = {p.url for p in pages} | include - exclude 476 | # if no other options beside author were specified, 477 | # just return everything we can 478 | if not kwargs: 479 | return map(self, sorted(urls)) 480 | # otherwise, retrieve the list of urls without the author parameter 481 | # to check which urls we should return and in which order 482 | pages = self._list_pages_parsed(**kwargs) 483 | return [p for p in pages if p.url in urls] 484 | 485 | ############################################################################### 486 | # Named Tuple Containers 487 | ############################################################################### 488 | 489 | nt = collections.namedtuple 490 | Revision = nt('Revision', 'id number user time comment') 491 | Vote = nt('Vote', 'user value') 492 | Post = nt('Post', 'id title content user time parent') 493 | File = nt('File', 'url name filetype size') 494 | Metadata = nt('Metadata', 'url user role date') 495 | Category = nt('Category', 'id title description size') 496 | Image = nt('Image', 'url source status notes data') 497 | del nt 498 | -------------------------------------------------------------------------------- /pyscp/orm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ############################################################################### 4 | # Module Imports 5 | ############################################################################### 6 | 7 | import concurrent.futures 8 | import logging 9 | import peewee 10 | import queue 11 | 12 | from itertools import islice 13 | 14 | ############################################################################### 15 | # Global Constants And Variables 16 | ############################################################################### 17 | 18 | log = logging.getLogger('pyscp.orm') 19 | pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) 20 | queue = queue.Queue() 21 | 22 | 23 | def queue_execution(fn, args=(), kw={}): 24 | queue.put(dict(fn=fn, args=args, kw=kw)) 25 | pool.submit(async_write) 26 | 27 | ############################################################################### 28 | # Database ORM Classes 29 | ############################################################################### 30 | 31 | db = peewee.Proxy() 32 | 33 | 34 | class BaseModel(peewee.Model): 35 | 36 | class Meta: 37 | database = db 38 | 39 | @classmethod 40 | def create(cls, **kw): 41 | queue_execution(fn=super().create, kw=kw) 42 | 43 | @classmethod 44 | def create_table(cls): 45 | if not hasattr(cls, '_id_cache'): 46 | cls._id_cache = [] 47 | queue_execution(fn=super().create_table, args=(True,)) 48 | 49 | @classmethod 50 | def insert_many(cls, data): 51 | data_iter = iter(data) 52 | chunk = list(islice(data_iter, 500)) 53 | while chunk: 54 | queue_execution( 55 | fn=lambda x: super(BaseModel, cls).insert_many(x).execute(), 56 | args=(chunk, )) 57 | chunk = list(islice(data_iter, 500)) 58 | 59 | @classmethod 60 | def convert_to_id(cls, data, key='user'): 61 | for row in data: 62 | if row[key] not in cls._id_cache: 63 | cls._id_cache.append(row[key]) 64 | row[key] = cls._id_cache.index(row[key]) + 1 65 | yield row 66 | 67 | @classmethod 68 | def write_ids(cls, field_name): 69 | cls.insert_many([ 70 | {'id': cls._id_cache.index(value) + 1, field_name: value} 71 | for value in set(cls._id_cache)]) 72 | cls._id_cache.clear() 73 | 74 | 75 | class ForumCategory(BaseModel): 76 | title = peewee.CharField() 77 | description = peewee.TextField() 78 | 79 | 80 | class ForumThread(BaseModel): 81 | category = peewee.ForeignKeyField(ForumCategory, null=True) 82 | title = peewee.CharField(null=True) 83 | description = peewee.TextField(null=True) 84 | 85 | 86 | class Page(BaseModel): 87 | url = peewee.CharField(unique=True) 88 | html = peewee.TextField() 89 | thread = peewee.ForeignKeyField( 90 | ForumThread, related_name='page', null=True) 91 | 92 | 93 | class User(BaseModel): 94 | name = peewee.CharField(unique=True) 95 | 96 | 97 | class Revision(BaseModel): 98 | page = peewee.ForeignKeyField(Page, related_name='revisions', index=True) 99 | user = peewee.ForeignKeyField(User, related_name='revisions', index=True) 100 | number = peewee.IntegerField() 101 | time = peewee.DateTimeField() 102 | comment = peewee.CharField(null=True) 103 | 104 | 105 | class Vote(BaseModel): 106 | page = peewee.ForeignKeyField(Page, related_name='votes', index=True) 107 | user = peewee.ForeignKeyField(User, related_name='votes', index=True) 108 | value = peewee.IntegerField() 109 | 110 | 111 | class ForumPost(BaseModel): 112 | thread = peewee.ForeignKeyField( 113 | ForumThread, related_name='posts', index=True) 114 | user = peewee.ForeignKeyField(User, related_name='posts', index=True) 115 | parent = peewee.ForeignKeyField('self', null=True) 116 | title = peewee.CharField(null=True) 117 | time = peewee.DateTimeField() 118 | content = peewee.TextField() 119 | 120 | 121 | class Tag(BaseModel): 122 | name = peewee.CharField(unique=True) 123 | 124 | 125 | class PageTag(BaseModel): 126 | page = peewee.ForeignKeyField(Page, related_name='tags', index=True) 127 | tag = peewee.ForeignKeyField(Tag, related_name='pages', index=True) 128 | 129 | 130 | class OverrideType(BaseModel): 131 | name = peewee.CharField(unique=True) 132 | 133 | 134 | class Override(BaseModel): 135 | url = peewee.ForeignKeyField(Page, to_field=Page.url, index=True) 136 | user = peewee.ForeignKeyField(User, index=True) 137 | type = peewee.ForeignKeyField(OverrideType) 138 | 139 | 140 | class ImageStatus(BaseModel): 141 | name = peewee.CharField(unique=True) 142 | 143 | 144 | class Image(BaseModel): 145 | url = peewee.CharField(unique=True) 146 | source = peewee.CharField() 147 | data = peewee.BlobField() 148 | status = peewee.ForeignKeyField(ImageStatus) 149 | notes = peewee.TextField(null=True) 150 | 151 | ############################################################################### 152 | # Helper Functions 153 | ############################################################################### 154 | 155 | 156 | def async_write(buffer=[]): 157 | item = queue.get() 158 | buffer.append(item) 159 | if len(buffer) > 500 or queue.empty(): 160 | log.debug('Processing {} queue items.'.format(len(buffer))) 161 | with db.transaction(): 162 | write_buffer(buffer) 163 | buffer.clear() 164 | 165 | 166 | def write_buffer(buffer): 167 | for item in buffer: 168 | try: 169 | item['fn'](*item.get('args', ()), **item.get('kw', {})) 170 | except: 171 | log.exception( 172 | 'Exception while processing queue item: {}' 173 | .format(item)) 174 | queue.task_done() 175 | 176 | 177 | def create_tables(*tables): 178 | for table in tables: 179 | eval(table).create_table() 180 | 181 | 182 | def connect(dbpath): 183 | log.info('Connecting to the database at {}'.format(dbpath)) 184 | db.initialize(peewee.SqliteDatabase(dbpath)) 185 | db.connect() 186 | 187 | 188 | ############################################################################### 189 | # Macros 190 | ############################################################################### 191 | 192 | 193 | def votes_by_user(user): 194 | up, down = [], [] 195 | for vote in (Vote.select().join(User).where(User.name == user)): 196 | if vote.value == 1: 197 | up.append(vote.page.url) 198 | else: 199 | down.append(vote.page.url) 200 | return {'+': up, '-': down} 201 | -------------------------------------------------------------------------------- /pyscp/resources/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anqxyr/pyscp/fc85c808495f8f47783db6fb12a79ce7727e919c/pyscp/resources/cover.png -------------------------------------------------------------------------------- /pyscp/resources/pages/cover.xhtml: -------------------------------------------------------------------------------- 1 |
2 | 3 |
-------------------------------------------------------------------------------- /pyscp/resources/pages/intro.xhtml: -------------------------------------------------------------------------------- 1 |
2 |

Mankind in its present state has been around for a quarter of a million years, yet only the last 4,000 have been of any significance.

3 |

So, what did we do for nearly 250,000 years? We huddled in caves and around small fires, fearful of the things that we didn't understand. It was more than explaining why the sun came up, it was the mystery of enormous birds with heads of men and rocks that came to life. So we called them 'gods' and 'demons', begged them to spare us, and prayed for salvation.

4 |

In time, their numbers dwindled and ours rose. The world began to make more sense when there were fewer things to fear, yet the unexplained can never truly go away, as if the universe demands the absurd and impossible.

5 |

Mankind must not go back to hiding in fear.No one else will protect us, and we must stand up for ourselves.

6 |

While the rest of mankind dwells in the light, we must stand in the darkness to fight it, contain it, and shield it from the eyes of the public, so that others may live in a sane and normal world.

7 |
8 |

We secure. We contain. We protect.

9 |

— The Administrator

10 |
11 |
12 | -------------------------------------------------------------------------------- /pyscp/resources/pages/license.xhtml: -------------------------------------------------------------------------------- 1 |

This book contains the collected works of the SCP Foundation, a collaborative fiction writing website. All contents are licensed under the CC-BY-SA 3.0 license. The stories comprising the book are available online at www.scp-wiki.net .

-------------------------------------------------------------------------------- /pyscp/resources/pages/title.xhtml: -------------------------------------------------------------------------------- 1 |
2 |

SCP Foundation

3 |
Ebook edition
4 |
-------------------------------------------------------------------------------- /pyscp/resources/stafflist.txt: -------------------------------------------------------------------------------- 1 | Accelerando 2 | anqxyr 3 | Blaroth 4 | Bouncl 5 | Chubert 6 | Crayne 7 | Devereaux 8 | Dexanote 9 | djkaktus 10 | Doctor Anborough 11 | DrBright 12 | DrClef 13 | DrEverettMann 14 | Drewbear 15 | Eskobar 16 | Faminepulse 17 | Fantem 18 | FlameShirt 19 | FortuneFavorsBold 20 | Gaffney 21 | Kalinin 22 | Kate McTiriss 23 | LurkD 24 | MisterFlames 25 | murphy_slaw 26 | Nioki 27 | Photosynthetic 28 | Pig_catapult 29 | Pixeltasim 30 | ProcyonLotor 31 | pxdnbluesoul 32 | Reject 33 | Riemann 34 | Roget 35 | Rumetzen 36 | Silberescher 37 | Sophia Light 38 | SoullessSingularity 39 | spikebrennan 40 | thattallfellow 41 | thedeadlymoose 42 | TroyL 43 | Tuomey Tombstone 44 | Vincent_Redgrave 45 | Vivax 46 | weizhong 47 | Wogglebug 48 | Zyn 49 | -------------------------------------------------------------------------------- /pyscp/resources/stylesheet.css: -------------------------------------------------------------------------------- 1 | @namespace h "http://www.w3.org/1999/xhtml"; 2 | .title1 { 3 | text-align: center; 4 | } 5 | .title1-bold { 6 | font-size: 250%; 7 | font-weight: bold; 8 | } 9 | .title2 { 10 | font-size: 150%; 11 | font-weight: bold; 12 | text-align: center; 13 | } 14 | .bold { 15 | font-weight: bold; 16 | } 17 | .italic { 18 | font-style: italic; 19 | } 20 | .license { 21 | font-style: italic; 22 | margin-left: 10%; 23 | margin-top: 40%; 24 | max-width: 80%; 25 | text-align: justify; 26 | } 27 | .quote { 28 | background-color: #fafafa; 29 | border: 1px dashed #bbb; 30 | margin: 0.5em 5%; 31 | padding: 0 1em; 32 | } 33 | .collapsible { 34 | background-color: #fafafa; 35 | margin: 0.5em 5%; 36 | padding: 0 1em; 37 | } 38 | .collaps-title { 39 | border-bottom: 1px solid #444; 40 | font-weight: bold; 41 | margin: 0 -1em; 42 | padding: 0.5em 1em; 43 | } 44 | .collapsible .quote{ 45 | background-color: #E0E0E0; 46 | } 47 | .scp-title { 48 | font-size: 120%; 49 | font-weight: bold; 50 | margin: 2em 0; 51 | } 52 | .tale-title { 53 | font-size: 120%; 54 | font-style: italic; 55 | margin: 2em 0; 56 | text-align: center; 57 | } 58 | table { 59 | border-bottom: 1px solid #999; 60 | border-collapse: separate; 61 | border-right: 1px solid #999; 62 | border-spacing: 0; 63 | } 64 | table th { 65 | background-color: #e0e0e0; 66 | } 67 | table th, table td { 68 | border-left: 1px solid #999; 69 | border-top: 1px solid #999; 70 | max-width: 1600px; 71 | overflow: hidden; 72 | padding: 10px; 73 | page-break-inside: avoid; 74 | } 75 | .intro { 76 | font-size: 130%; 77 | margin: 5em 5%; 78 | } 79 | .sign { 80 | text-align: center; 81 | } 82 | .attrib { 83 | font-size: 80%; 84 | } 85 | .link { 86 | font-family: monospace; 87 | font-weight: bold; 88 | text-decoration: underline; 89 | } 90 | .footer { 91 | text-align: center; 92 | } 93 | * { 94 | font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif; 95 | } 96 | .scp-image-block { 97 | float: right; 98 | clear: right; 99 | margin: 0 2em 1em 2em; 100 | border: solid 1px #666; 101 | box-shadow: 0 1px 6px rgba(0,0,0,.25); 102 | width: 300px; 103 | } 104 | .scp-image-block.block-left { 105 | float: left; 106 | clear: left; 107 | margin: 0 2em 1em 0; 108 | } 109 | .scp-image-block img { 110 | border: 0; 111 | width: 300px; 112 | } 113 | .scp-image-block .scp-image-caption { 114 | background-color: #eee; 115 | border-top: solid 1px #666; 116 | padding: 2px 0; 117 | font-size: 80%; 118 | font-weight: bold; 119 | text-align: center; 120 | width: 300px; 121 | } 122 | .scp-image-block > p { 123 | margin: 0; 124 | } 125 | .scp-image-block .scp-image-caption > p { 126 | margin: 0; 127 | padding: 0 10px; 128 | } 129 | -------------------------------------------------------------------------------- /pyscp/resources/templates/container.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /pyscp/resources/templates/content.opf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /pyscp/resources/templates/page.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | <link href="../stylesheet.css" rel="stylesheet" type="text/css"/> 7 | </head> 8 | <body/> 9 | </html> -------------------------------------------------------------------------------- /pyscp/resources/templates/toc.ncx: -------------------------------------------------------------------------------- 1 | <?xml version='1.0' encoding='UTF-8'?> 2 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> 3 | <head> 4 | <meta content="" name="dtb:uid"/> 5 | <meta content="0" name="dtb:depth"/> 6 | <meta content="0" name="dtb:totalPageCount"/> 7 | <meta content="0" name="dtb:maxPageNumber"/> 8 | </head> 9 | <docTitle> 10 | <text/> 11 | </docTitle> 12 | <navMap/> 13 | </ncx> -------------------------------------------------------------------------------- /pyscp/snapshot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Snapshot access classes. 5 | 6 | This module contains the classes that facilitate information extraction 7 | and communication with the sqlite Snapshots. 8 | """ 9 | 10 | ############################################################################### 11 | # Module Imports 12 | ############################################################################### 13 | 14 | import bs4 15 | import concurrent.futures 16 | import functools 17 | import itertools 18 | import logging 19 | import operator 20 | import pathlib 21 | import re 22 | import requests 23 | 24 | from pyscp import core, orm, utils 25 | 26 | ############################################################################### 27 | # Global Constants And Variables 28 | ############################################################################### 29 | 30 | log = logging.getLogger(__name__) 31 | 32 | ############################################################################### 33 | 34 | 35 | class Page(core.Page): 36 | """Page object.""" 37 | 38 | ########################################################################### 39 | # Internal Methods 40 | ########################################################################### 41 | 42 | def _query(self, ptable, stable='User'): 43 | """Generate SQL queries used to retrieve data.""" 44 | pt, st = [getattr(orm, i) for i in (ptable, stable)] 45 | return pt.select(pt, st.name).join(st).where(pt.page == self._id) 46 | 47 | @utils.cached_property 48 | def _pdata(self): 49 | """Preload the ids and contents of the page.""" 50 | pdata = orm.Page.get(orm.Page.url == self.url) 51 | return pdata.id, pdata._data['thread'], pdata.html 52 | 53 | ########################################################################### 54 | # Properties 55 | ########################################################################### 56 | 57 | @property 58 | def html(self): 59 | """Return HTML contents of the page.""" 60 | return self._pdata[2] 61 | 62 | @utils.cached_property 63 | def history(self): 64 | """Return the revisions of the page.""" 65 | revs = self._query('Revision') 66 | revs = sorted(revs, key=lambda x: x.number) 67 | return [core.Revision( 68 | r.id, r.number, r.user.name, str(r.time), r.comment) 69 | for r in revs] 70 | 71 | @utils.cached_property 72 | def votes(self): 73 | """Return all votes made on the page.""" 74 | return [core.Vote(v.user.name, v.value) 75 | for v in self._query('Vote')] 76 | 77 | @utils.cached_property 78 | def tags(self): 79 | """Return the set of tags with which the page is tagged.""" 80 | return {pt.tag.name for pt in self._query('PageTag', 'Tag')} 81 | 82 | 83 | class Thread(core.Thread): 84 | """Discussion/forum thread.""" 85 | 86 | @utils.cached_property 87 | def posts(self): 88 | """Post objects belonging to this thread.""" 89 | fp = orm.ForumPost 90 | us = orm.User 91 | query = fp.select(fp, us.name).join(us).where(fp.thread == self._id) 92 | return [core.Post( 93 | p.id, p.title, p.content, p.user.name, 94 | str(p.time), p._data['parent']) 95 | for p in query] 96 | 97 | 98 | class Wiki(core.Wiki): 99 | """Snapshot of a Wikidot website.""" 100 | 101 | Page = Page 102 | Thread = Thread 103 | # Tautology = Tautology 104 | 105 | ########################################################################### 106 | # Special Methods 107 | ########################################################################### 108 | 109 | def __init__(self, site, dbpath): 110 | """Create wiki instance.""" 111 | super().__init__(site) 112 | if not pathlib.Path(dbpath).exists(): 113 | raise FileNotFoundError(dbpath) 114 | self.dbpath = dbpath 115 | orm.connect(dbpath) 116 | 117 | def __repr__(self): 118 | """Pretty-print current instance.""" 119 | return '{}.{}({}, {})'.format( 120 | self.__module__, 121 | self.__class__.__qualname__, 122 | repr(self.site), 123 | repr(self.dbpath)) 124 | 125 | ########################################################################### 126 | # Internal Methods 127 | ########################################################################### 128 | 129 | @staticmethod 130 | def _filter_author(author): 131 | return (orm.Page.select(orm.Page.url) 132 | .join(orm.Revision).join(orm.User) 133 | .where(orm.Revision.number == 0) 134 | .where(orm.User.name == author)) 135 | 136 | @staticmethod 137 | def _filter_tag(tag): 138 | return (orm.Page.select(orm.Page.url) 139 | .join(orm.PageTag).join(orm.Tag) 140 | .where(orm.Tag.name == tag)) 141 | 142 | @staticmethod 143 | def _get_operator(string): 144 | symbol, *values = re.split(r'(\d+)', string) 145 | opdict = { 146 | '>': 'gt', '<': 'lt', '>=': 'ge', '<=': 'le', '=': 'eq', '': 'eq'} 147 | if symbol not in opdict: 148 | raise ValueError 149 | return getattr(operator, opdict[symbol]), values 150 | 151 | def _filter_rating(self, rating): 152 | compare, values = self._get_operator(rating) 153 | rating = int(values[0]) 154 | return (orm.Page.select(orm.Page.url) 155 | .join(orm.Vote).group_by(orm.Page.url) 156 | .having(compare(orm.peewee.fn.sum(orm.Vote.value), rating))) 157 | 158 | def _filter_created(self, created): 159 | compare, values = self._get_operator(created) 160 | date = '-'.join(values[::2]) 161 | return (orm.Page.select(orm.Page.url) 162 | .join(orm.Revision).where(orm.Revision.number == 0) 163 | .group_by(orm.Page.url) 164 | .having(compare( 165 | orm.peewee.fn.substr(orm.Revision.time, 1, len(date)), 166 | date))) 167 | 168 | def _list_pages_parsed(self, **kwargs): 169 | query = orm.Page.select(orm.Page.url) 170 | keys = ('author', 'tag', 'rating', 'created') 171 | keys = [k for k in keys if k in kwargs] 172 | for k in keys: 173 | query = query & getattr(self, '_filter_' + k)(kwargs[k]) 174 | if 'limit' in kwargs: 175 | query = query.limit(kwargs['limit']) 176 | return map(self, [p.url for p in query]) 177 | 178 | ########################################################################### 179 | # SCP-Wiki Specific Methods 180 | ########################################################################### 181 | 182 | @functools.lru_cache(maxsize=1) 183 | def list_images(self): 184 | """Image metadata.""" 185 | query = ( 186 | orm.Image.select(orm.Image, orm.ImageStatus.name) 187 | .join(orm.ImageStatus)) 188 | return [core.Image(r.url, r.source, r.status.name, r.notes, r.data) 189 | for r in query] 190 | 191 | ############################################################################### 192 | 193 | 194 | class SnapshotCreator: 195 | """ 196 | Create a snapshot of a wikidot site. 197 | 198 | This class uses WikidotConnector to iterate over all the pages of a site, 199 | and save the html content, revision history, votes, and the discussion 200 | of each to a sqlite database. Optionally, standalone forum threads can be 201 | saved too. 202 | 203 | In case of the scp-wiki, some additional information is saved: 204 | images for which their CC status has been confirmed, and info about 205 | overwriting page authorship. 206 | 207 | In general, this class will not save images hosted on the site that is 208 | being saved. Only the html content, discussions, and revision/vote 209 | metadata is saved. 210 | """ 211 | 212 | def __init__(self, dbpath): 213 | """Create an instance.""" 214 | if pathlib.Path(dbpath).exists(): 215 | raise FileExistsError(dbpath) 216 | orm.connect(dbpath) 217 | self.pool = concurrent.futures.ThreadPoolExecutor(max_workers=20) 218 | 219 | def take_snapshot(self, wiki, forums=False): 220 | """Take new snapshot.""" 221 | self.wiki = wiki 222 | self._save_all_pages() 223 | if forums: 224 | self._save_forums() 225 | if 'scp-wiki' in self.wiki.site: 226 | self._save_meta() 227 | orm.queue.join() 228 | self._save_cache() 229 | orm.queue.join() 230 | log.info('Snapshot succesfully taken.') 231 | 232 | def _save_all_pages(self): 233 | """Iterate over the site pages, call _save_page for each.""" 234 | orm.create_tables( 235 | 'Page', 'Revision', 'Vote', 'ForumPost', 236 | 'PageTag', 'ForumThread', 'User', 'Tag') 237 | count = next( 238 | self.wiki.list_pages(body='total', limit=1))._body['total'] 239 | bar = utils.ProgressBar('SAVING PAGES'.ljust(20), int(count)) 240 | bar.start() 241 | for _ in self.pool.map(self._save_page, self.wiki.list_pages()): 242 | bar.value += 1 243 | bar.stop() 244 | 245 | @utils.ignore(requests.HTTPError) 246 | def _save_page(self, page): 247 | """Download contents, revisions, votes and discussion of the page.""" 248 | orm.Page.create( 249 | id=page._id, url=page.url, thread=page._thread._id, html=page.html) 250 | 251 | revisions = orm.User.convert_to_id(i._asdict() for i in page.history) 252 | votes = orm.User.convert_to_id(i._asdict() for i in page.votes) 253 | tags = [{'tag': t} for t in page.tags] 254 | tags = orm.Tag.convert_to_id(tags, key='tag') 255 | 256 | def _insert(table, data): 257 | table.insert_many(dict(i, page=page._id) for i in data) 258 | 259 | _insert(orm.Revision, revisions) 260 | _insert(orm.Vote, votes) 261 | _insert(orm.PageTag, tags) 262 | 263 | self._save_thread(page._thread) 264 | 265 | def _save_forums(self): 266 | """Download and save standalone forum threads.""" 267 | orm.create_tables( 268 | 'ForumPost', 'ForumThread', 'ForumCategory', 'User') 269 | cats = self.wiki.list_categories() 270 | cats = [i for i in cats if i.title != 'Per page discussions'] 271 | orm.ForumCategory.insert_many(dict( 272 | id=c.id, 273 | title=c.title, 274 | description=c.description) for c in cats) 275 | total_size = sum(c.size for c in cats) 276 | bar = utils.ProgressBar('SAVING FORUM THREADS', total_size) 277 | bar.start() 278 | for cat in cats: 279 | threads = set(self.wiki.list_threads(cat.id)) 280 | c_id = itertools.repeat(cat.id) 281 | for _ in self.pool.map(self._save_thread, threads, c_id): 282 | bar.value += 1 283 | bar.stop() 284 | 285 | def _save_thread(self, thread, c_id=None): 286 | orm.ForumThread.create( 287 | category=c_id, id=thread._id, 288 | title=thread.title, description=thread.description) 289 | posts = orm.User.convert_to_id([i._asdict() for i in thread.posts]) 290 | orm.ForumPost.insert_many( 291 | dict(p, thread=thread._id) for p in posts) 292 | 293 | def _save_meta(self): 294 | orm.create_tables( 295 | 'Image', 'ImageStatus') 296 | licenses = { 297 | 'PERMISSION GRANTED', 'BY-NC-SA CC', 'BY-SA CC', 'PUBLIC DOMAIN'} 298 | images = [i for i in self.wiki.list_images() if i.status in licenses] 299 | self.ibar = utils.ProgressBar( 300 | 'SAVING IMAGES'.ljust(20), len(images)) 301 | self.ibar.start() 302 | data = list(self.pool.map(self._save_image, images)) 303 | self.ibar.stop() 304 | images = orm.ImageStatus.convert_to_id( 305 | [i._asdict() for i in images], key='status') 306 | orm.Image.insert_many( 307 | dict(i, data=d) for i, d in zip(images, data) if d) 308 | 309 | @utils.ignore(requests.RequestException) 310 | def _save_image(self, image): 311 | self.ibar.value += 1 312 | if not image.source: 313 | log.info('Image source not specified: ' + image.url) 314 | return 315 | return self.wiki.req.get(image.url, allow_redirects=True).content 316 | 317 | def _save_cache(self): 318 | for table in orm.User, orm.Tag, orm.OverrideType, orm.ImageStatus: 319 | if hasattr(table, '_id_cache') and table._id_cache: 320 | table.write_ids('name') 321 | -------------------------------------------------------------------------------- /pyscp/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anqxyr/pyscp/fc85c808495f8f47783db6fb12a79ce7727e919c/pyscp/stats/__init__.py -------------------------------------------------------------------------------- /pyscp/stats/counters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Counters. 5 | 6 | Take a list of pages and a scalar, and return a collections.Counter instance. 7 | """ 8 | 9 | ############################################################################### 10 | # Imports 11 | ############################################################################### 12 | 13 | import collections 14 | import re 15 | 16 | ############################################################################### 17 | 18 | 19 | def make_counter(pages, func, key): 20 | """Generic counter factory.""" 21 | subgroups = collections.defaultdict(list) 22 | for p in pages: 23 | key_value = key(p) 24 | if key_value: 25 | subgroups[key_value].append(p) 26 | return collections.Counter({k: func(v) for k, v in subgroups.items()}) 27 | 28 | 29 | def author(pages, func): 30 | """Group per page author.""" 31 | return make_counter(pages, func, lambda p: p.author) 32 | 33 | 34 | def month(pages, func): 35 | """Group per month the page was posted on.""" 36 | return make_counter(pages, func, lambda p: p.created[:7]) 37 | 38 | 39 | def page(pages, func): 40 | """Each page into its own group.""" 41 | return make_counter(pages, func, lambda p: p.url) 42 | 43 | 44 | def block(pages, func): 45 | """Group skips based on which 100-block they're in.""" 46 | def key(page): 47 | if 'scp' not in page.tags: 48 | return 49 | match = re.search(r'[0-9]{3,4}$', page.url) 50 | if not match: 51 | return 52 | match = int(match.group()) 53 | if match == 1: 54 | return 55 | return str((match // 100) * 100).zfill(3) 56 | return make_counter(pages, func, key) 57 | 58 | 59 | def chain(pages, func, *counters): 60 | """Apply counters one after another.""" 61 | if len(counters) == 1: 62 | return counters[0](pages, func) 63 | results = collections.Counter() 64 | for key, val in counters[0](pages, lambda x: x).items(): 65 | for ikey, ival in chain(val, func, *counters[1:]).items(): 66 | results['%s, %s' % (key, ikey)] = ival 67 | return results 68 | -------------------------------------------------------------------------------- /pyscp/stats/filters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Filters. 5 | 6 | Take a list of pages and return a subset of the list. 7 | """ 8 | 9 | ############################################################################### 10 | # Imports 11 | ############################################################################### 12 | 13 | import pyscp.stats.counters as cn 14 | import pyscp.stats.scalars as sc 15 | 16 | ############################################################################### 17 | 18 | 19 | def tag(pages, tag): 20 | """Pages with a given tag.""" 21 | if not tag: 22 | return pages 23 | return [p for p in pages if tag in p.tags] 24 | 25 | 26 | def user(pages, user): 27 | """Pages by a certain user.""" 28 | return [p for p in pages if p.author == user] 29 | 30 | 31 | # TODO: needs more indicative name. 32 | def min_authored(pages, min_val=3): 33 | """Pages by authors who have at least min_val pages.""" 34 | authors = cn.author(pages, sc.count) 35 | return [p for p in pages if authors[p.author] >= min_val] 36 | 37 | 38 | def filter_rating(pages, min_val=20): 39 | """Pages with rating above min_val.""" 40 | return [p for p in pages if p.rating > min_val] 41 | -------------------------------------------------------------------------------- /pyscp/stats/scalars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Scalars. 5 | 6 | Take a list of pages and return a single value. 7 | """ 8 | 9 | def upvotes(pages): 10 | """Upvotes.""" 11 | return sum([v.value for v in p.votes].count(1) for p in pages) 12 | 13 | 14 | def rating(pages): 15 | """Net rating.""" 16 | return sum(p.rating for p in pages) 17 | 18 | 19 | def rating_average(pages): 20 | """Average rating.""" 21 | return rating(pages) / len(pages) 22 | 23 | 24 | def divided(pages): 25 | """Controversy score.""" 26 | return sum(len(p.votes) / p.rating for p in pages) 27 | 28 | 29 | def redactions(pages): 30 | """Redaction score.""" 31 | return sum( 32 | p.text.count('█') + 33 | 20 * sum(map(p.text.count, ('REDACTED', 'EXPUNGED'))) 34 | for p in pages) 35 | 36 | 37 | def wordcount(pages): 38 | return sum(p.wordcount for p in pages) 39 | 40 | 41 | def wordcount_average(pages): 42 | return wordcount(pages) / len(pages) 43 | -------------------------------------------------------------------------------- /pyscp/stats/updater.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Stat Updater. 5 | 6 | Calculate stats from one wiki and write it into another. 7 | """ 8 | 9 | ############################################################################### 10 | # Module Imports 11 | ############################################################################### 12 | 13 | import logging 14 | 15 | from pyscp import snapshot, wikidot, utils 16 | from pyscp.stats import scalars, counters, filters 17 | 18 | ############################################################################### 19 | # Global Constants And Variables 20 | ############################################################################### 21 | 22 | log = logging.getLogger(__name__) 23 | 24 | ############################################################################### 25 | 26 | 27 | class Updater: 28 | 29 | scalars_author = ( 30 | ('Pages Created', len), 31 | ('Net Rating', scalars.rating), 32 | ('Average Rating', scalars.rating_average), 33 | ('Wordcount', scalars.wordcount), 34 | ('Average Wordcount', scalars.wordcount_average)) 35 | 36 | def __init__(self, source, target): 37 | self.pages = list(source.list_pages()) 38 | self.target = target 39 | self.exist = [p.url for p in target.list_pages()] 40 | 41 | @staticmethod 42 | def source_counter(counter): 43 | """Build wikidot markup source for ranking pages.""" 44 | source = ['||~ Rank||~ User||~ Score||'] 45 | # sort by score, then alphabetically by user 46 | items = sorted(counter.items(), key=lambda x: x[0].lower()) 47 | items = sorted(items, key=lambda x: x[1], reverse=True) 48 | template = '||{}||[[[user:{}]]]||{}||' 49 | for idx, (user, score) in enumerate(items): 50 | source.append(template.format(idx + 1, user, score)) 51 | return '\n'.join(source) 52 | 53 | def source_author(self, user): 54 | """Build source code for the user's authorship stats.""" 55 | pages = filters.user(self.pages, user) 56 | source = ['++ Authorship Statistics'] 57 | if not pages: 58 | source.append('This user have not authored any pages.') 59 | return '\n'.join(source) 60 | for descr, func in self.scalars_author: 61 | text = '[[[ranking:{}]]]:@@{}@@**{}**'.format( 62 | descr, ' ' * (40 - len(descr)), round(func(pages), 2)) 63 | source.append('{{%s}}' % text) 64 | return '\n'.join(source) 65 | 66 | def post(self, name, source): 67 | """Update if exists; create if not; retry if failed.""" 68 | p = self.target(name) 69 | for _ in range(10): # retry ten times max 70 | if p.url in self.exist: 71 | response = p.edit(source) 72 | else: 73 | title = name.split(':') 74 | response = p.create(source, title) 75 | if response['status'] == 'ok': 76 | return 77 | log.error('Failed to post: %s', name) 78 | 79 | def update_users(self): 80 | """Update the stats wiki with the author stats.""" 81 | users = {p.author for p in self.pages} 82 | for user in utils.pbar(users, 'UPDATING USER STATS'): 83 | self.post('user:' + user, self.source_author(user)) 84 | 85 | def update_rankings(self): 86 | for descr, func in utils.pbar( 87 | self.scalars_author, 'UPDATING RANKINGS'): 88 | value = self.source_counter(counters.author(self.pages, func)) 89 | self.post('ranking:' + descr, round(value, 2)) 90 | 91 | 92 | ############################################################################### 93 | 94 | if __name__ == "__main__": 95 | source = snapshot.Wiki( 96 | 'www.scp-wiki.net', '/home/anqxyr/heap/_scp/scp-wiki.2015-06-23.db') 97 | target = wikidot.Wiki('scp-stats') 98 | target.auth('placeholder', 'placeholder') 99 | up = Updater(source, target) 100 | up.update_rankings() 101 | up.update_users() 102 | -------------------------------------------------------------------------------- /pyscp/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ############################################################################### 4 | # Module Imports 5 | ############################################################################### 6 | 7 | import logging 8 | import re 9 | import time 10 | import threading 11 | import signal 12 | import functools 13 | import inspect 14 | 15 | ############################################################################### 16 | # Decorators 17 | ############################################################################### 18 | 19 | ############################################################################### 20 | # Decorator decorator is a simplified version of the code from the funcy lib. 21 | # https://github.com/Suor/funcy 22 | ############################################################################### 23 | 24 | 25 | class Call: 26 | 27 | def __init__(self, func, args, kwargs): 28 | self.func, self.args, self.kwargs = func, args, kwargs 29 | 30 | def __call__(self): 31 | return self.func(*self.args, **self.kwargs) 32 | 33 | 34 | def decorator(deco): 35 | spec = inspect.getargspec(deco) 36 | if len(spec.args) > 1 or spec.varargs or spec.keywords: 37 | @functools.wraps(deco) 38 | def _fab(*dargs, **dkwargs): 39 | return make_decorator(deco, *dargs, **dkwargs) 40 | return _fab 41 | else: 42 | return functools.wraps(deco)(make_decorator(deco)) 43 | 44 | 45 | def make_decorator(deco, *dargs, **dkwargs): 46 | def _decorator(func): 47 | @functools.wraps(func) 48 | def wrapper(*args, **kwargs): 49 | call = Call(func, args, kwargs) 50 | return deco(call, *dargs, **dkwargs) 51 | return wrapper 52 | return _decorator 53 | 54 | ############################################################################### 55 | 56 | 57 | @decorator 58 | def listify(call, wrapper=list): 59 | return wrapper(call()) 60 | 61 | 62 | @decorator 63 | def morph(call, catch_exc, raise_exc): 64 | try: 65 | return call() 66 | except catch_exc as error: 67 | raise raise_exc(error) from error 68 | 69 | 70 | @decorator 71 | def ignore(call, error=Exception, value=None): 72 | try: 73 | return call() 74 | except error: 75 | return value 76 | 77 | 78 | @decorator 79 | def log_errors(call, logger=print): 80 | try: 81 | return call() 82 | except Exception as error: 83 | logger(error) 84 | raise(error) 85 | 86 | 87 | @decorator 88 | def decochain(call, *decs): 89 | fn = call.func 90 | for dec in reversed(decs): 91 | fn = dec(fn) 92 | return fn(*call.args, **call.kwargs) 93 | 94 | 95 | class cached_property: 96 | 97 | def __init__(self, func): 98 | self.func = func 99 | functools.update_wrapper(self, func) 100 | 101 | def __get__(self, obj, cls): 102 | if not hasattr(obj, '_cache'): 103 | obj._cache = {} 104 | if self.func.__name__ not in obj._cache: 105 | obj._cache[self.func.__name__] = self.func(obj) 106 | return obj._cache[self.func.__name__] 107 | 108 | ############################################################################### 109 | 110 | 111 | def split(text, delimeters): 112 | pattern = '|'.join(map(re.escape, delimeters)) 113 | return re.split(pattern, text) 114 | 115 | 116 | class ProgressBar: 117 | 118 | def __init__(self, title, max_value): 119 | self.title = title 120 | self.max_value = max_value 121 | self.value = 0 122 | signal.signal(signal.SIGINT, self.exit) 123 | 124 | def start(self): 125 | self.finished = False 126 | self.time_started = time.time() 127 | threading.Thread(target=self.run).start() 128 | 129 | def update(self): 130 | print(self.line() + '\r', end='') 131 | 132 | def line(self): 133 | filled = 40 * self.value / self.max_value 134 | parts = ' ▏▎▍▌▋▊▉' 135 | current = int(filled * len(parts)) % len(parts) 136 | bar = '█' * int(filled) + parts[current] + ' ' * 40 137 | tm = time.gmtime(time.time() - self.time_started) 138 | return '{} |{}| {:>3}% ({}:{:02}:{:02}) '.format( 139 | self.title, 140 | bar[:40], 141 | 100 * self.value // self.max_value, 142 | tm.tm_hour, tm.tm_min, tm.tm_sec) 143 | 144 | def run(self): 145 | while not self.finished: 146 | self.update() 147 | time.sleep(1) 148 | 149 | def stop(self): 150 | self.finished = True 151 | print(self.line()) 152 | 153 | def exit(self, signum, frame): 154 | self.stop() 155 | raise KeyboardInterrupt 156 | 157 | 158 | def pbar(it, title=None, max=None): 159 | max = len(it) if max is None else max 160 | title = '' if title is None else title + ' ' 161 | bar = ProgressBar(title, max) 162 | bar.start() 163 | for i in it: 164 | yield i 165 | bar.value += 1 166 | bar.update() 167 | bar.stop() 168 | 169 | ############################################################################### 170 | 171 | 172 | class LogCount: 173 | 174 | def __init__(self): 175 | self.count = 1 176 | 177 | def filter(self, record): 178 | record.count = self.count 179 | self.count += 1 180 | return True 181 | 182 | 183 | def log_sql_debug(): 184 | logger = logging.getLogger('peewee') 185 | logger.setLevel(logging.DEBUG) 186 | logger.addFilter(LogCount()) 187 | term = logging.StreamHandler() 188 | term.setFormatter(logging.Formatter('{count} {message}', style='{')) 189 | logger.addHandler(term) 190 | 191 | 192 | def default_logging(debug=False): 193 | term = logging.StreamHandler() 194 | file = logging.FileHandler('pyscp.log', mode='a', delay=True) 195 | if debug: 196 | term.setLevel(logging.DEBUG) 197 | file.setLevel(logging.DEBUG) 198 | else: 199 | term.setLevel(logging.INFO) 200 | file.setLevel(logging.INFO) 201 | term.setFormatter(logging.Formatter('{message}', style='{')) 202 | file.setFormatter( 203 | logging.Formatter('{asctime} {levelname:8s} {message}', style='{')) 204 | logger = logging.getLogger('pyscp') 205 | logger.setLevel(logging.DEBUG) 206 | logger.addHandler(term) 207 | logger.addHandler(file) 208 | 209 | ############################################################################### 210 | -------------------------------------------------------------------------------- /pyscp/wikidot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Wikidot access classes. 5 | 6 | This module contains the classes that facilitate information extraction 7 | and communication with the Wikidot-hosted sites. 8 | """ 9 | 10 | ############################################################################### 11 | # Module Imports 12 | ############################################################################### 13 | 14 | import arrow 15 | import bs4 16 | import functools 17 | import itertools 18 | import logging 19 | import pyscp 20 | import re 21 | import requests 22 | 23 | ############################################################################### 24 | # Global Constants And Variables 25 | ############################################################################### 26 | 27 | log = logging.getLogger(__name__) 28 | 29 | 30 | ############################################################################### 31 | # Utility Classes 32 | ############################################################################### 33 | 34 | class InsistentRequest(requests.Session): 35 | """Make an auto-retrying request that handles connection loss.""" 36 | 37 | def __init__(self, max_attempts=10): 38 | super().__init__() 39 | self.max_attempts = max_attempts 40 | 41 | def __repr__(self): 42 | return '{}(max_attempts={})'.format( 43 | self.__class__.__name__, self.max_attempts) 44 | 45 | def request(self, method, url, **kwargs): 46 | logged_kwargs = hide_pass(kwargs) 47 | logged_kwargs = repr(logged_kwargs) if logged_kwargs else '' 48 | log.debug('%s: %s %s', method, url, logged_kwargs) 49 | 50 | kwargs.setdefault('timeout', 60) 51 | kwargs.setdefault('allow_redirects', False) 52 | for _ in range(self.max_attempts): 53 | try: 54 | resp = super().request(method=method, url=url, **kwargs) 55 | except ( 56 | requests.ConnectionError, 57 | requests.Timeout, 58 | requests.exceptions.ChunkedEncodingError): 59 | continue 60 | if 200 <= resp.status_code < 300: 61 | return resp 62 | elif 300 <= resp.status_code < 400: 63 | raise requests.HTTPError( 64 | 'Redirect attempted with url: {}'.format(url)) 65 | elif 400 <= resp.status_code < 600: 66 | continue 67 | raise requests.ConnectionError( 68 | 'Max retries exceeded with url: {}'.format(url)) 69 | 70 | def get(self, url, **kwargs): 71 | return self.request('GET', url, **kwargs) 72 | 73 | def post(self, url, **kwargs): 74 | return self.request('POST', url, **kwargs) 75 | 76 | 77 | def hide_pass(nested_dict): 78 | result = {} 79 | for k, v in nested_dict.items(): 80 | if k in ('pass', 'password', 'pasw'): 81 | result[k] = '********' 82 | elif isinstance(v, dict): 83 | result[k] = hide_pass(v) 84 | else: 85 | result[k] = v 86 | return result 87 | 88 | 89 | ############################################################################### 90 | 91 | 92 | class Page(pyscp.core.Page): 93 | """Create Page object.""" 94 | 95 | def __init__(self, wiki, url): 96 | super().__init__(wiki, url) 97 | self._body = {} 98 | 99 | ########################################################################### 100 | # Internal Methods 101 | ########################################################################### 102 | 103 | def _module(self, *args, **kwargs): 104 | """Call Wikidot module.""" 105 | return self._wiki._module(*args, page_id=self._id, **kwargs) 106 | 107 | def _action(self, event, **kwargs): 108 | """Execute WikiPageAction.""" 109 | return self._module( 110 | 'Empty', action='WikiPageAction', event=event, **kwargs) 111 | 112 | def _vote(self, value): 113 | """Vote on the page.""" 114 | return self._action( 115 | 'RateAction', 116 | event='ratePage' if value else 'cancelVote', 117 | points=value, 118 | force=True) 119 | 120 | def _flush(self, *names): 121 | if not hasattr(self, '_cache'): 122 | return 123 | self._cache = {k: v for k, v in self._cache.items() if k not in names} 124 | 125 | @pyscp.utils.cached_property 126 | def _pdata(self): 127 | data = self._wiki.req.get(self.url).text 128 | soup = bs4.BeautifulSoup(data, 'lxml') 129 | return (int(re.search('pageId = ([0-9]+);', data).group(1)), 130 | parse_element_id(soup.find(id='discuss-button')), 131 | str(soup.find(id='main-content')), 132 | {e.text for e in soup.select('.page-tags a')}) 133 | 134 | @property 135 | def _raw_title(self): 136 | if 'title' in self._body: 137 | return self._body['title'] 138 | return super()._raw_title 139 | 140 | @property 141 | def _raw_author(self): 142 | if 'created_by' in self._body: 143 | return self._body['created_by'] 144 | return super()._raw_author 145 | 146 | ########################################################################### 147 | # Properties 148 | ########################################################################### 149 | 150 | @property 151 | def html(self): 152 | return self._pdata[2] 153 | 154 | @pyscp.utils.cached_property 155 | @pyscp.utils.listify() 156 | def history(self): 157 | """Return the revision history of the page.""" 158 | data = self._module( 159 | 'history/PageRevisionListModule', page=1, perpage=99999)['body'] 160 | soup = bs4.BeautifulSoup(data, 'lxml') 161 | for row in reversed(soup('tr')[1:]): 162 | rev_id = int(row['id'].split('-')[-1]) 163 | cells = row('td') 164 | number = int(cells[0].text.strip('.')) 165 | user = cells[4].text 166 | time = parse_element_time(cells[5]) 167 | comment = cells[6].text if cells[6].text else None 168 | yield pyscp.core.Revision(rev_id, number, user, time, comment) 169 | 170 | @pyscp.utils.cached_property 171 | def votes(self): 172 | """Return all votes made on the page.""" 173 | data = self._module('pagerate/WhoRatedPageModule')['body'] 174 | soup = bs4.BeautifulSoup(data, 'lxml') 175 | spans = [i.text.strip() for i in soup('span')] 176 | pairs = zip(spans[::2], spans[1::2]) 177 | return [pyscp.core.Vote(u, 1 if v == '+' else -1) for u, v in pairs] 178 | 179 | @property 180 | def tags(self): 181 | if 'tags' in self._body: 182 | return set(self._body['tags'].split()) 183 | return self._pdata[3] 184 | 185 | @property 186 | def source(self): 187 | data = self._module('viewsource/ViewSourceModule')['body'] 188 | soup = bs4.BeautifulSoup(data, 'lxml') 189 | return soup.text[11:].strip().replace(chr(160), ' ') 190 | 191 | @property 192 | def created(self): 193 | if 'created_at' in self._body: 194 | time = arrow.get(self._body['created_at'], 'DD MMM YYYY HH:mm') 195 | return time.format('YYYY-MM-DD HH:mm:ss') 196 | return super().created 197 | 198 | @property 199 | def rating(self): 200 | if 'rating' in self._body: 201 | return int(self._body['rating']) 202 | return super().rating 203 | 204 | @pyscp.utils.cached_property 205 | def files(self): 206 | """List all files attached to the page.""" 207 | data = self._module('files/PageFilesModule')['body'] 208 | soup = bs4.BeautifulSoup(data, 'lxml') 209 | if not soup.select('table.page-files'): 210 | return [] 211 | files = soup.select('table.page-files')[0]('tr')[1:] 212 | parsed = [] 213 | for file in files: 214 | url = self._wiki.site + file.find('a')['href'] 215 | name = file.find('a').text.strip() 216 | filetype = file('td')[1].text.strip() 217 | size = file('td')[2].text.strip() 218 | parsed.append(pyscp.core.File(url, name, filetype, size)) 219 | return parsed 220 | 221 | ########################################################################### 222 | # Page-Modifying Methods 223 | ########################################################################### 224 | 225 | def edit(self, source, title=None, comment=None): 226 | """Overwrite the page with the new source and title.""" 227 | if title is None: 228 | title = self._raw_title 229 | self._flush('html', 'history', 'source') 230 | wiki_page = self.url.split('/')[-1] 231 | lock = self._module( 232 | 'edit/PageEditModule', 233 | mode='page', 234 | wiki_page=wiki_page, 235 | force_lock=True) 236 | return self._action( 237 | 'savePage', 238 | source=source, 239 | title=title, 240 | comments=comment, 241 | wiki_page=wiki_page, 242 | lock_id=lock['lock_id'], 243 | lock_secret=lock['lock_secret'], 244 | revision_id=lock.get('page_revision_id', None)) 245 | 246 | def create(self, source, title, comment=None): 247 | if not hasattr(self, '_cache'): 248 | self._cache = {} 249 | self._cache['_pdata'] = (None, None, None) 250 | response = self.edit(source, title, comment) 251 | del self._cache['_pdata'] 252 | return response 253 | 254 | def revert(self, rev_n): 255 | """Revert the page to a previous revision.""" 256 | self._flush('html', 'history', 'source', 'tags') 257 | return self._action('revert', revisionId=self.history[rev_n].id) 258 | 259 | def set_tags(self, tags): 260 | """Replace the tags of the page.""" 261 | res = self._action('saveTags', tags=' '.join(tags)) 262 | self._flush('history', '_pdata') 263 | return res 264 | 265 | def upload(self, name, data): 266 | url = self._wiki.site + '/default--flow/files__UploadTarget' 267 | kwargs = dict( 268 | pageId=self._id, 269 | page_id=self._id, 270 | action='FileAction', 271 | event='uploadFile', 272 | MAX_FILE_SIZE=52428800) 273 | response = self._wiki.req.post( 274 | url, 275 | data=kwargs, 276 | files={'userfile': (name, data)}, 277 | cookies={'wikidot_token7': '123456'}) 278 | response = bs4.BeautifulSoup(response.text, 'lxml') 279 | status = response.find(id='status').text 280 | message = response.find(id='message').text 281 | if status != 'ok': 282 | raise RuntimeError(message) 283 | return response 284 | 285 | ########################################################################### 286 | # Voting Methods 287 | ########################################################################### 288 | 289 | def upvote(self): 290 | self._vote(1) 291 | self._flush('votes') 292 | 293 | def downvote(self): 294 | self._vote(-1) 295 | self._flush('votes') 296 | 297 | def cancel_vote(self): 298 | self._vote(0) 299 | self._flush('votes') 300 | 301 | 302 | class Thread(pyscp.core.Thread): 303 | 304 | @pyscp.utils.cached_property 305 | @pyscp.utils.listify() 306 | def posts(self): 307 | if self._id is None: 308 | return 309 | pages = self._wiki._pager( 310 | 'forum/ForumViewThreadPostsModule', _key='pageNo', t=self._id) 311 | pages = (bs4.BeautifulSoup(p['body'], 'lxml').body for p in pages) 312 | pages = (p for p in pages if p) 313 | posts = (p(class_='post-container', recursive=False) for p in pages) 314 | posts = itertools.chain.from_iterable(posts) 315 | for post, parent in crawl_posts(posts): 316 | post_id = int(post['id'].split('-')[1]) 317 | title = post.find(class_='title').text.strip() 318 | title = title if title else None 319 | content = post.find(class_='content') 320 | content.attrs.clear() 321 | content = str(content) 322 | user = post.find(class_='printuser').text 323 | time = parse_element_time(post) 324 | yield pyscp.core.Post(post_id, title, content, user, time, parent) 325 | 326 | def new_post(self, source, title=None, parent_id=None): 327 | return self._wiki._module( 328 | 'Empty', 329 | threadId=self._id, 330 | parentId=parent_id, 331 | title=title, 332 | source=source, 333 | action='ForumAction', 334 | event='savePost') 335 | 336 | 337 | class Wiki(pyscp.core.Wiki): 338 | """ 339 | Create a Wiki object. 340 | 341 | This class does not use any of the official Wikidot API, and instead 342 | relies on sending http post/get requests to internal Wikidot pages and 343 | parsing the returned data. 344 | """ 345 | 346 | Page = Page 347 | Thread = Thread 348 | # Tautology = Tautology 349 | 350 | ########################################################################### 351 | # Special Methods 352 | ########################################################################### 353 | 354 | def __init__(self, site): 355 | super().__init__(site) 356 | self.req = InsistentRequest() 357 | 358 | def __repr__(self): 359 | return '{}.{}({})'.format( 360 | self.__module__, 361 | self.__class__.__name__, 362 | repr(self.site)) 363 | 364 | ########################################################################### 365 | # Internal Methods 366 | ########################################################################### 367 | 368 | @pyscp.utils.log_errors(log.warning) 369 | def _module(self, _name, **kwargs): 370 | """ 371 | Call a Wikidot module. 372 | 373 | This method is responsible for most of the class' functionality. 374 | Almost all other methods of the class are using _module in one way 375 | or another. 376 | """ 377 | response = self.req.post( 378 | self.site + '/ajax-module-connector.php', 379 | data=dict( 380 | pageId=kwargs.get('page_id', None), # fuck wikidot 381 | moduleName=_name, 382 | # token7 can be any 6-digit number, as long as it's the same 383 | # in the payload and in the cookie 384 | wikidot_token7='123456', 385 | **kwargs), 386 | headers={'Content-Type': 'application/x-www-form-urlencoded;'}, 387 | cookies={'wikidot_token7': '123456'}).json() 388 | if response['status'] != 'ok': 389 | log.error(response) 390 | raise RuntimeError(response.get('message') or response['status']) 391 | return response 392 | 393 | def _pager(self, _name, _key, _update=None, **kwargs): 394 | """Iterate over multi-page module results.""" 395 | first_page = self._module(_name, **kwargs) 396 | yield first_page 397 | counter = bs4.BeautifulSoup( 398 | first_page['body'], 'lxml').find(class_='pager-no') 399 | if not counter: 400 | return 401 | for idx in range(2, int(counter.text.split(' ')[-1]) + 1): 402 | kwargs.update({_key: idx if _update is None else _update(idx)}) 403 | yield self._module(_name, **kwargs) 404 | 405 | def _list_pages_raw(self, **kwargs): 406 | """ 407 | Call ListPages module. 408 | 409 | Wikidot's ListPages is an extremely versatile php module that can be 410 | used to retrieve all sorts of interesting informations, from urls of 411 | pages created by a given user, and up to full html contents of every 412 | page on the site. 413 | """ 414 | yield from self._pager( 415 | 'list/ListPagesModule', 416 | _key='offset', 417 | _update=lambda x: 250 * (x - 1), 418 | perPage=250, 419 | **kwargs) 420 | 421 | def _list_pages_parsed(self, **kwargs): 422 | """ 423 | Call ListPages module and parse the results. 424 | 425 | Sets default arguments, parses ListPages body into a namedtuple. 426 | Returns Page instances with a _body grafted in. 427 | """ 428 | keys = set(kwargs.pop('body', '').split() + ['fullname']) 429 | kwargs['module_body'] = '\n'.join( 430 | map('||{0}||%%{0}%% ||'.format, keys)) 431 | kwargs['created_by'] = kwargs.pop('author', None) 432 | lists = self._list_pages_raw(**kwargs) 433 | soups = (bs4.BeautifulSoup(p['body'], 'lxml') for p in lists) 434 | pages = (s.select('div.list-pages-item') for s in soups) 435 | pages = itertools.chain.from_iterable(pages) 436 | for page in pages: 437 | data = { 438 | r('td')[0].text: r('td')[1].text.strip() for r in page('tr')} 439 | page = self(data['fullname']) 440 | page._body = data 441 | yield page 442 | 443 | ########################################################################### 444 | # Public Methods 445 | ########################################################################### 446 | 447 | def auth(self, username, password): 448 | """Login to wikidot with the given username/password pair.""" 449 | return self.req.post( 450 | 'https://www.wikidot.com/default--flow/login__LoginPopupScreen', 451 | data=dict( 452 | login=username, 453 | password=password, 454 | action='Login2Action', 455 | event='login')) 456 | 457 | def list_categories(self): 458 | """Return forum categories.""" 459 | data = self._module('forum/ForumStartModule')['body'] 460 | soup = bs4.BeautifulSoup(data, 'lxml') 461 | for elem in [e.parent for e in soup(class_='name')]: 462 | cat_id = parse_element_id(elem.select('.title a')[0]) 463 | title, description, size = [ 464 | elem.find(class_=i).text.strip() 465 | for i in ('title', 'description', 'threads')] 466 | yield pyscp.core.Category( 467 | cat_id, title, description, int(size)) 468 | 469 | def list_threads(self, category_id): 470 | """Return threads in the given category.""" 471 | pages = self._pager( 472 | 'forum/ForumViewCategoryModule', _key='p', c=category_id) 473 | soups = (bs4.BeautifulSoup(p['body'], 'lxml') for p in pages) 474 | elems = (s(class_='name') for s in soups) 475 | for elem in itertools.chain(*elems): 476 | thread_id = parse_element_id(elem.select('.title a')[0]) 477 | title, description = [ 478 | elem.find(class_=i).text.strip() 479 | for i in ('title', 'description')] 480 | yield self.Thread(self, thread_id, title, description) 481 | 482 | def send_pm(self, username, text, title=None): 483 | lookup = self.req.get( 484 | 'https://www.wikidot.com/quickmodule.php?' 485 | 'module=UserLookupQModule&q=' + username).json() 486 | if not lookup['users'] or lookup['users'][0]['name'] != username: 487 | raise ValueError('Username Not Found') 488 | user_id = lookup['users'][0]['user_id'] 489 | return self.req.post( 490 | 'https://www.wikidot.com/ajax-module-connector.php', 491 | data=dict( 492 | moduleName='Empty', 493 | source=text, 494 | subject=title, 495 | to_user_id=user_id, 496 | action='DashboardMessageAction', 497 | event='send', 498 | wikidot_token7='123456'), 499 | headers={'Content-Type': 'application/x-www-form-urlencoded;'}, 500 | cookies={'wikidot_token7': '123456'}).json() 501 | 502 | ########################################################################### 503 | # SCP-Wiki Specific Methods 504 | ########################################################################### 505 | 506 | @functools.lru_cache(maxsize=1) 507 | @pyscp.utils.listify() 508 | def list_images(self): 509 | if 'scp-wiki' not in self.site: 510 | return 511 | base = 'http://scpsandbox2.wikidot.com/image-review-{}' 512 | urls = [base.format(i) for i in range(1, 36)] 513 | pages = [self.req.get(u).text for u in urls] 514 | soups = [bs4.BeautifulSoup(p, 'lxml') for p in pages] 515 | elems = [s('tr') for s in soups] 516 | elems = itertools.chain(*elems) 517 | elems = [e('td') for e in elems] 518 | elems = [e for e in elems if e] 519 | for elem in elems: 520 | url = elem[0].find('img')['src'] 521 | source = elem[2].a['href'] if elem[2]('a') else None 522 | status, notes = [elem[i].text for i in (3, 4)] 523 | status, notes = [i if i else None for i in (status, notes)] 524 | yield pyscp.core.Image(url, source, status, notes, None) 525 | 526 | ############################################################################### 527 | 528 | 529 | @pyscp.utils.ignore((IndexError, TypeError)) 530 | def parse_element_id(element): 531 | """Extract the id number from the link.""" 532 | return int(element['href'].split('/')[2].split('-')[1]) 533 | 534 | 535 | def parse_element_time(element): 536 | """Extract and format time from an html element.""" 537 | unixtime = element.find(class_='odate')['class'][1].split('_')[1] 538 | return arrow.get(unixtime).format('YYYY-MM-DD HH:mm:ss') 539 | 540 | 541 | def crawl_posts(post_containers, parent=None): 542 | """ 543 | Retrieve posts from the comment tree. 544 | 545 | For each post-container in the given list, returns a tuple of 546 | (post, parent). Then recurses onto all the post-container children 547 | of the current post-container. 548 | """ 549 | for container in post_containers: 550 | yield container.find(class_='post'), parent 551 | yield from crawl_posts( 552 | container(class_='post-container', recursive=False), 553 | int(container['id'].split('-')[1])) 554 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', encoding="utf8") as f: 4 | readme = f.read() 5 | 6 | setuptools.setup( 7 | name='pyscp', 8 | version='1.0.18', 9 | description='Python API and utilities for the scp-wiki.net website.', 10 | long_description=readme, 11 | url='https://github.com/anqxyr/pyscp/', 12 | author='anqxyr', 13 | author_email='anqxyr@gmail.com', 14 | license='MIT', 15 | classifiers=[ 16 | 'Development Status :: 4 - Beta', 17 | 'Intended Audience :: Other Audience', 18 | 'License :: OSI Approved :: MIT License', 19 | 'Operating System :: OS Independent', 20 | 'Programming Language :: Python :: 3.4'], 21 | packages=['pyscp'], 22 | install_requires=[ 23 | 'arrow', 24 | 'beautifulsoup4', 25 | 'blessings', 26 | 'lxml==3.3.3', 27 | 'requests', 28 | 'peewee==2.8.0'], 29 | ) 30 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ############################################################################### 4 | # Module Imports 5 | ############################################################################### 6 | 7 | from pyscp.core import WikidotConnector, SnapshotConnector 8 | import pytest 9 | import random 10 | 11 | ############################################################################### 12 | 13 | DBPATH = '/home/anqxyr/heap/_scp/scp-wiki.2015-03-16.db' 14 | USERNAME = '' 15 | PASSWORD = ("""""") 16 | 17 | 18 | @pytest.mark.parametrize('cn', [ 19 | WikidotConnector('www.scp-wiki.net'), 20 | SnapshotConnector('www.scp-wiki.net', DBPATH)]) 21 | class TestSCPWikiConnectors: 22 | 23 | def test_revision(self, cn): 24 | revision = cn('scp-1511').history[0] 25 | assert revision.revision_id == 39167223 26 | assert revision.page_id == 18578010 27 | assert revision.number == 0 28 | assert revision.user == 'anqxyr' 29 | assert revision.time == '2013-06-30 16:34:37' 30 | assert revision.comment == 'INITIATE HEAVEN SUBROUTINE' 31 | 32 | def test_post(self, cn): 33 | post = cn('SCP-1511').comments[0] 34 | assert post.post_id == 1806664 35 | assert post.thread_id == 666715 36 | assert post.parent is None 37 | assert post.title is None 38 | assert post.user == 'FlameShirt' 39 | assert post.time == '2013-06-30 16:47:22' 40 | assert post.wordcount == 26 41 | 42 | def test_list_pages(self, cn): 43 | pages = list(cn.list_pages(author='anqxyr', tag='crystalline')) 44 | assert pages == ['http://www.scp-wiki.net/scp-1511'] 45 | 46 | def test_list_pages_rewrites(self, cn): 47 | pages = list(cn.list_pages(author='thedeadlymoose', tag='thermal')) 48 | assert 'http://www.scp-wiki.net/scp-003' in pages 49 | 50 | 51 | class TestActiveMethods: 52 | 53 | @pytest.fixture 54 | def wiki(self, cache=[]): 55 | if cache: 56 | return cache[0] 57 | if not USERNAME or not PASSWORD: 58 | pytest.skip('need authentication data') 59 | wiki = WikidotConnector('testwiki2') 60 | wiki.auth(USERNAME, PASSWORD) 61 | cache.append(wiki) 62 | return wiki 63 | 64 | def test_edit_page(self, wiki): 65 | value = random.randint(0, 1000000) 66 | p = wiki('page1') 67 | p.edit(value, comment='automated test') 68 | assert p.source == str(value) 69 | 70 | def test_revert(self, wiki): 71 | p = wiki('page1') 72 | p.revert_to(24) 73 | assert p.source == 'no source here' 74 | 75 | def test_set_tags(self, wiki): 76 | value = random.randint(0, 1000000) 77 | p = wiki('page1') 78 | p.set_tags(p.tags + [str(value)]) 79 | assert str(value) in p.tags 80 | 81 | 82 | if __name__ == '__main__': 83 | pytest.main() 84 | --------------------------------------------------------------------------------