├── .gitignore ├── API-NOTES.md ├── LICENSE ├── README.md ├── examples ├── basic.py └── basic_gevent.py ├── setup.py ├── tox.ini └── wapiti ├── NOTES.rst ├── __init__.py ├── client.py ├── compat.py ├── operations ├── __init__.py ├── _test_tmpls │ └── regr_moctezuma_parser_funcs.txt ├── base.py ├── category.py ├── conftest.py ├── feedback.py ├── files.py ├── links.py ├── meta.py ├── misc.py ├── models.py ├── params.py ├── protection.py ├── query_operations.py ├── rand.py ├── revisions.py ├── template_parser.py ├── templates.py ├── test_basic.py ├── test_operations.py ├── user.py └── utils.py ├── ransom.py └── tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc* 2 | *.pyo* 3 | *.sublime-project 4 | *.sublime-workspace 5 | *.egg-info 6 | dist/* 7 | .tox/* 8 | .DS_Store 9 | .*.swp 10 | *~ 11 | ._* 12 | .\#* 13 | \#*\# 14 | *.csv 15 | *.json 16 | log* 17 | build/* 18 | *.swp 19 | -------------------------------------------------------------------------------- /API-NOTES.md: -------------------------------------------------------------------------------- 1 | # API improvement notes # 2 | Some MediaWiki API queries are inconsistent, broken, or otherwise in want of improvement. Similar notes are recorded at [Mediawiki:Requests for comment/API roadmap](http://www.mediawiki.org/wiki/Requests_for_comment/API_roadmap). 3 | 4 | * list=usecontribs 5 | 6 | - Docs mention `uccontinue`, but the query uses `ucstart` for continue functionality. 7 | - `ucprop` should be consistent with `rvprop`: it is missing `flags`, `sha1`, `ids` (parentid) 8 | 9 | * missing title 10 | - Throws a warning `unrecognized parameter`, but it should throw an error. 11 | 12 | * prop=extracts supports multiple titles, but requires a continue to 13 | get each individual extract, while returning the IDs/titles of all 14 | the requested pages each time. 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Mahmoud Hashemi and Stephen LaPorte 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following 12 | disclaimer in the documentation and/or other materials provided 13 | with the distribution. 14 | 15 | * The names of the contributors may not be used to endorse or 16 | promote products derived from this software without specific 17 | prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | wapiti 2 | ====== 3 | 4 | ![Wapiti](http://upload.wikimedia.org/wikipedia/commons/thumb/5/59/Elk_1_%28PSF%29.png/212px-Elk_1_%28PSF%29.png) 5 | 6 | A MediaWiki API wrapper in Python for humans and elk. 7 | 8 | Wapiti makes it simple for python scripts to retrieve data from the 9 | [Wikipedia API](https://en.wikipedia.org/w/api.php). No more worries 10 | about query limits, continue strings, or formatting. Just ask for data 11 | and get structured results. 12 | 13 | Example 14 | ------- 15 | 16 | Let's get the members of Wikipedia's [Category:Lists of 17 | superlatives](http://en.wikipedia.org/wiki/Category:Lists_of_superlatives). First, 18 | initialize a `WapitiClient` and change any settings. Next, run the 19 | operation `get_category_pages` on the category `'Lists of 20 | superlatives'`, with a limit of `10`: 21 | 22 | ```python 23 | >>> import wapiti 24 | >>> client = wapiti.WapitiClient('domas@mituzas.lt') 25 | >>> client.get_category_articles_recursive('Lists of superlatives', 10)) 26 | [PageInfo(title=u'The Fifty Worst Films of All Time', page_id=1820513, ns=0), 27 | PageInfo(title=u"World's busiest city airport systems by passenger traffic", page_id=33167241, ns=0), 28 | PageInfo(title=u'List of oldest Major League Baseball players', page_id=1947309, ns=0), 29 | PageInfo(title=u'List of firsts in India', page_id=3752148, ns=0), 30 | PageInfo(title=u'List of the first female holders of political offices in Europe', page_id=18904865, ns=0), 31 | PageInfo(title=u'List of the busiest airports in the Republic of Ireland', page_id=26712480, ns=0), 32 | PageInfo(title=u'List of longest bridges above water in India', page_id=32312925, ns=0), 33 | PageInfo(title=u'List of the busiest airports in China', page_id=33396262, ns=0), 34 | PageInfo(title=u'List of most common surnames in Asia', page_id=26810011, ns=0), 35 | PageInfo(title=u'List of largest mosques', page_id=20897194, ns=0)] 36 | ``` 37 | 38 | This returns a list of `PageInfo` objects for the category's members. 39 | 40 | Operations 41 | ---------- 42 | 43 | Operations usually take two positional arguments: the `query_param` 44 | (page, category, template, etc.), and `limit` (maximum number of 45 | results). 46 | 47 | - `get_random(limit)` : returns a list of `PageIdentifiers` for random pages. 48 | - `get_category_articles(category, limit)` : returns a list of `PageIdentifiers` for the articles or talk pages in a category. If you are interested in getting pages beyond of the main and talk namespace, try `get_category`. 49 | - `get_category_articles_recursive(category, limit)` : returns a list of `PageInfos` for the articles in a category and its subcategories. If you are interested in getting pages beyond of the main and talk namespace, try `get_category_recursive`. 50 | - `get_transcludes(page, limit)` : returns a list of `PageIdentifiers` for the articles that embed (transclude) a page. For example, see the pages that embed [Template:Infobox](http://en.wikipedia.org/wiki/Special:WhatLinksHere/Template:Infobox) with `client.get_transcludes('Infobox')`. 51 | - `get_backlinks(page, limit)` : returns a list of `PageIdentifiers` for pages that internally link back to a page. For example, see the pages that [link to 'Coffee'](http://en.wikipedia.org/wiki/Special:WhatLinksHere/Coffee) with `client.get_backlinks('Coffee')`. 52 | - `get_revision_infos(page, limit)` : returns a list of `RevisionInfos` for a page's revisions. 53 | - `get_current_content(page, limit)` : returns a list of `Revisions` (including text content) for the page's most recent revisions. 54 | 55 | Other operations are available: see wapiti/operations 56 | 57 | Models 58 | ------ 59 | 60 | Models describe the structure for result data. For the full list of 61 | models, see wapiti/operations/models.py 62 | 63 | ### PageIdentifier ### 64 | 65 | A `PageIdentifier` describes the standard information available for a page. 66 | 67 | - **Title** : unique name of the page 68 | - **ID** : the primary key for the page 69 | - **Namespace** : the [namespace](http://en.wikipedia.org/wiki/Wikipedia:Namespace) number, which can indicate whether the page is an article, discussion page, user page, template, category, etc. 70 | - **Source** : the MediaWiki API where this page was retrieved 71 | - **Normalized title** : the title may have been normalized by MediaWiki, for example, by resolving a redirect 72 | - **Subject ID** : the ID of the corresponding page in the basic namespace 73 | - **Talk page ID** : the ID of the corresponding page in the [talk namespace](http://en.wikipedia.org/wiki/Help:Using_talk_pages) 74 | 75 | ### RevisionInfo ### 76 | 77 | A `RevisionInfo` describes the standard information for a revision. 78 | 79 | * **PageIdentifier** : the page's `PageIdentifier` 80 | * **Subject revision ID** : the primary key for a revision 81 | * **Parent revision ID** : the previous revision to the page 82 | * **User text** : the editor's username, or IP address for an unregistered user 83 | * **User ID** : the unique id of the user who submitted this revision. It may be 0 for an unregistered user. 84 | * **Size** : the length of the article at this revision 85 | * **Timestamp** : timestamp in UTC when this revision was submitted 86 | * **SHA1** : the SHA-1 hash of revision text in base-36. 87 | * **Edit summary** : the [edit summary](http://meta.wikimedia.org/wiki/Help:Edit_summary) (or 'comment') for a contribution. In some cases, it may have been deleted (or 'oversighted') and unavailable through the API. 88 | * **Tags** : brief messages that MediaWiki (or an extension) may automatically place next to certain edits. [Tags](http://en.wikipedia.org/wiki/Wikipedia:Tags) are not common, usually placed by Edit Filter or VisualEditor extensions. 89 | * **Parsed** : whether the page is parsed (html) or not (wikitext) 90 | 91 | ### Revision ### 92 | 93 | A `Revision` includes the same data as `RevisionInfo`, plus full text content. 94 | 95 | TODO 96 | ---- 97 | - Logging 98 | - Client settings 99 | - Port more API calls 100 | - Retry and timeout behaviors 101 | - Get my shit together and continue work on the HTTP client. 102 | - Underscoring args 103 | - Pause/resume 104 | - Better differentiation between the following error groups: 105 | * Network/connectivity 106 | * Logic 107 | * Actual Mediawiki API errors ('no such category', etc.) 108 | - Relatedly: Save MediaWiki API warnings 109 | - Types of API calls: 110 | * single argument -> multiple results (get category) 111 | * many arguments -> up to one result per argument (get protections) 112 | * multiple arguments -> multiple results per argument (get language links) 113 | * TODO: establish return format convention for this 114 | - Need generic support for: 115 | * APIs which support both pageid and title lookup 116 | * Redirect following 117 | - Full docs 118 | -------------------------------------------------------------------------------- /examples/basic.py: -------------------------------------------------------------------------------- 1 | from wapiti import WapitiClient 2 | 3 | client = WapitiClient('you@example.com') 4 | 5 | res = [] 6 | cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics') 7 | for cat in cats: 8 | res.append(client.get_category_recursive(cat, 1000)) 9 | 10 | print res[0][0] 11 | 12 | import pdb;pdb.set_trace() 13 | -------------------------------------------------------------------------------- /examples/basic_gevent.py: -------------------------------------------------------------------------------- 1 | import gevent 2 | from gevent import monkey 3 | monkey.patch_all() 4 | 5 | from wapiti import WapitiClient 6 | 7 | client = WapitiClient('you@example.com') 8 | 9 | cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics') 10 | tasks = [gevent.spawn(client.get_category_recursive, x, 1000) for x in cats] 11 | gevent.wait(tasks) 12 | 13 | print tasks[0].value[0] 14 | 15 | import pdb;pdb.set_trace() 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wapiti 3 | ~~~~~~ 4 | 5 | Wapiti is a Wikipedia API client focused on providing a consistent 6 | and performant abstraction around the widely varying Mediawiki API 7 | endpoints and data models. Read-only APIs are first priority, but 8 | write operations are on the way. See `the Github project 9 | `_ for more info. 10 | 11 | :copyright: (c) 2013 by Mahmoud Hashemi and Stephen LaPorte 12 | :license: BSD, see LICENSE for more details. 13 | 14 | """ 15 | 16 | import sys 17 | from setuptools import setup 18 | 19 | 20 | __author__ = 'Mahmoud Hashemi' 21 | __version__ = '0.1' 22 | __contact__ = 'mahmoudrhashemi@gmail.com' 23 | __url__ = 'https://github.com/mahmoud/wapiti' 24 | __license__ = 'BSD' 25 | 26 | 27 | if sys.version_info >= (3,): 28 | raise NotImplementedError("wapiti Python 3 support en route to your location") 29 | 30 | 31 | setup(name='wapiti', 32 | version=__version__, 33 | description="A Wikipedia API client for humans and elk.", 34 | long_description=__doc__, 35 | author=__author__, 36 | author_email=__contact__, 37 | url=__url__, 38 | packages=['wapiti', 'wapiti.operations'], 39 | include_package_data=True, 40 | zip_safe=False, 41 | license=__license__, 42 | platforms='any', 43 | classifiers=[ 44 | 'Intended Audience :: Developers', 45 | 'Topic :: Software Development :: Libraries', 46 | 'Programming Language :: Python :: 2.6', 47 | 'Programming Language :: Python :: 2.7', 48 | 'Topic :: Internet :: WWW/HTTP', 49 | 'Topic :: Education', 50 | 'Development Status :: 3 - Alpha'] 51 | ) 52 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | [testenv] 4 | commands=python ./wapiti/tests.py 5 | -------------------------------------------------------------------------------- /wapiti/NOTES.rst: -------------------------------------------------------------------------------- 1 | Notes 2 | ===== 3 | 4 | Notes on "multiargument" and "bijective": 5 | ----------------------------------------- 6 | 7 | There are lots of ways to classify operations, and these are just a 8 | couple. 9 | 10 | "Multiargument" operations can take more than one search parameter 11 | at once, such as the GetProtections operation. Others, can only take 12 | one argument at a time, like GetCategory. 13 | 14 | "Bijective" only return at most one result per argument. GetProtections 15 | is an example of a bijective query. Bijective queries do not require an 16 | explicit limit on the number of results to be set by the user. 17 | 18 | Going forward, these attributes can be determined as follows: 19 | 20 | - Multiargument: determined by looking at an operation's 21 | `input_field`. If it is a SingleParam, then multiargument is false, 22 | if it's a MultiParam, then multiargument is true. 23 | 24 | - Bijective: determined by looking at an operation's `output_type`, 25 | which more accurately describes the *per-parameter* return type. If 26 | it is a list, then bijective is true, if it's a bare type, then 27 | bijective is false. 28 | 29 | 30 | Fodder from DSL/dataflow refactor 31 | --------------------------------- 32 | 33 | GetCategoryPagesRecursive 34 | (FlattenCategory -> GetCategoryPages -> Wikipedia API call -> URL fetch ) 35 | (PageInfos <- PageInfos <- MediaWikiCall <- RansomResponse) 36 | 37 | operation's input_field = explicit or first field of chain 38 | 39 | def process(op): 40 | res = op.process() 41 | return self.store_results(res) 42 | 43 | what about producing subops? 44 | 45 | def process(): 46 | task = self.get_current_task() 47 | res = task.process() 48 | if res and isinstance(res[0], Operation): 49 | self.store_subops(res) 50 | return # return subops? 51 | return self.store_results(res) # returns *new* results 52 | 53 | GetCategoryPagesRecursive 54 | (FlattenCategory --(CatInfos)-> 55 | GetCategoryPages --("APIParamStructs")-> 56 | MediawikiCall [--(url)-> URL fetch]) 57 | 58 | An "APIParamStruct" is really just something with the API url and param 59 | dictionary, so QueryOperations themselves could be viewed as 60 | APIParamStructs. In other words, hopefully no new model type needed 61 | just for that. 62 | 63 | At its most basic level, an Operation is something which: 64 | 65 | - Has a type-declared input field, and a declared return type 66 | - Has a process() function that returns results (of the output type) 67 | or raises NoMoreResults 68 | - Most likely takes a WapitiClient as a 'client' keyword 69 | argument in its __init__() 70 | - Provides a uniform way of checking progress (checking if it's done) 71 | 72 | Some notes on Operation design/usage: 73 | 74 | - An Operation typically keeps a copy of its results internally, 75 | most likely a unique list of some sort, and should return only 76 | new results. 77 | - Calling an Operation directly calls process() repeatedly until the 78 | operation is complete, then returns the internally tracked results. 79 | -------------------------------------------------------------------------------- /wapiti/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | from client import WapitiClient 4 | -------------------------------------------------------------------------------- /wapiti/client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | 5 | ''' 6 | The beginnings of a better Mediawiki API library (with certain builtin 7 | affordances for the more popular wikis and extensions). Most of what 8 | you see below is implementation internals, the public API isn't set yet, 9 | but check back soon. 10 | 11 | # TODO 12 | * Create client class 13 | * Port more API calls 14 | * Retry and timeout behaviors 15 | * Get my shit together and continue work on the HTTP client. 16 | * Underscoring args 17 | * pause/resume 18 | * better differentiation between the following error groups: 19 | * Network/connectivity 20 | * Logic 21 | * Actual Mediawiki API errors ('no such category', etc.) 22 | * Relatedly: Save MediaWiki API warnings 23 | 24 | Types of API calls: 25 | * single argument -> multiple results (get category) 26 | * many arguments -> up to one result per argument (get protections) 27 | * multiple arguments -> multiple results per argument (get language links) 28 | * TODO: establish return format convention for this 29 | 30 | Need generic support for: 31 | * APIs which support both pageid and title lookup 32 | * Redirect following 33 | ''' 34 | import re 35 | 36 | from operations import ALL_OPERATIONS, DEFAULT_API_URL 37 | 38 | DEFAULT_TIMEOUT = 15 39 | import socket 40 | socket.setdefaulttimeout(DEFAULT_TIMEOUT) # TODO: better timeouts for reqs 41 | 42 | 43 | _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') 44 | 45 | 46 | def camel2under(string): 47 | return _camel2under_re.sub(r'_\1', string).lower() 48 | 49 | 50 | def under2camel(string): 51 | return ''.join(w.capitalize() or '_' for w in string.split('_')) 52 | 53 | 54 | class BoundOperation(object): # TODO: Operation subtype? 55 | def __init__(self, op_type, client): 56 | self.client = client 57 | self.op_type = op_type 58 | self.op_inst = None 59 | 60 | def __call__(self, *a, **kw): 61 | if not self.op_inst: 62 | kw.setdefault('client', self.client) 63 | self.op_inst = self.op_type(*a, **kw) 64 | kw.pop('client') 65 | return self.op_inst() 66 | 67 | def __repr__(self): 68 | cn = self.__class__.__name__ 69 | if self.op_inst: 70 | return '<%s %r bound to %r>' % (cn, self.op_inst, self.client) 71 | op_cn = self.op_type.__name__ 72 | return '<%s %s bound to %r>' % (cn, op_cn, self.client) 73 | 74 | 75 | class UnboundOperation(object): # TODO: Operation subtype? 76 | def __init__(self, op_type): 77 | self.op_type = op_type 78 | 79 | def bind(self, client): 80 | return BoundOperation(self.op_type, client) 81 | 82 | def __get__(self, obj, obj_type=None): 83 | if obj_type and isinstance(obj, WapitiClient): 84 | return BoundOperation(self.op_type, obj) 85 | return self 86 | 87 | def __repr__(self): 88 | cn = self.__class__.__name__ 89 | return '<%s %r>' % (cn, self.op_type) 90 | 91 | 92 | class WapitiClient(object): 93 | """ 94 | Provides logging, caching, settings, and a convenient interface 95 | to most (all?) operations. 96 | """ 97 | def __init__(self, 98 | user_email, 99 | api_url=None, 100 | is_bot=False, 101 | init_source=True, 102 | debug=False): 103 | # set settings obj 104 | # set up source (from api_url in settings) 105 | # then you're ready to call ops 106 | self.user_email = user_email 107 | self.api_url = api_url or DEFAULT_API_URL 108 | self.is_bot = is_bot 109 | self.debug = debug 110 | 111 | if init_source: 112 | self._init_source() 113 | 114 | def _init_source(self): 115 | # TODO: no input_field and single respones 116 | self.source_info = self.get_source_info()[0] 117 | 118 | @property 119 | def op_names(self): 120 | return list(sorted(self.op_map.keys())) 121 | 122 | def print_usage(self, query=None): 123 | op_names = self.op_names 124 | if query: 125 | op_names = [o for o in self.op_names if query.lower() in o.lower()] 126 | print '\n'.join(self.op_map[name].help_str for name in op_names) 127 | 128 | # TODO: configurable operations 129 | op_map = dict([(op.__name__, op) for op in ALL_OPERATIONS]) 130 | unbound_op_map = dict([(camel2under(op_name), UnboundOperation(op)) 131 | for op_name, op in op_map.items()]) 132 | unbound_op_set = set(unbound_op_map.values()) 133 | locals().update(unbound_op_map) 134 | -------------------------------------------------------------------------------- /wapiti/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | 4 | is_py2 = sys.version_info[0] == 2 5 | is_py3 = sys.version_info[0] == 3 6 | 7 | from collections import OrderedDict # TODO 8 | 9 | if is_py2: 10 | from urllib import quote, unquote, quote_plus, unquote_plus, urlencode 11 | from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag 12 | from urllib2 import parse_http_list 13 | import cookielib 14 | from Cookie import Morsel 15 | from StringIO import StringIO 16 | 17 | unicode, str, bytes, basestring = unicode, str, str, basestring 18 | elif is_py3: 19 | from urllib.parse import (urlparse, urlunparse, urljoin, urlsplit, 20 | urlencode, quote, unquote, quote_plus, 21 | unquote_plus, urldefrag) 22 | from urllib.request import parse_http_list 23 | from http import cookiejar as cookielib 24 | from http.cookies import Morsel 25 | from io import StringIO 26 | 27 | unicode, str, bytes, basestring = str, bytes, bytes, str 28 | else: 29 | raise NotImplementedError('welcome to the future, I guess. (report this)') 30 | 31 | 32 | # The unreserved URI characters (RFC 3986) 33 | UNRESERVED_SET = frozenset( 34 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 35 | + "0123456789-._~") 36 | 37 | 38 | def unquote_unreserved(uri): 39 | """Un-escape any percent-escape sequences in a URI that are unreserved 40 | characters. This leaves all reserved, illegal and non-ASCII bytes encoded. 41 | """ 42 | parts = uri.split('%') 43 | for i in range(1, len(parts)): 44 | h = parts[i][0:2] 45 | if len(h) == 2 and h.isalnum(): 46 | c = chr(int(h, 16)) 47 | if c in UNRESERVED_SET: 48 | parts[i] = c + parts[i][2:] 49 | else: 50 | parts[i] = '%' + parts[i] 51 | else: 52 | parts[i] = '%' + parts[i] 53 | return ''.join(parts) 54 | 55 | 56 | def requote(uri): 57 | return quote(unquote_unreserved(uri), safe="!#$%&'()*+,/:;=?@[]~") 58 | -------------------------------------------------------------------------------- /wapiti/operations/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import WapitiException, DEFAULT_API_URL, OperationMeta 5 | from models import PageIdentifier, CategoryInfo, RevisionInfo 6 | 7 | import category 8 | import feedback 9 | import files 10 | import links 11 | import meta 12 | import misc 13 | import protection 14 | import rand 15 | import revisions 16 | import templates 17 | import user 18 | import query_operations 19 | 20 | for op in OperationMeta._all_ops: 21 | globals()[op.__name__] = op 22 | 23 | ALL_OPERATIONS = tuple(OperationMeta._all_ops) 24 | -------------------------------------------------------------------------------- /wapiti/operations/_test_tmpls/regr_moctezuma_parser_funcs.txt: -------------------------------------------------------------------------------- 1 | {{ArticleHistory 2 | |action1=GAN 3 | |action1date=14:08, 30 March 2008 (UTC) 4 | |action1result=not listed 5 | |action1oldid=201814944 6 | |currentstatus=FGAN 7 | }} 8 | {{WikiProjectBannerShell|1= 9 | {{WikiProject Biography|living=no|class=C|listas=Moctezuma Ii}} 10 | {{WikiProject Mesoamerica|aztec=yes|class=C|importance=Top}} 11 | {{WikiProject Mexico|class=c|importance=Mid}}}} 12 | 13 | =="Last king"== 14 | On a page entitled just "Moctezuma," (which I have redirected here) someone had written 15 | "'''Moctezuma''' was the last [[:Aztecs|Aztec]] king." 16 | Please note that that is both incomplete and false -- there were two Moctezumas, neither of which were "last." 17 | 18 | If we're not careful, the term "Wikipedia article" will come to be used to mean "authoritive-sounding nonsense." 19 | 20 | :In a way [[Motecuhzoma Xocoyotzin]] was the last true [[Mexica]] king of Tenochtitlan since during his rein, Tenochtitlan was raided by Cortez. [[User:Myke209|Myke]] 13:04, 12 April 2006 (UTC) 21 | 22 | 23 | The last [[Mexica]] [[tlatoani]] was [[Cuauhtémoc]]. And it's "Cortés" not "Cortez" --[[User:Sukozo|Sukozo]] ([[User talk:Sukozo|talk]]) 11:57, 3 November 2011 (UTC) 24 | 25 | ==No source== 26 | I cut the anonamously added "Some historians point out however, that the spaniards, realizing that Moctezuma had lost his influence and rule over his aztec subjects, found him useless and killed him right after the stoning incident." Source? As far as I remember, this isn't how Diaz de Castillo nor Prescott tell it. Wondering simply, [[User:Infrogmation|Infrogmation]] 04:49 3 Jul 2003 (UTC) 27 | : if i remember the original source of this is Gomarra (and confirmed by the florentine codex). This declaration from Gomarra wa the reazon why Castillo wrote "The Real history...". 28 | ::The florentine codex doesn't mention how motecuzoma died - it says that nahuas found his body on the lakebed early in the morning (i think after the noche triste)[[User:Maunus|·Maunus·ƛ·]] 21:16, 11 May 2009 (UTC) 29 | 30 | ==Warrior, not scholar== 31 | According to the ABC-CLIO world history database, Montezuma was a warrior, not a scholar, which directly contradicts what is said in the article. perhaps this should be changed. 32 | 33 | : There is no contradiction, all aztec upper classes were warriors. To have the title of Tlatoani, Moctezuma must have been responsible of capturing about a dozen war prissioners. Also as a tlatoani, he continue with the aztec military expansion. But he was more interested in religion. A warrior like his uncle would have not care about the divinity of Cortez. But intead of continue his military acounts he prefered to be the Head of the Calmecac, insted of the Telpochcalli, and live the life of a priest. He did not wanted to be elected tlatoani. . [[User:Nanahuatzin|Nanahuatzin]] 34 | 35 | ==Contact with the Spanish, sources== 36 | You all should really reconsider the portion of the article concerning Spanish contact. The problems with the sources you cite are enormous. Some modern research by those who are not as biased as the authors you've read might do this article some good. It's pretty much been decided in the past 20 years or so that Moctezuma didn't believe Cortés was Quetzalcoatl, and there's no native evidence that Quetzalcoatl was even supposed to return. Report that "the sources say that..." if you want, but don't present what the sources say as if it's fact. That's just lazy. 37 | 38 | : mmhh decided by whom?... Most of the info here comes from primary sources, like Alva ixtlixochitl, the mexicayotl cronicle, the ramirez codex, Camargo, Sahagun etc. The legend of the return of queztalcoatl, (or more correctly, the return of Ce Acatl Topiltzin Queztalcoatl, great priest of queztalcoatl) is recorded By Sahagun. Is it true that there is a hotly debate on why Moctezuma reacted the way he did, maybe we shoudl expand that... [[User:Nanahuatzin|Nanahuatzin]] 18:04, 5 December 2005 (UTC) 39 | 40 | ==Black legend== 41 | The whole paragraph starting from "They gave the Spanish gold flags" and ending with "they crave gold" is clearly an example of black legend. The writer pretend it has been written by an Aztec but in fact it's a fictional extract from Carl Sagan's Cosmos, therefore it's a subjective point of view which should not be included in a serious article. 42 | 43 | : Sorry , i have never read that book. Original Source Florentine Codex. Book XII, by Bernardino de Sahagun. I used the spanish translation from "The other side of the conquest", by Leon Portilla, the main scholar on nahuatl language. Yes it,s a bit colorful, but I included to show that not al aztecs view spaniards as gods o else... [[User:Nanahuatzin|Nanahuatzin]] 08:02, 13 March 2006 (UTC) 44 | 45 | Estoy de acuerdo de con tu explicación, se puede decir lo mismo (que no todos los aztecas consideraban a los españoles como dioses) con otras palabras, pero es menester señalar que lo anterioremente citado podía ofender y ofendía la sensibilidad de un español. 46 | 47 | :(Attempt at English translation of above comment) 48 | :I agree with your explanation. We could say the same thing (that not all Aztecs considered the Spaniards to be gods) using other words, but it is (menester) to note that the earlier text could offend and has offended the sensibility of a Spaniard. 49 | :(end translation) 50 | :[[User:Richardshusr|Richard]] 18:53, 1 April 2006 (UTC) 51 | 52 | :Entendido el punto. Procurare ser mas cuidadoso, sin embargo, tambien me gustaria comentar que es por esa razon que recurri a citar una fuente primaria, en lugar de solo referme a ella. EL publico en general esta acostumbrado solo a escuchar los testimonios europeos... [[User:Nanahuatzin|Nanahuatzin]] 16:16, 13 March 2006 (UTC) 53 | 54 | ::(Attempt at English translation of above comment) 55 | ::I get the point. I will be more careful 56 | ::(translator's note: the previous sentence is a loose translation, my Spanish is not that strong, please correct the translation if you can). 57 | ::However, I also prefer to comment that it is for this reason that I went back to cite a primary source instead of just referring to it. The general public is used to only hearing the European testimonies (translator's note: i.e. European point of view) 58 | ::[[User:Richardshusr|Richard]] 18:53, 1 April 2006 (UTC) 59 | 60 | ::Please offer English translations for Spanish comments. [[User:Pietdesomere|Piet]] 09:03, 1 April 2006 (UTC) 61 | ::: Ok.. [[User:Nanahuatzin|Nanahuatzin]] 20:52, 22 April 2006 (UTC) 62 | 63 | ==Moctezuma's father== 64 | Moctezuma was Axayácatl's son, not Ahuízotl's. Same goes for Cuitláhuac. Ahuízotl was Cuauhtémoc's father. 65 | 66 | == Name == 67 | 68 | The article claims that Moctezuma is ostensibly the preferred name. I cannot confirm or deny this. I am however quite certain that Montezuma is the more commonly used name, so I propose a change to that title. Relevant Wikipedia guidelines: 69 | *[[Wikipedia:Naming conventions (common names)]]. Quote: ''Wikipedia is not a place to advocate a title change in order to reflect recent scholarship. The articles themselves reflect recent scholarship but the titles should represent common usage.'' 70 | *[[Wikipedia:Naming conventions (use English)]]. Quote: ''If a native spelling uses different letters than the most common English spelling (eg, Wien vs. Vienna), only use the native spelling as an article title if it is more commonly used in English than the anglicized form. If you are talking about a person, country, town, movie or book, use the most commonly used English version of the name for the article, as you would find it in other encyclopedias and reference works.'' 71 | Awaiting reactions of course... [[User:Pietdesomere|Piet]] 09:13, 1 April 2006 (UTC) 72 | 73 | :Montezuma sounds awfull !!!!!!! (sorry you ask for my reaction...) I know this is an english enclyclopedia. But in the last years i have seen a trend to try to respect the original names: Sri-lanka, instead of Ceylan, Beijing instead of Pekin. So.. can we try to use Moctezuna, instead of the spaniard version of the name... Specially that even in Spain Montezuma is no longer used, (and not to mention that most mexicans would find montezuma offensive).. Please :) 74 | :Also.. if you look at goggle you will find that most references to "Montezuma" dos not refer the the Aztec Tlatoani, but To Montezuma school, Montezuma county, city of Montezuma, Montezuna Castle, Montezuma Well,Montezuma's Reptiles etc, while Moctezuma would refer specifically to it.... [[User:Nanahuatzin|Nanahuatzin]] 20:49, 22 April 2006 (UTC) 75 | 76 | ::I have not heard any other spelling but Moctezuma in Spanish. However, Montezuma ought to be mentioned too in the text as this is commonplace in English language. Regards, [[User:Asterion|'''Asterion''']][[User talk:Asterion|'''talk''']] 23:08, 3 November 2007 (UTC) 77 | 78 | :::Considering the above cited naming conventions, I don't see any strong argument for "Moctezuma". It may be more accurate, but please observe the adherence to the conventions in articles such as [[Saladin]] and [[Avicenna]], even though these names are not actually the correct forms. I somehow doubt strongly that "Moctezuma" is more common in English, and I would need to see a citation to that effect. 79 | 80 | :::The above convention states: ''use the most commonly used English version of the name for the article, as you would find it in other encyclopedias and reference works.'' One needs only do a quick search to find that Britannica, Encarta, Columbia, and Oxford, amongst others, all use the form "Montezuma" for the entry. I'm a little confused, because it seems that Nanahuatzin's comment seems to ignore the relevant policies cited directly above, which makes his own suggestions seem invalid. According to Wikipedia's naming conventions, this article should use the form "Montezuma" in the title and the text, as far as I'm concerned.--[[User:C.Logan|C.Logan]] ([[User talk:C.Logan|talk]]) 81 | 82 | :I think you have a valid point. Most Americans are used to seeing "Montezuma" as in "the halls of Montezuma". However, as you see above, we were convinced to use "Moctezuma" instead on the baseis that "Montezuma" was a Spanish neologism for the real Nahuatl name. 83 | 84 | :I'm OK with Moctezuma, I guess. However, I agree with you that the alternate names should be provided and explained. 85 | 86 | :Here's what MSN Encarta has to say ([http://encarta.msn.com/encyclopedia_761573242/Montezuma_II.html http://encarta.msn.com/encyclopedia_761573242/Montezuma_II.html]) 87 | :Montezuma II (1480?-1520), ruler of the Aztec Empire of Mexico; his name in the Native American Nahuatl language is Montecuhzoma. 88 | 89 | :Here's what the Britannica has to say ([http://www.britannica.com/eb/topic-390850/Montezuma-II http://www.britannica.com/eb/topic-390850/Montezuma-II]) 90 | :Montezuma II, or Moctezuma II, or Moteucçoma (Aztec emperor) 91 | 92 | :Now, where do you think we should go from here? 93 | 94 | :--[[User:Richardshusr|Richard]] ([[User talk:Richardshusr|talk]]) 02:21, 4 January 2008 (UTC) 95 | 96 | ::Well, I support the inclusion of the more correct name-forms, but it seems to me that we should use "Montezuma" as the primary form for the article, as these encyclopedias do. I don't have any doubt that "Montezuma" is not as accurate, but again, the naming conventions seem to suggest that we should go with familiarity over correctness- hence the above examples I'd given. I've tagged the claim that Moctezuma is the "most common" form in English, because this seems very questionable to me. It is certainly a recognized and used form, but none of the history books that I've seen use "Moctezuma", even if those books chose the improper form only, again, because of public familiarity. I would be interested to see a source which supports this claim, because it would satisfy my concerns.--[[User:C.Logan|C.Logan]] ([[User talk:C.Logan|talk]]) 02:34, 4 January 2008 (UTC) 97 | 98 | ::Neither "Montezuma" nor "Moctezuma" are correct. '''Both''' are 'corruptions'. Some scholars prefer to use the genuinely accurate spellings ''Motecuhzoma'' or ''Moteuczoma'', but they're yet to really catch on. --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 03:52, 4 January 2008 (UTC) 99 | :::Agreeing with Ptcamn. Reposting what i have recently written at [[Wikipedia_talk:WikiProject_Aztec/Terminology#Montezuma_vs._Moctezuma]]: 100 | :::The prefferred spelling in scholarly articles is '''Motecuhzoma''' if using Richard J. Andrews orthography which is becoming the most accepted in aztec studies. Another transliteration that is accpetable is Moteuczoma or Moteczoma but this is not commonly used. This is because unlike the two other forms ''moctezuma'' and ''montezuma'' it reflects his actual name in Nahuatl. It is composed of the three parts "mo" the reflexvive pronoun , "tecuh/teuc" "lord" and "zōma" "frown" - the other forms introduce spurious letters like "n" or turn "tecu" into "cte" for no good reason. [[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 10:03, 4 January 2008 (UTC) 101 | 102 | == Cortes leaving to meet Narvaez == 103 | 104 | The sentence about Cortes leaving to meet Narvaez does not express the core point that Narvaez had been sent to arrest Cortes. When I have time I will try to fix this. 105 | [[User:Richardshusr|Richard]] 18:43, 1 April 2006 (UTC) 106 | 107 | It's fixed. I fixed it a few weeks ago but forgot to leave a note here. 108 | --[[User:Richardshusr|Richard]] 06:51, 5 May 2006 (UTC) 109 | 110 | == Date of Moctezuma's death and of La Noche Triste == 111 | It seems to me hard to reconcile the date of M's death as given in this article (July 1) with the date of [[La Noche Triste]] (also July 1). Wasn't the latter some days after the former? [[User:Alpheus|Alpheus]] 112 | 113 | :: right, la "noche triste" was 30 of june, 1520 . Cortez delayed to run out of the city, because he still had the hight priest has hostage, and the aztec wanted to make Cuitlahuac a Tlatoani. The aztec ofered peace in excahnge, but the resume atack as soon as cuitlahuac was made Tlatoani[[User:Nanahuatzin|Nanahuatzin]] 06:51, 3 July 2006 (UTC) 114 | 115 | :::It's difficult to have accurate dates here. The Europeans were still using the [[Julian calendar]], which by that time had accumulated at least 10 days of error in comparison to the solar date. That is, the summar solstice for 1520 was probably around June 11th according to Cortes' calendar. [[User:Madman2001|Madman]] 22:41, 7 July 2006 (UTC) 116 | 117 | == Clean-up tag == 118 | 119 | Fellow editors: I placed a clean-up tag on this article. I believe the following areas should be addressed: 120 | 121 | *The conversational tone of large sections of the article. 122 | *Unsupported material. 123 | *Legends and hearsay passed as fact (e.g. Moctezuma looked into a bird's eyes and saw men landing on the coast). 124 | *Better use of white space. 125 | *Better placement or definition of Nahuatl words. 126 | *The date of death of Moctezuma as well as Moctezuma's lineage (as noted elsewhere on this page). 127 | *The Trivia section is out of place and a mess. 128 | 129 | My thoughts, [[User:Madman2001|Madman]] 22:55, 7 July 2006 (UTC) 130 | 131 | :: Very much in agreement with you I have begun the clean up. I would suggest cutting the "contact with the spansih part" entirely since this is all treated under [[Spanish conquest of Mexico]].--[[User:Maunus|Maunus]] 13:47, 7 August 2006 (UTC) 132 | 133 | ::: Agree for the most part but please preserve the "legends and hearsay", attributing them as such. I'm not sure how much of that may have come from Nanahuatzin but the point is that the legends are useful information if presented as such. Part of this comes from the importance of myth and legend in cultures such as the Aztec. And part of it comes from the fact that history is not just about fact but also about interpretation. Knowing what legends are passed on about a person gives us some understanding of how his contemporaries and subsequent generations viewed the person. A person is not just what he does but what others think about what he does. 134 | 135 | ::: Also, I have mixed feelings about [[User:Maunus|Maunus]]'s recommendation to cut out the "contact with the spanish" part entirely. I did something similar with the [[Hernan Cortes]] article (i.e. I cut out the "conquest of Mexico" part and put it in the [[Spanish Conquest of Mexico]] article. Another editor objected saying that I had removed the most important part of Cortes' life. As a compromise, I put a short summary of the [[Spanish conquest of Mexico]] article back into the [[Hernan Cortes]] article. 136 | 137 | ::: On the one hand, we should not replicate too much material between articles. On the other hand, we must not eviscerate the [[Montezuma II]] article by taking out the single episode of his life that makes him famous. I trust [[User:Madman2001]] to do the righ thing. 138 | 139 | ::: --[[User:Richardshusr|Richard]] 23:36, 7 August 2006 (UTC) 140 | 141 | :::: It is very dificult so separate the facts and the legend, but most history books in Mexico, include the omens that it is said, happened before the conquest as part of the biography of Moctezuma. They reflect the feelings and fears of the population of Tenochtitlan, and are part of the contradictions of his caracter. Proud to his people, and humble to the spanish. Brave in battle, and fearfull to the gods. All this has result in a hotly debate on his motives. Maybe all this has to moved to a section and leave the known facts apart.... [[User:Nanahuatzin|Nanahuatzin]] 142 | 143 | :::: I dont see the Omens relevance for the personality or history of Moteczuma. As argued by James Lockhart in "we people here" the omens are most likely an aztec hindsight addition and has snothing to do with actual history. And everything that is known about the personality of Moteczoma whether being "brave" or "humble" and to whom we have from biased sources that should not be mistaken for real biographic information. Of course it can be included but crtitically please, and stating which sources say what and what might be their reasons to do so. But yes I think there could be a section on the persona of Motecuzoma as he has been depicted in legends and hearsay. --[[User:Maunus|Maunus]] 12:09, 8 August 2006 (UTC) 144 | :::::I think the article is taking shape and I move that the cleanup tag be removed. If someone helps me giving a few finishsing touches I think we can get it to GA status within the month.[[User:Maunus|Maunus]] 14:29, 14 August 2006 (UTC) 145 | 146 | Thanks to the editors, particularly Maunus, for cleaning up the article. It has moved from awful to good, and I have removed the clean-up tag. 147 | 148 | Could I also ask the editors, particularly Maunus, to please check your spelling and links in your articles? As just one example, there were numerous spellings of "Moctezuma" scattered throughout the article (all from one editor). There were also numerous redlinks in the article, red only because of apelling errors (e.g. "Fransican" instead of "Franciscan" and many others). '''''Please''''' check your work before saving. Thanks, [[User:Madman2001|Madman]] 15:24, 26 August 2006 (UTC) 149 | 150 | Right you are. Sorry, I will do better checking. [[User:Maunus|Maunus]] 15:39, 26 August 2006 (UTC) 151 | 152 | == Azcapotzalco or Tlacopan?== 153 | The Aztec Triple Alliance is being described with Tenochtitlan, Texcoco and Azcapotzalco, shouldn't it be Tlacopan instead? Even once being subordinated to Azcapotzalco, Tlacopan sided with the other two cities in their conquest over Azcapotzalco. Then, Totoquihuaztli, Tlacopan's ruler, claimed the title of Tepaneca tecuhtli, "Lord of the Tepanecs".
154 | Do you agree on changing it? [[User:201.37.6.195|201.37.6.195]] 01:42, 9 August 2006 (UTC) 155 | 156 | :I hadn't checked [[Aztec Triple Alliance]] article. Info there matches what I said above. I've already fixed it. [[User:201.37.6.195|201.37.6.195]] 04:11, 9 August 2006 (UTC) 157 | 158 | 159 | 160 | ==Pulled Trivia section from article== 161 | 162 | Pulled this Trivia section from article per [[Wikipedia:Avoid trivia sections in articles]]. If you feel any line below belongs in the article, please insert it in the proper place. 163 | 164 | *[[Montezuma's Revenge]] is the colloquial term for any episodes of [[travelers' diarrhea]] or other sicknesses contracted by tourists visiting [[Mexico]]. 165 | *The [[Mexico City metro]] system has a station named [[Metro Moctezuma]] in honour of the ''tlatoani''. 166 | *[[Antonio Vivaldi]] also wrote an [[opera]] called "[[Motezuma]]"; it has little to do with the historical character. 167 | * Moctezuma was not allowed to be looked at unless it was a festival. A person that looked at him would receive the death penalty. 168 | * He was so holy that he was carried around everywhere so that his feet would not touch common ground. 169 | *This Emperor Moctezuma may possibly have influenced the semi-divine figure of [[Montezuma (mythology)|Montezuma]] common to the 19th century folklore of native tribes living in Arizona and New Mexico. 170 | *There is a reference to Montezuma in the song [[Cortez The Killer]] by [[Neil Young]] and [[Crazy Horse]] off of the Album [[Zuma]](1975). The verse is as follows: "On the shore lay Montezuma, With his coca leaves and pearls, In his halls he often wandered with the secrets of the world." 171 | 172 | ::This is a perfect illustration of why that guideline should not exist, and the guideline was apparently railroaded in by a select few without the knowledge of most editors... [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 15:45, 26 August 2006 (UTC) 173 | :: I agree with the guideline, and have long had a mind to remove the trivia section myself. It is wholly non-encyclopedic and of little to no relevance.[[User:Maunus|Maunus]] 15:57, 26 August 2006 (UTC) 174 | 175 | ::: The trivia section has bothered me for some time so I am glad to see it go. I was not aware of the guideline and am glad to have a solid basis for getting rid of trivia sections in other articles. 176 | 177 | ::: However, I do think there is value in acknowledging historical legacy and modern perceptions. Montezuma's Revenge deserves some mention since that phrase and the "From the shores of Tripoli to the halls of Montezuma" from the Marine Corps hymn were the two most well known mentions of Montezuma in my generation although admittedly this has probably changed in the younger generation of today. 178 | 179 | ::: Similarly, the influence of Moctezuma on the semi-divine figure in folkore of native tribes deserves mention IF it can be sourced to a [[WP:RS|reliable source]]. 180 | 181 | ::: The rest can probably go. 182 | 183 | ::: --[[User:Richardshusr|Richard]] 16:14, 26 August 2006 (UTC) 184 | 185 | ::::Richard, I would agree with you on mentioning Montezuma's Revenge and the Marine Corps hymn. Typically they are in a section entitled '''Modern legacy''' or some such. As mentioned above, editors are welcome to incorporate them into the article, just not in a section entitled '''Trivia''' which becomes a trash-magnet. 186 | 187 | :::[[User:Codex Sinaiticus]], what parts of this are relevant to an '''encyclopedia''' article on Moctezuma? Certainly the Neil Young song and Montezuma's revenga are just plain silly. The Vivaldi sentence is almost self-defeating ("it has little to do with the historical character"). The two tidbits about how special he was sound like legends more than facts. If they belong in the article, they need to be referenced and put into the article itself. Interested in your thoughts, [[User:Codex Sinaiticus]]. Thanks, [[User:Madman2001|Madman]] 16:20, 26 August 2006 (UTC) 188 | 189 | ::This new "no trivia" rule that seems to have come out of nowhere, if applied site wide, will radically transform the entire face of wikipedia as it currently exists, into something quite different, and far less enjoyable. I guess there is a minority of editors who decided behinbd closed doors that they wanted a carbon copy of Encyclopedia Britannica, and are now presenting this as a 'fait accomplis' "guideline" because no one knew about it. It feels like a hijacking. If this had been proposed in the open, it would NEVER have received a support from Wikipedia's editors. 190 | 191 | ::If you seriously need "reliable sources" that Montezuma has ever been connected with Motecuzoma, try taking a look at the Montezuma article. The other items are mostly relevant links to other related articles, why are they being suppressed and whom does this benefit??? [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 17:15, 26 August 2006 (UTC) 192 | 193 | :::Hot issue for you, eh? [[User:Madman2001|Madman]] 17:24, 26 August 2006 (UTC) 194 | 195 | :::Yes. What is going on here with deleting links to related information from the article onder the pretense of a misguided guideline is just plain wrong. I will resist this on any article where I see this being done. [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 17:38, 26 August 2006 (UTC) 196 | 197 | == This article needs some reorganization == 198 | 199 | I haven't ever looked at this article closely. There's a lot of information but, based on just a quick look, the section/subsection organization needs work. I don't have time to work on it today but I figured I'd drop the cleanup tag on it and try to get back to it later. 200 | 201 | --[[User:Richardshusr|Richard]] 17:34, 26 August 2006 (UTC) 202 | 203 | 204 | ==Expert tag?== 205 | An anonymous editor added the expert tag to the article without stating a reason. If no reason is introduced here on the talk page within the next days I will remove it.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 21:08, 29 May 2007 (UTC) 206 | 207 | == Moctezuma before 1519 == 208 | 209 | What about Moctezuma's life and reign ''before'' the arrival of the Spaniards? A description of the last two years of an about fifty to sixty year-old person is a pretty unfinished biography, isn't it? --[[User:88.64.212.26|88.64.212.26]] 17:15, 16 July 2007 (UTC) 210 | 211 | == Translation == 212 | 213 | I removed: 214 | :"he who angers himself."{{cite book |last=Thomas |first=Hugh |year=1995 |title=Conquest: Montezuma, Cortés, and the Fall of Old Mexico}} 215 | Despite being sourced, [[Hugh Thomas]] is a historian, not a nahuatlato. This translation both ignores the "lord" morpheme, treating it as though the name was simply ''Mozoma'', and fails to recognize that Nahuatl often uses reflexives with a passive meaning — it's like translating Spanish ''no se sabe'' as "it doesn't know itself" when the correct translation is "it is unknown". --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 03:39, 4 January 2008 (UTC) 216 | :::Agreeing.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 10:05, 4 January 2008 (UTC) 217 | 218 | == NPOV in the sources section == 219 | 220 | The paucity of indigenous written records and as well as the sometimes biased descriptions of the man by chroniclers can easily lead to no small amount of friction about how Moctezuma really was. Interpretations of the biographical accounts that we do possess vary widely. Therefore, I think it best to present actual quotations from them, with brief summaries of the general descrription BY THE WRITER, in the article, rather than fill the sources section with conjecture.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 16:49, 19 January 2008 (UTC) 221 | ::I agree that the section needs less conjecture. However I am not sure that the right way about it is to insert large chunks of primary sources. Primary sources needs to be interpreted in order to be correctly understood - some sources are more reliable than others, some have one kind of bias others have the opposite good scholars have studied the sources and written about how to best understand them. Therefore the section in my opinion should build on what good scholars have said about how to best understand the soruces. I think that the right way to do this is by having the "conjecture" and interpretations of historians be fully sourced to the works of the which historians who have written about it. For example James Lockhart and Matthew Restall - the article as it is anow is not well sourced and only briefly mentions those scholars, while not pointing to any specific texts by them: this should be change. Some statements are completely unsourced and seems to be lose conjecture by previous editors - these should be removed. I think it is ok to include smaller pieces of quoted primary sources to illustrate salient point and to show the style of the sources - but they should not be made to look like being the "truth". The truth is not in the sources, but can only be approximated through their interpretation.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 17:07, 19 January 2008 (UTC) 222 | 223 | In the case of Bernal Diaz, I think the quotes offered are reliable. I agree that the opinions of good, peer-reviewed, publications to support different interpretations are needed (e.g. Restall and Lockhart). I have made some edits, and I do believe that the Wikipedia reader can evaluate the actual quotation on their own. Nothing is lost by maintaining the description quote I've entered, though I will agree that the second quote about their reaction to his death could be superfluous and maybe belongs on the [[True History of the Conquest of New Spain]] page. [[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 17:53, 19 January 2008 (UTC) 224 | 225 | ==Neutrality== 226 | 227 | Until we can get some sources to support the arguments in the tagged section, I think we need to POV check. Once we get that, then the article will be much stronger. Also, since it is unsourced, without even mentioning a potential source, I have removed this section: 228 | 229 | ''As Aztec ruler, he expanded the Aztec Empire the most; warfare expanded the territory as far south as [[Soconusco|Xoconosco]] in [[Chiapas]] and the [[Isthmus of Tehuantepec]]. He elaborated the [[Templo Mayor]] and revolutionized the tribute system. He also increased Tenochtitlán's power over its allied cities to a dominant position in the [[Aztec Triple Alliance]]. He created a special temple, dedicated to the gods of the conquered towns, inside the temple of [[Huitzilopochtli]]. He also built a monument dedicated to the Tlatoani [[Tízoc]].'' 230 | 231 | until someone can provide sources. [[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 18:08, 19 January 2008 (UTC) 232 | ::I'll source that right away.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 10:55, 20 January 2008 (UTC) 233 | 234 | Perfect! I like the new positioning too; it fits better where you've put it.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 19:04, 20 January 2008 (UTC) 235 | 236 | 237 | ==A section o his life and times?== 238 | I think in focusing on the sources we have neglected making a section about the actual life of Moctezuma - I think we should make a section before the source section describing what is known about his lifes main events, and it should probably inforporate the "contact with spanish" section.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·ƛ·]] 10:49, 19 February 2008 (UTC) 239 | :Why is everyone focused on Moctezuma's life after the Spaniards' arrival? At that time he was more than 60 years old and he had already reigned for seventeen years! --[[Special:Contributions/88.64.57.246|88.64.57.246]] ([[User talk:88.64.57.246|talk]]) 21:22, 27 February 2008 (UTC) 240 | 241 | ==GA review== 242 | {{#if:|{{#ifeq:{{NAMESPACE}}|Talk||{{error:not substituted|GAList}}
}}}} 243 | :'''[[Wikipedia:Good article nominations|GA]] review''' (see [[Wikipedia:What is a good article?|here]] for criteria) 244 | {{#if:Needs some serious work on being consistent with references. Also needs a number of citations.|
Needs some serious work on being consistent with references. Also needs a number of citations.|}} 245 | #It is '''reasonably well written'''. 246 | #:a ''(prose)'': {{GAList/check|}} b ''([[Wikipedia:Manual of Style|MoS]])'': {{GAList/check|}} 247 | #:: {{#if:|{{{1com}}}|}} 248 | #It is '''factually accurate''' and '''[[Wikipedia:Verifiability|verifiable]]'''. 249 | #:a ''(references)'': {{GAList/check|}} b ''(citations to [[WP:RS|reliable sources]])'': {{GAList/check|}} c ''([[Wikipedia:No original research|OR]])'': {{GAList/check|}} 250 | #:: {{#if:|{{{2com}}}|}} 251 | #It is '''broad in its coverage'''. 252 | #:a ''(major aspects)'': {{GAList/check|}} b ''(focused)'': {{GAList/check|aye}} 253 | #:: {{#if:|{{{3com}}}|}} 254 | #It follows the '''[[WP:NPOV|neutral point of view]] policy'''. 255 | #:''Fair representation without bias'': {{GAList/check|aye}} 256 | #:: {{#if:|{{{4com}}}|}} 257 | #It is '''stable'''. 258 | #:''No edit wars etc.'': {{GAList/check|aye}} 259 | #:: {{#if:|{{{5com}}}|}} 260 | #It is illustrated by '''[[Wikipedia:Images|images]]''', where possible and appropriate. 261 | #:a ''(images are tagged and non-free images have [[Wikipedia:Image_description_page#Use_rationale|fair use rationales]])'': {{GAList/check|aye}} b ''(appropriate use with [[WP:CAP|suitable captions]])'': {{GAList/check|aye}} 262 | #:: {{#if:|{{{6com}}}|}} 263 | #'''Overall''': 264 | #:''Pass/Fail'': {{GAList/check|nay}} 265 | #:: {{#if:|{{{7com}}}|}}
266 | 267 | Details: 268 | 269 | * Consistency in formating the references and sources. There are a couple of websites used as references that are just bare urls, a few references are used that aren't listed in the sources, and the formatting of a couple of the sources isn't consistent with the format of the other sources. You also use a mix of Harvard citations (the (Diaz del Castillo 1568/1963 224-25)) and regular footnotes. It doesn't matter which style you use, it just needs to be consistent. 270 | * Reference 26 is a self-published website and wouldn't be considered a reliable source. Would be fine if it used the source listed at the bottom of the website as a source. 271 | * Formatting the quotations should be not in italics. See [[WP:MOS#Italics]]. 272 | * A number of spots need citations. I've added citation needed tags at the spots. I also marked with hidden text a few other spots that while not needing citations wouldn't be hurt by having them. 273 | * Also in the consistency part - footnotes after the punctuation. A number of spots have the footnoes after the punctuation. 274 | * Direct quotations need citations attachted to them, I've marked those spots also with citation needed tags. 275 | * Consider changing the Native American mythology, Symbol of indigenous leadership, Spanish noble family, and references in modern culture sections into subsections under the Legacy section. Also many of these sections could use some expansion to them, they feel kinda skimpy for someone who has had so much impact. 276 | * Also, consider changing the data in References in modern culture from a list into a paragraph or two. Myself, I'd nix the video game stuff, but that's just me. 277 | * See also sections usually go right before the References section. 278 | 279 | I haven't really read through the prose for anything grammatically wrong or awkward. Mainly, it needs a ruthless run through for consistency in referencing, puncutation, and other issues. Like I said, I haven't read the prose deeply, which would need to be done before passing it to GA also. I'll do that after the issues above are dealt with, it'll be easier then, after the kinks are worked out with the other concerns. 280 | 281 | I've put the article on hold for seven days to allow folks to address the issues I've brought up. Feel free to contact me on my talk page, or here with any concerns, and let me know one of those places when the issues have been addressed. If I may suggest that you strike out, check mark, or otherwise mark the items I've detailed, that will make it possible for me to see what's been addressed, and you can keep track of what's been done and what still needs to be worked on. [[User:Ealdgyth|Ealdgyth]] | [[User talk:Ealdgyth|Talk]] 19:27, 8 March 2008 (UTC) 282 | 283 | ::Not sure if I'll be able to get to all this stuff in the next week, but this will give us a starting point from which to better the article, if we don't get it fixed in time. Thanks for your review.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 20:34, 11 March 2008 (UTC) 284 | :::I'm heading out of town in the next few days and won't be able to reliably check in. Any chance of progress being made soon? [[User:Ealdgyth|Ealdgyth]] - [[User talk:Ealdgyth|Talk]] 02:47, 17 March 2008 (UTC) 285 | 286 | Given that another week has passed with no movement, I'm failing this article's GA nomination. [[User:Ealdgyth|Ealdgyth]] - [[User talk:Ealdgyth|Talk]] 14:08, 30 March 2008 (UTC) 287 | 288 | == Year of birth == 289 | 290 | Given that secondary sources commonly differ wrt M's approx year of birth (c.1466 vs c.1480), it would be good to track down and identify in the article the primary sources by which each of these alternatives are calculated. Anyone know offhand what might be the original basis for either? --[[User:CJLL Wright|cjllw]] ʘ ''[[User talk:CJLL Wright|TALK]]'' 03:33, 28 October 2008 (UTC) 291 | 292 | == The Feather Crown of Montezuma == 293 | 294 | I have looked just about everywhere for an article on Moctezuma's 'penacho', or feather crown. Frankly, I'm very surprised it doesn't exist (If this is the case). There is a Spanish article named 'Penacho de Moctezuma', and I would love to start an English article on this very important historical artifact. However, I have little knowledge of the Spanish langauge, and therefore can't translate effectively. Would someone mind creating an artical on this very interesting topic? Or help to find someone who can? I find little info in English on the subject anywhere, accept that it is kept in Austria and the Mexican government has asked for it back. Just an idea! Send a message. Thank You! [[User:C.Kent87|C.Kent87]] ([[User talk:C.Kent87|talk]]) 07:23, 18 December 2008 (UTC) 295 | 296 | :You're right, that artefact would make an interesting and valid basis for an article, which we don't have here on en.wiki yet (actually we may not even mention it in passing on related articles, AFAIK). Even tho' the headdress almost certainly is not Moctezuma's, I believe it's regarded as plausibly coming from the court at the time. It's subsequent provenance would also make for some interesting research & reading. 297 | :I or others could help on the spanish translations, tho I'd prefer not to just translate the es.wiki article as-is. Can't say when or if will be able to start something up, but as first step at least will put it on the [[WP:MESO/REQ|WP Mesoamerica's new requests lists]] as a reminder to be done, when time & inclination suits. Cheers, --[[User:CJLL Wright|cjllw]] ʘ ''[[User talk:CJLL Wright|TALK]]'' 04:47, 11 March 2009 (UTC) 298 | 299 | :: In the spansih article we have commented, Posibly it was one of the gifts that moctezuma gave to Cortes. Posibly not a headress.. but a part of the Quezcalcoatl drees (maybe a cape, since no tlatonai had worn anything like that), that was put by the emisaries of Moctesuma on Cortes. It was the "treasure of Questalcoatl" [[http://www.jornada.unam.mx/2005/06/13/a08n1cul.php]]. [[User:Nanahuatzin|Nanahuatzin]] ([[User talk:Nanahuatzin|talk]]) 23:41, 11 May 2009 (UTC) 300 | 301 | == "The name of Montezuma in Aztec" == 302 | 303 | [[File:Thenameofmontezumainaztec.png]] 304 | 305 | Moctezuma's usual name glyph can be seen [[:File:Moctezuma Mendoza.jpg|here]] and [[:File:Teocalli of the Sacred War - sun.jpg|here]] (top right). These represent ''Moquauhzoma'', claimed by Brinton to be the correct form Montezuma's name (but not the form that occurs in actual texts). --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 06:53, 22 May 2009 (UTC) 306 | ::Hmm could you mention in which sources this glyph occurs? I think Gillespie gives a number of different glyphs doesn't she?[[User:Maunus|·Maunus·ƛ·]] 15:30, 22 May 2009 (UTC) 307 | ::ahh now I got it - you removed it you didn't add it. I thought you'd gone mad there.[[User:Maunus|·Maunus·ƛ·]] 15:32, 22 May 2009 (UTC) 308 | 309 | == "Another girl?" == 310 | 311 | I'm not about to change this myself and upset everybody, but isn't there a better way to describe his offspring than "another girl?" [[User:Singthenightsky|Singthenightsky]] ([[User talk:Singthenightsky|talk]]) 21:18, 18 March 2010 (UTC) 312 | 313 | 314 | No. — Preceding [[Wikipedia:Signatures|unsigned]] comment added by [[Special:Contributions/108.129.90.185|108.129.90.185]] ([[User talk:108.129.90.185|talk]]) 03:39, 30 September 2011 (UTC) 315 | -------------------------------------------------------------------------------- /wapiti/operations/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import json 5 | from abc import ABCMeta 6 | 7 | from collections import OrderedDict 8 | from functools import wraps 9 | 10 | import sys 11 | from os.path import dirname, abspath 12 | # just until ransom becomes its own package 13 | sys.path.append(dirname(dirname(abspath(__file__)))) 14 | import ransom 15 | 16 | from params import SingleParam, StaticParam 17 | from models import get_unique_func, get_priority_func 18 | from utils import (PriorityQueue, 19 | MaxInt, 20 | chunked_iter, 21 | make_type_wrapper, 22 | OperationExample) 23 | 24 | # TODO: handle automatic redirecting better 25 | # TODO: support batching and optimization limits 26 | # TODO: concurrency. get_current_task() -> get_current_tasks() 27 | # TODO: wrap exceptions 28 | # TODO: separate structure for saving completed subops (for debugging?) 29 | # TODO: WebRequestOperation: accepts URL, action (default: GET) 30 | # TODO: Model links (url attribute) 31 | # TODO: support field param_type (for cases with ints and strs) 32 | # TODO: use source descriptor instead of api_url? (for op.source) 33 | # TODO: check that subop_chain types match up 34 | # TODO: check that priority attribute exists on output_type where applicable 35 | 36 | """ 37 | - what if operations were iterable over their results and process() 38 | returned the operation itself? (more expensive to iterate and find 39 | non-dupe results, would set ops help?) 40 | - client -> root_owner. parent operation (client 41 | if no parent op) -> owner. 42 | - pregenerate MediawikiCalls/URLs on QueryOperations 43 | 44 | Operation modifiers: 45 | - Prioritized 46 | - Recursive 47 | - Buffered 48 | 49 | fun metadata: 50 | 51 | - operations executed 52 | - suboperations skipped (from dedupe/prioritization/laziness) 53 | - web requests executed, kb downloaded 54 | 55 | retry strategies: 56 | 57 | - absolute number of failures 58 | - streaks/runs of failures 59 | - fail if first operation fails 60 | - reduce batch size/query limit on timeouts 61 | 62 | prioritization/batching/concurrency implementation thoughts: 63 | 64 | - hands-off implementation via multiplexing? 65 | - separate priority queues for params and suboperations? 66 | - fancy new datastructure with dedupe + priority queueing built-in 67 | - buffering: do 3/5/10 GetCategoryInfos before fetching member pages 68 | - early subop production based on next parameter priority 69 | sinking below a certain threshold? 70 | (e.g., next param's subcats=5 -> fetch more category infos) 71 | """ 72 | 73 | DEFAULT_API_URL = 'http://en.wikipedia.org/w/api.php' 74 | DEFAULT_BASE_URL = 'http://en.wikipedia.org/wiki/' 75 | 76 | DEFAULT_HEADERS = {'User-Agent': ('Wapiti/0.0.0 Mahmoud Hashemi' 77 | ' mahmoudrhashemi@gmail.com') } 78 | 79 | ALL = MaxInt('ALL') 80 | DEFAULT_MIN = 50 81 | 82 | 83 | class WapitiException(Exception): 84 | pass 85 | 86 | 87 | class NoMoreResults(Exception): 88 | pass 89 | 90 | 91 | DEFAULT_WEB_CLIENT = ransom.Client({'headers': DEFAULT_HEADERS}) 92 | 93 | 94 | class MockClient(object): 95 | def __init__(self, is_bot=False, **kwargs): 96 | self.debug = kwargs.pop('debug', False) 97 | self.web_client = DEFAULT_WEB_CLIENT 98 | self.api_url = DEFAULT_API_URL 99 | self.is_bot = is_bot 100 | 101 | 102 | DEFAULT_CLIENT = MockClient() 103 | 104 | 105 | Tune = make_type_wrapper('Tune', [('priority', None), ('buffer', None)]) 106 | Recursive = make_type_wrapper('Recursive', [('is_recursive', True)]) 107 | 108 | 109 | def get_unwrapped_options(wr_type): 110 | try: 111 | return dict(wr_type._wrapped_dict), wr_type._wrapped 112 | except AttributeError: 113 | return {}, wr_type 114 | 115 | 116 | class LimitSpec(object): 117 | def __init__(self, _max, bot_max=None): 118 | self.max = int(_max) 119 | self.bot_max = bot_max or (self.max * 10) 120 | 121 | def get_limit(self, is_bot=False): 122 | if is_bot: 123 | return self.bot_max 124 | return self.max 125 | 126 | def __int__(self): 127 | return self.max 128 | 129 | 130 | class ParamLimit(LimitSpec): 131 | pass 132 | 133 | 134 | class QueryLimit(LimitSpec): 135 | def __init__(self, _max, bot_max=None, mw_default=None, _min=None): 136 | super(QueryLimit, self).__init__(_max, bot_max) 137 | self.mw_default = mw_default 138 | if _min is None: 139 | _min = DEFAULT_MIN 140 | self.min = min(self.max, _min) 141 | 142 | 143 | PL_50_500 = ParamLimit(50, 500) 144 | QL_50_500 = QueryLimit(50, 500, 10) 145 | DEFAULT_QUERY_LIMIT = QL_500_5000 = QueryLimit(500, 5000, 10) 146 | 147 | 148 | def get_inputless_init(old_init): 149 | """ 150 | Used for Operations like get_random() which don't take an input 151 | parameter. 152 | """ 153 | if getattr(old_init, '_is_inputless', None): 154 | return old_init 155 | @wraps(old_init) 156 | def inputless_init(self, limit=None, **kw): 157 | kw['input_param'] = None 158 | return old_init(self, limit=limit, **kw) 159 | inputless_init._is_inputless = True 160 | return inputless_init 161 | 162 | 163 | def get_field_str(field): 164 | out_str = field.key 165 | mods = [] 166 | if field.required: 167 | mods.append('required') 168 | if field.multi: 169 | mods.append('multi') 170 | if mods: 171 | out_str += ' (%s)' % ', '.join(mods) 172 | return out_str 173 | 174 | def operation_signature_doc(operation): 175 | if operation.input_field is None: 176 | doc_input = 'None' 177 | else: 178 | doc_input = operation.input_field.key 179 | doc_output = operation.singular_output_type.__name__ 180 | doc_template = 'Input: %s\n' 181 | if operation.is_bijective: 182 | doc_template += 'Output: %s\n' 183 | else: 184 | doc_template += 'Output: List of %s\n' 185 | 186 | print_fields = [f for f in getattr(operation, 'fields', []) 187 | if not isinstance(f, StaticParam)] 188 | if print_fields: 189 | doc_template += 'Options: ' 190 | doc_template += ','.join([get_field_str(f) for f in print_fields]) + '\n' 191 | 192 | if hasattr(operation, 'examples'): 193 | doc_template += 'Examples: \n\t' 194 | doc_template += '\n\t'.join([repr(x) for x in operation.examples]) + '\n' 195 | 196 | return doc_template % (doc_input, doc_output) 197 | 198 | 199 | class OperationMeta(ABCMeta): 200 | _all_ops = [] 201 | 202 | def __new__(cls, name, bases, attrs): 203 | ret = super(OperationMeta, cls).__new__(cls, name, bases, attrs) 204 | if name == 'Operation' or name == 'QueryOperation': 205 | return ret # TODO: add elegance? 206 | subop_chain = getattr(ret, 'subop_chain', []) 207 | try: 208 | input_field = ret.input_field 209 | except AttributeError: 210 | input_field = subop_chain[0].input_field 211 | ret.input_field = input_field 212 | if input_field is None: 213 | ret.__init__ = get_inputless_init(ret.__init__) 214 | else: 215 | input_field.required = True 216 | # TODO: run through subop_chain, checking the outputs match up 217 | try: 218 | output_type = ret.output_type 219 | except AttributeError: 220 | output_type = subop_chain[-1].singular_output_type 221 | for st in subop_chain: 222 | if not st.is_bijective: 223 | output_type = [output_type] 224 | break 225 | ret.output_type = output_type 226 | 227 | try: 228 | ret.singular_output_type = ret.output_type[0] 229 | except (TypeError, IndexError): 230 | ret.singular_output_type = ret.output_type 231 | 232 | # TODO: support manual overrides for the following? 233 | ret.is_multiargument = getattr(input_field, 'multi', False) 234 | ret.is_bijective = True 235 | if type(output_type) is list and output_type: 236 | ret.is_bijective = False 237 | 238 | for ex in getattr(ret, 'examples', []): 239 | ex.bind_op_type(ret) 240 | 241 | ret.__doc__ = (ret.__doc__ and ret.__doc__ + '\n') or '' 242 | ret.__doc__ += operation_signature_doc(ret) 243 | cls._all_ops.append(ret) 244 | return ret 245 | 246 | @property 247 | def help_str(self): 248 | ret = '\n\t'.join([self.__name__] + self.__doc__.strip().split('\n')) 249 | 250 | # TODO move options and examples to the __doc__ 251 | 252 | ret += '\n' 253 | return ret 254 | 255 | class OperationQueue(object): 256 | # TODO: chunking/batching should probably happen here 257 | # with the assistance of another queue for prioritized params 258 | # (i.e., don't create subops so eagerly) 259 | def __init__(self, qid, op_type, default_limit=ALL): 260 | self.qid = qid 261 | options, unwrapped = get_unwrapped_options(op_type) 262 | self.op_type = op_type 263 | self.unwrapped_type = unwrapped 264 | self.options = options 265 | 266 | self.unique_key = options.get('unique_key', 'unique_key') 267 | self.unique_func = get_unique_func(self.unique_key) 268 | self.priority = options.get('priority', 0) 269 | self.priority_func = get_priority_func(self.priority) 270 | self.default_limit = default_limit 271 | 272 | self.param_set = set() 273 | self.op_queue = PriorityQueue() 274 | self._dup_params = [] 275 | 276 | def enqueue(self, param, **kw): 277 | unique_key = self.unique_func(param) 278 | if unique_key in self.param_set: 279 | self._dup_params.append(unique_key) 280 | return 281 | priority = self.priority_func(param) 282 | kwargs = {'limit': self.default_limit} 283 | kwargs.update(kw) 284 | new_subop = self.op_type(param, **kwargs) 285 | new_subop._origin_queue = self.qid 286 | self.op_queue.add(new_subop, priority) 287 | self.param_set.add(unique_key) 288 | 289 | def enqueue_many(self, param_list, **kw): 290 | for param in param_list: 291 | self.enqueue(param, **kw) 292 | return 293 | 294 | def __len__(self): 295 | return len(self.op_queue) 296 | 297 | def peek(self, *a, **kw): 298 | return self.op_queue.peek(*a, **kw) 299 | 300 | def pop(self, *a, **kw): 301 | return self.op_queue.pop(*a, **kw) 302 | 303 | 304 | class Operation(object): 305 | """ 306 | An abstract class connoting some semblance of statefulness and 307 | introspection (e.g., progress monitoring). 308 | """ 309 | __metaclass__ = OperationMeta 310 | 311 | subop_chain = [] 312 | 313 | def __init__(self, input_param, limit=None, **kw): 314 | self.client = kw.pop('client', None) 315 | if self.client is None: 316 | self.client = DEFAULT_CLIENT 317 | self.api_url = self.client.api_url 318 | self.is_bot_op = self.client.is_bot 319 | 320 | self.set_input_param(input_param) 321 | self.set_limit(limit) 322 | 323 | self.kwargs = kw 324 | self.started = False 325 | self.results = OrderedDict() 326 | 327 | subop_queues = [OperationQueue(0, type(self))] 328 | if self.subop_chain: 329 | subop_queues.extend([OperationQueue(i + 1, st) for i, st 330 | in enumerate(self.subop_chain)]) 331 | subop_queues[1].enqueue_many(self.input_param_list, 332 | client=self.client) 333 | self.subop_queues = subop_queues 334 | 335 | def get_progress(self): 336 | return len(self.results) 337 | 338 | def get_relative_progress(self): 339 | if self.limit and self.limit is not ALL: 340 | return len(self.results) / float(self.limit) 341 | return 0.0 342 | 343 | def set_input_param(self, param): 344 | self._orig_input_param = self._input_param = param 345 | if self.input_field: 346 | self._input_param = self.input_field.get_value(param) 347 | self._input_param_list = self.input_field.get_value_list(param) 348 | else: 349 | self._input_param = None 350 | self._input_param_list = [] # TODO: necessary? 351 | 352 | @property 353 | def input_param(self): 354 | return self._input_param 355 | 356 | @property 357 | def input_param_list(self): 358 | return self._input_param_list 359 | 360 | @property 361 | def source(self): 362 | return self.api_url 363 | 364 | def set_limit(self, limit): 365 | # TODO: add support for callable limit getters? 366 | self._orig_limit = limit 367 | if isinstance(limit, Operation): 368 | self.parent = limit 369 | if self.is_bijective and self.input_field: 370 | limit = len(self.input_param_list) 371 | self._limit = limit 372 | 373 | @property 374 | def limit(self): 375 | if isinstance(self._limit, Operation): 376 | return self._limit.remaining 377 | return self._limit 378 | 379 | @property 380 | def remaining(self): 381 | limit = self.limit 382 | if limit is None: 383 | limit = ALL 384 | return max(0, limit - len(self.results)) 385 | 386 | def process(self): 387 | self.started = True 388 | task = self.get_current_task() 389 | if self.client.debug: 390 | print self.__class__.__name__, self.remaining 391 | if task is None: 392 | raise NoMoreResults() 393 | elif isinstance(task, Operation): 394 | results = task.process() 395 | elif callable(task): # not actually used 396 | results = task() 397 | else: 398 | msg = 'task expected as Operation or callable, not: %r' % task 399 | raise TypeError(msg) 400 | # TODO: check resp for api errors/warnings 401 | # TODO: check for unrecognized parameter values 402 | new_results = self.store_results(task, results) 403 | return new_results 404 | 405 | def get_current_task(self): 406 | if not self.remaining: 407 | return None 408 | for subop_queue in reversed(self.subop_queues): 409 | while subop_queue: 410 | subop = subop_queue.peek() 411 | if subop.remaining: 412 | return subop 413 | else: 414 | subop_queue.pop() 415 | return None 416 | 417 | def store_results(self, task, results): 418 | new_res = [] 419 | oqi = getattr(task, '_origin_queue', None) 420 | if oqi is None: 421 | return self._update_results(results) 422 | dqi = oqi + 1 423 | 424 | origin_queue = self.subop_queues[oqi] 425 | is_recursive = origin_queue.options.get('is_recursive') 426 | if is_recursive: 427 | origin_queue.enqueue_many(results, client=self.client) 428 | if dqi < len(self.subop_queues): 429 | dest_queue = self.subop_queues[dqi] 430 | dest_queue.enqueue_many(results, client=self.client) 431 | else: 432 | new_res = self._update_results(results) 433 | return new_res 434 | 435 | def _update_results(self, results): 436 | ret = [] 437 | filt_exists = self.kwargs.get('exists') 438 | filt_exists = filt_exists if filt_exists is None else bool(filt_exists) 439 | for res in results: 440 | if not self.remaining: 441 | break 442 | if filt_exists is not None and res.exists is not filt_exists: 443 | continue 444 | unique_key = getattr(res, 'unique_key', res) 445 | if unique_key in self.results: 446 | continue 447 | self.results[unique_key] = res 448 | ret.append(res) 449 | return ret 450 | 451 | def process_all(self): 452 | while 1: # TODO: +retry behavior 453 | try: 454 | self.process() 455 | except NoMoreResults: 456 | break 457 | return self.results.values() 458 | 459 | __call__ = process_all 460 | 461 | def __repr__(self): 462 | cn = self.__class__.__name__ 463 | if self.input_field is None: 464 | return '%s(limit=%r)' % (cn, self.limit) 465 | tmpl = '%s(%s, limit=%r)' # add dynamic-limity stuff 466 | try: 467 | ip_disp = repr(self.input_param) 468 | except: 469 | ip_disp = "'(unprintable param)'" 470 | return tmpl % (cn, ip_disp, self.limit) 471 | 472 | 473 | class QueryOperation(Operation): 474 | api_action = 'query' 475 | field_prefix = None # e.g., 'gcm' 476 | cont_str_key = None 477 | per_query_limit = DEFAULT_QUERY_LIMIT 478 | default_limit = ALL 479 | 480 | def __init__(self, input_param, limit=None, **kw): 481 | if limit is None: 482 | limit = self.default_limit 483 | super(QueryOperation, self).__init__(input_param, limit, **kw) 484 | self.cont_strs = [] 485 | self._set_params() 486 | 487 | if self.is_bijective and self.input_param and \ 488 | len(self.input_param_list) > self.per_query_param_limit: 489 | self.is_multiplexing = True 490 | self._setup_multiplexing() 491 | else: 492 | self.is_multiplexing = False 493 | 494 | def _set_params(self): 495 | is_bot_op = self.is_bot_op 496 | 497 | params = {} 498 | for field in self.fields: 499 | pref_key = field.get_key(self.field_prefix) 500 | kw_val = self.kwargs.get(field.key) 501 | params[pref_key] = field.get_value(kw_val) 502 | if self.input_field: 503 | qp_key_pref = self.input_field.get_key(self.field_prefix) 504 | qp_val = self.input_field.get_value(self.input_param) 505 | params[qp_key_pref] = qp_val 506 | 507 | field_limit = self.input_field.limit or PL_50_500 508 | try: 509 | pq_pl = field_limit.get_limit(is_bot_op) 510 | except AttributeError: 511 | pq_pl = int(field_limit) 512 | self.per_query_param_limit = pq_pl 513 | self.params = params 514 | try: 515 | per_query_limit = self.per_query_limit.get_limit(is_bot_op) 516 | except AttributeError: 517 | per_query_limit = int(self.per_query_limit) 518 | self.per_query_limit = per_query_limit 519 | 520 | return 521 | 522 | def _setup_multiplexing(self): 523 | subop_queue = self.subop_queues[0] 524 | chunk_size = self.per_query_param_limit 525 | for chunk in chunked_iter(self.input_param_list, chunk_size): 526 | subop_queue.enqueue(tuple(chunk), client=self.client) # TODO 527 | return 528 | 529 | @property 530 | def current_limit(self): 531 | ret = self.remaining 532 | if not self.is_bijective: 533 | ret = max(DEFAULT_MIN, ret) 534 | ret = min(ret, self.per_query_limit) 535 | return ret 536 | 537 | @property 538 | def remaining(self): 539 | if self.is_depleted: 540 | return 0 541 | return super(QueryOperation, self).remaining 542 | 543 | @property 544 | def last_cont_str(self): 545 | if not self.cont_strs: 546 | return None 547 | return self.cont_strs[-1] 548 | 549 | @property 550 | def is_depleted(self): 551 | if self.cont_strs and self.last_cont_str is None: 552 | return True 553 | return False 554 | 555 | @classmethod 556 | def get_field_dict(cls): 557 | ret = dict([(f.get_key(cls.field_prefix), f) for f in cls.fields]) 558 | if cls.input_field: 559 | query_key = cls.input_field.get_key(cls.field_prefix) 560 | ret[query_key] = cls.input_field 561 | return ret 562 | 563 | def get_current_task(self): 564 | if self.is_multiplexing: 565 | return super(QueryOperation, self).get_current_task() 566 | if not self.remaining: 567 | return None 568 | params = self.prepare_params(**self.kwargs) 569 | mw_call = MediaWikiCall(params, client=self.client) 570 | return mw_call 571 | 572 | def prepare_params(self, **kw): 573 | params = dict(self.params) 574 | if not self.is_bijective: 575 | params[self.field_prefix + 'limit'] = self.current_limit 576 | if self.last_cont_str: 577 | params[self.cont_str_key] = self.last_cont_str 578 | params['action'] = self.api_action 579 | return params 580 | 581 | def post_process_response(self, response): 582 | """ 583 | Used to rectify inconsistencies in API responses (looking at 584 | you, Feedback API) 585 | """ 586 | return response.results.get(self.api_action) 587 | 588 | def extract_results(self, resp): 589 | raise NotImplementedError('inheriting classes should return' 590 | ' a list of results from the response') 591 | 592 | def get_cont_str(self, resp): 593 | qc_val = resp.results.get(self.api_action + '-continue') 594 | if qc_val is None: 595 | return None 596 | for key in ('generator', 'prop', 'list'): 597 | if key in self.params: 598 | next_key = self.params[key] 599 | break 600 | else: 601 | raise KeyError("couldn't find contstr") 602 | if not self.cont_str_key: 603 | self.cont_str_key = qc_val[next_key].keys()[0] 604 | return qc_val[next_key][self.cont_str_key] 605 | 606 | def store_results(self, task, resp): 607 | if self.is_multiplexing: 608 | return super(QueryOperation, self).store_results(task, resp) 609 | if resp.notices: # TODO: lift this 610 | self._notices = list(resp.notices) 611 | self._url = resp.url 612 | print "may have an error: %r (%r)" % (resp.notices, resp.url) 613 | processed_resp = self.post_process_response(resp) 614 | if processed_resp is None: 615 | new_cont_str = self.get_cont_str(resp) # TODO: DRY this. 616 | self.cont_strs.append(new_cont_str) 617 | return [] # TODO: keep an eye on this 618 | try: 619 | new_results = self.extract_results(processed_resp) 620 | except Exception: 621 | raise 622 | super(QueryOperation, self).store_results(task, new_results) 623 | new_cont_str = self.get_cont_str(resp) 624 | self.cont_strs.append(new_cont_str) 625 | return new_results 626 | 627 | 628 | BASE_API_PARAMS = {'format': 'json', 629 | 'servedby': 'true'} 630 | 631 | 632 | class MediaWikiCall(Operation): 633 | """ 634 | Sets up actual API HTTP request, makes the request, encapsulates 635 | error handling, and stores results. 636 | """ 637 | input_field = SingleParam('url_params') # param_type=dict) 638 | output_type = Operation 639 | 640 | _limit = 1 641 | 642 | def __init__(self, params, **kw): 643 | # These settings will all go on the WapitiClient 644 | self.raise_exc = kw.pop('raise_exc', True) 645 | self.raise_err = kw.pop('raise_err', True) 646 | self.raise_warn = kw.pop('raise_warn', False) 647 | self.client = kw.pop('client') 648 | self.web_client = getattr(self.client, 649 | 'web_client', 650 | DEFAULT_WEB_CLIENT) 651 | if kw: 652 | raise ValueError('got unexpected keyword arguments: %r' 653 | % kw.keys()) 654 | self.api_url = self.client.api_url 655 | params = params or {} 656 | self.params = dict(BASE_API_PARAMS) 657 | self.params.update(params) 658 | self.action = params['action'] 659 | 660 | self.url = '' 661 | self.results = None 662 | self.servedby = None 663 | self.exception = None 664 | self.error = None 665 | self.error_code = None 666 | self.warnings = [] 667 | 668 | self._input_param = params 669 | 670 | def process(self): 671 | # TODO: add URL to all exceptions 672 | resp = None 673 | try: 674 | resp = self.web_client.get(self.api_url, self.params) 675 | except Exception as e: 676 | # TODO: log 677 | self.exception = e # TODO: wrap 678 | if self.raise_exc: 679 | raise 680 | return self 681 | finally: 682 | self.url = getattr(resp, 'url', '') 683 | 684 | try: 685 | self.results = json.loads(resp.text) 686 | except Exception as e: 687 | self.exception = e # TODO: wrap 688 | if self.raise_exc: 689 | raise 690 | return self 691 | self.servedby = self.results.get('servedby') 692 | 693 | error = self.results.get('error') 694 | if error: 695 | self.error = error.get('info') 696 | self.error_code = error.get('code') 697 | 698 | warnings = self.results.get('warnings', {}) 699 | for mod_name, warn_dict in warnings.items(): 700 | warn_str = '%s: %s' % (mod_name, warn_dict.get('*', warn_dict)) 701 | self.warnings.append(warn_str) 702 | 703 | if self.error and self.raise_err: 704 | raise WapitiException(self.error_code) 705 | if self.warnings and self.raise_warn: 706 | raise WapitiException('warnings: %r' % self.warnings) 707 | return self 708 | 709 | @property 710 | def notices(self): 711 | ret = [] 712 | if self.exception: 713 | ret.append(self.exception) 714 | if self.error: 715 | ret.append(self.error) 716 | if self.warnings: 717 | ret.extend(self.warnings) 718 | return ret 719 | 720 | @property 721 | def remaining(self): 722 | if self.done: 723 | return 0 724 | return 1 725 | 726 | 727 | class WebRequestOperation(Operation): 728 | input_field = SingleParam('url') 729 | output_type = Operation 730 | _limit = 1 731 | 732 | def __init__(self, input_param, **kw): 733 | self.client = kw.pop('client', None) 734 | self.web_client = getattr(self.client, 735 | 'web_client', 736 | DEFAULT_WEB_CLIENT) 737 | self.action = kw.pop('action', 'get') 738 | self.raise_exc = kw.pop('raise_exc', True) 739 | if kw: 740 | raise ValueError('got unexpected keyword arguments: %r' 741 | % kw.keys()) 742 | self.set_input_param(input_param) 743 | self.url = self._input_param 744 | self.kwargs = kw 745 | self.results = {} 746 | 747 | def process(self): 748 | resp = None 749 | try: 750 | resp = self.web_client.req(self.action, self.url) 751 | except Exception as e: 752 | self.exception = e 753 | if self.raise_exc: 754 | raise 755 | return self 756 | self.results[self.url] = resp.text 757 | raise NoMoreResults() 758 | #return self 759 | 760 | 761 | class GetPageHTML(Operation): 762 | input_field = SingleParam('title') 763 | examples = [OperationExample('Africa', limit=1)] 764 | output_type = Operation 765 | _limit = 1 766 | 767 | def __init__(self, *a, **kw): 768 | super(GetPageHTML, self).__init__(*a, **kw) 769 | self.web_client = getattr(self.client, 770 | 'web_client', 771 | DEFAULT_WEB_CLIENT) 772 | self.raise_exc = kw.pop('raise_exc', True) 773 | source_info = getattr(self.client, 'source_info', None) 774 | if source_info: 775 | main_title = source_info.mainpage 776 | main_url = source_info.base 777 | self.base_url = main_url[:-len(main_title)] 778 | else: 779 | self.base_url = DEFAULT_BASE_URL 780 | self.url = self.base_url + self.input_param 781 | self.results = {} 782 | 783 | def process(self): 784 | try: 785 | resp = self.web_client.get(self.url) 786 | except Exception as e: 787 | self.exception = e 788 | if self.raise_exc: 789 | raise 790 | return self 791 | self.results[self.url] = resp.text 792 | raise NoMoreResults() 793 | -------------------------------------------------------------------------------- /wapiti/operations/category.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | wapiti.operations.category 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | A module of query operations related to categories. MediaWiki categories 7 | create an automatic index based on category tags in the page text. 8 | """ 9 | from __future__ import unicode_literals 10 | 11 | from models import CategoryInfo, PageInfo 12 | from base import (QueryOperation, 13 | Operation, 14 | Recursive, 15 | Tune) 16 | from params import StaticParam, SingleParam, MultiParam 17 | from utils import OperationExample 18 | 19 | 20 | class GetCategoryList(QueryOperation): 21 | """ 22 | Fetch the categories containing pages. 23 | """ 24 | field_prefix = 'gcl' 25 | input_field = MultiParam('titles', key_prefix=False) 26 | fields = [StaticParam('generator', 'categories'), 27 | StaticParam('prop', 'categoryinfo'), 28 | SingleParam('gclshow', '')] # hidden, !hidden 29 | output_type = [CategoryInfo] 30 | examples = [OperationExample('Physics')] 31 | 32 | def extract_results(self, query_resp): 33 | ret = [] 34 | for k, pid_dict in query_resp['pages'].iteritems(): 35 | cat_info = CategoryInfo.from_query(pid_dict, 36 | source=self.source) 37 | if cat_info.page_id < 0: 38 | continue 39 | ret.append(cat_info) 40 | return ret 41 | 42 | 43 | class GetCategory(QueryOperation): 44 | """ 45 | Fetch the members in category. 46 | """ 47 | field_prefix = 'gcm' 48 | input_field = SingleParam('title', val_prefix='Category:') 49 | fields = [StaticParam('generator', 'categorymembers'), 50 | StaticParam('prop', 'info'), 51 | StaticParam('inprop', 'subjectid|talkid|protection'), 52 | MultiParam('namespace')] 53 | output_type = [PageInfo] 54 | examples = [OperationExample('Featured_articles')] 55 | 56 | def extract_results(self, query_resp): 57 | ret = [] 58 | for k, pid_dict in query_resp['pages'].iteritems(): 59 | page_ident = PageInfo.from_query(pid_dict, 60 | source=self.source) 61 | ret.append(page_ident) 62 | return ret 63 | 64 | 65 | class GetCategoryArticles(GetCategory): 66 | """ 67 | Fetch the pages (namespace 0 or 1) that are members of category. 68 | """ 69 | fields = GetCategory.fields + [StaticParam('gcmnamespace', '0')] 70 | examples = [OperationExample('Featured_articles')] 71 | 72 | 73 | class GetSubcategoryInfos(QueryOperation): 74 | """ 75 | Fetch `CategoryInfo` for category, used to count the members of 76 | sub-categories. 77 | """ 78 | field_prefix = 'gcm' 79 | input_field = SingleParam('title', val_prefix='Category:') 80 | fields = [StaticParam('generator', 'categorymembers'), 81 | StaticParam('prop', 'categoryinfo'), 82 | StaticParam('gcmtype', 'subcat')] 83 | output_type = [CategoryInfo] 84 | examples = [OperationExample('FA-Class_articles')] 85 | 86 | def extract_results(self, query_resp): 87 | ret = [] 88 | for k, pid_dict in query_resp['pages'].iteritems(): 89 | pid_dict.update(pid_dict.get('categoryinfo', {})) 90 | cat_info = CategoryInfo.from_query(pid_dict, 91 | source=self.source) 92 | if cat_info.page_id < 0: 93 | continue 94 | ret.append(cat_info) 95 | return ret 96 | 97 | 98 | class GetAllCategoryInfos(GetSubcategoryInfos): 99 | """ 100 | Fetch all categories on the source wiki. 101 | """ 102 | field_prefix = 'gac' 103 | input_field = None 104 | fields = [StaticParam('generator', 'allcategories'), 105 | StaticParam('prop', 'categoryinfo')] 106 | examples = [OperationExample(doc='basic allcats')] 107 | 108 | 109 | class GetFlattenedCategory(Operation): 110 | """ 111 | Fetch all category's sub-categories. 112 | """ 113 | subop_chain = [Tune(Recursive(GetSubcategoryInfos), 114 | priority='subcat_count')] 115 | examples = [OperationExample('Africa', 100)] 116 | 117 | 118 | class GetCategoryRecursive(Operation): 119 | """ 120 | Fetch all the members of category and its sub-categories. A Wikipedia 121 | category tree can have a large number of shallow categories, so this 122 | operation will prioritize the larger categories by default. 123 | """ 124 | subop_chain = (GetFlattenedCategory, 125 | Tune(GetCategory, priority='total_count')) 126 | examples = [OperationExample('Africa', 100), 127 | OperationExample('Lists of slang', 10)] 128 | 129 | def __init__(self, input_param, *a, **kw): 130 | super(GetCategoryRecursive, self).__init__(input_param, *a, **kw) 131 | root_cat_op = GetCategory(input_param, 132 | client=self.client) 133 | self.subop_queues[-1].op_queue.add(root_cat_op, 10 ** 6) 134 | 135 | 136 | class GetCategoryArticlesRecursive(Operation): 137 | """ 138 | Fetch all pages (namespace 0 and 1) in category and its sub- 139 | categories. 140 | """ 141 | subop_chain = (GetFlattenedCategory, 142 | Tune(GetCategoryArticles, priority='page_count')) 143 | examples = [OperationExample('Africa', 100), 144 | OperationExample('Lists of slang', 10)] 145 | 146 | def __init__(self, input_param, *a, **kw): 147 | cls = GetCategoryArticlesRecursive 148 | super(cls, self).__init__(input_param, *a, **kw) 149 | root_cat_op = GetCategoryArticles(input_param, 150 | client=self.client) 151 | self.subop_queues[-1].op_queue.add(root_cat_op, 10 ** 6) 152 | -------------------------------------------------------------------------------- /wapiti/operations/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption("--mag", action="store", type="int", default=1, 6 | help="magnitude of the operation limits") 7 | 8 | 9 | @pytest.fixture 10 | def mag(request): 11 | return request.config.getoption("--mag") 12 | -------------------------------------------------------------------------------- /wapiti/operations/feedback.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import SingleParam, StaticParam 6 | from utils import OperationExample 7 | 8 | 9 | #class GetFeedbackV4(QueryOperation): 10 | # """ 11 | # This API is no longer available (on en or de wikipedia). As of 12 | # 3/9/2013, this API does not even appear in the documentation at: 13 | # http://en.wikipedia.org/w/api.php 14 | # """ 15 | # field_prefix = 'af' 16 | # input_field = SingleParam('pageid') 17 | # fields = [StaticParam('list', 'articlefeedback')] 18 | # output_type = list 19 | # 20 | # def extract_results(self, query_resp): 21 | # ret = query_resp['articlefeedback'][0].get('ratings', []) 22 | # return ret 23 | 24 | 25 | _FV5_KNOWN_FILTERS = ['*', 'featured', 'unreviewed', 'helpful', 'unhelpful', 26 | 'flagged', 'useful', 'resolved', 'noaction', 27 | 'inappropriate', 'archived', 'allcomment', 'hidden', 28 | 'requested', 'declined', 'oversighted', 'all'] 29 | 30 | 31 | class GetFeedbackV5(QueryOperation): 32 | """ 33 | article feedback v5 breaks standards in a couple ways. 34 | * the various v5 APIs use different prefixes (af/afvf) 35 | * it doesn't put its results under 'query', requiring a custom 36 | post_process_response() 37 | """ 38 | field_prefix = 'afvf' 39 | input_field = SingleParam('pageid') 40 | fields = [StaticParam('list', 'articlefeedbackv5-view-feedback'), 41 | SingleParam('filter', default='featured')] 42 | output_type = list 43 | examples = [OperationExample('604727')] 44 | 45 | def post_process_response(self, response): 46 | if not response.results: 47 | return {} 48 | return dict(response.results) 49 | 50 | def extract_results(self, query_resp): 51 | count = query_resp['articlefeedbackv5-view-feedback']['count'] 52 | return ['TODO'] * int(count) 53 | -------------------------------------------------------------------------------- /wapiti/operations/files.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import MultiParam, StaticParam 6 | from models import PageInfo, ImageInfo 7 | from utils import OperationExample 8 | 9 | 10 | DEFAULT_IMAGE_PROPS = ['timestamp', 'user', 'userid', 'comment', 'parsedcomment', 11 | 'url', 'size', 'dimensions', 'sha1', 'mime', 'mediatype', 12 | 'metadata', 'bitdepth'] 13 | IMAGE_INFO_PROPS = DEFAULT_IMAGE_PROPS + ['thumbmime', 'archivename'] 14 | 15 | 16 | class GetImages(QueryOperation): 17 | """ 18 | Fetch the images embedded on pages. 19 | """ 20 | field_prefix = 'gim' 21 | input_field = MultiParam('titles', key_prefix=False) 22 | fields = [StaticParam('generator', 'images'), 23 | StaticParam('prop', 'info'), 24 | StaticParam('inprop', 'subjectid|talkid|protection')] 25 | output_type = [PageInfo] 26 | examples = [OperationExample('Coffee')] 27 | 28 | def extract_results(self, query_resp): 29 | ret = [] 30 | for pid, pid_dict in query_resp['pages'].iteritems(): 31 | if pid.startswith('-'): 32 | pid_dict['pageid'] = None # TODO: breaks consistency :/ 33 | page_ident = PageInfo.from_query(pid_dict, 34 | source=self.source) 35 | ret.append(page_ident) 36 | return ret 37 | 38 | 39 | class GetImageInfos(QueryOperation): 40 | field_prefix = 'ii' 41 | input_field = MultiParam('titles', key_prefix=False) 42 | fields = [StaticParam('prop', 'imageinfo'), 43 | StaticParam('iiprop', IMAGE_INFO_PROPS)] 44 | output_type = [ImageInfo] 45 | 46 | def extract_results(self, query_resp): 47 | ret = [] 48 | for k, pid_dict in query_resp['pages'].iteritems(): 49 | if int(k) < 0 and pid_dict['imagerepository'] != 'local': 50 | pid_dict['pageid'] = 'shared' 51 | pid_dict['revid'] = 'shared' 52 | try: 53 | pid_dict.update(pid_dict.get('imageinfo', [{}])[0]) 54 | image_info = ImageInfo.from_query(pid_dict, 55 | source=self.source) 56 | except ValueError as e: 57 | print e 58 | continue 59 | ret.append(image_info) 60 | return ret 61 | 62 | 63 | class GetAllImageInfos(GetImageInfos): 64 | field_prefix = 'gai' 65 | input_field = None 66 | fields = [StaticParam('generator', 'allimages'), 67 | StaticParam('prop', 'imageinfo'), 68 | StaticParam('gaiprop', DEFAULT_IMAGE_PROPS)] 69 | examples = [OperationExample()] 70 | -------------------------------------------------------------------------------- /wapiti/operations/links.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import SingleParam, MultiParam, StaticParam 6 | from models import (PageInfo, LanguageLink, InterwikiLink, ExternalLink) 7 | from utils import OperationExample 8 | 9 | 10 | class GetBacklinks(QueryOperation): 11 | """ 12 | Fetch page's incoming links from other pages on source wiki. 13 | """ 14 | field_prefix = 'gbl' 15 | input_field = SingleParam('title') 16 | fields = [StaticParam('generator', 'backlinks'), 17 | StaticParam('prop', 'info'), 18 | StaticParam('inprop', 'subjectid|talkid|protection')] 19 | output_type = [PageInfo] 20 | examples = [OperationExample('Coffee')] 21 | 22 | def extract_results(self, query_resp): 23 | ret = [] 24 | for pid, pid_dict in query_resp.get('pages', {}).iteritems(): 25 | page_info = PageInfo.from_query(pid_dict, 26 | source=self.source) 27 | ret.append(page_info) 28 | return ret 29 | 30 | 31 | class GetLinks(QueryOperation): 32 | """ 33 | Fetch page's outgoing links to other pages on source wiki. 34 | """ 35 | field_prefix = 'gpl' 36 | input_field = SingleParam('titles', key_prefix=False) 37 | fields = [StaticParam('generator', 'links'), 38 | StaticParam('prop', 'info'), 39 | StaticParam('inprop', 'subjectid|talkid|protection'), 40 | MultiParam('namespace')] 41 | output_type = [PageInfo] 42 | examples = [OperationExample('Coffee'), 43 | OperationExample('Aabach')] 44 | 45 | def extract_results(self, query_resp): 46 | ret = [] 47 | for pid, pid_dict in query_resp['pages'].iteritems(): 48 | page_info = PageInfo.from_query(pid_dict, 49 | source=self.source) 50 | ret.append(page_info) 51 | return ret 52 | 53 | 54 | class GetExternalLinks(QueryOperation): 55 | """ 56 | Fetch page outgoing links to URLs outside of source wiki. 57 | """ 58 | field_prefix = 'el' 59 | input_field = SingleParam('titles', key_prefix=False) 60 | fields = [StaticParam('prop', 'extlinks')] 61 | output_type = [ExternalLink] 62 | examples = [OperationExample('Croatian War of Independence')] 63 | 64 | def extract_results(self, query_resp): 65 | ret = [] 66 | for pid_dict in query_resp.get('pages', {}).values(): 67 | for el in pid_dict.get('extlinks', []): 68 | cur_dict = dict(pid_dict) 69 | cur_dict['source'] = self.source 70 | cur_dict['url'] = el.get('*') 71 | link = ExternalLink.from_query(cur_dict) 72 | ret.append(link) 73 | return ret 74 | 75 | def prepare_params(self, **kw): 76 | params = super(GetExternalLinks, self).prepare_params(**kw) 77 | if params.get('elcontinue'): 78 | params['eloffset'] = params.pop('elcontinue') 79 | return params 80 | 81 | 82 | class GetLanguageLinks(QueryOperation): 83 | """ 84 | Fetch pages' interlanguage links (aka "Language Links" in the MediaWiki 85 | API). Interlanguage links should correspond to pages on another language 86 | wiki. Mostly useful on a source wiki with a family of similar multilingual 87 | projects, such as Wikipedias. 88 | """ 89 | field_prefix = 'll' 90 | input_field = MultiParam('titles', key_prefix=False) 91 | fields = [StaticParam('prop', 'langlinks'), 92 | SingleParam('url', True)] 93 | output_type = [LanguageLink] 94 | examples = [OperationExample('Coffee')] 95 | 96 | def extract_results(self, query_resp): 97 | ret = [] 98 | for pid_dict in query_resp.get('pages', {}).values(): 99 | for ld in pid_dict.get('langlinks', []): 100 | cur_dict = dict(pid_dict) 101 | cur_dict['source'] = self.source 102 | cur_dict['url'] = ld.get('*') 103 | cur_dict['language'] = ld.get('lang') 104 | link = LanguageLink.from_query(cur_dict) 105 | ret.append(link) 106 | return ret 107 | 108 | 109 | class GetInterwikiLinks(QueryOperation): 110 | """ 111 | Fetch pages' interwiki links. 112 | """ 113 | field_prefix = 'iw' 114 | input_field = MultiParam('titles', key_prefix=False) 115 | fields = [StaticParam('prop', 'iwlinks'), 116 | SingleParam('url', True)] 117 | output_type = [InterwikiLink] 118 | examples = [OperationExample('Coffee')] 119 | 120 | def extract_results(self, query_resp): 121 | ret = [] 122 | for pid_dict in query_resp.get('pages', {}).values(): 123 | for iwd in pid_dict.get('iwlinks', []): 124 | cur_dict = dict(pid_dict) 125 | cur_dict['source'] = self.source 126 | cur_dict['url'] = iwd.get('url') 127 | cur_dict['prefix'] = iwd.get('prefix') 128 | link = InterwikiLink.from_query(cur_dict) 129 | ret.append(link) 130 | return ret 131 | -------------------------------------------------------------------------------- /wapiti/operations/meta.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import MultiParam, StaticParam 6 | from models import NamespaceDescriptor, InterwikiDescriptor, SourceInfo 7 | 8 | 9 | DEFAULT_PROPS = ('general', 10 | 'namespaces', 11 | 'namespacealiases', 12 | 'statistics', 13 | 'interwikimap') 14 | """ 15 | OTHER_PROPS = ['namespaces', 16 | 'namespacealiases', 17 | 'specialpagealiases', 18 | 'magicwords', 19 | 'statistics', 20 | 'interwikimap', 21 | 'dbrepllag', 22 | 'usergroups', 23 | 'extensions', 24 | 'fileextensions', 25 | 'rightsinfo', 26 | 'languages', 27 | 'skins', 28 | 'extensiontags', 29 | 'functionhooks', 30 | 'showhooks', 31 | 'variables', 32 | 'protocols'] 33 | """ 34 | 35 | 36 | class GetSourceInfo(QueryOperation): 37 | """ 38 | Fetch meta site information about the source wiki. 39 | 40 | The default properties include: 41 | 42 | - General source information: Main Page, base, sitename, generator, 43 | phpversion, phpsapi, dbtype, dbversion, case, rights, lang, fallback, 44 | fallback8bitEncoding, writeapi, timezone, timeoffset, articlepath, 45 | scriptpath, script, variantarticlepath, server, wikiid, time, misermode, 46 | maxuploadsize 47 | - Namespace map 48 | - Interwiki map 49 | - Statistics: pages, articles, edits, images, users, activeusers, admins, 50 | jobs 51 | """ 52 | field_prefix = 'si' 53 | input_field = None 54 | fields = [StaticParam('meta', 'siteinfo'), 55 | MultiParam('prop', DEFAULT_PROPS)] 56 | output_type = SourceInfo 57 | 58 | def extract_results(self, query_resp): 59 | ret = query_resp['general'] 60 | namespaces = query_resp.get('namespaces', {}) 61 | interwikis = query_resp.get('interwikimap', {}) 62 | ns_map = [] 63 | for ns, ns_dict in namespaces.iteritems(): 64 | ns_map.append(NamespaceDescriptor(ns_dict.get('id'), 65 | ns_dict.get('*'), 66 | ns_dict.get('canonical'))) 67 | iw_map = [] 68 | for iw in interwikis: 69 | iw_map.append(InterwikiDescriptor(iw.get('prefix'), 70 | iw.get('url'), 71 | iw.get('language'))) 72 | ret['namespace_map'] = tuple(ns_map) 73 | ret['interwiki_map'] = tuple(iw_map) 74 | ret.update(query_resp['statistics']) 75 | source_info = SourceInfo(**ret) 76 | return [source_info] 77 | -------------------------------------------------------------------------------- /wapiti/operations/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from collections import namedtuple 5 | 6 | from base import QueryOperation 7 | from params import SingleParam, MultiParam, StaticParam 8 | from models import PageIdentifier, CoordinateIdentifier, PageInfo 9 | from utils import OperationExample 10 | 11 | # TODO: These operations should be moved to the proper file 12 | # TODO: convert to real model(s) 13 | QueryPageInfo = namedtuple('QueryPageInfo', 'title ns value querypage cache') 14 | 15 | DEFAULT_COORD_PROPS = ['type', 'name', 'dim', 'country', 'region'] 16 | 17 | 18 | class GetPageInfo(QueryOperation): 19 | field_prefix = 'in' 20 | input_field = MultiParam('titles', key_prefix=False) 21 | fields = [StaticParam('prop', 'info'), 22 | MultiParam('prop', 'subjectid|talkid|protection')] 23 | output_type = PageInfo 24 | examples = [OperationExample(['Coffee', 'Category:Africa'])] 25 | 26 | def extract_results(self, query_resp): 27 | ret = [] 28 | for k, pid_dict in query_resp['pages'].iteritems(): 29 | page_info = PageInfo.from_query(pid_dict, 30 | source=self.source) 31 | ret.append(page_info) 32 | return ret 33 | 34 | 35 | class GetCoordinates(QueryOperation): 36 | field_prefix = 'co' 37 | input_field = MultiParam('titles', key_prefix=False) 38 | fields = [StaticParam('prop', 'coordinates'), 39 | SingleParam('primary', 'all'), # primary, secondary, all 40 | MultiParam('prop', DEFAULT_COORD_PROPS)] 41 | output_type = [CoordinateIdentifier] 42 | examples = [OperationExample(['White House', 'Mount Everest'])] 43 | 44 | def extract_results(self, query_resp): 45 | ret = [] 46 | for k, pid_dict in query_resp['pages'].iteritems(): 47 | page_ident = PageIdentifier.from_query(pid_dict, 48 | source=self.source) 49 | for coord in pid_dict['coordinates']: 50 | coord_ident = CoordinateIdentifier(coord, page_ident) 51 | ret.append(coord_ident) 52 | return ret 53 | 54 | 55 | class GeoSearch(QueryOperation): 56 | field_prefix = 'gs' 57 | input_field = MultiParam('coord') 58 | fields = [StaticParam('list', 'geosearch'), 59 | SingleParam('radius', 10000), # must be within 10 and 10000 60 | #SingleParam('maxdim', 1000), # does not work? 61 | SingleParam('globe', 'earth'), # which planet? donno... 62 | SingleParam('namespace'), 63 | StaticParam('gsprop', DEFAULT_COORD_PROPS)] 64 | output_type = [CoordinateIdentifier] 65 | examples = [OperationExample(('37.8197', '-122.479'), 1)] 66 | 67 | def extract_results(self, query_resp): 68 | ret = [] 69 | for pid_dict in query_resp['geosearch']: 70 | page_ident = PageIdentifier.from_query(pid_dict, 71 | source=self.source) 72 | coord_ident = CoordinateIdentifier(pid_dict, page_ident) 73 | ret.append(coord_ident) 74 | return ret 75 | 76 | 77 | class GetRecentChanges(QueryOperation): 78 | field_prefix = 'grc' 79 | input_field = None 80 | fields = [StaticParam('generator', 'recentchanges'), 81 | StaticParam('prop', 'info'), 82 | StaticParam('inprop', 'subjectid|talkid|protection')] 83 | output_type = [PageInfo] 84 | examples = [OperationExample()] 85 | 86 | def extract_results(self, query_resp): 87 | ret = [] 88 | for pid, pid_dict in query_resp['pages'].iteritems(): 89 | if pid.startswith('-'): 90 | continue 91 | page_ident = PageInfo.from_query(pid_dict, 92 | source=self.source) 93 | ret.append(page_ident) 94 | return ret 95 | 96 | ''' 97 | If we are completionists (for action=query) 98 | 99 | * prop=pageprops (pp) * 100 | Get various properties defined in the page content 101 | * prop=videoinfo (vi) * 102 | Extends imageinfo to include video source information 103 | * prop=transcodestatus * 104 | Get transcode status for a given file page 105 | * prop=globalusage (gu) * 106 | Returns global image usage for a certain image 107 | * prop=extracts (ex) * 108 | Returns plain-text or limited HTML extracts of the given page(s) 109 | * prop=pageimages (pi) * 110 | Returns information about images on the page such as thumbnail and presence of photos. 111 | * prop=flagged * 112 | Get information about the flagging status of the given pages. 113 | 114 | * list=alllinks (al) * 115 | Enumerate all links that point to a given namespace 116 | * list=allpages (ap) * 117 | Enumerate all pages sequentially in a given namespace 118 | * list=allusers (au) * 119 | Enumerate all registered users 120 | * list=blocks (bk) * 121 | List all blocked users and IP addresses 122 | 123 | * list=exturlusage (eu) * 124 | Enumerate pages that contain a given URL 125 | * list=filearchive (fa) * 126 | Enumerate all deleted files sequentially 127 | * list=iwbacklinks (iwbl) * 128 | Find all pages that link to the given interwiki link. 129 | * list=langbacklinks (lbl) * 130 | Find all pages that link to the given language link. 131 | 132 | * list=logevents (le) * 133 | Get events from logs 134 | * list=protectedtitles (pt) * 135 | List all titles protected from creation 136 | 137 | 138 | * list=search (sr) * 139 | Perform a full text search 140 | * list=tags (tg) * 141 | List change tags 142 | * list=users (us) * 143 | Get information about a list of users 144 | * list=abuselog (afl) * 145 | Show events that were caught by one of the abuse filters. 146 | * list=abusefilters (abf) * 147 | Show details of the abuse filters. 148 | 149 | ''' 150 | -------------------------------------------------------------------------------- /wapiti/operations/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | wapiti.operations.models 4 | ~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | This module provides structures and abstractions for creating consistent 7 | Operation interfaces, regardless of underlying Mediawiki API response 8 | types. 9 | 10 | For example the ``prop=revisions`` and ``list=usercontribs`` APIs 11 | both return lists of revision information, however not all of the 12 | attributes afforded by ``prop=revisions`` are available from 13 | ``list=usercontribs``. Wapiti models and operations strive to 14 | resolve and abstract this fact away from the user as sanely as 15 | possible. 16 | """ 17 | from __future__ import unicode_literals 18 | 19 | from datetime import datetime 20 | from collections import namedtuple, OrderedDict 21 | 22 | 23 | def parse_timestamp(timestamp): 24 | return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ') 25 | 26 | 27 | NamespaceDescriptor = namedtuple('NamespaceDescriptor', 'id title canonical') 28 | InterwikiDescriptor = namedtuple('InterwikiDescriptor', 'alias url language') 29 | 30 | _MISSING = object() 31 | 32 | 33 | class NamespaceDescriptor(object): 34 | def __init__(self, id, title, canonical, **kw): 35 | self.id = id 36 | self.title = title 37 | self.canonical = canonical 38 | 39 | 40 | class WapitiModelAttr(object): 41 | def __init__(self, name, **kw): 42 | self.name = name 43 | self.mw_name = kw.pop('mw_name', name) 44 | self.display = kw.pop('display', False) 45 | try: 46 | self.type = kw.pop('type') 47 | if not isinstance(self.type, type): 48 | raise TypeError("WapitiModelAttr kwarg 'type' expected type") 49 | except KeyError: 50 | self.type = _MISSING 51 | try: 52 | self.default = kw.pop('default') 53 | except KeyError: 54 | self.default = _MISSING 55 | if kw: 56 | raise ValueError('got unexpected keyword arguments: %r' 57 | % kw.keys()) 58 | 59 | def __repr__(self): 60 | ret = [self.__class__.__name__, '(', repr(self.name)] 61 | if self.mw_name != self.name: 62 | ret.extend([', mw_name=', repr(self.mw_name)]) 63 | if self.type is not _MISSING: 64 | ret.extend([', type=', self.type.__name__]) 65 | if self.default is not _MISSING: 66 | ret.extend([', default=', repr(self.default)]) 67 | if self.display: 68 | ret.extend([', display=', repr(self.display)]) 69 | ret.append(')') 70 | return ''.join(ret) 71 | 72 | def __iter__(self): 73 | for attr in ('name', 'mw_name', 'type', 'default', 'display'): 74 | yield getattr(self, attr) 75 | 76 | 77 | WMA = WapitiModelAttr # Windows Media Audio 78 | 79 | 80 | def title_talk2subject(title): 81 | talk_pref, _, title_suf = title.partition(':') 82 | subj_pref, _, _ = talk_pref.rpartition('talk') 83 | subj_pref = subj_pref.strip() 84 | new_title = subj_pref + ':' + title_suf 85 | new_title = new_title.lstrip(':') 86 | return new_title 87 | 88 | 89 | def title_subject2talk(title): 90 | subj_pref, _, title_suf = title.partition(':') 91 | subj_pref = subj_pref.strip() 92 | if not title_suf: 93 | talk_pref = 'Talk' 94 | title_suf = subj_pref 95 | elif subj_pref.endswith('talk'): 96 | talk_pref = subj_pref 97 | else: 98 | talk_pref = subj_pref + ' talk' 99 | new_title = talk_pref + ':' + title_suf 100 | return new_title 101 | 102 | 103 | def get_unique_func(val): 104 | if callable(val): 105 | return val 106 | elif isinstance(val, basestring): 107 | return lambda obj: getattr(obj, val, obj) 108 | try: 109 | if all([isinstance(v, basestring) for v in val]): 110 | return lambda obj: tuple([getattr(obj, v, obj) for v in val]) 111 | except TypeError: 112 | pass 113 | raise TypeError('could not derive uniqueification function from %r' % val) 114 | 115 | 116 | def get_priority_func(val, default=0): 117 | if val is None: 118 | val = default 119 | if callable(val): 120 | return val 121 | elif isinstance(val, basestring): 122 | return lambda obj: getattr(obj, val, default) 123 | try: 124 | int_val = int(val) 125 | return lambda obj: int_val 126 | except TypeError: 127 | pass 128 | try: 129 | if all([isinstance(v, basestring) for v in val]): 130 | return lambda obj: tuple([getattr(obj, v, default) for v in val]) 131 | except TypeError: 132 | pass 133 | raise TypeError('could not derive priority function from %r' % val) 134 | 135 | 136 | class WapitiModelMeta(type): 137 | """ 138 | The foundation of Wapiti's data models, which attempt to add 139 | consistency and order to the wide variety of return types used 140 | across different Mediawiki APIs. This metaclass enables certain 141 | inheritance-like usage patterns in models. See WapitiModelBase's 142 | docstring for more information. 143 | 144 | The `attributes` dictionary is a mapping of Python class attribute 145 | names to Mediawiki API result keys (e.g., `pageid` becomes 146 | `page_id` on the Python object). 147 | 148 | The `defaults` dictionary is a mapping of Python attribute name to 149 | default value, if allowed. If an attribute does not have a default 150 | value, and is missing upon instantiation of a model, an exception 151 | will be raised. 152 | """ 153 | attributes = [] 154 | 155 | def __new__(cls, name, bases, attrs): 156 | all_attributes = OrderedDict() 157 | for base in bases: 158 | base_attr_list = getattr(base, 'attributes', []) 159 | base_attr_dict = OrderedDict([(a.name, a) for a in base_attr_list]) 160 | all_attributes.update(base_attr_dict) 161 | attr_dict = OrderedDict([(a.name, a) for a 162 | in attrs.get('attributes', [])]) 163 | all_attributes.update(attr_dict) 164 | attrs['attributes'] = all_attributes.values() 165 | if 'unique_on' in attrs: 166 | unique_func = get_unique_func(attrs['unique_on']) 167 | attrs['unique_key'] = property(unique_func) 168 | ret = super(WapitiModelMeta, cls).__new__(cls, name, bases, attrs) 169 | return ret 170 | 171 | 172 | class WapitiModelBase(object): 173 | """ 174 | The more-concrete counterpart of WapitiModelMeta, which primarily 175 | provides generic initialization and display logic. 176 | 177 | There are two methods for instantiation, the standard 178 | ``__init__()`` (e.g., ``CategoryInfo()``), which takes attributes 179 | as keyword arguments, and ``from_query()``, which usually takes a 180 | dictionary deserialized from JSON, as returned by Mediawiki 181 | API. For information on `attributes` and `defaults`, see 182 | WapitiModelMeta. 183 | 184 | ``__repr__()`` and ``__str__()`` are powered by 185 | ``get_display_str()``. 186 | """ 187 | 188 | __metaclass__ = WapitiModelMeta 189 | attributes = [] 190 | unique_on = lambda self: self 191 | exists = True # Defaults to True, instances can represent non-existent pages 192 | 193 | def __init__(self, **kw): 194 | missing = [] 195 | for attr in self.attributes: 196 | try: 197 | val = kw.pop(attr.name) 198 | except KeyError: 199 | if attr.default is _MISSING: 200 | missing.append(attr.name) 201 | continue 202 | val = attr.default 203 | if attr.type is not _MISSING and not isinstance(val, attr.type): 204 | val = attr.type(val) 205 | print val 206 | setattr(self, attr.name, val) 207 | if missing: 208 | raise ValueError('missing expected keyword arguments: %r' 209 | % missing) 210 | # TODO: raise on unexpected keyword arguments? 211 | return 212 | 213 | @classmethod 214 | def from_query(cls, q_dict, **kw): 215 | kwargs = {} 216 | all_q_dict = dict(kw) 217 | all_q_dict.update(q_dict) 218 | for name, mw_name, _, _, _ in cls.attributes: 219 | if mw_name is None: 220 | continue 221 | try: 222 | kwargs[name] = all_q_dict[mw_name] 223 | except KeyError: 224 | pass 225 | return cls(**kwargs) 226 | 227 | def get_display_str(self, raise_exc=True): 228 | attr_list = [] 229 | try: 230 | for (name, _, _, _, do_disp) in self.attributes: 231 | if not do_disp: 232 | continue 233 | # TODO: don't display values if equal to default? 234 | val = getattr(self, name) 235 | attr_list.append('%s=%r' % (name, val)) 236 | except: 237 | if raise_exc: 238 | raise 239 | return super(WapitiModelBase, self).__str__() 240 | attr_str = ', '.join(attr_list) 241 | return ''.join([self.__class__.__name__, '(', attr_str, ')']) 242 | 243 | __str__ = get_display_str 244 | 245 | def __repr__(self): 246 | try: 247 | return self.get_display_str() 248 | except: 249 | return super(WapitiModelBase, self).__repr__() 250 | 251 | 252 | class SourceInfo(WapitiModelBase): 253 | attributes = [WMA('namespace_map'), 254 | WMA('interwiki_map')] 255 | 256 | def __init__(self, **kw): 257 | for k, v in kw.iteritems(): 258 | attr = WMA(k) 259 | setattr(self, attr.name, v) 260 | 261 | 262 | class PageIdentifier(WapitiModelBase): 263 | attributes = [WMA('title', display=True), 264 | WMA('page_id', mw_name='pageid', display=True, default=None), 265 | WMA('ns', display=True), 266 | WMA('source')] 267 | 268 | unique_on = 'title' 269 | 270 | @property 271 | def exists(self): 272 | return self.page_id is not None 273 | 274 | @property 275 | def is_subject_page(self): 276 | return (self.ns >= 0 and self.ns % 2 == 0) 277 | 278 | @property 279 | def is_talk_page(self): 280 | return (self.ns >= 0 and self.ns % 2 == 1) 281 | 282 | LanguageLink = namedtuple('LanguageLink', 'url language origin_page') 283 | InterwikiLink = namedtuple('InterwikiLink', 'url prefix origin_page') 284 | ExternalLink = namedtuple('ExternalLink', 'url origin_page') 285 | 286 | 287 | class Link(WapitiModelBase): 288 | unique_on = 'url' 289 | attributes = [WMA('url', display=True)] 290 | 291 | 292 | class LanguageLink(Link, PageIdentifier): 293 | # TODO: URL is really the other language's title 294 | unique_on = ('url', 'language') 295 | attributes = [WMA('language', display=True)] 296 | 297 | 298 | class InterwikiLink(Link, PageIdentifier): 299 | attributes = [WMA('prefix')] 300 | 301 | 302 | class ExternalLink(Link, PageIdentifier): 303 | pass 304 | 305 | 306 | class PageInfo(PageIdentifier): 307 | attributes = [WMA('subject_id', mw_name='subjectid', default=None), 308 | WMA('talk_id', mw_name='talkid', default=None)] 309 | 310 | def __init__(self, **kw): 311 | req_title = kw.pop('req_title', None) 312 | super(PageInfo, self).__init__(**kw) 313 | self.req_title = req_title or self.title 314 | 315 | if self.is_subject_page: 316 | self.subject_id = self.page_id 317 | elif self.is_talk_page: 318 | self.talk_id = self.page_id 319 | else: 320 | raise ValueError('special or nonexistent namespace: %r' % self.ns) 321 | 322 | def get_subject_info(self): 323 | if self.is_subject_page: 324 | return self 325 | if self.subject_id is None: 326 | raise ValueError('subject_id not set') 327 | subj_title = title_talk2subject(self.title) 328 | subj_ns = self.ns - 1 329 | kwargs = dict(self.__dict__) 330 | kwargs['title'] = subj_title 331 | kwargs['ns'] = subj_ns 332 | return PageInfo(**kwargs) 333 | 334 | def get_talk_info(self): 335 | if self.is_talk_page: 336 | return self 337 | if self.talk_id is None: 338 | raise ValueError('talk_id not set') 339 | talk_title = title_subject2talk(self.title) 340 | talk_ns = self.ns + 1 341 | kwargs = dict(self.__dict__) 342 | kwargs['title'] = talk_title 343 | kwargs['ns'] = talk_ns 344 | return PageInfo(**kwargs) 345 | 346 | 347 | class CategoryInfo(PageInfo): 348 | kw = {'default': 0, 'type': int} 349 | attributes = [WMA('total_count', mw_name='size', display=True, **kw), 350 | WMA('page_count', mw_name='pages', **kw), 351 | WMA('file_count', mw_name='files', **kw), 352 | WMA('subcat_count', mw_name='subcats', display=True, **kw)] 353 | del kw 354 | 355 | 356 | class RevisionInfo(PageInfo): 357 | attributes = [WMA('rev_id', mw_name='revid', display=True), 358 | WMA('size'), 359 | WMA('user_text', mw_name='user', default='!userhidden'), 360 | WMA('user_id', mw_name='userid', default=-1), 361 | WMA('timestamp', display=True), 362 | WMA('comment', default=''), 363 | WMA('parsed_comment', mw_name='parsedcomment', default=''), 364 | WMA('tags')] 365 | 366 | unique_on = 'rev_id' 367 | 368 | # note that certain revisions may have hidden the fields 369 | # user_id, user_text, and comment for administrative reasons, 370 | # aka "oversighting" 371 | # TODO: is oversighting better handled in operation? 372 | 373 | def __init__(self, *a, **kw): 374 | super(RevisionInfo, self).__init__(*a, **kw) 375 | self.timestamp = parse_timestamp(self.timestamp) 376 | 377 | 378 | class Revision(RevisionInfo): 379 | attributes = [WMA('parent_rev_id', mw_name='parentid', display=True), 380 | WMA('content', mw_name='*', default=''), # default=''? 381 | WMA('is_parsed')] 382 | 383 | 384 | class ImageInfo(PageIdentifier): 385 | attributes = [WMA('image_repo', mw_name='imagerepository'), 386 | WMA('missing', default=False), 387 | WMA('url', default=''), # will only exist if non-local repo 388 | WMA('dimensions', default=''), 389 | WMA('mime', default=''), 390 | WMA('thumbmime', default=''), 391 | WMA('media_type', mw_name='mediatype', default=''), 392 | WMA('metadata', default=''), 393 | WMA('archive_name', mw_name='archivename', default=''), 394 | WMA('bitdepth', default='')] 395 | 396 | 397 | #TODO: class ParsedTemplate(object): 398 | 399 | 400 | # 401 | # Protections 402 | # 403 | NEW = 'NEW' 404 | AUTOCONFIRMED = 'AUTOCONFIRMED' 405 | SYSOP = 'SYSOP' 406 | PROTECTION_ACTIONS = ('create', 'edit', 'move', 'upload') 407 | 408 | 409 | Protection = namedtuple('Protection', 'level, expiry') 410 | 411 | 412 | class ProtectionInfo(object): 413 | # TODO: turn into mixin, add to PageIdentifier 414 | """ 415 | For more info on protection, 416 | see https://en.wikipedia.org/wiki/Wikipedia:Protection_policy 417 | """ 418 | levels = { 419 | 'new': NEW, 420 | 'autoconfirmed': AUTOCONFIRMED, 421 | 'sysop': SYSOP, 422 | } 423 | 424 | def __init__(self, protections, page_ident=None): 425 | self.page_ident = page_ident 426 | 427 | protections = protections or {} 428 | self.protections = {} 429 | for p in protections: 430 | if not p['expiry'] == 'infinity': 431 | expiry = parse_timestamp(p['expiry']) 432 | else: 433 | expiry = 'infinity' 434 | level = self.levels.get(p['level'], p['level']) 435 | self.protections[p['type']] = Protection(level, expiry) 436 | 437 | @property 438 | def has_protection(self): 439 | return any([x.level != NEW for x in self.protections.values()]) 440 | 441 | @property 442 | def has_indef(self): 443 | return any([x.expiry == 'infinity' for x in self.protections.values()]) 444 | 445 | @property 446 | def is_full_prot(self): 447 | try: 448 | if self.protections['edit'].level == SYSOP and \ 449 | self.protections['move'].level == SYSOP: 450 | return True 451 | else: 452 | return False 453 | except (KeyError, AttributeError): 454 | return False 455 | 456 | @property 457 | def is_semi_prot(self): 458 | try: 459 | if self.protections['edit'].level == AUTOCONFIRMED: 460 | return True 461 | else: 462 | return False 463 | except (KeyError, AttributeError): 464 | return False 465 | 466 | def __repr__(self): 467 | return u'ProtectionInfo(%r)' % self.protections 468 | 469 | 470 | class CoordinateIdentifier(object): 471 | def __init__(self, coord, page_ident=None): 472 | self.page_ident = page_ident 473 | self.lat = coord.get('lat') 474 | self.lon = coord.get('lon') 475 | self.type = coord.get('type') 476 | self.name = coord.get('name') 477 | self.dim = coord.get('dim') 478 | self.country = coord.get('country') 479 | self.region = coord.get('region') 480 | if coord.get('primary', False): 481 | self.primary = True 482 | else: 483 | self.primary = False 484 | return 485 | 486 | 487 | class QueryPageInfo(object): 488 | def __init__(self, 489 | title, 490 | ns, 491 | value, 492 | querypage, 493 | cache): 494 | self.title = title 495 | self.ns = ns 496 | self.value = value 497 | self.querypage = querypage 498 | self.cache = cache 499 | -------------------------------------------------------------------------------- /wapiti/operations/params.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | """ 5 | Fields, parameters, and coercion 6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 7 | 8 | Fields are typed slots which normalize and validate values passed to 9 | them, facilitating much more robust Operation usage. 10 | 11 | Parameters (aka params) are the values passed to a field. Another 12 | benefit of fields is that Operations will not initialize successfully 13 | without proper parameters, allowing earlier error detection (and in 14 | some cases correction). 15 | 16 | Coercion refers a field's [limited] ability to transform certain 17 | values into a parameters usable by the owning Operation. For instance, 18 | to get all members of 'Category:Africa', one can use the GetCategory 19 | operation like this, ``GetCategory('Category:Africa')``, or this, 20 | ``GetCategory('Africa')``. The ``query_field`` on the GetCategory 21 | Operation will automatically prepend the 'Category:' prefix, as all 22 | Wikipedia categories start with 'Category:'. 23 | 24 | The normalized value can be retrieved with ``field.get_value()``, 25 | which (currently) always returns a single string (as would be used in 26 | an API call URL). ``field.get_value_list()`` also exists, which 27 | returns a list of strings. 28 | 29 | Fields also encapsulate a ``key``, which typically corresponds the URL 30 | query argument name used in API request URLs, as well as key 31 | preparation logic, like prefixing (e.g., 'title' -> 'gcmtitle'). 32 | 33 | Here are some notes on various aspects of coercion: 34 | 35 | - By default, fields allow submission of plain strings (or iterables 36 | of strings), which are then normalized and subject to cardinality 37 | constraints. This behavior can be disabled with allow_string=False. 38 | 39 | - Fields can also accept non-string objects (i.e., WapitiModel 40 | instances) by specifying the name of an attribute to get from any 41 | non-string value submitted to the field. 42 | """ 43 | 44 | from collections import Sequence, Set 45 | from utils import is_scalar, prefixed 46 | 47 | 48 | def param_list2str(p_list, prefix=None, keep_empty=False): 49 | if is_scalar(p_list): 50 | p_list = param_str2list(p_list, keep_empty) 51 | u_p_list = [unicode(p) for p in p_list] 52 | ret = "|".join([prefixed(t, prefix) 53 | for t in u_p_list if (t or keep_empty)]) 54 | return unicode(ret) 55 | 56 | 57 | def param_str2list(p, keep_empty=False): 58 | p = p or '' 59 | if is_scalar(p): 60 | p = unicode(p) 61 | else: 62 | p = param_list2str(p) 63 | p_list = p.split('|') 64 | if not keep_empty: 65 | p_list = [sp for sp in p_list if sp] 66 | return p_list 67 | 68 | 69 | def normalize_param(p, prefix=None, multi=None): 70 | p_list = param_str2list(p) 71 | if multi is False: 72 | if len(p_list) > 1: 73 | tmpl = 'expected singular query parameter, not %r' 74 | raise ValueError(tmpl % p) 75 | return param_list2str(p_list, prefix) 76 | 77 | 78 | # unacceptablllllllle 79 | PLURAL_MAP = {'titles': 'title', 'revids': 'revid'} 80 | 81 | def make_param_attr_getter(attr_name): 82 | def param_attr_getter(obj): 83 | ret = getattr(obj, attr_name) 84 | if callable(ret): 85 | raise AttributeError('unsuitable attribute value %r' % ret) 86 | return ret 87 | 88 | return param_attr_getter 89 | 90 | class Param(object): 91 | def __init__(self, key, default=None, val_prefix=None, **kw): 92 | if not key: 93 | raise ValueError('expected key, not %r' % key) 94 | self.key = unicode(key) 95 | self.val_prefix = val_prefix 96 | self.required = kw.pop('required', False) 97 | self.multi = kw.pop('multi', None) 98 | self.accept_str = kw.pop('accept_str', True) 99 | self.key_prefix = kw.pop('key_prefix', True) # True = filled in later 100 | self.limit = kw.pop('limit', None) 101 | 102 | param_attr = kw.pop('attr', None) 103 | coerce_func = kw.pop('coerce', None) 104 | if coerce_func is None: 105 | if param_attr is None: 106 | param_attr = self.key 107 | if self.multi: 108 | param_attr = PLURAL_MAP.get(param_attr, param_attr) 109 | if isinstance(param_attr, basestring): 110 | coerce_func = make_param_attr_getter(param_attr) 111 | elif param_attr is None: 112 | coerce_func = lambda x: x 113 | else: 114 | raise TypeError("'attr' expected string") 115 | elif not callable(coerce_func): 116 | raise TypeError("'coerce' expected callable") 117 | self.coerce_func = coerce_func 118 | if kw: 119 | raise ValueError('unexpected keyword argument(s): %r' % kw) 120 | if default is not None: 121 | default = normalize_param(default, self.val_prefix, self.multi) 122 | self.default = default 123 | 124 | def get_key(self, key_prefix=None): 125 | if self.key_prefix: 126 | prefix = key_prefix 127 | if prefix is None: 128 | prefix = self.key_prefix 129 | if isinstance(prefix, basestring): 130 | prefix = unicode(prefix) 131 | else: 132 | raise TypeError('expected valid string prefix') 133 | else: 134 | prefix = '' 135 | return prefix + self.key 136 | 137 | def _coerce_value(self, value): 138 | # TODO: it's real late and this is a bit of a sty 139 | # also, in some cases the bar-split normalization 140 | # should not occur (e.g., on a URL) 141 | if value is None: 142 | return value 143 | try: 144 | return self.coerce_func(value) 145 | except AttributeError: 146 | pass 147 | 148 | if is_scalar(value): 149 | if isinstance(value, basestring): 150 | return value 151 | else: 152 | # some models are iterable, but none are sequences/sets (yet) 153 | coerced = [] 154 | for v in value: 155 | if isinstance(v, basestring): 156 | coerced.append(v) 157 | else: 158 | coerced.append(self.coerce_func(v)) 159 | return coerced 160 | raise TypeError('could not coerce value %r to %r' % (value, self.key)) 161 | 162 | def get_value(self, value, prefix=None): 163 | if prefix is None: 164 | prefix = self.val_prefix 165 | value = self._coerce_value(value) 166 | norm_val = normalize_param(value, prefix, self.multi) 167 | val = norm_val or self.default 168 | if val is None and self.required: 169 | raise ValueError('%r param is required' % self.key) 170 | return val 171 | 172 | def get_value_list(self, value, prefix=None): 173 | return param_str2list(self.get_value(value, prefix)) 174 | 175 | def get_tuple(self): 176 | return (self.key, self.value) 177 | 178 | def get_tuple_from_kwargs(self, **kwargs): 179 | """ 180 | Picks up appropriate values from kwargs, 181 | returns the defaults if nothing matches. 182 | """ 183 | pass 184 | 185 | __call__ = get_value 186 | 187 | 188 | class SingleParam(Param): 189 | def __init__(self, *a, **kw): 190 | kw['multi'] = False 191 | super(SingleParam, self).__init__(*a, **kw) 192 | 193 | 194 | class MultiParam(Param): 195 | def __init__(self, *a, **kw): 196 | kw['multi'] = True 197 | super(MultiParam, self).__init__(*a, **kw) 198 | 199 | 200 | class StaticParam(Param): 201 | def __init__(self, key, value): 202 | super(StaticParam, self).__init__(key, value) 203 | 204 | def get_key(self, *a, **kw): 205 | return self.key 206 | 207 | def get_value(self, *a, **kw): 208 | return self.default 209 | 210 | 211 | class PassthroughParam(Param): 212 | def __init__(self, *a, **kw): 213 | super(PassthroughParam, self).__init__(*a, **kw) 214 | 215 | def get_value(self, value, prefix=None): 216 | return value 217 | 218 | def get_value_list(self, value, prefix=None): 219 | if is_scalar(value): 220 | return [value] 221 | return value 222 | -------------------------------------------------------------------------------- /wapiti/operations/protection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import MultiParam, StaticParam 6 | from models import ProtectionInfo 7 | from utils import OperationExample 8 | 9 | 10 | class GetProtections(QueryOperation): 11 | field_prefix = 'in' 12 | input_field = MultiParam('titles', key_prefix=False) 13 | fields = [StaticParam('prop', 'info'), 14 | StaticParam('inprop', 'protection')] 15 | output_type = ProtectionInfo 16 | examples = [OperationExample('Coffee'), 17 | OperationExample('Coffee|House'), 18 | OperationExample(['Coffee', 'House'])] 19 | 20 | def extract_results(self, query_resp): 21 | ret = [] 22 | for page_id, page in query_resp['pages'].iteritems(): 23 | ret.append(ProtectionInfo(page['protection'])) 24 | return ret 25 | -------------------------------------------------------------------------------- /wapiti/operations/query_operations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import SingleParam, StaticParam 6 | 7 | from models import PageInfo 8 | 9 | 10 | class GetQueryPage(QueryOperation): 11 | field_prefix = 'gqp' 12 | input_field = SingleParam('page') 13 | fields = [StaticParam('generator', 'querypage'), 14 | StaticParam('prop', 'info'), 15 | StaticParam('inprop', 'subjectid|talkid|protection')] 16 | output_type = PageInfo 17 | 18 | def extract_results(self, query_resp): 19 | ret = [] 20 | for k, pid_dict in query_resp['pages'].iteritems(): 21 | page = PageInfo.from_query(pid_dict, 22 | source=self.source) 23 | ret.append(page) 24 | return ret 25 | 26 | def prepare_params(self, **kw): 27 | params = super(GetQueryPage, self).prepare_params(**kw) 28 | if params.get('gqpcontinue'): 29 | params['gqpoffset'] = params.pop('ggqpcontinue') 30 | return params 31 | 32 | 33 | class GetAncientPages(GetQueryPage): 34 | input_field = None 35 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Ancientpages')] 36 | 37 | 38 | class GetBrokenRedirects(GetQueryPage): 39 | input_field = None 40 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'BrokenRedirects')] 41 | 42 | 43 | class GetDeadendPages(GetQueryPage): 44 | input_field = None 45 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Deadendpages')] 46 | 47 | 48 | class GetDisambiguations(GetQueryPage): 49 | input_field = None 50 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Disambiguations')] 51 | 52 | 53 | class GetDoubleRedirects(GetQueryPage): 54 | input_field = None 55 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Doulberedirects')] 56 | 57 | 58 | class GetListRedirects(GetQueryPage): 59 | input_field = None 60 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Listredirects')] 61 | 62 | 63 | class GetLonelyPages(GetQueryPage): 64 | input_field = None 65 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Lonelypages')] 66 | 67 | 68 | class GetLongPages(GetQueryPage): 69 | input_field = None 70 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Longpages')] 71 | 72 | 73 | class GetMostCategories(GetQueryPage): 74 | input_field = None 75 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostcategories')] 76 | 77 | 78 | class GetMostImages(GetQueryPage): 79 | input_field = None 80 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostimages')] 81 | 82 | 83 | class GetMostInterwikiLinks(GetQueryPage): 84 | input_field = None 85 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostinterwikis')] 86 | 87 | 88 | class GetMostLinkedCategories(GetQueryPage): 89 | input_field = None 90 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinkedcategories')] 91 | 92 | 93 | class GetMostLinkedTemplates(GetQueryPage): 94 | input_field = None 95 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinkedtemplates')] 96 | 97 | 98 | class GetMostLinked(GetQueryPage): 99 | input_field = None 100 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinked')] 101 | 102 | 103 | class GetMostRevisions(GetQueryPage): 104 | input_field = None 105 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostrevisions')] 106 | 107 | 108 | class GetFewestRevisions(GetQueryPage): 109 | input_field = None 110 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Fewestrevisions')] 111 | 112 | 113 | class GetShortPages(GetQueryPage): 114 | input_field = None 115 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Shortpages')] 116 | 117 | 118 | class GetUncategorizedCategories(GetQueryPage): 119 | input_field = None 120 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedcategories')] 121 | 122 | 123 | class GetUncategorizedPages(GetQueryPage): 124 | input_field = None 125 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedpages')] 126 | 127 | 128 | class GetUncategorizedImages(GetQueryPage): 129 | input_field = None 130 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedimages')] 131 | 132 | 133 | class GetUncategorizedTemplates(GetQueryPage): 134 | input_field = None 135 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedtemplates')] 136 | 137 | 138 | class GetUnusedCategories(GetQueryPage): 139 | input_field = None 140 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedcategories')] 141 | 142 | 143 | class GetUnusedImages(GetQueryPage): 144 | input_field = None 145 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedimages')] 146 | 147 | 148 | class GetWantedCategories(GetQueryPage): 149 | input_field = None 150 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedcategories')] 151 | 152 | 153 | class GetWantedFiles(GetQueryPage): 154 | input_field = None 155 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedfiles')] 156 | 157 | 158 | class GetWantedPages(GetQueryPage): 159 | input_field = None 160 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedpages')] 161 | 162 | 163 | class GetWantedTemplates(GetQueryPage): 164 | input_field = None 165 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedtemplates')] 166 | 167 | 168 | class GetUnusedTemplates(GetQueryPage): 169 | input_field = None 170 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedtemplates')] 171 | 172 | 173 | class GetWithoutInterwikiLinks(GetQueryPage): 174 | input_field = None 175 | fields = GetQueryPage.fields + [StaticParam('gqppage', 'Withoutinterwiki')] 176 | 177 | # 'Unwatchedpages' requires being logged in 178 | -------------------------------------------------------------------------------- /wapiti/operations/rand.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation, QueryLimit 5 | from params import StaticParam, SingleParam 6 | from models import PageInfo 7 | from utils import OperationExample, coerce_namespace 8 | 9 | class GetRandom(QueryOperation): 10 | """ 11 | Fetch random pages using MediaWiki's Special:Random. 12 | """ 13 | field_prefix = 'grn' 14 | fields = [StaticParam('generator', 'random'), 15 | StaticParam('prop', 'info'), 16 | StaticParam('inprop', 'subjectid|talkid|protection'), 17 | SingleParam('namespace', default='', coerce=coerce_namespace)] 18 | input_field = None 19 | output_type = [PageInfo] 20 | per_query_limit = QueryLimit(10, 20) 21 | examples = [OperationExample(doc='basic random')] 22 | 23 | def extract_results(self, query_resp): 24 | ret = [] 25 | for k, pid_dict in query_resp['pages'].iteritems(): 26 | page_info = PageInfo.from_query(pid_dict, 27 | source=self.source) 28 | ret.append(page_info) 29 | return ret 30 | 31 | def get_cont_str(self, *a, **kw): 32 | return '' 33 | 34 | 35 | class GetRandomArticles(GetRandom): 36 | def __init__(self, *a, **kw): 37 | kw['namespace'] = 0 38 | super(GetRandomArticles, self).__init__(*a, **kw) 39 | examples = [OperationExample(doc='random articles')] 40 | 41 | 42 | class GetRandomCategories(GetRandom): 43 | def __init__(self, *a, **kw): 44 | kw['namespace'] = 14 45 | super(GetRandomCategories, self).__init__(*a, **kw) 46 | examples = [OperationExample(doc='random categories')] 47 | 48 | 49 | class GetRandomFilePages(GetRandom): 50 | def __init__(self, *a, **kw): 51 | kw['namespace'] = 6 52 | super(GetRandomFilePages, self).__init__(*a, **kw) 53 | examples = [OperationExample(doc='random file pages')] 54 | -------------------------------------------------------------------------------- /wapiti/operations/revisions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import StaticParam, MultiParam, SingleParam 6 | from models import RevisionInfo, Revision 7 | from utils import OperationExample 8 | 9 | DEFAULT_PROPS = 'ids|flags|timestamp|user|userid|size|sha1|comment|parsedcomment|tags' 10 | 11 | 12 | class GetPageRevisionInfos(QueryOperation): 13 | """ 14 | Fetch revisions for pages. 15 | """ 16 | field_prefix = 'rv' 17 | input_field = MultiParam('titles', key_prefix=False) 18 | fields = [StaticParam('prop', 'revisions'), 19 | MultiParam('prop', DEFAULT_PROPS)] 20 | output_type = [RevisionInfo] 21 | examples = [OperationExample('Coffee', 10)] 22 | 23 | def extract_results(self, query_resp): 24 | ret = [] 25 | pages = [p for p in query_resp.get('pages', {}).values() 26 | if 'missing' not in p] 27 | for pid_dict in pages: 28 | for rev in pid_dict.get('revisions', []): 29 | rev_dict = dict(pid_dict) 30 | rev_dict.update(rev) 31 | rev_info = RevisionInfo.from_query(rev_dict, 32 | source=self.source) 33 | ret.append(rev_info) 34 | return ret 35 | 36 | 37 | class GetRevisionInfos(GetPageRevisionInfos): 38 | """ 39 | Fetch information about specific revision. 40 | """ 41 | input_field = MultiParam('revids', attr='rev_id', key_prefix=False) 42 | output_type = RevisionInfo 43 | examples = [OperationExample(['538903663', '539916351', '531458383'])] 44 | 45 | def prepare_params(self, *a, **kw): 46 | ret = super(GetRevisionInfos, self).prepare_params() 47 | ret.pop(self.field_prefix + 'limit', None) 48 | return ret 49 | 50 | 51 | class GetCurrentContent(QueryOperation): 52 | """ 53 | Fetch full content for current (top) revision. 54 | """ 55 | input_field = MultiParam('titles', key_prefix=False, attr='title') 56 | field_prefix = 'rv' 57 | fields = [StaticParam('prop', 'revisions'), 58 | MultiParam('prop', DEFAULT_PROPS + '|content'), 59 | SingleParam('parse', False), 60 | SingleParam('redirects', True, key_prefix=False)] 61 | examples = [OperationExample('This page does not exist'), 62 | OperationExample('Coffee')] 63 | output_type = Revision 64 | 65 | def extract_results(self, query_resp): 66 | ret = [] 67 | #redirect_list = query_resp.get('redirects', []) # TODO 68 | #redirects = dict([(r['from'], r['to']) for r in redirect_list]) 69 | requested_title = self.input_param 70 | is_parsed = self.kwargs.get('rvparse', False) 71 | 72 | pages = query_resp.get('pages', {}) 73 | for page_id, pid_dict in pages.iteritems(): 74 | if int(page_id) < 0: 75 | continue 76 | rev_dict = dict(pid_dict) 77 | rev_dict.update(pid_dict['revisions'][0]) 78 | revision = Revision.from_query(rev_dict, 79 | source=self.source, 80 | is_parsed=is_parsed) 81 | revision.req_title = requested_title 82 | ret.append(revision) 83 | return ret 84 | 85 | 86 | class GetRevisionContent(GetCurrentContent): 87 | input_field = SingleParam('revids', key_prefix=False, attr='rev_id') 88 | fields = [StaticParam('prop', 'revisions'), 89 | MultiParam('prop', DEFAULT_PROPS + '|content'), 90 | SingleParam('parse', False)] 91 | examples = [OperationExample('539916351')] 92 | 93 | 94 | class GetCurrentTalkContent(GetCurrentContent): 95 | """ 96 | The simple prefix behavior means this operation will only work on 97 | namespace 0 pages. I wouldn't rely on this operation being around 98 | for long. 99 | """ 100 | input_field = MultiParam('titles', val_prefix='Talk:', key_prefix=False) 101 | examples = [OperationExample('This page does not exist'), 102 | OperationExample('Coffee')] 103 | -------------------------------------------------------------------------------- /wapiti/operations/template_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | A very simple Mediawiki template parser that turns template 4 | references into nested key-value objects. 5 | 6 | From a Python perspective, one can think of TemplateReferences as the 7 | ``partial()``s to MediaWiki templates. 8 | 9 | Thanks to Mark Williams for drafting this. 10 | 11 | TODO 12 | ---- 13 | 14 | * Save original TemplateReference source 15 | * Nesting robustness 16 | * Autoclose template? 17 | * Merge TemplateReference and ProtoTemplateRef 18 | * .add_item() for order preservation 19 | * items(), keys(), etc. 20 | * parse out limited html for attempted int/float extraction 21 | * traverse() helper 22 | * strip wiki syntax 23 | * .links, .tables, .subtemplates 24 | """ 25 | from __future__ import unicode_literals 26 | 27 | import re 28 | import itertools 29 | """ 30 | Notes 31 | ----- 32 | 33 | - everything inside html comments is ignored 34 | - no html in keys 35 | - html in values 36 | - '=' only allowed after key if no '=' encountered yet 37 | 38 | _transitiony_things = \ 39 | {'template': {'key': ['=', '|', '}}'], 'value': ['|', '}}']}, 40 | 'html_comment': ['-->'], 41 | 'link': [']]'], 42 | 'table': ['|}']} 43 | """ 44 | 45 | 46 | def is_iterable(obj): 47 | try: 48 | iter(obj) 49 | except TypeError: 50 | return False 51 | return True 52 | 53 | 54 | def is_scalar(obj): 55 | return not is_iterable(obj) or isinstance(obj, basestring) 56 | 57 | 58 | class TemplateReference(object): 59 | def __init__(self, name, args, kwargs): 60 | self.name = name 61 | self.args = args 62 | self.kwargs = kwargs 63 | 64 | @classmethod 65 | def from_string(cls, text): 66 | tokens = tokenize(text) 67 | return [t for t in parse(tokens) if isinstance(t, cls)][0] # blargh 68 | 69 | def __repr__(self): 70 | cn = self.__class__.__name__ 71 | return '%s(%r, %r, %r)' % (cn, self.name, self.args, self.kwargs) 72 | 73 | def __getitem__(self, key): 74 | try: 75 | return self.kwargs[key] 76 | except KeyError: 77 | raise KeyError('%r template has no key %r' % (self.name, key)) 78 | 79 | def __iter__(self): 80 | return itertools.chain(iter(self.args), self.kwargs.iteritems()) 81 | 82 | 83 | def get_page_templates(source, raise_exc=True): 84 | tokens = tokenize(source) 85 | parsed = parse(tokens, raise_exc=raise_exc) 86 | return [t for t in parsed if isinstance(t, TemplateReference)] 87 | 88 | 89 | class Token(object): 90 | def __init__(self, start_index, text): 91 | self.start_index = start_index 92 | self.text = text 93 | 94 | @classmethod 95 | def from_match(cls, match): 96 | return cls(start_index=match.start(), text=match.group()) 97 | 98 | def __repr__(self): 99 | cn = self.__class__.__name__ 100 | return '%s(%r)' % (cn, self.text) 101 | 102 | 103 | class BufferToken(Token): 104 | pass 105 | 106 | 107 | class CommentToken(BufferToken): 108 | pass 109 | 110 | 111 | class LinkToken(BufferToken): 112 | pass 113 | 114 | 115 | class TableToken(BufferToken): 116 | pass 117 | 118 | 119 | class TemplateLogicToken(BufferToken): 120 | pass 121 | 122 | 123 | class SepToken(Token): 124 | pass 125 | 126 | 127 | class StartTemplateToken(Token): 128 | pass 129 | 130 | 131 | class EndTemplateToken(SepToken): 132 | pass 133 | 134 | 135 | LEXICON = \ 136 | [(r'(\[\[.+?\]\])', lambda m, t: LinkToken.from_match(m)), 137 | (r'(\{\|.+?\|\})', lambda m, t: TableToken.from_match(m)), 138 | (r'(\{\{\{.+?\}\}\})', lambda m, t: TemplateLogicToken.from_match(m)), 139 | (r'(\{\{#.+?\|\}\})', lambda m, t: TemplateLogicToken.from_match(m)), 140 | (r'()', lambda m, t: CommentToken.from_match(m)), 141 | (r'\{\{', lambda m, t: StartTemplateToken.from_match(m)), 142 | (r'\}\}', lambda m, t: EndTemplateToken.from_match(m)), 143 | (r'=', lambda m, t: SepToken.from_match(m)), 144 | (r'\|', lambda m, t: SepToken.from_match(m))] 145 | 146 | 147 | def build_scanner(lexicon, flags=0): 148 | import sre_parse 149 | import sre_compile 150 | from sre_constants import BRANCH, SUBPATTERN 151 | # combine phrases into a compound pattern 152 | p = [] 153 | s = sre_parse.Pattern() 154 | s.flags = flags 155 | for phrase, action in lexicon: 156 | p.append(sre_parse.SubPattern(s, [ 157 | (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))), 158 | ])) 159 | s.groups = len(p) + 1 160 | p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) 161 | scanner = sre_compile.compile(p) 162 | return scanner 163 | 164 | 165 | def tokenize(source, lexicon=None): 166 | lexicon = lexicon or LEXICON 167 | lex = build_scanner(lexicon, re.DOTALL) 168 | all_tokens = [] 169 | start, end, prev_end = 0, 0, 0 170 | for match in lex.finditer(source): 171 | start, end = match.start(), match.end() 172 | if prev_end < start: 173 | all_tokens.append(BufferToken(start, source[prev_end:start])) 174 | action = lexicon[match.lastindex - 1][1] 175 | if callable(action): 176 | # TODO: what should the callbacks want? 177 | cur_token = action(match, match.group()) 178 | all_tokens.append(cur_token) 179 | else: 180 | raise TypeError('expected callable callback, not %r' % (action,)) 181 | prev_end = end 182 | if prev_end < len(source): 183 | all_tokens.append(BufferToken(prev_end, source[prev_end:])) 184 | return all_tokens 185 | 186 | 187 | def cond_join(items, sep='', cond=None): 188 | # TODO: messsss 189 | if cond is None: 190 | cond = lambda s: isinstance(s, basestring) 191 | ret, tmp_buffer = [], [] 192 | for item in items: 193 | if cond(item): 194 | tmp_buffer.append(item.strip()) # TODO: remove strip() 195 | else: 196 | if tmp_buffer: 197 | ret.append(sep.join(tmp_buffer)) 198 | tmp_buffer = [] 199 | ret.append(item) 200 | if tmp_buffer: 201 | ret.append(sep.join(tmp_buffer)) 202 | return ret 203 | 204 | 205 | def process_korv(korv): 206 | if not korv: 207 | return '' 208 | # TODO: need fancy split() (for ) 209 | korv = [_kv for _kv in cond_join(korv) if _kv] 210 | if not korv: 211 | return '' 212 | elif len(korv) == 1: 213 | korv = korv[0] 214 | if isinstance(korv, basestring): 215 | korv = korv.strip() 216 | converters = [int, float, unicode] 217 | for convert in converters: 218 | try: 219 | return convert(korv) 220 | except ValueError: 221 | pass 222 | else: 223 | raise ValueError('unknown key or value {0}'.format(korv)) 224 | return korv 225 | 226 | 227 | class ProtoTemplateRef(object): 228 | def __init__(self, start_token): 229 | self.start_token = start_token 230 | self.args = [] 231 | self.kwargs = [] 232 | self.cur_val = [] 233 | self.tmp_key = None 234 | self.end_token = None 235 | 236 | def to_template_ref(self): 237 | args = [process_korv(a) for a in self.args] 238 | name, args = args[0], args[1:] 239 | if not is_scalar(name): 240 | raise ValueError('invalid template name %r' % (name,)) # todo: 241 | kwargs = [(process_korv(k), process_korv(v)) for (k, v) in self.kwargs] 242 | kwargs = dict(kwargs) 243 | return TemplateReference(name, args, kwargs) 244 | 245 | def __repr__(self): 246 | cn = self.__class__.__name__ 247 | if not self.args: 248 | return '%s(%r)' % (cn, self.start_token,) 249 | return ('%s(%r, %r, %r)' % 250 | (cn, self.args[0], self.args[1:], self.kwargs)) 251 | 252 | 253 | def parse(tokens, raise_exc=True): 254 | ret = [] 255 | pts = [] # ProtoTemplate stack 256 | interstish = [] 257 | for token in tokens: 258 | if isinstance(token, CommentToken): 259 | continue # TODO: save comments? 260 | if isinstance(token, StartTemplateToken): 261 | if interstish: 262 | ret.append(''.join(interstish)) 263 | interstish = [] 264 | pts.append(ProtoTemplateRef(token)) 265 | continue 266 | elif not pts: 267 | interstish.append(token.text) 268 | continue 269 | else: 270 | cpt = pts[-1] 271 | 272 | if isinstance(token, SepToken): 273 | tmp_key, cur_val = cpt.tmp_key, cpt.cur_val 274 | #''.join(cpt.cur_buff).strip() 275 | if token.text == '|' or token.text == '}}': 276 | if tmp_key is None: 277 | # cur_val is a value for a positional arg 278 | cpt.args.append(cur_val) 279 | else: 280 | # cur_val is a value for a keyword arg 281 | cpt.kwargs.append((tmp_key, cur_val)) 282 | cpt.tmp_key = None 283 | cpt.cur_val = [] 284 | elif token.text == '=' and tmp_key is None: 285 | # cur_val is a key 286 | try: 287 | cpt.tmp_key = ''.join(cur_val).strip() # TODO: int()s? 288 | except Exception as e: 289 | if raise_exc: 290 | raise 291 | 292 | cpt.cur_val = [] 293 | else: 294 | cpt.cur_val.append(token.text) 295 | else: 296 | # links and tables 297 | cpt.cur_val.append(token.text) 298 | 299 | if isinstance(token, EndTemplateToken): 300 | # create real Template 301 | pts.pop() 302 | cpt.end_token = token 303 | try: 304 | comp_tmpl = cpt.to_template_ref() 305 | except Exception as e: 306 | if raise_exc: 307 | raise 308 | continue 309 | if pts: 310 | pts[-1].cur_val.append(comp_tmpl) 311 | else: 312 | ret.append(comp_tmpl) 313 | # end loop 314 | 315 | return ret 316 | 317 | 318 | _BASIC_CITE_TEST = '''{{cite web 319 | | url = [http://www.census.gov/geo/www/gazetteer/files/Gaz_places_national.txt U.S. Census] 320 | | publisher=US Census Bureau 321 | | accessdate =2011 322 | | title = U.S. Census 323 | }}''' 324 | 325 | _BIGGER_CITE_TEST = '''{{citation 326 | | last = Terplan 327 | | first = Egon 328 | | title = Organizing for Economic Growth 329 | | subtitle = A new approach to business attraction and retention in San Francisco 330 | | work=SPUR Report 331 | | publisher=San Francisco Planning and Urban Research Association 332 | | date = June 7, 2010 333 | | url = http://www.spur.org/publications/library/report/organizing-economic-growth 334 | | quote = During the 1960s and 1970s San Francisco's historic maritime industry relocated to Oakland. ... San Francisco remained a center for business and professional services (such as consulting, law, accounting and finance) and also successfully developed its tourism sector, which became the leading local industry. 335 | | accessdate = January 5, 2013 336 | }}''' 337 | 338 | _SF_CLIMATE_TEST = '''{{climate chart 339 | | San Francisco 340 | |46.2|56.9|4.5 341 | |48.1|60.2|4.61 342 | |49.1|62.9|3.26 343 | |49.9|64.3|1.46 344 | |51.6|65.6|0.7 345 | |53.3|67.9|0.16 346 | |54.6|68.2|0 347 | |55.6|69.4|0.06 348 | |55.7|71.3|0.21 349 | |54.3|70.4|1.13 350 | |50.7|63.2|3.16 351 | |46.7|57.3|4.56 352 | |float=right 353 | |clear=none 354 | |units=imperial}}''' 355 | 356 | _SF_INFOBOX = '''{{Infobox settlement 357 | |name = San Francisco 358 | |official_name = City and County of San Francisco 359 | |nickname = ''The City by the Bay''; ''Fog City''; ''S.F.''; ''Frisco''; ''The City that Knows How'' (''antiquated''); ''Baghdad by the Bay'' (''antiquated''); ''The Paris of the West'' 360 | | settlement_type = [[Consolidated city-county|City and county]] 361 | | motto = ''Oro en Paz, Fierro en Guerra''
(English: "Gold in Peace, Iron in War") 362 | | image_skyline = SF From Marin Highlands3.jpg 363 | | imagesize = 280px 364 | | image_caption = San Francisco from the Marin Headlands, with the Golden Gate Bridge in the foreground 365 | | image_flag = Flag of San Francisco.svg 366 | | flag_size = 100px 367 | | image_seal = Sfseal.png 368 | | seal_size = 100px 369 | | image_map = California county map (San Francisco County enlarged).svg 370 | | mapsize = 200px 371 | | map_caption = Location of San Francisco in California 372 | | pushpin_map = USA2 373 | | pushpin_map_caption = Location in the United States 374 | 375 | | coordinates_region = US-CA 376 | | subdivision_type = [[List of countries|Country]] 377 | | subdivision_name = {{USA}} 378 | | subdivision_type1 = [[Political divisions of the United States|State]] 379 | | subdivision_name1 = {{flag|California}} 380 | 381 | 382 | | government_type = [[Mayor-council government|Mayor-council]] 383 | | governing_body = [[San Francisco Board of Supervisors|Board of Supervisors]] 384 | | leader_title = [[Mayor of San Francisco]] 385 | | leader_name = [[Ed Lee (politician)|Ed Lee]] 386 | | leader_title1 = [[San Francisco Board of Supervisors|Board of Supervisors]] 387 | | leader_name1 = {{Collapsible list 388 | | title = Supervisors 389 | | frame_style = border:none; padding: 0; 390 | | list_style = text-align:left; 391 | | 1 = [[Eric Mar]] 392 | | 2 = [[Mark Farrell (politician)|Mark Farrell]] 393 | | 3 = [[David Chiu (politician)|David Chiu]] 394 | | 4 = [[Katy Tang]] 395 | | 5 = [[London Breed]] 396 | | 6 = [[Jane Kim]] 397 | | 7 = [[Norman Yee]] 398 | | 8 = [[Scott Wiener]] 399 | | 9 = [[David Campos]] 400 | | 10 = [[Malia Cohen]] 401 | | 11 = [[John Avalos]]}} 402 | | leader_title2 = [[California State Assembly]] 403 | | leader_name2 = [[Tom Ammiano]] ([[California Democratic Party|D]])
[[Phil Ting]] ([[California Democratic Party|D]]) 404 | | leader_title3 = [[California State Senate]] 405 | | leader_name3 = [[Leland Yee]] ([[California Democratic Party|D]])
[[Mark Leno]] ([[California Democratic Party|D]]) 406 | | leader_title4 = [[United States House of Representatives]] 407 | | leader_name4 = [[Nancy Pelosi]] ([[Democratic Party (United States)|D]])
[[Jackie Speier]] ([[Democratic Party (United States)|D]]) 408 | | established_title = Founded 409 | | established_date = June 29, 1776 410 | | established_title1 = [[Municipal incorporation|Incorporated]] 411 | | established_date1 = April 15, 1850{{cite web 412 | | url = http://www6.sfgov.org/index.aspx?page=4 413 | | title = San Francisco: Government 414 | | publisher = SFGov.org 415 | | accessdate =March 8, 2012 416 | | quote = San Francisco was incorporated as a City on April 15th, 1850 by act of the Legislature.}} 417 | | founder = Lieutenant [[José Joaquin Moraga]] and [[Francisco Palóu]] 418 | | named_for = [[St. Francis of Assisi]] 419 | 420 | 421 | |area_magnitude = 422 | | unit_pref = US 423 | | area_footnotes = 424 | | area_total_sq_mi = 231.89 425 | | area_land_sq_mi = 46.87 426 | | area_water_sq_mi = 185.02 427 | | area_water_percent = 79.79 428 | | area_note = 429 | | area_metro_sq_mi = 3524.4 430 | 431 | 432 | | elevation_ft = 52 433 | | elevation_max_ft = 925 434 | | elevation_min_ft = 0 435 | 436 | 437 | | population_as_of = 2012 438 | | population_footnotes = 439 | | population_total = 815358 http://voices.yahoo.com/largest-us-cities-population-size-2012-6453656.html?cat=16 440 | | population_density_sq_mi = 17179.2 441 | | population = [[Combined statistical area|CSA]]: 8371000 442 | | population_metro = 4335391 443 | | population_urban = 3273190 444 | | population_demonym = San Franciscan 445 | 446 | 447 | | timezone = [[Pacific Time Zone|Pacific Standard Time]] 448 | | utc_offset = -8 449 | | timezone_DST = [[Pacific Time Zone|Pacific Daylight Time]] 450 | | utc_offset_DST = -7 451 | | latd = 37 452 | | latm = 47 453 | | latNS = N 454 | | longd = 122 455 | | longm = 25 456 | | longEW = W 457 | | coordinates_display = 8 458 | 459 | 460 | | postal_code_type = [[ZIP Code]] 461 | | postal_code = 94101–94112, 94114–94147, 94150–94170, 94172, 94175, 94177 462 | | area_code = [[Area code 415|415]] 463 | | blank_name = [[Federal Information Processing Standard|FIPS code]] 464 | | blank_info = 06-67000 465 | | blank1_name = [[Federal Information Processing Standard|FIPS code]] 466 | | blank1_info = 06-075 467 | | blank2_name = [[Geographic Names Information System|GNIS]] feature ID 468 | | blank2_info = 277593 469 | | website = {{URL|http://www.sfgov.org/}} 470 | | footnotes = 471 | }} 472 | ''' 473 | 474 | 475 | _ALL_TEST_STRS = [_BASIC_CITE_TEST, 476 | _BIGGER_CITE_TEST, 477 | _SF_CLIMATE_TEST, 478 | _SF_INFOBOX] 479 | 480 | def _main(): 481 | import os 482 | import pprint 483 | CUR_DIR = os.path.dirname(os.path.abspath(__file__)) 484 | ret = [] 485 | try: 486 | for test in _ALL_TEST_STRS: 487 | ret.append(TemplateReference.from_string(test)) 488 | pprint.pprint(ret[-1]) 489 | sf_infobox_tmpl = TemplateReference.from_string(_SF_INFOBOX) 490 | print 'Testing accessor:', sf_infobox_tmpl['leader_name1']['title'] 491 | 492 | test_path = os.path.join(CUR_DIR, '_test_tmpls', 493 | 'regr_moctezuma_parser_funcs.txt') 494 | tmpl_test = open(test_path).read().decode('utf-8') 495 | tmpls = get_page_templates(tmpl_test) 496 | import pdb;pdb.set_trace() 497 | except Exception as e: 498 | print e 499 | import pdb 500 | pdb.post_mortem() 501 | raise 502 | 503 | return ret 504 | 505 | 506 | if __name__ == '__main__': 507 | _main() 508 | -------------------------------------------------------------------------------- /wapiti/operations/templates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from collections import OrderedDict 5 | import re 6 | 7 | from base import QueryOperation, Operation, NoMoreResults 8 | from params import SingleParam, StaticParam, MultiParam, PassthroughParam 9 | from models import PageInfo 10 | from utils import OperationExample 11 | from revisions import GetCurrentContent 12 | from template_parser import get_page_templates, TemplateReference, _BASIC_CITE_TEST 13 | 14 | 15 | class GetTemplates(QueryOperation): 16 | field_prefix = 'gtl' 17 | input_field = MultiParam('titles', key_prefix=False) 18 | fields = [StaticParam('generator', 'templates'), 19 | StaticParam('prop', 'info'), 20 | StaticParam('inprop', 'subjectid|talkid|protection')] 21 | output_type = [PageInfo] 22 | examples = [OperationExample('Coffee')] 23 | 24 | def extract_results(self, query_resp): 25 | ret = [] 26 | for k, pid_dict in query_resp['pages'].iteritems(): 27 | page_ident = PageInfo.from_query(pid_dict, 28 | source=self.source) 29 | ret.append(page_ident) 30 | return ret 31 | 32 | 33 | class GetTranscludes(QueryOperation): 34 | input_field = SingleParam('title', val_prefix='Template:') 35 | field_prefix = 'gei' 36 | fields = [StaticParam('generator', 'embeddedin'), 37 | StaticParam('prop', 'info'), 38 | StaticParam('inprop', 'subjectid|talkid|protection')] 39 | output_type = [PageInfo] 40 | examples = [OperationExample('Template:ArticleHistory')] 41 | 42 | def extract_results(self, query_resp): 43 | ret = [] 44 | for k, pid_dict in query_resp.get('pages', {}).items(): 45 | page_ident = PageInfo.from_query(pid_dict, 46 | source=self.source) 47 | ret.append(page_ident) 48 | return ret 49 | 50 | 51 | class GetParsedTemplates(Operation): 52 | input_field = PassthroughParam('content') 53 | output_type = [TemplateReference] 54 | examples = [OperationExample(_BASIC_CITE_TEST, limit=1)] 55 | 56 | @property 57 | def remaining(self): 58 | if self.results: 59 | return 0 60 | return 1 # TODO: fix 61 | 62 | def process(self): 63 | if None in self.results: 64 | raise NoMoreResults() 65 | content = getattr(self.input_param, 'content', self.input_param) 66 | res = get_page_templates(content, raise_exc=False) 67 | self.results[None] = res 68 | return list(res) 69 | 70 | 71 | class GetParsedTemplatesPage(Operation): 72 | subop_chain = [GetCurrentContent, 73 | GetParsedTemplates] 74 | 75 | examples = [OperationExample('Coffee', limit=1)] 76 | 77 | 78 | class GetParsedTranscludes(Operation): 79 | ''' 80 | Template names may redirect, but this operation doesn't handle that yet 81 | ''' 82 | subop_chain = [GetTranscludes, 83 | GetCurrentContent, 84 | GetParsedTemplates] 85 | examples = [OperationExample('ArticleHistory', 10)] 86 | 87 | def _update_results(self, results): 88 | _, _, tmpl_name = self.input_param.rpartition(':') 89 | filt_res = [res for res in results 90 | if res.name.lower() == tmpl_name.lower()] 91 | return super(GetParsedTranscludes, self)._update_results(filt_res) 92 | 93 | 94 | def tmpl_text_to_odict(text): 95 | ret = OrderedDict() 96 | pairs = text.split('|') 97 | for p in pairs: 98 | p = p.strip() 99 | if not p: 100 | continue 101 | k, _, v = p.partition('=') 102 | k = k.strip() 103 | v = v.strip() 104 | if not k: 105 | print 'blank key error', k 106 | #import pdb;pdb.set_trace() 107 | continue 108 | if k in ret: 109 | print 'duplicate key error', k 110 | #import pdb;pdb.set_trace() 111 | continue 112 | ret[k] = v 113 | return ret 114 | 115 | 116 | def extract_template(tmpl_name, text): 117 | ret = [] 118 | tmpl_re = re.compile('\{\{(\s*' + tmpl_name + '.*?)\}\}', 119 | flags=(re.DOTALL | re.IGNORECASE| re.M)) 120 | tmpl_txts = re.findall(tmpl_re, text) 121 | for txt in tmpl_txts: 122 | ret.append(tmpl_text_to_odict(txt)) 123 | return ret 124 | 125 | 126 | #class GetAllTranscludes(GetTranscludes): 127 | # field_prefix = 'at' 128 | # input_field = None 129 | # fields = [StaticParam('list', 'alltransclusions'), 130 | # #StaticParam('prop', 'info'), 131 | # StaticParam('atprop', 'ids|title')] # 'subjectid|talkid|protection')] 132 | -------------------------------------------------------------------------------- /wapiti/operations/test_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import base 5 | 6 | from misc import GetPageInfo 7 | from models import PageIdentifier 8 | from category import GetSubcategoryInfos 9 | 10 | from revisions import GetCurrentContent, GetPageRevisionInfos 11 | from meta import GetSourceInfo 12 | 13 | 14 | def test_unicode_title(): 15 | get_beyonce = GetCurrentContent("Beyoncé Knowles") 16 | assert get_beyonce() 17 | 18 | 19 | def test_coercion_basic(): 20 | pid = PageIdentifier(title='Africa', page_id=123, ns=4, source='enwp') 21 | get_subcats = GetSubcategoryInfos(pid) 22 | assert get_subcats.input_param == 'Category:Africa' 23 | 24 | 25 | def test_web_request(): 26 | url = 'http://upload.wikimedia.org/wikipedia/commons/d/d2/Mcgregor.jpg' 27 | get_photo = base.WebRequestOperation(url) 28 | res = get_photo() 29 | text = res[0] 30 | assert len(text) == 16408 31 | 32 | 33 | def test_get_html(): 34 | get_africa_html = base.GetPageHTML('Africa') 35 | res = get_africa_html() 36 | text = res[0] 37 | assert len(text) > 350000 38 | 39 | 40 | def test_missing_revisions(): 41 | get_revs = GetPageRevisionInfos('Coffee_lololololol') 42 | rev_list = get_revs() 43 | ''' 44 | Should return 'missing' and negative pageid 45 | ''' 46 | assert len(rev_list) == 0 47 | 48 | 49 | def test_get_meta(): 50 | get_source_info = GetSourceInfo() 51 | meta = get_source_info() 52 | assert meta 53 | 54 | 55 | def test_client_passed_to_subops(): 56 | # This tests whether the client object given to the initial operation 57 | # is passed to its sub-operations. 58 | 59 | # Use just enough titles to force multiplexing so that we can get 60 | # sub ops to test. 61 | titles = ['a'] * (base.DEFAULT_QUERY_LIMIT.get_limit() + 1) 62 | 63 | client = base.MockClient() 64 | op = GetPageInfo(titles, client=client) 65 | assert id(op.subop_queues[0].peek().client) == id(client) 66 | -------------------------------------------------------------------------------- /wapiti/operations/test_operations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import base 5 | import category 6 | import feedback 7 | import files 8 | import links 9 | import meta 10 | import misc 11 | import protection 12 | import rand 13 | import revisions 14 | import templates 15 | import user 16 | 17 | from revisions import GetRevisionInfos 18 | 19 | MAGNITUDE = 1 20 | 21 | 22 | def limit_equal_or_depleted(op): 23 | if getattr(op, '_notices', None): 24 | return False 25 | elif getattr(op, 'is_depleted', None): 26 | return True 27 | elif len(op.results) == op.limit: 28 | return True 29 | return False 30 | 31 | 32 | def get_op_examples(): 33 | ops = list(base.OperationMeta._all_ops) 34 | ret = [] 35 | for op in ops: 36 | examples = getattr(op, 'examples', None) 37 | if not examples: 38 | continue 39 | ret.extend(op.examples) 40 | return ret 41 | 42 | 43 | def pytest_generate_tests(metafunc): 44 | #if 'limit' in metafunc.fixturenames: # TODO 45 | # keyword = metafunc.config.option.keyword 46 | # it's also too hard to override matching behavior 47 | if metafunc.function is test_op_example: 48 | mag = metafunc.config.getoption('--mag') 49 | op_examples = get_op_examples() 50 | #op_examples = [ex for ex in op_examples 51 | # if keyword.lower() in ex.op_name.lower()] 52 | ops = [op_ex.make_op(mag=mag) for op_ex in op_examples] 53 | _test_tuples = [(repr(op), op) for op in ops] 54 | metafunc.parametrize(('op_repr', 'op'), _test_tuples) 55 | pass 56 | 57 | 58 | #def pytest_funcarg__mag(request): 59 | # # TODO: switch to command line argument 60 | # return MAGNITUDE 61 | 62 | 63 | #def pytest_funcarg__limit(request): 64 | # wish there was a good way to compose this with mag and the current 65 | # value of the function's "limit" keyword argument to make the final 66 | # limit return 1 67 | 68 | 69 | def test_multiplexing(mag): 70 | limit = mag * 100 71 | rev_ids = [str(x) for x in range(543184935 - limit, 543184935)] 72 | get_rev_infos = GetRevisionInfos(rev_ids) 73 | rev_infos = get_rev_infos() 74 | assert len(rev_infos) > (0.9 * limit) # a couple might be missing 75 | 76 | 77 | def test_op_example(op_repr, op): 78 | op.process_all() 79 | assert limit_equal_or_depleted(op) 80 | -------------------------------------------------------------------------------- /wapiti/operations/user.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from base import QueryOperation 5 | from params import SingleParam, StaticParam 6 | from models import RevisionInfo 7 | from utils import OperationExample 8 | 9 | 10 | DEFAULT_PROPS = 'ids|flags|timestamp|size|comment|tags|title' 11 | 12 | 13 | class GetUserContribs(QueryOperation): 14 | field_prefix = 'uc' 15 | input_field = SingleParam('user') 16 | fields = [StaticParam('list', 'usercontribs'), 17 | StaticParam('ucprop', DEFAULT_PROPS)] 18 | output_type = [RevisionInfo] 19 | examples = [OperationExample('Jimbo Wales')] 20 | 21 | def extract_results(self, query_resp): 22 | ret = [] 23 | for rev_dict in query_resp.get('usercontribs', []): 24 | user_contrib = RevisionInfo.from_query(rev_dict, 25 | source=self.source) 26 | ret.append(user_contrib) 27 | return ret 28 | 29 | 30 | #class GetUserContribRevisions(Operation): 31 | # subop_chain = (GetUserContribs, GetRevision) 32 | -------------------------------------------------------------------------------- /wapiti/operations/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from heapq import heappush, heappop 5 | import itertools 6 | from functools import total_ordering 7 | 8 | 9 | def is_scalar(obj): 10 | return not hasattr(obj, '__iter__') or isinstance(obj, basestring) 11 | 12 | 13 | def prefixed(arg, prefix=None): 14 | if prefix and not arg.startswith(prefix): 15 | arg = prefix + arg 16 | return arg 17 | 18 | 19 | @total_ordering 20 | class MaxInt(long): 21 | """ 22 | A quite-large integer type that tries to be like float('inf') 23 | (Infinity), but can be used for slicing and other integer 24 | operations. float('inf') is generally more correct, except that 25 | mixing a float and integer in arithmetic operations will result in 26 | a float, which will raise an error on slicing. 27 | """ 28 | def __new__(cls, *a, **kw): 29 | return super(MaxInt, cls).__new__(cls, sys.maxint + 1) 30 | 31 | def __init__(self, name='MAX'): 32 | self._name = str(name) 33 | 34 | def __repr__(self): 35 | return self._name 36 | 37 | def __str__(self): 38 | return repr(self) 39 | 40 | # TODO: better math 41 | for func in ('__add__', '__sub__', '__mul__', '__floordiv__', '__div__', 42 | '__mod__', '__divmod__', '__pow__', '__lshift__', 43 | '__rshift__'): 44 | locals()[func] = lambda self, other: self 45 | 46 | def __gt__(self, other): 47 | return not self == other 48 | 49 | def __eq__(self, other): 50 | return isinstance(other, MaxInt) 51 | 52 | def __int__(self): 53 | return self 54 | 55 | 56 | class OperationExample(object): 57 | """ 58 | Sort of like a partial, but specialer. 59 | 60 | # other types of tests? 61 | """ 62 | def __init__(self, 63 | param=None, 64 | limit=None, 65 | op_type=None, 66 | **kw): 67 | self.op_type = op_type 68 | self.param = param 69 | self.limit = limit 70 | 71 | self.doc = kw.pop('doc', '') 72 | self.test = kw.pop('test', None) 73 | # test defaults to limit_equal_or_depleted in test_ops.py 74 | if kw: 75 | raise TypeError('got unexpected keyword arguments: %r' % kw) 76 | 77 | @property 78 | def op_name(self): 79 | if self.op_type is None: 80 | return None 81 | return self.op_type.__name__ 82 | 83 | @property 84 | def disp_name(self): 85 | if not self.op_type: 86 | return '(unbound OperationExample)' 87 | tmpl = '%(type)s(%(param)r, limit=%(limit)s)' 88 | if self.op_type.input_field is None: 89 | tmpl = '%(type)s(limit=%(limit)s)' 90 | 91 | return tmpl % {'type': self.op_type.__name__, 92 | 'param': self.param, 93 | 'limit': self.limit} 94 | 95 | def bind_op_type(self, op_type): 96 | if self.op_type is None: 97 | self.op_type = op_type 98 | if self.limit is None: 99 | try: 100 | pql = op_type.per_query_limit 101 | except AttributeError: 102 | pql = op_type.subop_chain[0].per_query_limit 103 | self.limit = pql.get_limit() 104 | return 105 | 106 | def make_op(self, mag=None): 107 | if not self.op_type: 108 | raise TypeError('no Operation type assigned') 109 | mag = int(mag or 1) 110 | limit = self.limit * mag 111 | if self.op_type.input_field is None: 112 | return self.op_type(limit=limit) 113 | return self.op_type(self.param, limit=limit) 114 | 115 | def __repr__(self): 116 | cn = self.__class__.__name__ 117 | kwargs = ['param', 'limit', 'test', 'doc'] 118 | kw_parts = ['op_type=%s' % self.op_name] 119 | vals = [getattr(self, a) for a in kwargs if getattr(self, a)] 120 | kw_parts.extend(['%s=%r' % (a, v) for a, v in zip(kwargs, vals)]) 121 | kwarg_str = ', '.join(kw_parts) 122 | return '%s(%s)' % (cn, kwarg_str) 123 | 124 | __str__ = __repr__ 125 | 126 | 127 | """ 128 | TypeWrapper and MetaTypeWrapper are a pair of what are technically 129 | metaclasses, but really just a very overwrought way of enabling 130 | customized versions of types floating around in some 131 | locations. Because Wapiti is a DSL, but also just a bunch of Python, 132 | we have to deal with the fact that if you modify a type/class, it will 133 | be modified everywhere that references it. 134 | 135 | TL;DR: This overblown thing lets Operations use something like 136 | Prioritized(GetCategory, key='total_count'), which sets a priority for 137 | better queueing, without modifying the GetCategory Operation 138 | itself. (Different operations will want to prioritiez different 139 | things.) 140 | 141 | (There is almost certainly a better way, but this was a bit of 142 | fun. Ever made an object that is an instance and a subclass of 143 | itself?) 144 | """ 145 | 146 | 147 | def make_type_wrapper(name, init_args=None): 148 | init_args = init_args or [] 149 | args, defaults = [], {} 150 | for ia in init_args: 151 | try: 152 | arg, _default = ia 153 | defaults[arg] = _default 154 | except ValueError: 155 | arg = ia 156 | if not isinstance(arg, basestring): 157 | raise TypeError('expected string arg name, not %r' % arg) 158 | args.append(arg) 159 | 160 | attrs = {'_args': args, '_defaults': defaults} 161 | return WrapperType(str(name), (Wrapper,), attrs) 162 | 163 | 164 | class WrapperType(type): 165 | @property 166 | def _repr_args(self): 167 | ret = [] 168 | for a in self._args: 169 | try: 170 | ret.append((a, self._defaults[a])) 171 | except KeyError: 172 | ret.append(a) 173 | return ret 174 | 175 | def __repr__(cls): 176 | name, cname = cls.__name__, cls.__class__.__name__ 177 | if cls._repr_args: 178 | return '%s(%r, %r)' % (cname, name, cls._repr_args) 179 | else: 180 | return '%s(%r)' % (cname, name) 181 | 182 | 183 | class Wrapper(object): 184 | __metaclass__ = WrapperType 185 | _args, _defaults = [], {} 186 | 187 | def __init__(self, to_wrap, *args, **kwargs): 188 | wrapped_dict = {} 189 | if isinstance(to_wrap, Wrapper): 190 | wrapped_dict = dict(to_wrap._wrapped_dict) 191 | to_wrap = to_wrap._wrapped 192 | self.__dict__['_wrapped'] = to_wrap 193 | self.__dict__['_wrapped_dict'] = wrapped_dict 194 | 195 | cn = self.__name__ 196 | for arg_i, arg_name in enumerate(self._args): 197 | try: 198 | val = args[arg_i] 199 | if arg_name in kwargs: 200 | raise TypeError('%s got multiple values for arg %r' 201 | % (cn, arg_name)) 202 | except IndexError: 203 | try: 204 | val = kwargs.pop(arg_name) 205 | except KeyError: 206 | try: 207 | val = self._defaults[arg_name] 208 | except KeyError: 209 | raise TypeError('%s expected required arg %r' 210 | % (cn, arg_name)) 211 | setattr(self, arg_name, val) 212 | return 213 | 214 | def __repr__(self): 215 | kv = ', '.join(['%s=%r' % (k, v) for k, v 216 | in self._wrapped_dict.items()]) 217 | tmpl = "" 218 | return tmpl % (self._wrapped, kv) 219 | 220 | def __getattr__(self, name): 221 | return getattr(self._wrapped, name) 222 | 223 | def __setattr__(self, name, val): 224 | super(Wrapper, self).__setattr__(name, val) 225 | self._wrapped_dict[name] = val 226 | 227 | def __delattr__(self, name, val): 228 | super(Wrapper, self).__delattr__(name, val) 229 | self._wrapped_dict.pop(name, None) 230 | 231 | def __call__(self, *a, **kw): 232 | return self._wrapped(*a, **kw) 233 | 234 | 235 | REMOVED = '' 236 | 237 | 238 | class PriorityQueue(object): 239 | """ 240 | Real quick type based on the heapq docs. 241 | """ 242 | def __init__(self): 243 | self._pq = [] 244 | self._entry_map = {} 245 | self.counter = itertools.count() 246 | 247 | def add(self, task, priority=None): 248 | # larger numbers = higher priority 249 | priority = -int(priority or 0) 250 | if task in self._entry_map: 251 | self.remove_task(task) 252 | count = next(self.counter) 253 | entry = [priority, count, task] 254 | self._entry_map[task] = entry 255 | heappush(self._pq, entry) 256 | 257 | def remove(self, task): 258 | entry = self._entry_map.pop(task) 259 | entry[-1] = REMOVED 260 | 261 | def _cull(self): 262 | while self._pq: 263 | priority, count, task = self._pq[0] 264 | if task is REMOVED: 265 | heappop(self._pq) 266 | continue 267 | return 268 | raise IndexError('empty priority queue') 269 | 270 | def peek(self, default=REMOVED): 271 | try: 272 | self._cull() 273 | _, _, task = self._pq[0] 274 | except IndexError: 275 | if default is not REMOVED: 276 | return default 277 | raise IndexError('peek on empty queue') 278 | return task 279 | 280 | def pop(self, default=REMOVED): 281 | try: 282 | self._cull() 283 | _, _, task = heappop(self._pq) 284 | del self._entry_map[task] 285 | except IndexError: 286 | if default is not REMOVED: 287 | return default 288 | raise IndexError('pop on empty queue') 289 | return task 290 | 291 | def __len__(self): 292 | return len(self._entry_map) 293 | 294 | 295 | def chunked_iter(src, size, **kw): 296 | """ 297 | Generates 'size'-sized chunks from 'src' iterable. Unless 298 | the optional 'fill' keyword argument is provided, iterables 299 | not even divisible by 'size' will have a final chunk that is 300 | smaller than 'size'. 301 | 302 | Note that fill=None will in fact use None as the fill value. 303 | 304 | >>> list(chunked_iter(range(10), 3)) 305 | [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] 306 | >>> list(chunked_iter(range(10), 3, fill=None)) 307 | [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]] 308 | """ 309 | size = int(size) 310 | if size <= 0: 311 | raise ValueError('expected a positive integer chunk size') 312 | do_fill = True 313 | try: 314 | fill_val = kw.pop('fill') 315 | except KeyError: 316 | do_fill = False 317 | fill_val = None 318 | if kw: 319 | raise ValueError('got unexpected keyword arguments: %r' % kw.keys()) 320 | if not src: 321 | return 322 | cur_chunk = [] 323 | i = 0 324 | for item in src: 325 | cur_chunk.append(item) 326 | i += 1 327 | if i % size == 0: 328 | yield cur_chunk 329 | cur_chunk = [] 330 | if cur_chunk: 331 | if do_fill: 332 | lc = len(cur_chunk) 333 | cur_chunk[lc:] = [fill_val] * (size - lc) 334 | yield cur_chunk 335 | return 336 | 337 | # From http://en.wikipedia.org/wiki/Wikipedia:Namespace 338 | NAMESPACES = { 339 | 'Main': 0, 340 | 'Talk': 1, 341 | 'User': 2, 342 | 'User talk': 3, 343 | 'Wikipedia': 4, 344 | 'Wikipedia talk': 5, 345 | 'File': 6, 346 | 'File talk': 7, 347 | 'MediaWiki': 8, 348 | 'MediaWiki talk': 9, 349 | 'Template': 10, 350 | 'Template talk': 11, 351 | 'Help': 12, 352 | 'Help talk': 13, 353 | 'Category': 14, 354 | 'Category talk': 15, 355 | 'Portal': 100, 356 | 'Portal talk': 101, 357 | 'Book': 108, 358 | 'Book talk': 109, 359 | 'Special': -1, 360 | 'Media': -2} 361 | 362 | 363 | def bucketize(src, keyfunc=None): 364 | """ 365 | Group values in 'src' iterable by value returned by 'keyfunc'. 366 | keyfunc defaults to bool, which will group the values by 367 | truthiness; at most there will be two keys, True and False, and 368 | each key will have a list with at least one item. 369 | 370 | >>> bucketize(range(5)) 371 | {False: [0], True: [1, 2, 3, 4]} 372 | >>> is_odd = lambda x: x % 2 == 1 373 | >>> bucketize(range(5), is_odd) 374 | {False: [0, 2, 4], True: [1, 3]} 375 | 376 | Value lists are not deduplicated: 377 | 378 | >>> bucketize([None, None, None, 'hello']) 379 | {False: [None, None, None], True: ['hello']} 380 | """ 381 | if not is_iterable(src): 382 | raise TypeError('expected an iterable') 383 | if keyfunc is None: 384 | keyfunc = bool 385 | if not callable(keyfunc): 386 | raise TypeError('expected callable key function') 387 | 388 | ret = {} 389 | for val in src: 390 | key = keyfunc(val) 391 | ret.setdefault(key, []).append(val) 392 | return ret 393 | 394 | 395 | def bucketize_bool(src, keyfunc=None): 396 | """ 397 | Like bucketize, but for added convenience returns a tuple of 398 | (truthy_values, falsy_values). 399 | 400 | >>> nonempty, empty = bucketize_bool(['', '', 'hi', '', 'bye']) 401 | >>> nonempty 402 | ['hi', 'bye'] 403 | 404 | keyfunc defaults to bool, but can be carefully overridden to 405 | use any function that returns either True or False. 406 | 407 | >>> import string 408 | >>> is_digit = lambda x: x in string.digits 409 | >>> decimal_digits, hexletters = bucketize_bool(string.hexdigits, is_digit) 410 | >>> ''.join(decimal_digits), ''.join(hexletters) 411 | ('0123456789', 'abcdefABCDEF') 412 | """ 413 | bucketized = bucketize(src, keyfunc) 414 | return bucketized.get(True, []), bucketized.get(False, []) 415 | 416 | def coerce_namespace(ns_arg): 417 | ns_str = str(ns_arg).capitalize() 418 | return NAMESPACES.get(ns_str, ns_str) 419 | -------------------------------------------------------------------------------- /wapiti/ransom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | """ 5 | This is like a mini version of requests, which has simply 6 | grown too heavy and presented multiple compatibility issues 7 | re: api changes and gevent. 8 | """ 9 | import urllib2 10 | import gzip 11 | 12 | from compat import (unicode, bytes, OrderedDict, StringIO, 13 | urlparse, urlunparse, urlencode, requote) 14 | 15 | 16 | DEFAULT_CONFIG = { 17 | 'headers': {'User-Agent': 'reqs/0.0.0'}} 18 | 19 | 20 | class Response(object): 21 | """ 22 | echoing the tone of the rest of the module, this is abysmally 23 | oversimplified and will be improved soon. 24 | """ 25 | def __init__(self, status_code=None, text=None, headers=None, error=None): 26 | self.status_code = status_code 27 | self.text = text 28 | self.headers = headers 29 | self.error = error 30 | 31 | 32 | def get_items(iterable): 33 | if not iterable: 34 | return [] 35 | return OrderedDict(iterable).items() 36 | 37 | 38 | def get_keys(iterable): 39 | if not iterable: 40 | return [] 41 | return OrderedDict(iterable).keys() 42 | 43 | 44 | def is_scalar(obj): 45 | return not hasattr(obj, '__iter__') or isinstance(obj, basestring) 46 | 47 | 48 | def get_encoded(val): 49 | if not isinstance(val, (unicode, bytes)): 50 | val = unicode(val) 51 | return val.encode('utf-8') 52 | 53 | 54 | def ordered_yield(mapping, keys): 55 | for k in keys: 56 | try: 57 | yield (k, mapping.pop(k)) 58 | except KeyError: 59 | pass 60 | for k in mapping: 61 | yield (k, mapping.pop(k)) 62 | 63 | 64 | def parse_url(url): 65 | try: 66 | url = unicode(url) 67 | except UnicodeDecodeError: 68 | pass 69 | 70 | parsed = urlparse(url) 71 | if not (parsed.scheme and parsed.netloc): 72 | raise ValueError("invalid URL, no schema supplied: %r" % url) 73 | 74 | try: 75 | dec_netloc = parsed.netloc.encode('idna').decode('utf-8') 76 | parsed = parsed._replace(netloc=dec_netloc) 77 | except UnicodeError: 78 | raise ValueError('invalid characters in url: %r' % parsed.netloc) 79 | 80 | if not parsed.path: 81 | parsed = parsed._replace(path=u'/') 82 | 83 | for k, v in parsed._asdict().items(): 84 | parsed = parsed._replace(**{k: get_encoded(v)}) 85 | 86 | return parsed 87 | 88 | 89 | def encode_url_params(params, keep_blank=False): 90 | # TODO: handle case where params is just a string 91 | res = [] 92 | for k, vs in get_items(params): 93 | if is_scalar(vs): 94 | vs = [vs] 95 | for v in vs: 96 | if not v: 97 | if keep_blank: 98 | v = '' 99 | else: 100 | continue 101 | res.append((get_encoded(k), get_encoded(v))) 102 | return urlencode(res, doseq=True) 103 | 104 | 105 | # TODO: merging url params 106 | """ 107 | , keep_order=True): 108 | if keep_order: 109 | existing_params = parse_qsl(parsed_url.query, 110 | keep_blank_values=True) 111 | params = list(ordered_yield(params, get_keys(existing_params))) 112 | query = encode_url_params(params) 113 | else: 114 | """ 115 | 116 | 117 | def construct_url(url, params): 118 | parsed_url = parse_url(url) 119 | 120 | query = parsed_url.query 121 | encoded_params = encode_url_params(params) 122 | if encoded_params: 123 | if query: 124 | query = query + '&' + encoded_params 125 | else: 126 | query = encoded_params 127 | new_parsed_url = parsed_url._replace(query=query) 128 | new_url = requote(urlunparse(new_parsed_url)) 129 | return new_url 130 | 131 | 132 | def gunzip(text): 133 | buf = StringIO(text) 134 | f = gzip.GzipFile(fileobj=buf) 135 | return f.read() 136 | 137 | 138 | class Client(object): 139 | def __init__(self, config=None): # among other things 140 | self.config = dict(DEFAULT_CONFIG) 141 | if config: 142 | self.config.update(config) 143 | 144 | def req(self, method, url, params=None, headers=None, use_gzip=True): 145 | _headers = dict(self.config.get('headers', {})) 146 | if headers: 147 | _headers.update(headers) 148 | headers = _headers 149 | if use_gzip and not headers.get('Accept-encoding'): 150 | headers['Accept-encoding'] = 'gzip' 151 | 152 | full_url = construct_url(url, params) 153 | ret = Response() 154 | ret.url = full_url 155 | resp_text = None 156 | resp_status = None 157 | resp_headers = {} 158 | try: 159 | req = urllib2.Request(full_url, headers=headers) 160 | resp = urllib2.urlopen(req) 161 | resp_text = resp.read() 162 | resp.close() 163 | if 'gzip' in resp.info().get('Content-Encoding', ''): # TODO 164 | resp_text = gunzip(resp_text) 165 | resp_status = resp.getcode() 166 | resp_headers = resp.headers 167 | except Exception as e: 168 | raise 169 | ret.text = resp_text 170 | ret.status_code = resp_status 171 | ret.headers = resp_headers 172 | return ret 173 | 174 | def get(self, url, params=None, headers=None, use_gzip=True): 175 | return self.req('get', url, params, headers, use_gzip) 176 | 177 | def post(self, url, params=None, headers=None, use_gzip=True): 178 | return self.req('post', url, params, headers, use_gzip) 179 | 180 | 181 | # lol compat 182 | requests = Client() 183 | 184 | 185 | if __name__ == '__main__': 186 | print requests.get('https://www.google.com/webhp', params={'q':'python'}).text 187 | -------------------------------------------------------------------------------- /wapiti/tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from argparse import ArgumentParser 5 | from functools import wraps 6 | 7 | from client import WapitiClient 8 | from operations import test_basic, test_operations 9 | 10 | from functools import partial 11 | 12 | DEFAULT_MAGNITUDE = 'norm' 13 | 14 | 15 | def magnitude(norm, big=None, huge=None): 16 | if big is None: 17 | big = norm 18 | if huge is None: 19 | huge = big 20 | 21 | def mag_dec(func): 22 | 23 | @wraps(func) 24 | def wrapped(limit_or_mag=None): 25 | if limit_or_mag is None: 26 | limit_or_mag = wrapped.norm 27 | try: 28 | limit = int(limit_or_mag) 29 | except ValueError: 30 | limit = int(wrapped.__dict__[limit_or_mag]) 31 | return func(limit) 32 | wrapped.norm = norm 33 | wrapped.big = big 34 | wrapped.huge = huge 35 | return wrapped 36 | return mag_dec 37 | 38 | 39 | def call_and_ret(func): 40 | try: 41 | ret = func() 42 | except Exception as e: 43 | if PDB_ERROR: 44 | import pdb;pdb.post_mortem() 45 | raise 46 | if PDB_ALL: 47 | import pdb;pdb.set_trace() 48 | if ret: 49 | try: 50 | disp = ret[0] 51 | except TypeError: 52 | disp = ret 53 | print repr(disp)[:74] + '...' 54 | return ret 55 | 56 | 57 | def test_client_basic(limit): 58 | client = WapitiClient('mahmoudrhashemi@gmail.com') 59 | return len(client.source_info.namespace_map) > 10 60 | 61 | 62 | @magnitude(norm=20, big=550, huge=2000) 63 | def test_cat(limit): 64 | client = WapitiClient('mahmoudrhashemi@gmail.com') 65 | get_africa = partial(client.get_category_recursive, 'Africa', limit) 66 | cat_pages = call_and_ret(get_africa) 67 | return len(cat_pages) == limit 68 | 69 | 70 | def test_unicode_title(limit): 71 | client = WapitiClient('mahmoudrhashemi@gmail.com') 72 | get_beyonce = partial(client.get_current_content, "Beyoncé Knowles") 73 | beyonce = call_and_ret(get_beyonce) 74 | return bool(beyonce) 75 | 76 | 77 | @magnitude(norm=20, big=550, huge=2000) 78 | def test_category_basic(limit): 79 | client = WapitiClient('mahmoudrhashemi@gmail.com') 80 | get_2k_featured = partial(client.get_category, 'Featured_articles', limit) 81 | pages = call_and_ret(get_2k_featured) 82 | return len(pages) == limit 83 | 84 | 85 | @magnitude(norm=20, big=550, huge=2000) 86 | def test_subcategory_infos(limit): 87 | client = WapitiClient('mahmoudrhashemi@gmail.com') 88 | get_subcats = partial(client.get_subcategory_infos, 'FA-Class_articles', limit) 89 | subcats = call_and_ret(get_subcats) 90 | return len(subcats) == limit 91 | 92 | 93 | def test_all_category_infos(limit): 94 | client = WapitiClient('mahmoudrhashemi@gmail.com') 95 | get_all_cats = partial(client.get_all_category_infos, 501) 96 | all_cats = call_and_ret(get_all_cats) 97 | return len(all_cats) == 501 98 | 99 | 100 | @magnitude(norm=10, big=1000, huge=10000) 101 | def test_category_recursive(limit): 102 | client = WapitiClient('mahmoudrhashemi@gmail.com') 103 | get_limit_recursive = partial(client.get_category_recursive, 'Africa', limit) 104 | pages = call_and_ret(get_limit_recursive) 105 | return len(pages) == limit 106 | 107 | 108 | def test_single_prot(limit): 109 | client = WapitiClient('mahmoudrhashemi@gmail.com') 110 | get_coffee_prot = partial(client.get_protections, 'Coffee') 111 | prots = call_and_ret(get_coffee_prot) 112 | return len(prots) == 1 113 | 114 | 115 | def test_multi_prots_list(limit): 116 | client = WapitiClient('mahmoudrhashemi@gmail.com') 117 | get_prots = partial(client.get_protections, ['Coffee', 'House']) 118 | prots = call_and_ret(get_prots) 119 | return len(prots) == 2 120 | 121 | 122 | def test_multi_prots_str(limit): 123 | client = WapitiClient('mahmoudrhashemi@gmail.com') 124 | get_prots = partial(client.get_protections, 'Coffee|House') 125 | prots = call_and_ret(get_prots) 126 | return len(prots) == 2 127 | 128 | 129 | @magnitude(norm=20, big=550, huge=2000) 130 | def test_backlinks(limit): 131 | client = WapitiClient('mahmoudrhashemi@gmail.com') 132 | get_bls = partial(client.get_backlinks, 'Coffee', limit) 133 | bls = call_and_ret(get_bls) 134 | ''' 135 | Nonexistent title returns [] 136 | ''' 137 | return len(bls) == limit 138 | 139 | 140 | @magnitude(norm=20, big=550, huge=2000) 141 | def test_random(limit): 142 | client = WapitiClient('mahmoudrhashemi@gmail.com') 143 | get_fifty_random = partial(client.get_random, limit) 144 | pages = call_and_ret(get_fifty_random) 145 | return len(pages) == limit 146 | 147 | 148 | @magnitude(norm=5, big=550, huge=2000) 149 | def test_lang_links(limit): 150 | client = WapitiClient('mahmoudrhashemi@gmail.com') 151 | get_coffee_langs = partial(client.get_language_links, 'Coffee', limit) 152 | lang_list = call_and_ret(get_coffee_langs) 153 | return len(lang_list) == limit 154 | 155 | 156 | @magnitude(norm=5, big=550, huge=2000) 157 | def test_interwiki_links(limit): 158 | client = WapitiClient('mahmoudrhashemi@gmail.com') 159 | get_coffee_iwlinks = partial(client.get_interwiki_links, 'Coffee', limit) 160 | iw_list = call_and_ret(get_coffee_iwlinks) 161 | return len(iw_list) == limit 162 | 163 | @magnitude(norm=20, big=550, huge=2000) 164 | def test_external_links(limit): 165 | client = WapitiClient('mahmoudrhashemi@gmail.com') 166 | get_coffee_elinks = partial(client.get_external_links, 'Croatian War of Independence', limit) 167 | el_list = call_and_ret(get_coffee_elinks) 168 | assert len(set(el_list)) == len(el_list) 169 | return len(el_list) == limit 170 | 171 | 172 | #def test_feedback_v4(limit): # no longer available, see feedback.py for info 173 | # get_v4 = GetFeedbackV4('604727') 174 | # v4_list = call_and_ret(get_v4) 175 | # return len(v4_list) > 1 176 | 177 | 178 | def test_feedback_v5(limit): 179 | client = WapitiClient('mahmoudrhashemi@gmail.com') 180 | get_v5 = partial(client.get_feedback_v5, '604727') # TODO: support ints 181 | v5_list = call_and_ret(get_v5) 182 | return isinstance(v5_list, list) 183 | 184 | 185 | @magnitude(norm=10, big=550, huge=2000) 186 | def test_revisions(limit): 187 | client = WapitiClient('mahmoudrhashemi@gmail.com') 188 | get_revs = partial(client.get_page_revision_infos, 'Coffee', 10) 189 | rev_list = call_and_ret(get_revs) 190 | return len(rev_list) == 10 191 | 192 | 193 | def test_missing_revisions(limit): 194 | client = WapitiClient('mahmoudrhashemi@gmail.com') 195 | get_revs = partial(client.get_page_revision_infos, 'Coffee_lololololol') 196 | rev_list = call_and_ret(get_revs) 197 | ''' 198 | Should return 'missing' and negative pageid 199 | ''' 200 | return len(rev_list) == 0 201 | 202 | 203 | @magnitude(norm=20, big=550, huge=2000) 204 | def test_transclusions(limit): 205 | client = WapitiClient('mahmoudrhashemi@gmail.com') 206 | get_transcludes = partial(client.get_transcludes, 'Template:ArticleHistory', limit) 207 | tr_list = call_and_ret(get_transcludes) 208 | ''' 209 | Nonexistent title returns [] 210 | ''' 211 | return len(tr_list) == limit 212 | 213 | 214 | @magnitude(norm=20, big=550, huge=2000) 215 | def test_resolve_subjects(limit): 216 | client = WapitiClient('mahmoudrhashemi@gmail.com') 217 | get_res_transcludes = partial(client.get_transcludes, 'Template:ArticleHistory', 218 | limit, 219 | resolve_to_subject=True) 220 | tr_list = call_and_ret(get_res_transcludes) 221 | tr_list = [t.get_subject_info() for t in tr_list] 222 | return len(tr_list) == limit and all([t.is_subject_page for t in tr_list]) 223 | 224 | 225 | def test_current_content(limit): 226 | client = WapitiClient('mahmoudrhashemi@gmail.com') 227 | get_page = partial(client.get_current_content, 'Coffee') 228 | page = call_and_ret(get_page) 229 | return page[0].title == 'Coffee' 230 | 231 | 232 | def test_current_content_redirect(limit): 233 | client = WapitiClient('mahmoudrhashemi@gmail.com') 234 | get_page = partial(client.get_current_content, 'Obama') 235 | page = call_and_ret(get_page) 236 | return page[0].title == 'Barack Obama' 237 | 238 | 239 | def test_current_talk_content(limit): 240 | client = WapitiClient('mahmoudrhashemi@gmail.com') 241 | get_talk_page = partial(client.get_current_talk_content, 'Obama') 242 | page = call_and_ret(get_talk_page) 243 | return page[0].title == 'Talk:Barack Obama' 244 | 245 | 246 | @magnitude(norm=20, big=550, huge=2000) 247 | def test_flatten_category(limit): 248 | client = WapitiClient('mahmoudrhashemi@gmail.com') 249 | get_flat_cat = partial(client.get_flattened_category, 'History', limit) 250 | cat_infos = call_and_ret(get_flat_cat) 251 | assert len(set([ci.title for ci in cat_infos])) == len(cat_infos) 252 | return len(cat_infos) == limit 253 | 254 | 255 | @magnitude(norm=10, big=550, huge=2000) 256 | def test_cat_mem_namespace(limit): 257 | client = WapitiClient('mahmoudrhashemi@gmail.com') 258 | get_star_portals = partial(client.get_category, 259 | 'Astronomy_portals', 260 | limit, 261 | namespace=['100']) 262 | portals = call_and_ret(get_star_portals) 263 | return len(portals) == limit 264 | 265 | 266 | @magnitude(norm=20, big=550, huge=2000) 267 | def test_cat_pages_recursive(limit): 268 | client = WapitiClient('mahmoudrhashemi@gmail.com') 269 | get_cat_pages_rec = partial(client.get_category_articles_recursive, 270 | 'Africa', 271 | limit, 272 | resolve_to_subject=True) 273 | pages = call_and_ret(get_cat_pages_rec) 274 | return len(pages) == limit 275 | 276 | 277 | @magnitude(norm=11, big=550, huge=2000) 278 | def test_cat_list(limit): 279 | client = WapitiClient('mahmoudrhashemi@gmail.com') 280 | get_cat_list = partial(client.get_category_list, 'Physics', limit) 281 | pages = call_and_ret(get_cat_list) 282 | return len(pages) == limit 283 | 284 | 285 | @magnitude(norm=4, big=550, huge=2000) 286 | def test_get_images(limit): 287 | client = WapitiClient('mahmoudrhashemi@gmail.com') 288 | get_imgs = partial(client.get_images, 'Coffee', limit) 289 | imgs = call_and_ret(get_imgs) 290 | return len(imgs) == limit 291 | 292 | 293 | @magnitude(norm=5, big=550, huge=2000) 294 | def test_get_links(limit): 295 | client = WapitiClient('mahmoudrhashemi@gmail.com') 296 | get_links = partial(client.get_links, 'Coffee', limit) 297 | links = call_and_ret(get_links) 298 | return len(links) == limit 299 | 300 | 301 | def test_coordinates(limit): 302 | client = WapitiClient('mahmoudrhashemi@gmail.com') 303 | get_coordinates = partial(client.get_coordinates, ['White House', 'Golden Gate Bridge', 'Mount Everest']) 304 | coords = call_and_ret(get_coordinates) 305 | return len(coords) == 3 306 | 307 | 308 | def test_geosearch(limit): 309 | client = WapitiClient('mahmoudrhashemi@gmail.com') 310 | geosearch = partial(client.geo_search, ('37.8197', '-122.479')) 311 | geo = call_and_ret(geosearch) 312 | return len(geo) > 1 313 | 314 | 315 | @magnitude(norm=20, big=550, huge=2000) 316 | def test_get_user_contribs(limit): 317 | client = WapitiClient('mahmoudrhashemi@gmail.com') 318 | get_contribs = partial(client.get_user_contribs, 'Jimbo Wales', limit) 319 | contribs = call_and_ret(get_contribs) 320 | return len(contribs) == limit 321 | 322 | 323 | def test_get_meta(limit): 324 | client = WapitiClient('mahmoudrhashemi@gmail.com') 325 | get_source_info = client.get_source_info 326 | meta = call_and_ret(get_source_info) 327 | return len(meta[0].interwiki_map) > 1 328 | 329 | 330 | def test_get_revision_infos(limit): 331 | client = WapitiClient('mahmoudrhashemi@gmail.com') 332 | get_revisions = partial(client.get_revision_infos, ['538903663', '539916351', '531458383']) 333 | rev_infos = call_and_ret(get_revisions) 334 | return len(rev_infos) == 3 335 | 336 | 337 | def test_get_image_info(limit): 338 | client = WapitiClient('mahmoudrhashemi@gmail.com') 339 | get_image_info = partial(client.get_image_infos, 'File:Logo.gif') 340 | image_info = call_and_ret(get_image_info) 341 | return image_info[0].url == 'http://upload.wikimedia.org/wikipedia/en/e/ea/Logo.gif' 342 | 343 | 344 | """ 345 | @magnitude(norm=20, big=550, huge=2000) 346 | def test_get_all_image_infos(limit): 347 | client = WapitiClient('mahmoudrhashemi@gmail.com') 348 | get_all_img = partial(client.get_all_image_infos, limit) 349 | all_imgs = call_and_ret(get_all_img) 350 | return len(all_imgs) == limit 351 | """ 352 | 353 | 354 | @magnitude(norm=20, big=550, huge=2000) 355 | def test_get_templates(limit): 356 | client = WapitiClient('mahmoudrhashemi@gmail.com') 357 | get_templates = partial(client.get_templates, 'Coffee', limit) 358 | tmpl = call_and_ret(get_templates) 359 | return len(tmpl) == limit 360 | 361 | 362 | """ 363 | @magnitude(norm=1, big=5, huge=600) 364 | def test_query_pages(limit): 365 | client = WapitiClient('mahmoudrhashemi@gmail.com') 366 | from operations.misc import GetQueryPage as gqp 367 | qp_types = gqp.known_qps 368 | ret = [] 369 | for qpt in qp_types: 370 | get_pages = partial(client.get_query_page, qpt, limit) 371 | ret.extend(call_and_ret(get_pages)) 372 | return len(ret) == limit * len(qp_types) 373 | """ 374 | 375 | """ 376 | def test_nonexistent_query_page(limit): 377 | client = WapitiClient('mahmoudrhashemi@gmail.com') 378 | try: 379 | non_existent_qp = partial(client.get_query_page, 'FakeQueryPage') 380 | call_and_ret(non_existent_qp) 381 | except ValueError: 382 | return True 383 | """ 384 | 385 | 386 | @magnitude(norm=20, big=550, huge=2000) 387 | def test_recent_changes(limit): 388 | client = WapitiClient('mahmoudrhashemi@gmail.com') 389 | get_recent_changes = partial(client.get_recent_changes, limit) 390 | recent_changes = call_and_ret(get_recent_changes) 391 | return len(recent_changes) == limit 392 | 393 | 394 | def create_parser(): 395 | parser = ArgumentParser(description='Test operations') 396 | parser.add_argument('functions', nargs='*') 397 | parser.add_argument('--pdb_all', '-a', action='store_true') 398 | parser.add_argument('--pdb_error', '-e', action='store_true') 399 | parser.add_argument('--do_print', '-p', action='store_true') 400 | parser.add_argument('--magnitude', '-m', 401 | default=DEFAULT_MAGNITUDE) 402 | return parser 403 | 404 | 405 | def main(): 406 | global PDB_ALL, PDB_ERROR, DO_PRINT 407 | parser = create_parser() 408 | args = parser.parse_args() 409 | PDB_ALL = args.pdb_all 410 | PDB_ERROR = args.pdb_error 411 | DO_PRINT = args.do_print 412 | if args.functions: 413 | tests = {} 414 | for func in args.functions: 415 | try: 416 | tests[func] = globals()[func] 417 | except KeyError: 418 | print func, 'is not a valid test function' 419 | continue 420 | else: 421 | tests = dict([(k, v) for k, v in globals().items() 422 | if callable(v) and k.startswith('test_')]) 423 | results = {} 424 | for k, v in tests.items(): 425 | results[k] = v(args.magnitude) 426 | print k, results[k] 427 | return results 428 | 429 | if __name__ == '__main__': 430 | from pprint import pprint 431 | pprint(main()) 432 | --------------------------------------------------------------------------------