├── .gitignore
├── API-NOTES.md
├── LICENSE
├── README.md
├── examples
    ├── basic.py
    └── basic_gevent.py
├── setup.py
├── tox.ini
└── wapiti
    ├── NOTES.rst
    ├── __init__.py
    ├── client.py
    ├── compat.py
    ├── operations
        ├── __init__.py
        ├── _test_tmpls
        │   └── regr_moctezuma_parser_funcs.txt
        ├── base.py
        ├── category.py
        ├── conftest.py
        ├── feedback.py
        ├── files.py
        ├── links.py
        ├── meta.py
        ├── misc.py
        ├── models.py
        ├── params.py
        ├── protection.py
        ├── query_operations.py
        ├── rand.py
        ├── revisions.py
        ├── template_parser.py
        ├── templates.py
        ├── test_basic.py
        ├── test_operations.py
        ├── user.py
        └── utils.py
    ├── ransom.py
    └── tests.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc*
 2 | *.pyo*
 3 | *.sublime-project
 4 | *.sublime-workspace
 5 | *.egg-info
 6 | dist/*
 7 | .tox/*
 8 | .DS_Store
 9 | .*.swp
10 | *~
11 | ._*
12 | .\#*
13 | \#*\#
14 | *.csv
15 | *.json
16 | log*
17 | build/*
18 | *.swp
19 | 


--------------------------------------------------------------------------------
/API-NOTES.md:
--------------------------------------------------------------------------------
 1 | # API improvement notes #
 2 | Some MediaWiki API queries are inconsistent, broken, or otherwise in want of improvement. Similar notes are recorded at [Mediawiki:Requests for comment/API roadmap](http://www.mediawiki.org/wiki/Requests_for_comment/API_roadmap).
 3 | 
 4 | * list=usecontribs
 5 | 
 6 |   - Docs mention `uccontinue`, but the query uses `ucstart` for continue functionality.
 7 |   - `ucprop` should be consistent with `rvprop`: it is missing `flags`, `sha1`, `ids` (parentid)
 8 | 
 9 | * missing title
10 |   - Throws a warning `unrecognized parameter`, but it should throw an error.
11 | 
12 | * prop=extracts supports multiple titles, but requires a continue to
13 |   get each individual extract, while returning the IDs/titles of all
14 |   the requested pages each time.
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Mahmoud Hashemi and Stephen LaPorte
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 | 
10 |     * Redistributions in binary form must reproduce the above
11 |       copyright notice, this list of conditions and the following
12 |       disclaimer in the documentation and/or other materials provided
13 |       with the distribution.
14 | 
15 |     * The names of the contributors may not be used to endorse or
16 |       promote products derived from this software without specific
17 |       prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | wapiti
  2 | ======
  3 | 
  4 | ![Wapiti](http://upload.wikimedia.org/wikipedia/commons/thumb/5/59/Elk_1_%28PSF%29.png/212px-Elk_1_%28PSF%29.png)
  5 | 
  6 | A MediaWiki API wrapper in Python for humans and elk.
  7 | 
  8 | Wapiti makes it simple for python scripts to retrieve data from the
  9 | [Wikipedia API](https://en.wikipedia.org/w/api.php). No more worries
 10 | about query limits, continue strings, or formatting. Just ask for data
 11 | and get structured results.
 12 | 
 13 | Example
 14 | -------
 15 | 
 16 | Let's get the members of Wikipedia's [Category:Lists of
 17 | superlatives](http://en.wikipedia.org/wiki/Category:Lists_of_superlatives). First,
 18 | initialize a `WapitiClient` and change any settings. Next, run the
 19 | operation `get_category_pages` on the category `'Lists of
 20 | superlatives'`, with a limit of `10`:
 21 | 
 22 | ```python
 23 | >>> import wapiti
 24 | >>> client = wapiti.WapitiClient('domas@mituzas.lt')
 25 | >>> client.get_category_articles_recursive('Lists of superlatives', 10))
 26 | [PageInfo(title=u'The Fifty Worst Films of All Time', page_id=1820513, ns=0),
 27 |  PageInfo(title=u"World's busiest city airport systems by passenger traffic", page_id=33167241, ns=0),
 28 |  PageInfo(title=u'List of oldest Major League Baseball players', page_id=1947309, ns=0),
 29 |  PageInfo(title=u'List of firsts in India', page_id=3752148, ns=0),
 30 |  PageInfo(title=u'List of the first female holders of political offices in Europe', page_id=18904865, ns=0),
 31 |  PageInfo(title=u'List of the busiest airports in the Republic of Ireland', page_id=26712480, ns=0),
 32 |  PageInfo(title=u'List of longest bridges above water in India', page_id=32312925, ns=0),
 33 |  PageInfo(title=u'List of the busiest airports in China', page_id=33396262, ns=0),
 34 |  PageInfo(title=u'List of most common surnames in Asia', page_id=26810011, ns=0),
 35 |  PageInfo(title=u'List of largest mosques', page_id=20897194, ns=0)]
 36 | ```
 37 | 
 38 | This returns a list of `PageInfo` objects for the category's members.
 39 | 
 40 | Operations
 41 | ----------
 42 | 
 43 | Operations usually take two positional arguments: the `query_param`
 44 | (page, category, template, etc.), and `limit` (maximum number of
 45 | results).
 46 | 
 47 | - `get_random(limit)` : returns a list of `PageIdentifiers` for random pages.
 48 | - `get_category_articles(category, limit)` : returns a list of `PageIdentifiers` for the articles or talk pages in a category. If you are interested in getting pages beyond of the main and talk namespace, try `get_category`.
 49 | - `get_category_articles_recursive(category, limit)` : returns a list of `PageInfos` for the articles in a category and its subcategories. If you are interested in getting pages beyond of the main and talk namespace, try `get_category_recursive`.
 50 | - `get_transcludes(page, limit)` : returns a list of `PageIdentifiers` for the articles that embed (transclude) a page. For example, see the pages that embed [Template:Infobox](http://en.wikipedia.org/wiki/Special:WhatLinksHere/Template:Infobox) with `client.get_transcludes('Infobox')`.
 51 | - `get_backlinks(page, limit)` : returns a list of `PageIdentifiers` for pages that internally link back to a page. For example, see the pages that [link to 'Coffee'](http://en.wikipedia.org/wiki/Special:WhatLinksHere/Coffee) with `client.get_backlinks('Coffee')`.
 52 | - `get_revision_infos(page, limit)` : returns a list of `RevisionInfos` for a page's revisions.
 53 | - `get_current_content(page, limit)` : returns a list of `Revisions` (including text content) for the page's most recent revisions.
 54 | 
 55 | Other operations are available: see wapiti/operations
 56 | 
 57 | Models
 58 | ------
 59 | 
 60 | Models describe the structure for result data. For the full list of
 61 | models, see wapiti/operations/models.py
 62 | 
 63 | ### PageIdentifier ###
 64 | 
 65 | A `PageIdentifier` describes the standard information available for a  page.
 66 | 
 67 | - **Title** : unique name of the page
 68 | - **ID** : the primary key for the page
 69 | - **Namespace** : the [namespace](http://en.wikipedia.org/wiki/Wikipedia:Namespace) number, which can indicate whether the page is an article, discussion page, user page, template, category, etc.
 70 | - **Source** : the MediaWiki API where this page was retrieved
 71 | - **Normalized title** : the title may have been normalized by MediaWiki, for example, by resolving a redirect
 72 | - **Subject ID** : the ID of the corresponding page in the basic namespace
 73 | - **Talk page ID** : the ID of the corresponding page in the [talk namespace](http://en.wikipedia.org/wiki/Help:Using_talk_pages)
 74 | 
 75 | ### RevisionInfo ###
 76 | 
 77 | A `RevisionInfo` describes the standard information for a revision.
 78 | 
 79 | * **PageIdentifier** : the page's `PageIdentifier`
 80 | * **Subject revision ID** : the primary key for a revision
 81 | * **Parent revision ID** : the previous revision to the page
 82 | * **User text** : the editor's username, or IP address for an unregistered user
 83 | * **User ID** : the unique id of the user who submitted this revision. It may be 0 for an unregistered user.
 84 | * **Size** : the length of the article at this revision
 85 | * **Timestamp** : timestamp in UTC when this revision was submitted
 86 | * **SHA1** : the SHA-1 hash of revision text in base-36.
 87 | * **Edit summary** : the [edit summary](http://meta.wikimedia.org/wiki/Help:Edit_summary) (or 'comment') for a contribution. In some cases, it may have been deleted (or 'oversighted') and unavailable through the API.
 88 | * **Tags** : brief messages that MediaWiki (or an extension) may automatically place next to certain edits. [Tags](http://en.wikipedia.org/wiki/Wikipedia:Tags) are not common, usually placed by Edit Filter or VisualEditor extensions.
 89 | * **Parsed** : whether the page is parsed (html) or not (wikitext)
 90 | 
 91 | ### Revision ###
 92 | 
 93 | A `Revision` includes the same data as `RevisionInfo`, plus full text content.
 94 | 
 95 | TODO
 96 | ----
 97 | - Logging
 98 | - Client settings
 99 | - Port more API calls
100 | - Retry and timeout behaviors
101 | - Get my shit together and continue work on the HTTP client.
102 | - Underscoring args
103 | - Pause/resume
104 | - Better differentiation between the following error groups:
105 |    * Network/connectivity
106 |    * Logic
107 |    * Actual Mediawiki API errors ('no such category', etc.)
108 | - Relatedly: Save MediaWiki API warnings
109 | - Types of API calls:
110 |    * single argument -> multiple results (get category)
111 |    * many arguments -> up to one result per argument (get protections)
112 |    * multiple arguments -> multiple results per argument (get language links)
113 |    * TODO: establish return format convention for this
114 | - Need generic support for:
115 |    * APIs which support both pageid and title lookup
116 |    * Redirect following
117 | - Full docs
118 | 


--------------------------------------------------------------------------------
/examples/basic.py:
--------------------------------------------------------------------------------
 1 | from wapiti import WapitiClient
 2 | 
 3 | client = WapitiClient('you@example.com')
 4 | 
 5 | res = []
 6 | cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics')
 7 | for cat in cats:
 8 |     res.append(client.get_category_recursive(cat, 1000))
 9 | 
10 | print res[0][0]
11 | 
12 | import pdb;pdb.set_trace()
13 | 


--------------------------------------------------------------------------------
/examples/basic_gevent.py:
--------------------------------------------------------------------------------
 1 | import gevent
 2 | from gevent import monkey
 3 | monkey.patch_all()
 4 | 
 5 | from wapiti import WapitiClient
 6 | 
 7 | client = WapitiClient('you@example.com')
 8 | 
 9 | cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics')
10 | tasks = [gevent.spawn(client.get_category_recursive, x, 1000) for x in cats]
11 | gevent.wait(tasks)
12 | 
13 | print tasks[0].value[0]
14 | 
15 | import pdb;pdb.set_trace()
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Wapiti
 3 |     ~~~~~~
 4 | 
 5 |     Wapiti is a Wikipedia API client focused on providing a consistent
 6 |     and performant abstraction around the widely varying Mediawiki API
 7 |     endpoints and data models. Read-only APIs are first priority, but
 8 |     write operations are on the way. See `the Github project
 9 |     <https://github.com/mahmoud/wapiti>`_ for more info.
10 | 
11 |     :copyright: (c) 2013 by Mahmoud Hashemi and Stephen LaPorte
12 |     :license: BSD, see LICENSE for more details.
13 | 
14 | """
15 | 
16 | import sys
17 | from setuptools import setup
18 | 
19 | 
20 | __author__ = 'Mahmoud Hashemi'
21 | __version__ = '0.1'
22 | __contact__ = 'mahmoudrhashemi@gmail.com'
23 | __url__ = 'https://github.com/mahmoud/wapiti'
24 | __license__ = 'BSD'
25 | 
26 | 
27 | if sys.version_info >= (3,):
28 |     raise NotImplementedError("wapiti Python 3 support en route to your location")
29 | 
30 | 
31 | setup(name='wapiti',
32 |       version=__version__,
33 |       description="A Wikipedia API client for humans and elk.",
34 |       long_description=__doc__,
35 |       author=__author__,
36 |       author_email=__contact__,
37 |       url=__url__,
38 |       packages=['wapiti', 'wapiti.operations'],
39 |       include_package_data=True,
40 |       zip_safe=False,
41 |       license=__license__,
42 |       platforms='any',
43 |       classifiers=[
44 |           'Intended Audience :: Developers',
45 |           'Topic :: Software Development :: Libraries',
46 |           'Programming Language :: Python :: 2.6',
47 |           'Programming Language :: Python :: 2.7',
48 |           'Topic :: Internet :: WWW/HTTP',
49 |           'Topic :: Education',
50 |           'Development Status :: 3 - Alpha']
51 |       )
52 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27
3 | [testenv]
4 | commands=python ./wapiti/tests.py
5 | 


--------------------------------------------------------------------------------
/wapiti/NOTES.rst:
--------------------------------------------------------------------------------
 1 | Notes
 2 | =====
 3 | 
 4 | Notes on "multiargument" and "bijective":
 5 | -----------------------------------------
 6 | 
 7 | There are lots of ways to classify operations, and these are just a
 8 | couple.
 9 | 
10 | "Multiargument" operations can take more than one search parameter
11 | at once, such as the GetProtections operation. Others, can only take
12 | one argument at a time, like GetCategory.
13 | 
14 | "Bijective" only return at most one result per argument. GetProtections
15 | is an example of a bijective query. Bijective queries do not require an
16 | explicit limit on the number of results to be set by the user.
17 | 
18 | Going forward, these attributes can be determined as follows:
19 | 
20 | - Multiargument: determined by looking at an operation's
21 |   `input_field`. If it is a SingleParam, then multiargument is false,
22 |   if it's a MultiParam, then multiargument is true.
23 | 
24 | - Bijective: determined by looking at an operation's `output_type`,
25 |   which more accurately describes the *per-parameter* return type. If
26 |   it is a list, then bijective is true, if it's a bare type, then
27 |   bijective is false.
28 | 
29 | 
30 | Fodder from DSL/dataflow refactor
31 | ---------------------------------
32 | 
33 | GetCategoryPagesRecursive
34 | (FlattenCategory -> GetCategoryPages -> Wikipedia API call -> URL fetch     )
35 | (PageInfos       <- PageInfos        <- MediaWikiCall      <- RansomResponse)
36 | 
37 | operation's input_field = explicit or first field of chain
38 | 
39 | def process(op):
40 |    res = op.process()
41 |    return self.store_results(res)
42 | 
43 | what about producing subops?
44 | 
45 | def process():
46 |    task = self.get_current_task()
47 |    res = task.process()
48 |    if res and isinstance(res[0], Operation):
49 |       self.store_subops(res)
50 |       return  # return subops?
51 |    return self.store_results(res)  # returns *new* results
52 | 
53 | GetCategoryPagesRecursive
54 | (FlattenCategory --(CatInfos)->
55 |         GetCategoryPages --("APIParamStructs")->
56 |                MediawikiCall [--(url)-> URL fetch])
57 | 
58 | An "APIParamStruct" is really just something with the API url and param
59 | dictionary, so QueryOperations themselves could be viewed as
60 | APIParamStructs. In other words, hopefully no new model type needed
61 | just for that.
62 | 
63 | At its most basic level, an Operation is something which:
64 | 
65 | - Has a type-declared input field, and a declared return type
66 | - Has a process() function that returns results (of the output type)
67 |   or raises NoMoreResults
68 | - Most likely takes a WapitiClient as a 'client' keyword
69 |   argument in its __init__()
70 | - Provides a uniform way of checking progress (checking if it's done)
71 | 
72 | Some notes on Operation design/usage:
73 | 
74 | - An Operation typically keeps a copy of its results internally,
75 |   most likely a unique list of some sort, and should return only
76 |   new results.
77 | - Calling an Operation directly calls process() repeatedly until the
78 |   operation is complete, then returns the internally tracked results.
79 | 


--------------------------------------------------------------------------------
/wapiti/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 | from client import WapitiClient
4 | 


--------------------------------------------------------------------------------
/wapiti/client.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | 
  5 | '''
  6 | The beginnings of a better Mediawiki API library (with certain builtin
  7 | affordances for the more popular wikis and extensions). Most of what
  8 | you see below is implementation internals, the public API isn't set yet,
  9 | but check back soon.
 10 | 
 11 | # TODO
 12 |  * Create client class
 13 |  * Port more API calls
 14 |  * Retry and timeout behaviors
 15 |  * Get my shit together and continue work on the HTTP client.
 16 |  * Underscoring args
 17 |  * pause/resume
 18 |  * better differentiation between the following error groups:
 19 |    * Network/connectivity
 20 |    * Logic
 21 |    * Actual Mediawiki API errors ('no such category', etc.)
 22 |  * Relatedly: Save MediaWiki API warnings
 23 | 
 24 | Types of API calls:
 25 |  * single argument -> multiple results (get category)
 26 |  * many arguments -> up to one result per argument (get protections)
 27 |  * multiple arguments -> multiple results per argument (get language links)
 28 |    * TODO: establish return format convention for this
 29 | 
 30 | Need generic support for:
 31 |  * APIs which support both pageid and title lookup
 32 |  * Redirect following
 33 | '''
 34 | import re
 35 | 
 36 | from operations import ALL_OPERATIONS, DEFAULT_API_URL
 37 | 
 38 | DEFAULT_TIMEOUT = 15
 39 | import socket
 40 | socket.setdefaulttimeout(DEFAULT_TIMEOUT)  # TODO: better timeouts for reqs
 41 | 
 42 | 
 43 | _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
 44 | 
 45 | 
 46 | def camel2under(string):
 47 |     return _camel2under_re.sub(r'_\1', string).lower()
 48 | 
 49 | 
 50 | def under2camel(string):
 51 |     return ''.join(w.capitalize() or '_' for w in string.split('_'))
 52 | 
 53 | 
 54 | class BoundOperation(object):  # TODO: Operation subtype?
 55 |     def __init__(self, op_type, client):
 56 |         self.client = client
 57 |         self.op_type = op_type
 58 |         self.op_inst = None
 59 | 
 60 |     def __call__(self, *a, **kw):
 61 |         if not self.op_inst:
 62 |             kw.setdefault('client', self.client)
 63 |             self.op_inst = self.op_type(*a, **kw)
 64 |             kw.pop('client')
 65 |         return self.op_inst()
 66 | 
 67 |     def __repr__(self):
 68 |         cn = self.__class__.__name__
 69 |         if self.op_inst:
 70 |             return '<%s %r bound to %r>' % (cn, self.op_inst, self.client)
 71 |         op_cn = self.op_type.__name__
 72 |         return '<%s %s bound to %r>' % (cn, op_cn, self.client)
 73 | 
 74 | 
 75 | class UnboundOperation(object):  # TODO: Operation subtype?
 76 |     def __init__(self, op_type):
 77 |         self.op_type = op_type
 78 | 
 79 |     def bind(self, client):
 80 |         return BoundOperation(self.op_type, client)
 81 | 
 82 |     def __get__(self, obj, obj_type=None):
 83 |         if obj_type and isinstance(obj, WapitiClient):
 84 |             return BoundOperation(self.op_type, obj)
 85 |         return self
 86 | 
 87 |     def __repr__(self):
 88 |         cn = self.__class__.__name__
 89 |         return '<%s %r>' % (cn, self.op_type)
 90 | 
 91 | 
 92 | class WapitiClient(object):
 93 |     """
 94 |     Provides logging, caching, settings, and a convenient interface
 95 |     to most (all?) operations.
 96 |     """
 97 |     def __init__(self,
 98 |                  user_email,
 99 |                  api_url=None,
100 |                  is_bot=False,
101 |                  init_source=True,
102 |                  debug=False):
103 |         # set settings obj
104 |         # set up source (from api_url in settings)
105 |         # then you're ready to call ops
106 |         self.user_email = user_email
107 |         self.api_url = api_url or DEFAULT_API_URL
108 |         self.is_bot = is_bot
109 |         self.debug = debug
110 | 
111 |         if init_source:
112 |             self._init_source()
113 | 
114 |     def _init_source(self):
115 |         # TODO: no input_field and single respones
116 |         self.source_info = self.get_source_info()[0]
117 | 
118 |     @property
119 |     def op_names(self):
120 |         return list(sorted(self.op_map.keys()))
121 | 
122 |     def print_usage(self, query=None):
123 |         op_names = self.op_names
124 |         if query:
125 |             op_names = [o for o in self.op_names if query.lower() in o.lower()]
126 |         print '\n'.join(self.op_map[name].help_str for name in op_names)
127 | 
128 |     # TODO: configurable operations
129 |     op_map = dict([(op.__name__, op) for op in ALL_OPERATIONS])
130 |     unbound_op_map = dict([(camel2under(op_name), UnboundOperation(op))
131 |                            for op_name, op in op_map.items()])
132 |     unbound_op_set = set(unbound_op_map.values())
133 |     locals().update(unbound_op_map)
134 | 


--------------------------------------------------------------------------------
/wapiti/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | is_py2 = sys.version_info[0] == 2
 5 | is_py3 = sys.version_info[0] == 3
 6 | 
 7 | from collections import OrderedDict  # TODO
 8 | 
 9 | if is_py2:
10 |     from urllib import quote, unquote, quote_plus, unquote_plus, urlencode
11 |     from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag
12 |     from urllib2 import parse_http_list
13 |     import cookielib
14 |     from Cookie import Morsel
15 |     from StringIO import StringIO
16 | 
17 |     unicode, str, bytes, basestring = unicode, str, str, basestring
18 | elif is_py3:
19 |     from urllib.parse import (urlparse, urlunparse, urljoin, urlsplit,
20 |                               urlencode, quote, unquote, quote_plus,
21 |                               unquote_plus, urldefrag)
22 |     from urllib.request import parse_http_list
23 |     from http import cookiejar as cookielib
24 |     from http.cookies import Morsel
25 |     from io import StringIO
26 | 
27 |     unicode, str, bytes, basestring = str, bytes, bytes, str
28 | else:
29 |     raise NotImplementedError('welcome to the future, I guess. (report this)')
30 | 
31 | 
32 | # The unreserved URI characters (RFC 3986)
33 | UNRESERVED_SET = frozenset(
34 |     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
35 |     + "0123456789-._~")
36 | 
37 | 
38 | def unquote_unreserved(uri):
39 |     """Un-escape any percent-escape sequences in a URI that are unreserved
40 |     characters. This leaves all reserved, illegal and non-ASCII bytes encoded.
41 |     """
42 |     parts = uri.split('%')
43 |     for i in range(1, len(parts)):
44 |         h = parts[i][0:2]
45 |         if len(h) == 2 and h.isalnum():
46 |             c = chr(int(h, 16))
47 |             if c in UNRESERVED_SET:
48 |                 parts[i] = c + parts[i][2:]
49 |             else:
50 |                 parts[i] = '%' + parts[i]
51 |         else:
52 |             parts[i] = '%' + parts[i]
53 |     return ''.join(parts)
54 | 
55 | 
56 | def requote(uri):
57 |     return quote(unquote_unreserved(uri), safe="!#$%&'()*+,/:;=?@[]~")
58 | 


--------------------------------------------------------------------------------
/wapiti/operations/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import WapitiException, DEFAULT_API_URL, OperationMeta
 5 | from models import PageIdentifier, CategoryInfo, RevisionInfo
 6 | 
 7 | import category
 8 | import feedback
 9 | import files
10 | import links
11 | import meta
12 | import misc
13 | import protection
14 | import rand
15 | import revisions
16 | import templates
17 | import user
18 | import query_operations
19 | 
20 | for op in OperationMeta._all_ops:
21 |     globals()[op.__name__] = op
22 | 
23 | ALL_OPERATIONS = tuple(OperationMeta._all_ops)
24 | 


--------------------------------------------------------------------------------
/wapiti/operations/_test_tmpls/regr_moctezuma_parser_funcs.txt:
--------------------------------------------------------------------------------
  1 | {{ArticleHistory
  2 | |action1=GAN
  3 | |action1date=14:08, 30 March 2008 (UTC)
  4 | |action1result=not listed
  5 | |action1oldid=201814944
  6 | |currentstatus=FGAN
  7 | }}
  8 | {{WikiProjectBannerShell|1=
  9 | {{WikiProject Biography|living=no|class=C|listas=Moctezuma Ii}}
 10 | {{WikiProject Mesoamerica|aztec=yes|class=C|importance=Top}}
 11 | {{WikiProject Mexico|class=c|importance=Mid}}}}
 12 | 
 13 | =="Last king"==
 14 | On a page entitled just "Moctezuma," (which I have redirected here) someone had written
 15 | "'''Moctezuma''' was the last [[:Aztecs|Aztec]] king."
 16 | Please note that that is both incomplete and false -- there were two Moctezumas, neither of which were "last."
 17 | 
 18 | If we're not careful, the term "Wikipedia article" will come to be used to mean "authoritive-sounding nonsense."
 19 | 
 20 | :In a way [[Motecuhzoma Xocoyotzin]] was the last true [[Mexica]] king of Tenochtitlan since during his rein, Tenochtitlan was raided by Cortez. [[User:Myke209|Myke]] 13:04, 12 April 2006 (UTC)
 21 | 
 22 | 
 23 | The last [[Mexica]] [[tlatoani]] was [[Cuauhtémoc]]. And it's "Cortés" not "Cortez" --[[User:Sukozo|Sukozo]] ([[User talk:Sukozo|talk]]) 11:57, 3 November 2011 (UTC)
 24 | 
 25 | ==No source==
 26 | I cut the anonamously added "Some historians point out however, that the spaniards, realizing that Moctezuma had lost his influence and rule over his aztec subjects, found him useless and killed him right after the stoning incident."  Source? As far as I remember, this isn't how Diaz de Castillo nor Prescott tell it.  Wondering simply, [[User:Infrogmation|Infrogmation]] 04:49 3 Jul 2003 (UTC)
 27 | : if i remember the original source of this is Gomarra (and  confirmed by the florentine codex). This declaration from Gomarra wa the reazon why Castillo wrote "The Real history...".
 28 | ::The florentine codex doesn't mention how motecuzoma died - it says that nahuas found his body on the lakebed early in the morning (i think after the noche triste)[[User:Maunus|·Maunus·<span class="Unicode">ƛ</span>·]] 21:16, 11 May 2009 (UTC)
 29 | 
 30 | ==Warrior, not scholar==
 31 | According to the ABC-CLIO world history database, Montezuma was a warrior, not a scholar, which directly contradicts what is said in the article. perhaps this should be changed.
 32 | 
 33 | : There is no contradiction, all aztec upper classes were warriors. To have the title of Tlatoani, Moctezuma must have been responsible of capturing about a dozen war prissioners. Also as a tlatoani, he continue with the aztec military expansion. But he was more interested in religion. A warrior like his uncle would have not care about the divinity of Cortez. But intead of continue his military acounts he prefered to be the Head of the Calmecac, insted of the Telpochcalli, and live the life of a priest. He did not wanted to be elected tlatoani. . [[User:Nanahuatzin|Nanahuatzin]]
 34 | 
 35 | ==Contact with the Spanish, sources==
 36 | You all should really reconsider the portion of the article concerning Spanish contact. The problems with the sources you cite are enormous. Some modern research by those who are not as biased as the authors you've read might do this article some good. It's pretty much been decided in the past 20 years or so that Moctezuma didn't believe Cortés was Quetzalcoatl, and there's no native evidence that Quetzalcoatl was even supposed to return. Report that "the sources say that..." if you want, but don't present what the sources say as if it's fact. That's just lazy.
 37 | 
 38 | : mmhh decided by whom?... Most of the info here comes from primary sources, like Alva ixtlixochitl, the mexicayotl cronicle, the ramirez codex, Camargo, Sahagun etc. The legend of the return of queztalcoatl, (or more correctly, the return of Ce Acatl Topiltzin Queztalcoatl, great priest of queztalcoatl) is recorded By Sahagun. Is it true that there is a hotly debate on why Moctezuma reacted the way he did, maybe we shoudl expand that...  [[User:Nanahuatzin|Nanahuatzin]] 18:04, 5 December 2005 (UTC)
 39 | 
 40 | ==Black legend==
 41 | The whole paragraph starting from "They gave the Spanish gold flags" and ending with "they crave gold" is clearly an example of black legend. The writer pretend it has been written by an Aztec but in fact it's a fictional extract from Carl Sagan's Cosmos, therefore it's a subjective point of view which should not be included in a serious article.
 42 | 
 43 | : Sorry , i have never read that book. Original Source Florentine Codex. Book XII, by Bernardino de Sahagun. I used the spanish translation from "The other side of the conquest", by Leon Portilla, the main scholar on nahuatl language. Yes it,s a bit colorful, but I included to show that not al aztecs view spaniards as gods o else... [[User:Nanahuatzin|Nanahuatzin]] 08:02, 13 March 2006 (UTC)
 44 | 
 45 | Estoy de acuerdo de con tu explicación, se puede decir lo mismo (que no todos los aztecas consideraban a los españoles como dioses) con otras palabras, pero es menester señalar que lo anterioremente citado podía ofender y ofendía la sensibilidad de un español.
 46 | 
 47 | :(Attempt at English translation of above comment)
 48 | :I agree with your explanation.  We could say the same thing (that not all Aztecs considered the Spaniards to be gods) using other words, but it is (menester) to note that the earlier text could offend and has offended the sensibility of a Spaniard.
 49 | :(end translation)
 50 | :[[User:Richardshusr|Richard]] 18:53, 1 April 2006 (UTC)
 51 | 
 52 | :Entendido el punto. Procurare ser mas cuidadoso, sin embargo, tambien me gustaria comentar que es por esa razon que recurri a citar una fuente primaria, en lugar de solo referme a ella. EL publico en general esta acostumbrado solo a escuchar los testimonios europeos...  [[User:Nanahuatzin|Nanahuatzin]] 16:16, 13 March 2006 (UTC)
 53 | 
 54 | ::(Attempt at English translation of above comment)
 55 | ::I get the point.  I will be more careful
 56 | ::(translator's note: the previous sentence is a loose translation, my Spanish is not that strong, please correct the translation if you can).
 57 | ::However, I also prefer to comment that it is for this reason that I went back to cite a primary source instead of just referring to it.  The general public is used to only hearing the European testimonies (translator's note: i.e. European point of view)
 58 | ::[[User:Richardshusr|Richard]] 18:53, 1 April 2006 (UTC)
 59 | 
 60 | ::Please offer English translations for Spanish comments. [[User:Pietdesomere|Piet]] 09:03, 1 April 2006 (UTC)
 61 | ::: Ok.. [[User:Nanahuatzin|Nanahuatzin]] 20:52, 22 April 2006 (UTC)
 62 | 
 63 | ==Moctezuma's father==
 64 | Moctezuma was Axayácatl's son, not Ahuízotl's. Same goes for Cuitláhuac. Ahuízotl was Cuauhtémoc's father.
 65 | 
 66 | == Name ==
 67 | 
 68 | The article claims that Moctezuma is ostensibly the preferred name. I cannot confirm or deny this. I am however quite certain that Montezuma is the more commonly used name, so I propose a change to that title. Relevant Wikipedia guidelines:
 69 | *[[Wikipedia:Naming conventions (common names)]]. Quote: ''Wikipedia is not a place to advocate a title change in order to reflect recent scholarship. The articles themselves reflect recent scholarship but the titles should represent common usage.''
 70 | *[[Wikipedia:Naming conventions (use English)]]. Quote: ''If a native spelling uses different letters than the most common English spelling (eg, Wien vs. Vienna), only use the native spelling as an article title if it is more commonly used in English than the anglicized form. If you are talking about a person, country, town, movie or book, use the most commonly used English version of the name for the article, as you would find it in other encyclopedias and reference works.''
 71 | Awaiting reactions of course... [[User:Pietdesomere|Piet]] 09:13, 1 April 2006 (UTC)
 72 | 
 73 | :Montezuma sounds awfull   !!!!!!! (sorry you ask for my reaction...)  I know this is an english enclyclopedia. But in the last years i have seen a trend to try to respect the original names: Sri-lanka, instead of Ceylan, Beijing instead of Pekin. So.. can we try to use Moctezuna, instead of the spaniard version of the name... Specially that even in Spain Montezuma is no longer used, (and  not to mention that most mexicans would find montezuma offensive).. Please  :)
 74 | :Also.. if you look at goggle you will find that most references to "Montezuma" dos not refer the the Aztec Tlatoani, but To Montezuma school, Montezuma county, city of Montezuma, Montezuna Castle, Montezuma Well,Montezuma's Reptiles  etc, while Moctezuma would refer specifically to it....  [[User:Nanahuatzin|Nanahuatzin]] 20:49, 22 April 2006 (UTC)
 75 | 
 76 | ::I have not heard any other spelling but Moctezuma in Spanish. However, Montezuma ought to be mentioned too in the text as this is commonplace in English language. Regards, [[User:Asterion|<span style="color:#0000FF;font-weight:bold;">'''Asterion'''</span>]]<sup>[[User talk:Asterion|<span style="color:#00EF00;">'''talk'''</span>]]</sup> 23:08, 3 November 2007 (UTC)
 77 | 
 78 | :::Considering the above cited naming conventions, I don't see any strong argument for "Moctezuma". It may be more accurate, but please observe the adherence to the conventions in articles such as [[Saladin]] and [[Avicenna]], even though these names are not actually the correct forms. I somehow doubt strongly that "Moctezuma" is more common in English, and I would need to see a citation to that effect.
 79 | 
 80 | :::The above convention states: ''use the most commonly used English version of the name for the article, as you would find it in other encyclopedias and reference works.'' One needs only do a quick search to find that Britannica, Encarta, Columbia, and Oxford, amongst others, all use the form "Montezuma" for the entry. I'm a little confused, because it seems that Nanahuatzin's comment seems to ignore the relevant policies cited directly above, which makes his own suggestions seem invalid. According to Wikipedia's naming conventions, this article should use the form "Montezuma" in the title and the text, as far as I'm concerned.--[[User:C.Logan|C.Logan]] ([[User talk:C.Logan|talk]])
 81 | 
 82 | :I think you have a valid point.  Most Americans are used to seeing "Montezuma" as in "the halls of Montezuma".  However, as you see above, we were convinced to use "Moctezuma" instead on the baseis that "Montezuma" was a Spanish neologism for the real Nahuatl name.
 83 | 
 84 | :I'm OK with Moctezuma, I guess.  However, I agree with you that the alternate names should be provided and explained.
 85 | 
 86 | :Here's what MSN Encarta has to say ([http://encarta.msn.com/encyclopedia_761573242/Montezuma_II.html http://encarta.msn.com/encyclopedia_761573242/Montezuma_II.html])
 87 | :Montezuma II (1480?-1520), ruler of the Aztec Empire of Mexico; his name in the Native American Nahuatl language is Montecuhzoma.
 88 | 
 89 | :Here's what the Britannica has to say ([http://www.britannica.com/eb/topic-390850/Montezuma-II http://www.britannica.com/eb/topic-390850/Montezuma-II])
 90 | :Montezuma II, or Moctezuma II, or Moteucçoma (Aztec emperor)
 91 | 
 92 | :Now, where do you think we should go from here?
 93 | 
 94 | :--[[User:Richardshusr|Richard]] ([[User talk:Richardshusr|talk]]) 02:21, 4 January 2008 (UTC)
 95 | 
 96 | ::Well, I support the inclusion of the more correct name-forms, but it seems to me that we should use "Montezuma" as the primary form for the article, as these encyclopedias do. I don't have any doubt that "Montezuma" is not as accurate, but again, the naming conventions seem to suggest that we should go with familiarity over correctness- hence the above examples I'd given. I've tagged the claim that Moctezuma is the "most common" form in English, because this seems very questionable to me. It is certainly a recognized and used form, but none of the history books that I've seen use "Moctezuma", even if those books chose the improper form only, again, because of public familiarity. I would be interested to see a source which supports this claim, because it would satisfy my concerns.--[[User:C.Logan|C.Logan]] ([[User talk:C.Logan|talk]]) 02:34, 4 January 2008 (UTC)
 97 | 
 98 | ::Neither "Montezuma" nor "Moctezuma" are correct. '''Both''' are 'corruptions'. Some scholars prefer to use the genuinely accurate spellings ''Motecuhzoma'' or ''Moteuczoma'', but they're yet to really catch on. --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 03:52, 4 January 2008 (UTC)
 99 | :::Agreeing with Ptcamn. Reposting what i have recently written at [[Wikipedia_talk:WikiProject_Aztec/Terminology#Montezuma_vs._Moctezuma]]:
100 | :::The prefferred spelling in scholarly articles is '''Motecuhzoma''' if using Richard J. Andrews orthography which is becoming the most accepted in aztec studies. Another transliteration that is accpetable is Moteuczoma or Moteczoma but this is not commonly used. This is because unlike the two other forms ''moctezuma'' and ''montezuma'' it reflects his actual name in Nahuatl. It is composed of the three parts "mo" the reflexvive pronoun , "tecuh/teuc" "lord" and "zōma" "frown" - the other forms introduce spurious letters like "n" or turn "tecu" into "cte" for no good reason. [[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 10:03, 4 January 2008 (UTC)
101 | 
102 | == Cortes leaving to meet Narvaez ==
103 | 
104 | The sentence about Cortes leaving to meet Narvaez does not express the core point that Narvaez had been sent to arrest Cortes.  When I have time I will try to fix this.
105 | [[User:Richardshusr|Richard]] 18:43, 1 April 2006 (UTC)
106 | 
107 | It's fixed.  I fixed it a few weeks ago but forgot to leave a note here.
108 | --[[User:Richardshusr|Richard]] 06:51, 5 May 2006 (UTC)
109 | 
110 | == Date of Moctezuma's death and of La Noche Triste ==
111 | It seems to me hard to reconcile the date of M's death as given in this article (July 1) with the date of [[La Noche Triste]] (also July 1). Wasn't the latter some days after the former? [[User:Alpheus|Alpheus]]
112 | 
113 | :: right, la "noche triste" was 30 of june, 1520 . Cortez delayed to run out of the city, because he still had the hight priest has hostage, and the aztec wanted to make Cuitlahuac a Tlatoani. The aztec ofered peace in excahnge, but the resume atack as soon as cuitlahuac was made Tlatoani[[User:Nanahuatzin|Nanahuatzin]] 06:51, 3 July 2006 (UTC)
114 | 
115 | :::It's difficult to have accurate dates here.  The Europeans were still using the [[Julian calendar]], which by that time had accumulated at least 10 days of error in comparison to the solar date.  That is, the summar solstice for 1520 was probably around June 11th according to Cortes' calendar.  [[User:Madman2001|Madman]] 22:41, 7 July 2006 (UTC)
116 | 
117 | == Clean-up tag ==
118 | 
119 | Fellow editors: I placed a clean-up tag on this article.  I believe the following areas should be addressed:
120 | 
121 | *The conversational tone of large sections of the article.
122 | *Unsupported material.
123 | *Legends and hearsay passed as fact (e.g. Moctezuma looked into a bird's eyes and saw men landing on the coast).
124 | *Better use of white space.
125 | *Better placement or definition of Nahuatl words.
126 | *The date of death of Moctezuma as well as Moctezuma's lineage (as noted elsewhere on this page).
127 | *The Trivia section is out of place and a mess.
128 | 
129 | My thoughts, [[User:Madman2001|Madman]] 22:55, 7 July 2006 (UTC)
130 | 
131 | :: Very much in agreement with you I have begun the clean up. I would suggest cutting the "contact with the spansih part" entirely since this is all treated under [[Spanish conquest of Mexico]].--[[User:Maunus|Maunus]] 13:47, 7 August 2006 (UTC)
132 | 
133 | ::: Agree for the most part but please preserve the "legends and hearsay", attributing them as such.  I'm not sure how much of that may have come from Nanahuatzin but the point is that the legends are useful information if presented as such.  Part of this comes from the importance of myth and legend in cultures such as the Aztec.  And part of it comes from the fact that history is not just about fact but also about interpretation.  Knowing what legends are passed on about a person gives us some understanding of how his contemporaries and subsequent generations viewed the person.  A person is not just what he does but what others think about what he does.
134 | 
135 | ::: Also, I have mixed feelings about [[User:Maunus|Maunus]]'s recommendation to cut out the "contact with the spanish" part entirely.  I did something similar with the [[Hernan Cortes]] article (i.e. I cut out the "conquest of Mexico" part and put it in the [[Spanish Conquest of Mexico]] article.  Another editor objected saying that I had removed the most important part of Cortes' life.  As a compromise, I put a short summary of the [[Spanish conquest of Mexico]] article back into the [[Hernan Cortes]] article.
136 | 
137 | ::: On the one hand, we should not replicate too much material between articles.  On the other hand, we must not eviscerate the [[Montezuma II]] article by taking out the single episode of his life that makes him famous.  I trust [[User:Madman2001]] to do the righ thing.
138 | 
139 | ::: --[[User:Richardshusr|Richard]] 23:36, 7 August 2006 (UTC)
140 | 
141 | :::: It is very dificult so separate the facts and the legend, but most history books in Mexico, include the omens that it is said, happened before the conquest as part of the biography of Moctezuma. They reflect the feelings and fears of the population of Tenochtitlan, and are part of the contradictions of his caracter. Proud  to his people, and humble to the spanish. Brave in battle, and fearfull to the gods.   All this has result in a hotly debate on his motives. Maybe all this has to moved to a section and leave the known facts apart....  [[User:Nanahuatzin|Nanahuatzin]]
142 | 
143 | :::: I dont see the Omens relevance for the personality or history of Moteczuma. As argued by James Lockhart in "we people here" the omens are most likely an aztec hindsight addition and has snothing to do with actual history. And everything that is known about the personality of Moteczoma whether being "brave" or "humble" and to whom we have from biased sources that should not be mistaken for real biographic information. Of course it can be included but crtitically please, and stating which sources say what and what might be their reasons to do so. But yes I think there could be a section on the persona of Motecuzoma as he has been depicted in legends and hearsay. --[[User:Maunus|Maunus]] 12:09, 8 August 2006 (UTC)
144 | :::::I think the article is taking shape and I move that the cleanup tag be removed. If someone helps me giving a few finishsing touches I think we can get it to GA status within the month.[[User:Maunus|Maunus]] 14:29, 14 August 2006 (UTC)
145 | 
146 | Thanks to the editors, particularly Maunus, for cleaning up the article. It has moved from awful to good, and I have removed the clean-up tag.
147 | 
148 | Could I also ask the editors, particularly Maunus, to please check your spelling and links in your articles?  As just one example, there were numerous spellings of "Moctezuma" scattered throughout the article (all from one editor).  There were also numerous redlinks in the article, red only because of apelling errors (e.g. "Fransican" instead of "Franciscan" and many others).  '''''Please''''' check your work before saving.  Thanks, [[User:Madman2001|Madman]] 15:24, 26 August 2006 (UTC)
149 | 
150 | Right you are. Sorry, I will do better checking. [[User:Maunus|Maunus]] 15:39, 26 August 2006 (UTC)
151 | 
152 | == Azcapotzalco or Tlacopan?==
153 | The Aztec Triple Alliance is being described with Tenochtitlan, Texcoco and  Azcapotzalco, shouldn't it be Tlacopan instead? Even once being subordinated to Azcapotzalco, Tlacopan sided with the other two cities in their conquest over Azcapotzalco. Then, Totoquihuaztli, Tlacopan's ruler, claimed the title of Tepaneca tecuhtli, "Lord of the Tepanecs".<br>
154 | Do you agree on changing it? [[User:201.37.6.195|201.37.6.195]] 01:42, 9 August 2006 (UTC)
155 | 
156 | :I hadn't checked [[Aztec Triple Alliance]] article. Info there matches what I said above. I've already fixed it. [[User:201.37.6.195|201.37.6.195]] 04:11, 9 August 2006 (UTC)
157 | 
158 | 
159 | 
160 | ==Pulled Trivia section from article==
161 | 
162 | Pulled this Trivia section from article per [[Wikipedia:Avoid trivia sections in articles]].  If you feel any line below belongs in the article, please insert it in the proper place.
163 | 
164 | *[[Montezuma's Revenge]] is the colloquial term for any episodes of [[travelers' diarrhea]] or other sicknesses contracted by tourists visiting [[Mexico]].
165 | *The [[Mexico City metro]] system has a station named [[Metro Moctezuma]] in honour of the ''tlatoani''.
166 | *[[Antonio Vivaldi]] also wrote an [[opera]] called "[[Motezuma]]"; it has little to do with the historical character.
167 | * Moctezuma was not allowed to be looked at unless it was a festival. A person that looked at him would receive the death penalty.
168 | * He was so holy that he was carried around everywhere so that his feet would not touch common ground.
169 | *This Emperor Moctezuma may possibly have influenced the semi-divine figure of [[Montezuma (mythology)|Montezuma]] common to the 19th century folklore of native tribes living in Arizona and New Mexico.
170 | *There is a reference to Montezuma in the song [[Cortez The Killer]] by [[Neil Young]] and [[Crazy Horse]]  off of the Album [[Zuma]](1975). The verse is as follows: "On the shore lay Montezuma, With his coca leaves and pearls, In his halls he often wandered with the secrets of the world."
171 | 
172 | ::This is a perfect illustration of why that guideline should not exist, and the guideline was apparently railroaded in by a select few without the knowledge of most editors... [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 15:45, 26 August 2006 (UTC)
173 | :: I agree with the guideline, and have long had a mind to remove the trivia section myself. It is wholly non-encyclopedic and of little to no relevance.[[User:Maunus|Maunus]] 15:57, 26 August 2006 (UTC)
174 | 
175 | ::: The trivia section has bothered me for some time so I am glad to see it go.  I was not aware of the guideline and am glad to have a solid basis for getting rid of trivia sections in other articles.
176 | 
177 | ::: However, I do think there is value in acknowledging historical legacy and modern perceptions.  Montezuma's Revenge deserves some mention since that phrase and the "From the shores of Tripoli to the halls of Montezuma" from the Marine Corps hymn were the two most well known mentions of Montezuma in my generation although admittedly this has probably changed in the younger generation of today.
178 | 
179 | ::: Similarly, the influence of Moctezuma on the semi-divine figure in folkore of native tribes deserves mention IF it can be sourced to a [[WP:RS|reliable source]].
180 | 
181 | ::: The rest can probably go.
182 | 
183 | ::: --[[User:Richardshusr|Richard]] 16:14, 26 August 2006 (UTC)
184 | 
185 | ::::Richard, I would agree with you on mentioning Montezuma's Revenge and the Marine Corps hymn.  Typically they are in a section entitled '''Modern legacy''' or some such.  As mentioned above, editors are welcome to incorporate them into the article, just not in a section entitled '''Trivia''' which becomes a trash-magnet.
186 | 
187 | :::[[User:Codex Sinaiticus]], what parts of this are relevant to an '''encyclopedia''' article on Moctezuma?  Certainly the Neil Young song and Montezuma's revenga are just plain silly.  The Vivaldi sentence is almost self-defeating ("it has little to do with the historical character").  The two tidbits about how special he was sound like legends more than facts.  If they belong in the article, they need to be referenced and put into the article itself.  Interested in your thoughts, [[User:Codex Sinaiticus]].    Thanks, [[User:Madman2001|Madman]] 16:20, 26 August 2006 (UTC)
188 | 
189 | ::This new "no trivia" rule that seems to have come out of nowhere, if applied site wide, will radically transform the entire face of wikipedia as it currently exists, into something quite different, and far less enjoyable.  I guess there is a minority of editors who decided behinbd closed doors that they wanted a carbon copy of Encyclopedia Britannica, and are now presenting this as a 'fait accomplis' "guideline" because no one knew about it.  It feels like a hijacking. If this had been proposed in the open, it would NEVER have received a support from Wikipedia's editors.
190 | 
191 | ::If you seriously need "reliable sources" that Montezuma has ever been connected with Motecuzoma, try taking a look at the Montezuma article.  The other items are mostly relevant links to other related articles, why are they being suppressed and whom does this benefit???  [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 17:15, 26 August 2006 (UTC)
192 | 
193 | :::Hot issue for you, eh?  [[User:Madman2001|Madman]] 17:24, 26 August 2006 (UTC)
194 | 
195 | :::Yes.  What is going on here with deleting links to related information  from the article onder the pretense of a misguided guideline is just plain wrong.  I will resist this on any article where I see this being done. [[User:Codex Sinaiticus|ፈቃደ]] ([[User talk:Codex Sinaiticus|ውይይት]]) 17:38, 26 August 2006 (UTC)
196 | 
197 | == This article needs some reorganization ==
198 | 
199 | I haven't ever looked at this article closely.  There's a lot of information but, based on just a quick look, the section/subsection organization needs work.  I don't have time to work on it today but I figured I'd drop the cleanup tag on it and try to get back to it later.
200 | 
201 | --[[User:Richardshusr|Richard]] 17:34, 26 August 2006 (UTC)
202 | 
203 | 
204 | ==Expert tag?==
205 | An anonymous editor added the expert tag to the article without stating a reason. If no reason is introduced here on the talk page within the next days I will remove it.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 21:08, 29 May 2007 (UTC)
206 | 
207 | == Moctezuma before 1519 ==
208 | 
209 | What about Moctezuma's life and reign ''before'' the arrival of the Spaniards? A description of the last two years of an about fifty to sixty year-old person is a pretty unfinished biography, isn't it? --[[User:88.64.212.26|88.64.212.26]] 17:15, 16 July 2007 (UTC)
210 | 
211 | == Translation ==
212 | 
213 | I removed:
214 | :"he who angers himself."<nowiki><ref>{{cite book |last=Thomas |first=Hugh |year=1995 |title=Conquest: Montezuma, Cortés, and the Fall of Old Mexico}}</ref></nowiki>
215 | Despite being sourced, [[Hugh Thomas]] is a historian, not a nahuatlato. This translation both ignores the "lord" morpheme, treating it as though the name was simply ''Mozoma'', and fails to recognize that Nahuatl often uses reflexives with a passive meaning — it's like translating Spanish ''no se sabe'' as "it doesn't know itself" when the correct translation is "it is unknown". --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 03:39, 4 January 2008 (UTC)
216 | :::Agreeing.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 10:05, 4 January 2008 (UTC)
217 | 
218 | == NPOV in the sources section ==
219 | 
220 | The paucity of indigenous written records and as well as the sometimes biased descriptions of the man by chroniclers can easily lead to no small amount of friction about how Moctezuma really was.  Interpretations of the biographical accounts that we do possess vary widely.  Therefore, I think it best to present actual quotations from them, with brief summaries of the general descrription BY THE WRITER, in the article, rather than fill the sources section with conjecture.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 16:49, 19 January 2008 (UTC)
221 | ::I agree that the section needs less conjecture. However I am not sure that the right way about it is to insert large chunks of primary sources. Primary sources needs to be interpreted in order to be correctly understood - some sources are more reliable than others, some have one kind of bias others have the opposite good scholars have studied the sources and written about how to best understand them. Therefore the section in my opinion should build on what good scholars have said about how to best understand the soruces. I think that the right way to do this is by having the "conjecture" and interpretations of historians be fully sourced to the works of the which historians who have written about it. For example James Lockhart and Matthew Restall - the article as it is anow is not well sourced and only briefly mentions those scholars, while not pointing to any specific texts by them: this should be change. Some statements are completely unsourced and seems to be lose conjecture by previous editors - these should be removed. I think it is ok to include smaller pieces of quoted primary sources to illustrate salient point and to show the style of the sources - but they should not be made to look like being the "truth". The truth is not in the sources, but can only be approximated through their interpretation.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 17:07, 19 January 2008 (UTC)
222 | 
223 | In the case of Bernal Diaz, I think the quotes offered are reliable.  I agree that the opinions of good, peer-reviewed, publications to support different interpretations are needed (e.g. Restall and Lockhart).  I have made some edits, and I do believe that the Wikipedia reader can evaluate the actual quotation on their own.   Nothing is lost by maintaining the description quote I've entered, though I will agree that the second quote about their reaction to his death could be superfluous and maybe belongs on the [[True History of the Conquest of New Spain]] page. [[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 17:53, 19 January 2008 (UTC)
224 | 
225 | ==Neutrality==
226 | 
227 | Until we can get some sources to support the arguments in the tagged section, I think we need to POV check.  Once we get that, then the article will be much stronger.  Also, since it is unsourced, without even mentioning a potential source, I have removed this section:
228 | 
229 | ''As Aztec ruler, he expanded the Aztec Empire the most; warfare expanded the territory as far south as [[Soconusco|Xoconosco]] in [[Chiapas]] and the [[Isthmus of Tehuantepec]]. He elaborated the [[Templo Mayor]] and revolutionized the tribute system. He also increased Tenochtitlán's power over its allied cities to a dominant position in the [[Aztec Triple Alliance]]. He created a special temple, dedicated to the gods of the conquered towns, inside the temple of [[Huitzilopochtli]]. He also built a monument dedicated to the Tlatoani [[Tízoc]].''
230 | 
231 | until someone can provide sources.  [[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 18:08, 19 January 2008 (UTC)
232 | ::I'll source that right away.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 10:55, 20 January 2008 (UTC)
233 | 
234 | Perfect!  I like the new positioning too; it fits better where you've put it.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 19:04, 20 January 2008 (UTC)
235 | 
236 | 
237 | ==A section o his life and times?==
238 | I think in focusing on the sources we have neglected making a section about the actual life of Moctezuma - I think we should make a section before the source section describing what is known about his lifes main events, and it should probably inforporate the "contact with spanish" section.[[User:Maunus|·Maunus·]] [[User talk:Maunus|·<span class="Unicode">ƛ</span>·]] 10:49, 19 February 2008 (UTC)
239 | :Why is everyone focused on Moctezuma's life after the Spaniards' arrival? At that time he was more than 60 years old and he had already reigned for seventeen years! --[[Special:Contributions/88.64.57.246|88.64.57.246]] ([[User talk:88.64.57.246|talk]]) 21:22, 27 February 2008 (UTC)
240 | 
241 | ==GA review==
242 | {{#if:|{{#ifeq:{{NAMESPACE}}|Talk||{{error:not substituted|GAList}}<div style="display:none;">}}}}
243 | :'''[[Wikipedia:Good article nominations|GA]] review''' (see [[Wikipedia:What is a good article?|here]] for criteria)
244 | {{#if:Needs some serious work on being consistent with references. Also needs a number of citations.|<hr width=50%>Needs some serious work on being consistent with references. Also needs a number of citations.|}}
245 | #It is '''reasonably well written'''.
246 | #:a ''(prose)'': {{GAList/check|}} b ''([[Wikipedia:Manual of Style|MoS]])'': {{GAList/check|}}
247 | #:: {{#if:|{{{1com}}}|}}
248 | #It is '''factually accurate''' and '''[[Wikipedia:Verifiability|verifiable]]'''.
249 | #:a ''(references)'': {{GAList/check|}} b ''(citations to [[WP:RS|reliable sources]])'': {{GAList/check|}} c ''([[Wikipedia:No original research|OR]])'': {{GAList/check|}}
250 | #:: {{#if:|{{{2com}}}|}}
251 | #It is '''broad in its coverage'''.
252 | #:a ''(major aspects)'': {{GAList/check|}} b ''(focused)'': {{GAList/check|aye}}
253 | #:: {{#if:|{{{3com}}}|}}
254 | #It follows the '''[[WP:NPOV|neutral point of view]] policy'''.
255 | #:''Fair representation without bias'': {{GAList/check|aye}}
256 | #:: {{#if:|{{{4com}}}|}}
257 | #It is '''stable'''.
258 | #:''No edit wars etc.'': {{GAList/check|aye}}
259 | #:: {{#if:|{{{5com}}}|}}
260 | #It is illustrated by '''[[Wikipedia:Images|images]]''', where possible and appropriate.
261 | #:a ''(images are tagged and non-free images have [[Wikipedia:Image_description_page#Use_rationale|fair use rationales]])'': {{GAList/check|aye}} b ''(appropriate use with [[WP:CAP|suitable captions]])'': {{GAList/check|aye}}
262 | #:: {{#if:|{{{6com}}}|}}
263 | #'''Overall''':
264 | #:''Pass/Fail'': {{GAList/check|nay}}
265 | #:: {{#if:|{{{7com}}}|}}<!-- Template:GAList --></div>
266 | 
267 | Details:
268 | 
269 | * Consistency in formating the references and sources. There are a couple of websites used as references that are just bare urls, a few references are used that aren't listed in the sources, and the formatting of a couple of the sources isn't consistent with the format of the other sources. You also use a mix of Harvard citations (the (Diaz del Castillo 1568/1963 224-25)) and regular footnotes. It doesn't matter which style you use, it just needs to be consistent.
270 | * Reference 26 is a self-published website and wouldn't be considered a reliable source. Would be fine if it used the source listed at the bottom of the website as a source.
271 | * Formatting the quotations should be not in italics. See [[WP:MOS#Italics]].
272 | * A number of spots need citations. I've added citation needed tags at the spots. I also marked with hidden text a few other spots that while not needing citations wouldn't be hurt by having them.
273 | * Also in the consistency part - footnotes after the punctuation. A number of spots have the footnoes after the punctuation.
274 | * Direct quotations need citations attachted to them, I've marked those spots also with citation needed tags.
275 | * Consider changing the Native American mythology, Symbol of indigenous leadership, Spanish noble family, and references in modern culture sections into subsections under the Legacy section. Also many of these sections could use some expansion to them, they feel kinda skimpy for someone who has had so much impact.
276 | * Also, consider changing the data in References in modern culture from a list into a paragraph or two. Myself, I'd nix the video game stuff, but that's just me.
277 | * See also sections usually go right before the References section.
278 | 
279 | I haven't really read through the prose for anything grammatically wrong or awkward. Mainly, it needs a ruthless run through for consistency in referencing, puncutation, and other issues. Like I said, I haven't read the prose deeply, which would need to be done before passing it to GA also. I'll do that after the issues above are dealt with, it'll be easier then, after the kinks are worked out with the other concerns.
280 | 
281 | I've put the article on hold for seven days to allow folks to address the issues I've brought up. Feel free to contact me on my talk page, or here with any concerns, and let me know one of those places when the issues have been addressed. If I may suggest that you strike out, check mark, or otherwise mark the items I've detailed, that will make it possible for me to see what's been addressed, and you can keep track of what's been done and what still needs to be worked on. [[User:Ealdgyth|Ealdgyth]] | [[User talk:Ealdgyth|Talk]] 19:27, 8 March 2008 (UTC)
282 | 
283 | ::Not sure if I'll be able to get to all this stuff in the next week, but this will give us a starting point from which to better the article, if we don't get it fixed in time.  Thanks for your review.[[User:Wuapinmon|Wuapinmon]] ([[User talk:Wuapinmon|talk]]) 20:34, 11 March 2008 (UTC)
284 | :::I'm heading out of town in the next few days and won't be able to reliably check in. Any chance of progress being made soon? [[User:Ealdgyth|Ealdgyth]] - [[User talk:Ealdgyth|Talk]] 02:47, 17 March 2008 (UTC)
285 | 
286 | Given that another week has passed with no movement, I'm failing this article's GA nomination. [[User:Ealdgyth|Ealdgyth]] - [[User talk:Ealdgyth|Talk]] 14:08, 30 March 2008 (UTC)
287 | 
288 | == Year of birth ==
289 | 
290 | Given that secondary sources commonly differ wrt M's approx year of birth (c.1466 vs c.1480), it would be good to track down and identify in the article the primary sources by which each of these alternatives are calculated. Anyone know offhand what might be the original basis for either? --[[User:CJLL Wright|cjllw]]<font color="#DAA520"> <span title="Pronunciation in IPA" class="IPA">ʘ</span> </font><small>''[[User talk:CJLL Wright|TALK]]''</small> 03:33, 28 October 2008 (UTC)
291 | 
292 | == The Feather Crown of Montezuma ==
293 | 
294 | I have looked just about everywhere for an article on Moctezuma's 'penacho', or feather crown. Frankly, I'm very surprised it doesn't exist (If this is the case). There is a Spanish article named 'Penacho de Moctezuma', and I would love to start an English article on this very important historical artifact. However, I have little knowledge of the Spanish langauge, and therefore can't translate effectively. Would someone mind creating an artical on this very interesting topic? Or help to find someone who can? I find little info in English on the subject anywhere, accept that it is kept in Austria and the Mexican government has asked for it back. Just an idea! Send a message. Thank You! [[User:C.Kent87|C.Kent87]] ([[User talk:C.Kent87|talk]]) 07:23, 18 December 2008 (UTC)
295 | 
296 | :You're right, that artefact would make an interesting and valid basis for an article, which we don't have here on en.wiki yet (actually we may not even mention it in passing on related articles, AFAIK). Even tho' the headdress almost certainly is not Moctezuma's, I believe it's regarded as plausibly coming from the court at the time. It's subsequent provenance would also make for some interesting research & reading.
297 | :I or others could help on the spanish translations, tho I'd prefer not to just translate the es.wiki article as-is. Can't say when or if will be able to start something up, but as first step at least will put it on the [[WP:MESO/REQ|WP Mesoamerica's new requests lists]] as a reminder to be done, when time & inclination suits. Cheers, --[[User:CJLL Wright|cjllw]]<font color="#DAA520"> <span title="Pronunciation in IPA" class="IPA">ʘ</span> </font><small>''[[User talk:CJLL Wright|TALK]]''</small> 04:47, 11 March 2009 (UTC)
298 | 
299 | :: In the spansih article we have commented, Posibly it was one of the gifts that moctezuma gave to Cortes. Posibly not a headress.. but a part of the Quezcalcoatl drees (maybe a cape, since no tlatonai had worn anything like that), that was put by the emisaries of Moctesuma on Cortes. It was the "treasure of Questalcoatl"  [[http://www.jornada.unam.mx/2005/06/13/a08n1cul.php]]. [[User:Nanahuatzin|Nanahuatzin]] ([[User talk:Nanahuatzin|talk]]) 23:41, 11 May 2009 (UTC)
300 | 
301 | == "The name of Montezuma in Aztec" ==
302 | 
303 | [[File:Thenameofmontezumainaztec.png]]
304 | 
305 | Moctezuma's usual name glyph can be seen [[:File:Moctezuma Mendoza.jpg|here]] and [[:File:Teocalli of the Sacred War - sun.jpg|here]] (top right). These represent ''Moquauhzoma'', claimed by Brinton to be the correct form Montezuma's name (but not the form that occurs in actual texts). --[[User:Ptcamn|Ptcamn]] ([[User talk:Ptcamn|talk]]) 06:53, 22 May 2009 (UTC)
306 | ::Hmm could you mention in which sources this glyph occurs? I think Gillespie gives a number of different glyphs doesn't she?[[User:Maunus|·Maunus·<span class="Unicode">ƛ</span>·]] 15:30, 22 May 2009 (UTC)
307 | ::ahh now I got it - you removed it you didn't add it. I thought you'd gone mad there.[[User:Maunus|·Maunus·<span class="Unicode">ƛ</span>·]] 15:32, 22 May 2009 (UTC)
308 | 
309 | == "Another girl?" ==
310 | 
311 | I'm not about to change this myself and upset everybody, but isn't there a better way to describe his offspring than "another girl?" [[User:Singthenightsky|Singthenightsky]] ([[User talk:Singthenightsky|talk]]) 21:18, 18 March 2010 (UTC)
312 | 
313 | 
314 | No.  <span style="font-size: smaller;" class="autosigned">— Preceding [[Wikipedia:Signatures|unsigned]] comment added by [[Special:Contributions/108.129.90.185|108.129.90.185]] ([[User talk:108.129.90.185|talk]]) 03:39, 30 September 2011 (UTC)</span><!-- Template:Unsigned IP --> <!--Autosigned by SineBot-->
315 | 


--------------------------------------------------------------------------------
/wapiti/operations/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | import json
  5 | from abc import ABCMeta
  6 | 
  7 | from collections import OrderedDict
  8 | from functools import wraps
  9 | 
 10 | import sys
 11 | from os.path import dirname, abspath
 12 | # just until ransom becomes its own package
 13 | sys.path.append(dirname(dirname(abspath(__file__))))
 14 | import ransom
 15 | 
 16 | from params import SingleParam, StaticParam
 17 | from models import get_unique_func, get_priority_func
 18 | from utils import (PriorityQueue,
 19 |                    MaxInt,
 20 |                    chunked_iter,
 21 |                    make_type_wrapper,
 22 |                    OperationExample)
 23 | 
 24 | # TODO: handle automatic redirecting better
 25 | # TODO: support batching and optimization limits
 26 | # TODO: concurrency. get_current_task() -> get_current_tasks()
 27 | # TODO: wrap exceptions
 28 | # TODO: separate structure for saving completed subops (for debugging?)
 29 | # TODO: WebRequestOperation: accepts URL, action (default: GET)
 30 | # TODO: Model links (url attribute)
 31 | # TODO: support field param_type (for cases with ints and strs)
 32 | # TODO: use source descriptor instead of api_url? (for op.source)
 33 | # TODO: check that subop_chain types match up
 34 | # TODO: check that priority attribute exists on output_type where applicable
 35 | 
 36 | """
 37 | - what if operations were iterable over their results and process()
 38 |   returned the operation itself? (more expensive to iterate and find
 39 |   non-dupe results, would set ops help?)
 40 | - client -> root_owner.  parent operation (client
 41 |   if no parent op) -> owner.
 42 | - pregenerate MediawikiCalls/URLs on QueryOperations
 43 | 
 44 | Operation modifiers:
 45 | - Prioritized
 46 | - Recursive
 47 | - Buffered
 48 | 
 49 | fun metadata:
 50 | 
 51 | - operations executed
 52 | - suboperations skipped (from dedupe/prioritization/laziness)
 53 | - web requests executed, kb downloaded
 54 | 
 55 | retry strategies:
 56 | 
 57 | - absolute number of failures
 58 | - streaks/runs of failures
 59 | - fail if first operation fails
 60 | - reduce batch size/query limit on timeouts
 61 | 
 62 | prioritization/batching/concurrency implementation thoughts:
 63 | 
 64 | - hands-off implementation via multiplexing?
 65 | - separate priority queues for params and suboperations?
 66 | - fancy new datastructure with dedupe + priority queueing built-in
 67 | - buffering: do 3/5/10 GetCategoryInfos before fetching member pages
 68 | - early subop production based on next parameter priority
 69 |   sinking below a certain threshold?
 70 |   (e.g., next param's subcats=5 -> fetch more category infos)
 71 | """
 72 | 
 73 | DEFAULT_API_URL = 'http://en.wikipedia.org/w/api.php'
 74 | DEFAULT_BASE_URL = 'http://en.wikipedia.org/wiki/'
 75 | 
 76 | DEFAULT_HEADERS = {'User-Agent': ('Wapiti/0.0.0 Mahmoud Hashemi'
 77 |                                   ' mahmoudrhashemi@gmail.com') }
 78 | 
 79 | ALL = MaxInt('ALL')
 80 | DEFAULT_MIN = 50
 81 | 
 82 | 
 83 | class WapitiException(Exception):
 84 |     pass
 85 | 
 86 | 
 87 | class NoMoreResults(Exception):
 88 |     pass
 89 | 
 90 | 
 91 | DEFAULT_WEB_CLIENT = ransom.Client({'headers': DEFAULT_HEADERS})
 92 | 
 93 | 
 94 | class MockClient(object):
 95 |     def __init__(self, is_bot=False, **kwargs):
 96 |         self.debug = kwargs.pop('debug', False)
 97 |         self.web_client = DEFAULT_WEB_CLIENT
 98 |         self.api_url = DEFAULT_API_URL
 99 |         self.is_bot = is_bot
100 | 
101 | 
102 | DEFAULT_CLIENT = MockClient()
103 | 
104 | 
105 | Tune = make_type_wrapper('Tune', [('priority', None), ('buffer', None)])
106 | Recursive = make_type_wrapper('Recursive', [('is_recursive', True)])
107 | 
108 | 
109 | def get_unwrapped_options(wr_type):
110 |     try:
111 |         return dict(wr_type._wrapped_dict), wr_type._wrapped
112 |     except AttributeError:
113 |         return {}, wr_type
114 | 
115 | 
116 | class LimitSpec(object):
117 |     def __init__(self, _max, bot_max=None):
118 |         self.max = int(_max)
119 |         self.bot_max = bot_max or (self.max * 10)
120 | 
121 |     def get_limit(self, is_bot=False):
122 |         if is_bot:
123 |             return self.bot_max
124 |         return self.max
125 | 
126 |     def __int__(self):
127 |         return self.max
128 | 
129 | 
130 | class ParamLimit(LimitSpec):
131 |     pass
132 | 
133 | 
134 | class QueryLimit(LimitSpec):
135 |     def __init__(self, _max, bot_max=None, mw_default=None, _min=None):
136 |         super(QueryLimit, self).__init__(_max, bot_max)
137 |         self.mw_default = mw_default
138 |         if _min is None:
139 |             _min = DEFAULT_MIN
140 |         self.min = min(self.max, _min)
141 | 
142 | 
143 | PL_50_500 = ParamLimit(50, 500)
144 | QL_50_500 = QueryLimit(50, 500, 10)
145 | DEFAULT_QUERY_LIMIT = QL_500_5000 = QueryLimit(500, 5000, 10)
146 | 
147 | 
148 | def get_inputless_init(old_init):
149 |     """
150 |     Used for Operations like get_random() which don't take an input
151 |     parameter.
152 |     """
153 |     if getattr(old_init, '_is_inputless', None):
154 |         return old_init
155 |     @wraps(old_init)
156 |     def inputless_init(self, limit=None, **kw):
157 |         kw['input_param'] = None
158 |         return old_init(self, limit=limit, **kw)
159 |     inputless_init._is_inputless = True
160 |     return inputless_init
161 | 
162 | 
163 | def get_field_str(field):
164 |     out_str = field.key
165 |     mods = []
166 |     if field.required:
167 |         mods.append('required')
168 |     if field.multi:
169 |         mods.append('multi')
170 |     if mods:
171 |         out_str += ' (%s)' % ', '.join(mods)
172 |     return out_str
173 | 
174 | def operation_signature_doc(operation):
175 |     if operation.input_field is None:
176 |         doc_input = 'None'
177 |     else:
178 |         doc_input = operation.input_field.key
179 |     doc_output = operation.singular_output_type.__name__
180 |     doc_template = 'Input: %s\n'
181 |     if operation.is_bijective:
182 |         doc_template += 'Output: %s\n'
183 |     else:
184 |         doc_template += 'Output: List of %s\n'
185 | 
186 |     print_fields = [f for f in getattr(operation, 'fields', [])
187 |                     if not isinstance(f, StaticParam)]
188 |     if print_fields:
189 |         doc_template += 'Options: '
190 |         doc_template += ','.join([get_field_str(f) for f in print_fields]) + '\n'
191 | 
192 |     if hasattr(operation, 'examples'):
193 |         doc_template += 'Examples: \n\t'
194 |         doc_template += '\n\t'.join([repr(x) for x in operation.examples]) + '\n'
195 | 
196 |     return doc_template % (doc_input, doc_output)
197 | 
198 | 
199 | class OperationMeta(ABCMeta):
200 |     _all_ops = []
201 | 
202 |     def __new__(cls, name, bases, attrs):
203 |         ret = super(OperationMeta, cls).__new__(cls, name, bases, attrs)
204 |         if name == 'Operation' or name == 'QueryOperation':
205 |             return ret  # TODO: add elegance?
206 |         subop_chain = getattr(ret, 'subop_chain', [])
207 |         try:
208 |             input_field = ret.input_field
209 |         except AttributeError:
210 |             input_field = subop_chain[0].input_field
211 |             ret.input_field = input_field
212 |         if input_field is None:
213 |             ret.__init__ = get_inputless_init(ret.__init__)
214 |         else:
215 |             input_field.required = True
216 |         # TODO: run through subop_chain, checking the outputs match up
217 |         try:
218 |             output_type = ret.output_type
219 |         except AttributeError:
220 |             output_type = subop_chain[-1].singular_output_type
221 |             for st in subop_chain:
222 |                 if not st.is_bijective:
223 |                     output_type = [output_type]
224 |                     break
225 |             ret.output_type = output_type
226 | 
227 |         try:
228 |             ret.singular_output_type = ret.output_type[0]
229 |         except (TypeError, IndexError):
230 |             ret.singular_output_type = ret.output_type
231 | 
232 |         # TODO: support manual overrides for the following?
233 |         ret.is_multiargument = getattr(input_field, 'multi', False)
234 |         ret.is_bijective = True
235 |         if type(output_type) is list and output_type:
236 |             ret.is_bijective = False
237 | 
238 |         for ex in getattr(ret, 'examples', []):
239 |             ex.bind_op_type(ret)
240 | 
241 |         ret.__doc__ = (ret.__doc__ and ret.__doc__ + '\n') or ''
242 |         ret.__doc__ += operation_signature_doc(ret)
243 |         cls._all_ops.append(ret)
244 |         return ret
245 | 
246 |     @property
247 |     def help_str(self):
248 |         ret = '\n\t'.join([self.__name__] + self.__doc__.strip().split('\n'))
249 | 
250 |         # TODO move options and examples to the __doc__
251 | 
252 |         ret += '\n'
253 |         return ret
254 | 
255 | class OperationQueue(object):
256 |     # TODO: chunking/batching should probably happen here
257 |     # with the assistance of another queue for prioritized params
258 |     # (i.e., don't create subops so eagerly)
259 |     def __init__(self, qid, op_type, default_limit=ALL):
260 |         self.qid = qid
261 |         options, unwrapped = get_unwrapped_options(op_type)
262 |         self.op_type = op_type
263 |         self.unwrapped_type = unwrapped
264 |         self.options = options
265 | 
266 |         self.unique_key = options.get('unique_key', 'unique_key')
267 |         self.unique_func = get_unique_func(self.unique_key)
268 |         self.priority = options.get('priority', 0)
269 |         self.priority_func = get_priority_func(self.priority)
270 |         self.default_limit = default_limit
271 | 
272 |         self.param_set = set()
273 |         self.op_queue = PriorityQueue()
274 |         self._dup_params = []
275 | 
276 |     def enqueue(self, param, **kw):
277 |         unique_key = self.unique_func(param)
278 |         if unique_key in self.param_set:
279 |             self._dup_params.append(unique_key)
280 |             return
281 |         priority = self.priority_func(param)
282 |         kwargs = {'limit': self.default_limit}
283 |         kwargs.update(kw)
284 |         new_subop = self.op_type(param, **kwargs)
285 |         new_subop._origin_queue = self.qid
286 |         self.op_queue.add(new_subop, priority)
287 |         self.param_set.add(unique_key)
288 | 
289 |     def enqueue_many(self, param_list, **kw):
290 |         for param in param_list:
291 |             self.enqueue(param, **kw)
292 |         return
293 | 
294 |     def __len__(self):
295 |         return len(self.op_queue)
296 | 
297 |     def peek(self, *a, **kw):
298 |         return self.op_queue.peek(*a, **kw)
299 | 
300 |     def pop(self, *a, **kw):
301 |         return self.op_queue.pop(*a, **kw)
302 | 
303 | 
304 | class Operation(object):
305 |     """
306 |     An abstract class connoting some semblance of statefulness and
307 |     introspection (e.g., progress monitoring).
308 |     """
309 |     __metaclass__ = OperationMeta
310 | 
311 |     subop_chain = []
312 | 
313 |     def __init__(self, input_param, limit=None, **kw):
314 |         self.client = kw.pop('client', None)
315 |         if self.client is None:
316 |             self.client = DEFAULT_CLIENT
317 |         self.api_url = self.client.api_url
318 |         self.is_bot_op = self.client.is_bot
319 | 
320 |         self.set_input_param(input_param)
321 |         self.set_limit(limit)
322 | 
323 |         self.kwargs = kw
324 |         self.started = False
325 |         self.results = OrderedDict()
326 | 
327 |         subop_queues = [OperationQueue(0, type(self))]
328 |         if self.subop_chain:
329 |             subop_queues.extend([OperationQueue(i + 1, st) for i, st
330 |                                  in enumerate(self.subop_chain)])
331 |             subop_queues[1].enqueue_many(self.input_param_list,
332 |                                          client=self.client)
333 |         self.subop_queues = subop_queues
334 | 
335 |     def get_progress(self):
336 |         return len(self.results)
337 | 
338 |     def get_relative_progress(self):
339 |         if self.limit and self.limit is not ALL:
340 |             return len(self.results) / float(self.limit)
341 |         return 0.0
342 | 
343 |     def set_input_param(self, param):
344 |         self._orig_input_param = self._input_param = param
345 |         if self.input_field:
346 |             self._input_param = self.input_field.get_value(param)
347 |             self._input_param_list = self.input_field.get_value_list(param)
348 |         else:
349 |             self._input_param = None
350 |             self._input_param_list = []  # TODO: necessary?
351 | 
352 |     @property
353 |     def input_param(self):
354 |         return self._input_param
355 | 
356 |     @property
357 |     def input_param_list(self):
358 |         return self._input_param_list
359 | 
360 |     @property
361 |     def source(self):
362 |         return self.api_url
363 | 
364 |     def set_limit(self, limit):
365 |         # TODO: add support for callable limit getters?
366 |         self._orig_limit = limit
367 |         if isinstance(limit, Operation):
368 |             self.parent = limit
369 |         if self.is_bijective and self.input_field:
370 |             limit = len(self.input_param_list)
371 |         self._limit = limit
372 | 
373 |     @property
374 |     def limit(self):
375 |         if isinstance(self._limit, Operation):
376 |             return self._limit.remaining
377 |         return self._limit
378 | 
379 |     @property
380 |     def remaining(self):
381 |         limit = self.limit
382 |         if limit is None:
383 |             limit = ALL
384 |         return max(0, limit - len(self.results))
385 | 
386 |     def process(self):
387 |         self.started = True
388 |         task = self.get_current_task()
389 |         if self.client.debug:
390 |             print self.__class__.__name__, self.remaining
391 |         if task is None:
392 |             raise NoMoreResults()
393 |         elif isinstance(task, Operation):
394 |             results = task.process()
395 |         elif callable(task):  # not actually used
396 |             results = task()
397 |         else:
398 |             msg = 'task expected as Operation or callable, not: %r' % task
399 |             raise TypeError(msg)
400 |         # TODO: check resp for api errors/warnings
401 |         # TODO: check for unrecognized parameter values
402 |         new_results = self.store_results(task, results)
403 |         return new_results
404 | 
405 |     def get_current_task(self):
406 |         if not self.remaining:
407 |             return None
408 |         for subop_queue in reversed(self.subop_queues):
409 |             while subop_queue:
410 |                 subop = subop_queue.peek()
411 |                 if subop.remaining:
412 |                     return subop
413 |                 else:
414 |                     subop_queue.pop()
415 |         return None
416 | 
417 |     def store_results(self, task, results):
418 |         new_res = []
419 |         oqi = getattr(task, '_origin_queue', None)
420 |         if oqi is None:
421 |             return self._update_results(results)
422 |         dqi = oqi + 1
423 | 
424 |         origin_queue = self.subop_queues[oqi]
425 |         is_recursive = origin_queue.options.get('is_recursive')
426 |         if is_recursive:
427 |             origin_queue.enqueue_many(results, client=self.client)
428 |         if dqi < len(self.subop_queues):
429 |             dest_queue = self.subop_queues[dqi]
430 |             dest_queue.enqueue_many(results, client=self.client)
431 |         else:
432 |             new_res = self._update_results(results)
433 |         return new_res
434 | 
435 |     def _update_results(self, results):
436 |         ret = []
437 |         filt_exists = self.kwargs.get('exists')
438 |         filt_exists = filt_exists if filt_exists is None else bool(filt_exists)
439 |         for res in results:
440 |             if not self.remaining:
441 |                 break
442 |             if filt_exists is not None and res.exists is not filt_exists:
443 |                 continue
444 |             unique_key = getattr(res, 'unique_key', res)
445 |             if unique_key in self.results:
446 |                 continue
447 |             self.results[unique_key] = res
448 |             ret.append(res)
449 |         return ret
450 | 
451 |     def process_all(self):
452 |         while 1:  # TODO: +retry behavior
453 |             try:
454 |                 self.process()
455 |             except NoMoreResults:
456 |                 break
457 |         return self.results.values()
458 | 
459 |     __call__ = process_all
460 | 
461 |     def __repr__(self):
462 |         cn = self.__class__.__name__
463 |         if self.input_field is None:
464 |             return '%s(limit=%r)' % (cn, self.limit)
465 |         tmpl = '%s(%s, limit=%r)'  # add dynamic-limity stuff
466 |         try:
467 |             ip_disp = repr(self.input_param)
468 |         except:
469 |             ip_disp = "'(unprintable param)'"
470 |         return tmpl % (cn, ip_disp, self.limit)
471 | 
472 | 
473 | class QueryOperation(Operation):
474 |     api_action = 'query'
475 |     field_prefix = None        # e.g., 'gcm'
476 |     cont_str_key = None
477 |     per_query_limit = DEFAULT_QUERY_LIMIT
478 |     default_limit = ALL
479 | 
480 |     def __init__(self, input_param, limit=None, **kw):
481 |         if limit is None:
482 |             limit = self.default_limit
483 |         super(QueryOperation, self).__init__(input_param, limit, **kw)
484 |         self.cont_strs = []
485 |         self._set_params()
486 | 
487 |         if self.is_bijective and self.input_param and \
488 |                 len(self.input_param_list) > self.per_query_param_limit:
489 |             self.is_multiplexing = True
490 |             self._setup_multiplexing()
491 |         else:
492 |             self.is_multiplexing = False
493 | 
494 |     def _set_params(self):
495 |         is_bot_op = self.is_bot_op
496 | 
497 |         params = {}
498 |         for field in self.fields:
499 |             pref_key = field.get_key(self.field_prefix)
500 |             kw_val = self.kwargs.get(field.key)
501 |             params[pref_key] = field.get_value(kw_val)
502 |         if self.input_field:
503 |             qp_key_pref = self.input_field.get_key(self.field_prefix)
504 |             qp_val = self.input_field.get_value(self.input_param)
505 |             params[qp_key_pref] = qp_val
506 | 
507 |             field_limit = self.input_field.limit or PL_50_500
508 |             try:
509 |                 pq_pl = field_limit.get_limit(is_bot_op)
510 |             except AttributeError:
511 |                 pq_pl = int(field_limit)
512 |             self.per_query_param_limit = pq_pl
513 |         self.params = params
514 |         try:
515 |             per_query_limit = self.per_query_limit.get_limit(is_bot_op)
516 |         except AttributeError:
517 |             per_query_limit = int(self.per_query_limit)
518 |         self.per_query_limit = per_query_limit
519 | 
520 |         return
521 | 
522 |     def _setup_multiplexing(self):
523 |         subop_queue = self.subop_queues[0]
524 |         chunk_size = self.per_query_param_limit
525 |         for chunk in chunked_iter(self.input_param_list, chunk_size):
526 |             subop_queue.enqueue(tuple(chunk), client=self.client)  # TODO
527 |         return
528 | 
529 |     @property
530 |     def current_limit(self):
531 |         ret = self.remaining
532 |         if not self.is_bijective:
533 |             ret = max(DEFAULT_MIN, ret)
534 |         ret = min(ret, self.per_query_limit)
535 |         return ret
536 | 
537 |     @property
538 |     def remaining(self):
539 |         if self.is_depleted:
540 |             return 0
541 |         return super(QueryOperation, self).remaining
542 | 
543 |     @property
544 |     def last_cont_str(self):
545 |         if not self.cont_strs:
546 |             return None
547 |         return self.cont_strs[-1]
548 | 
549 |     @property
550 |     def is_depleted(self):
551 |         if self.cont_strs and self.last_cont_str is None:
552 |             return True
553 |         return False
554 | 
555 |     @classmethod
556 |     def get_field_dict(cls):
557 |         ret = dict([(f.get_key(cls.field_prefix), f) for f in cls.fields])
558 |         if cls.input_field:
559 |             query_key = cls.input_field.get_key(cls.field_prefix)
560 |             ret[query_key] = cls.input_field
561 |         return ret
562 | 
563 |     def get_current_task(self):
564 |         if self.is_multiplexing:
565 |             return super(QueryOperation, self).get_current_task()
566 |         if not self.remaining:
567 |             return None
568 |         params = self.prepare_params(**self.kwargs)
569 |         mw_call = MediaWikiCall(params, client=self.client)
570 |         return mw_call
571 | 
572 |     def prepare_params(self, **kw):
573 |         params = dict(self.params)
574 |         if not self.is_bijective:
575 |             params[self.field_prefix + 'limit'] = self.current_limit
576 |         if self.last_cont_str:
577 |             params[self.cont_str_key] = self.last_cont_str
578 |         params['action'] = self.api_action
579 |         return params
580 | 
581 |     def post_process_response(self, response):
582 |         """
583 |         Used to rectify inconsistencies in API responses (looking at
584 |         you, Feedback API)
585 |         """
586 |         return response.results.get(self.api_action)
587 | 
588 |     def extract_results(self, resp):
589 |         raise NotImplementedError('inheriting classes should return'
590 |                                   ' a list of results from the response')
591 | 
592 |     def get_cont_str(self, resp):
593 |         qc_val = resp.results.get(self.api_action + '-continue')
594 |         if qc_val is None:
595 |             return None
596 |         for key in ('generator', 'prop', 'list'):
597 |             if key in self.params:
598 |                 next_key = self.params[key]
599 |                 break
600 |         else:
601 |             raise KeyError("couldn't find contstr")
602 |         if not self.cont_str_key:
603 |             self.cont_str_key = qc_val[next_key].keys()[0]
604 |         return qc_val[next_key][self.cont_str_key]
605 | 
606 |     def store_results(self, task, resp):
607 |         if self.is_multiplexing:
608 |             return super(QueryOperation, self).store_results(task, resp)
609 |         if resp.notices:  # TODO: lift this
610 |             self._notices = list(resp.notices)
611 |             self._url = resp.url
612 |             print "may have an error: %r (%r)" % (resp.notices, resp.url)
613 |         processed_resp = self.post_process_response(resp)
614 |         if processed_resp is None:
615 |             new_cont_str = self.get_cont_str(resp)  # TODO: DRY this.
616 |             self.cont_strs.append(new_cont_str)
617 |             return []  # TODO: keep an eye on this
618 |         try:
619 |             new_results = self.extract_results(processed_resp)
620 |         except Exception:
621 |             raise
622 |         super(QueryOperation, self).store_results(task, new_results)
623 |         new_cont_str = self.get_cont_str(resp)
624 |         self.cont_strs.append(new_cont_str)
625 |         return new_results
626 | 
627 | 
628 | BASE_API_PARAMS = {'format': 'json',
629 |                    'servedby': 'true'}
630 | 
631 | 
632 | class MediaWikiCall(Operation):
633 |     """
634 |     Sets up actual API HTTP request, makes the request, encapsulates
635 |     error handling, and stores results.
636 |     """
637 |     input_field = SingleParam('url_params')  # param_type=dict)
638 |     output_type = Operation
639 | 
640 |     _limit = 1
641 | 
642 |     def __init__(self, params, **kw):
643 |         # These settings will all go on the WapitiClient
644 |         self.raise_exc = kw.pop('raise_exc', True)
645 |         self.raise_err = kw.pop('raise_err', True)
646 |         self.raise_warn = kw.pop('raise_warn', False)
647 |         self.client = kw.pop('client')
648 |         self.web_client = getattr(self.client,
649 |                                      'web_client',
650 |                                      DEFAULT_WEB_CLIENT)
651 |         if kw:
652 |             raise ValueError('got unexpected keyword arguments: %r'
653 |                              % kw.keys())
654 |         self.api_url = self.client.api_url
655 |         params = params or {}
656 |         self.params = dict(BASE_API_PARAMS)
657 |         self.params.update(params)
658 |         self.action = params['action']
659 | 
660 |         self.url = ''
661 |         self.results = None
662 |         self.servedby = None
663 |         self.exception = None
664 |         self.error = None
665 |         self.error_code = None
666 |         self.warnings = []
667 | 
668 |         self._input_param = params
669 | 
670 |     def process(self):
671 |         # TODO: add URL to all exceptions
672 |         resp = None
673 |         try:
674 |             resp = self.web_client.get(self.api_url, self.params)
675 |         except Exception as e:
676 |             # TODO: log
677 |             self.exception = e  # TODO: wrap
678 |             if self.raise_exc:
679 |                 raise
680 |             return self
681 |         finally:
682 |             self.url = getattr(resp, 'url', '')
683 | 
684 |         try:
685 |             self.results = json.loads(resp.text)
686 |         except Exception as e:
687 |             self.exception = e  # TODO: wrap
688 |             if self.raise_exc:
689 |                 raise
690 |             return self
691 |         self.servedby = self.results.get('servedby')
692 | 
693 |         error = self.results.get('error')
694 |         if error:
695 |             self.error = error.get('info')
696 |             self.error_code = error.get('code')
697 | 
698 |         warnings = self.results.get('warnings', {})
699 |         for mod_name, warn_dict in warnings.items():
700 |             warn_str = '%s: %s' % (mod_name, warn_dict.get('*', warn_dict))
701 |             self.warnings.append(warn_str)
702 | 
703 |         if self.error and self.raise_err:
704 |             raise WapitiException(self.error_code)
705 |         if self.warnings and self.raise_warn:
706 |             raise WapitiException('warnings: %r' % self.warnings)
707 |         return self
708 | 
709 |     @property
710 |     def notices(self):
711 |         ret = []
712 |         if self.exception:
713 |             ret.append(self.exception)
714 |         if self.error:
715 |             ret.append(self.error)
716 |         if self.warnings:
717 |             ret.extend(self.warnings)
718 |         return ret
719 | 
720 |     @property
721 |     def remaining(self):
722 |         if self.done:
723 |             return 0
724 |         return 1
725 | 
726 | 
727 | class WebRequestOperation(Operation):
728 |     input_field = SingleParam('url')
729 |     output_type = Operation
730 |     _limit = 1
731 | 
732 |     def __init__(self, input_param, **kw):
733 |         self.client = kw.pop('client', None)
734 |         self.web_client = getattr(self.client,
735 |                                   'web_client',
736 |                                   DEFAULT_WEB_CLIENT)
737 |         self.action = kw.pop('action', 'get')
738 |         self.raise_exc = kw.pop('raise_exc', True)
739 |         if kw:
740 |             raise ValueError('got unexpected keyword arguments: %r'
741 |                              % kw.keys())
742 |         self.set_input_param(input_param)
743 |         self.url = self._input_param
744 |         self.kwargs = kw
745 |         self.results = {}
746 | 
747 |     def process(self):
748 |         resp = None
749 |         try:
750 |             resp = self.web_client.req(self.action, self.url)
751 |         except Exception as e:
752 |             self.exception = e
753 |             if self.raise_exc:
754 |                 raise
755 |             return self
756 |         self.results[self.url] = resp.text
757 |         raise NoMoreResults()
758 |         #return self
759 | 
760 | 
761 | class GetPageHTML(Operation):
762 |     input_field = SingleParam('title')
763 |     examples = [OperationExample('Africa', limit=1)]
764 |     output_type = Operation
765 |     _limit = 1
766 | 
767 |     def __init__(self, *a, **kw):
768 |         super(GetPageHTML, self).__init__(*a, **kw)
769 |         self.web_client = getattr(self.client,
770 |                                   'web_client',
771 |                                   DEFAULT_WEB_CLIENT)
772 |         self.raise_exc = kw.pop('raise_exc', True)
773 |         source_info = getattr(self.client, 'source_info', None)
774 |         if source_info:
775 |             main_title = source_info.mainpage
776 |             main_url = source_info.base
777 |             self.base_url = main_url[:-len(main_title)]
778 |         else:
779 |             self.base_url = DEFAULT_BASE_URL
780 |         self.url = self.base_url + self.input_param
781 |         self.results = {}
782 | 
783 |     def process(self):
784 |         try:
785 |             resp = self.web_client.get(self.url)
786 |         except Exception as e:
787 |             self.exception = e
788 |             if self.raise_exc:
789 |                 raise
790 |             return self
791 |         self.results[self.url] = resp.text
792 |         raise NoMoreResults()
793 | 


--------------------------------------------------------------------------------
/wapiti/operations/category.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     wapiti.operations.category
  4 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~
  5 | 
  6 |     A module of query operations related to categories. MediaWiki categories
  7 |     create an automatic index based on category tags in the page text.
  8 | """
  9 | from __future__ import unicode_literals
 10 | 
 11 | from models import CategoryInfo, PageInfo
 12 | from base import (QueryOperation,
 13 |                   Operation,
 14 |                   Recursive,
 15 |                   Tune)
 16 | from params import StaticParam, SingleParam, MultiParam
 17 | from utils import OperationExample
 18 | 
 19 | 
 20 | class GetCategoryList(QueryOperation):
 21 |     """
 22 |     Fetch the categories containing pages.
 23 |     """
 24 |     field_prefix = 'gcl'
 25 |     input_field = MultiParam('titles', key_prefix=False)
 26 |     fields = [StaticParam('generator', 'categories'),
 27 |               StaticParam('prop', 'categoryinfo'),
 28 |               SingleParam('gclshow', '')]  # hidden, !hidden
 29 |     output_type = [CategoryInfo]
 30 |     examples = [OperationExample('Physics')]
 31 | 
 32 |     def extract_results(self, query_resp):
 33 |         ret = []
 34 |         for k, pid_dict in query_resp['pages'].iteritems():
 35 |             cat_info = CategoryInfo.from_query(pid_dict,
 36 |                                                source=self.source)
 37 |             if cat_info.page_id < 0:
 38 |                 continue
 39 |             ret.append(cat_info)
 40 |         return ret
 41 | 
 42 | 
 43 | class GetCategory(QueryOperation):
 44 |     """
 45 |     Fetch the members in category.
 46 |     """
 47 |     field_prefix = 'gcm'
 48 |     input_field = SingleParam('title', val_prefix='Category:')
 49 |     fields = [StaticParam('generator', 'categorymembers'),
 50 |               StaticParam('prop', 'info'),
 51 |               StaticParam('inprop', 'subjectid|talkid|protection'),
 52 |               MultiParam('namespace')]
 53 |     output_type = [PageInfo]
 54 |     examples = [OperationExample('Featured_articles')]
 55 | 
 56 |     def extract_results(self, query_resp):
 57 |         ret = []
 58 |         for k, pid_dict in query_resp['pages'].iteritems():
 59 |             page_ident = PageInfo.from_query(pid_dict,
 60 |                                              source=self.source)
 61 |             ret.append(page_ident)
 62 |         return ret
 63 | 
 64 | 
 65 | class GetCategoryArticles(GetCategory):
 66 |     """
 67 |     Fetch the pages (namespace 0 or 1) that are members of category.
 68 |     """
 69 |     fields = GetCategory.fields + [StaticParam('gcmnamespace', '0')]
 70 |     examples = [OperationExample('Featured_articles')]
 71 | 
 72 | 
 73 | class GetSubcategoryInfos(QueryOperation):
 74 |     """
 75 |     Fetch `CategoryInfo` for category, used to count the members of
 76 |     sub-categories.
 77 |     """
 78 |     field_prefix = 'gcm'
 79 |     input_field = SingleParam('title', val_prefix='Category:')
 80 |     fields = [StaticParam('generator', 'categorymembers'),
 81 |               StaticParam('prop', 'categoryinfo'),
 82 |               StaticParam('gcmtype', 'subcat')]
 83 |     output_type = [CategoryInfo]
 84 |     examples = [OperationExample('FA-Class_articles')]
 85 | 
 86 |     def extract_results(self, query_resp):
 87 |         ret = []
 88 |         for k, pid_dict in query_resp['pages'].iteritems():
 89 |             pid_dict.update(pid_dict.get('categoryinfo', {}))
 90 |             cat_info = CategoryInfo.from_query(pid_dict,
 91 |                                                source=self.source)
 92 |             if cat_info.page_id < 0:
 93 |                 continue
 94 |             ret.append(cat_info)
 95 |         return ret
 96 | 
 97 | 
 98 | class GetAllCategoryInfos(GetSubcategoryInfos):
 99 |     """
100 |     Fetch all categories on the source wiki.
101 |     """
102 |     field_prefix = 'gac'
103 |     input_field = None
104 |     fields = [StaticParam('generator', 'allcategories'),
105 |               StaticParam('prop', 'categoryinfo')]
106 |     examples = [OperationExample(doc='basic allcats')]
107 | 
108 | 
109 | class GetFlattenedCategory(Operation):
110 |     """
111 |     Fetch all category's sub-categories.
112 |     """
113 |     subop_chain = [Tune(Recursive(GetSubcategoryInfos),
114 |                         priority='subcat_count')]
115 |     examples = [OperationExample('Africa', 100)]
116 | 
117 | 
118 | class GetCategoryRecursive(Operation):
119 |     """
120 |     Fetch all the members of category and its sub-categories. A Wikipedia
121 |     category tree can have a large number of shallow categories, so this
122 |     operation will prioritize the larger categories by default.
123 |     """
124 |     subop_chain = (GetFlattenedCategory,
125 |                    Tune(GetCategory, priority='total_count'))
126 |     examples = [OperationExample('Africa', 100),
127 |                 OperationExample('Lists of slang', 10)]
128 | 
129 |     def __init__(self, input_param, *a, **kw):
130 |         super(GetCategoryRecursive, self).__init__(input_param, *a, **kw)
131 |         root_cat_op = GetCategory(input_param,
132 |                                   client=self.client)
133 |         self.subop_queues[-1].op_queue.add(root_cat_op, 10 ** 6)
134 | 
135 | 
136 | class GetCategoryArticlesRecursive(Operation):
137 |     """
138 |     Fetch all pages (namespace 0 and 1) in category and its sub-
139 |     categories.
140 |     """
141 |     subop_chain = (GetFlattenedCategory,
142 |                    Tune(GetCategoryArticles, priority='page_count'))
143 |     examples = [OperationExample('Africa', 100),
144 |                 OperationExample('Lists of slang', 10)]
145 | 
146 |     def __init__(self, input_param, *a, **kw):
147 |         cls = GetCategoryArticlesRecursive
148 |         super(cls, self).__init__(input_param, *a, **kw)
149 |         root_cat_op = GetCategoryArticles(input_param,
150 |                                           client=self.client)
151 |         self.subop_queues[-1].op_queue.add(root_cat_op, 10 ** 6)
152 | 


--------------------------------------------------------------------------------
/wapiti/operations/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption("--mag", action="store", type="int", default=1,
 6 |                      help="magnitude of the operation limits")
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def mag(request):
11 |     return request.config.getoption("--mag")
12 | 


--------------------------------------------------------------------------------
/wapiti/operations/feedback.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation
 5 | from params import SingleParam, StaticParam
 6 | from utils import OperationExample
 7 | 
 8 | 
 9 | #class GetFeedbackV4(QueryOperation):
10 | #    """
11 | #    This API is no longer available (on en or de wikipedia).  As of
12 | #    3/9/2013, this API does not even appear in the documentation at:
13 | #    http://en.wikipedia.org/w/api.php
14 | #    """
15 | #    field_prefix = 'af'
16 | #    input_field = SingleParam('pageid')
17 | #    fields = [StaticParam('list', 'articlefeedback')]
18 | #    output_type = list
19 | #
20 | #    def extract_results(self, query_resp):
21 | #        ret = query_resp['articlefeedback'][0].get('ratings', [])
22 | #        return ret
23 | 
24 | 
25 | _FV5_KNOWN_FILTERS = ['*', 'featured', 'unreviewed', 'helpful', 'unhelpful',
26 |                       'flagged', 'useful', 'resolved', 'noaction',
27 |                       'inappropriate', 'archived', 'allcomment', 'hidden',
28 |                       'requested', 'declined', 'oversighted', 'all']
29 | 
30 | 
31 | class GetFeedbackV5(QueryOperation):
32 |     """
33 |     article feedback v5 breaks standards in a couple ways.
34 |       * the various v5 APIs use different prefixes (af/afvf)
35 |       * it doesn't put its results under 'query', requiring a custom
36 |       post_process_response()
37 |     """
38 |     field_prefix = 'afvf'
39 |     input_field = SingleParam('pageid')
40 |     fields = [StaticParam('list', 'articlefeedbackv5-view-feedback'),
41 |               SingleParam('filter', default='featured')]
42 |     output_type = list
43 |     examples = [OperationExample('604727')]
44 | 
45 |     def post_process_response(self, response):
46 |         if not response.results:
47 |             return {}
48 |         return dict(response.results)
49 | 
50 |     def extract_results(self, query_resp):
51 |         count = query_resp['articlefeedbackv5-view-feedback']['count']
52 |         return ['TODO'] * int(count)
53 | 


--------------------------------------------------------------------------------
/wapiti/operations/files.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation
 5 | from params import MultiParam, StaticParam
 6 | from models import PageInfo, ImageInfo
 7 | from utils import OperationExample
 8 | 
 9 | 
10 | DEFAULT_IMAGE_PROPS = ['timestamp', 'user', 'userid', 'comment', 'parsedcomment',
11 |                        'url', 'size', 'dimensions', 'sha1', 'mime', 'mediatype',
12 |                        'metadata', 'bitdepth']
13 | IMAGE_INFO_PROPS = DEFAULT_IMAGE_PROPS + ['thumbmime', 'archivename']
14 | 
15 | 
16 | class GetImages(QueryOperation):
17 |     """
18 |     Fetch the images embedded on pages.
19 |     """
20 |     field_prefix = 'gim'
21 |     input_field = MultiParam('titles', key_prefix=False)
22 |     fields = [StaticParam('generator', 'images'),
23 |               StaticParam('prop', 'info'),
24 |               StaticParam('inprop', 'subjectid|talkid|protection')]
25 |     output_type = [PageInfo]
26 |     examples = [OperationExample('Coffee')]
27 | 
28 |     def extract_results(self, query_resp):
29 |         ret = []
30 |         for pid, pid_dict in query_resp['pages'].iteritems():
31 |             if pid.startswith('-'):
32 |                 pid_dict['pageid'] = None  # TODO: breaks consistency :/
33 |             page_ident = PageInfo.from_query(pid_dict,
34 |                                              source=self.source)
35 |             ret.append(page_ident)
36 |         return ret
37 | 
38 | 
39 | class GetImageInfos(QueryOperation):
40 |     field_prefix = 'ii'
41 |     input_field = MultiParam('titles', key_prefix=False)
42 |     fields = [StaticParam('prop', 'imageinfo'),
43 |               StaticParam('iiprop', IMAGE_INFO_PROPS)]
44 |     output_type = [ImageInfo]
45 | 
46 |     def extract_results(self, query_resp):
47 |         ret = []
48 |         for k, pid_dict in query_resp['pages'].iteritems():
49 |             if int(k) < 0 and pid_dict['imagerepository'] != 'local':
50 |                 pid_dict['pageid'] = 'shared'
51 |                 pid_dict['revid'] = 'shared'
52 |             try:
53 |                 pid_dict.update(pid_dict.get('imageinfo', [{}])[0])
54 |                 image_info = ImageInfo.from_query(pid_dict,
55 |                                                   source=self.source)
56 |             except ValueError as e:
57 |                 print e
58 |                 continue
59 |             ret.append(image_info)
60 |         return ret
61 | 
62 | 
63 | class GetAllImageInfos(GetImageInfos):
64 |     field_prefix = 'gai'
65 |     input_field = None
66 |     fields = [StaticParam('generator', 'allimages'),
67 |               StaticParam('prop', 'imageinfo'),
68 |               StaticParam('gaiprop', DEFAULT_IMAGE_PROPS)]
69 |     examples = [OperationExample()]
70 | 


--------------------------------------------------------------------------------
/wapiti/operations/links.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from base import QueryOperation
  5 | from params import SingleParam, MultiParam, StaticParam
  6 | from models import (PageInfo, LanguageLink, InterwikiLink, ExternalLink)
  7 | from utils import OperationExample
  8 | 
  9 | 
 10 | class GetBacklinks(QueryOperation):
 11 |     """
 12 |     Fetch page's incoming links from other pages on source wiki.
 13 |     """
 14 |     field_prefix = 'gbl'
 15 |     input_field = SingleParam('title')
 16 |     fields = [StaticParam('generator', 'backlinks'),
 17 |               StaticParam('prop', 'info'),
 18 |               StaticParam('inprop', 'subjectid|talkid|protection')]
 19 |     output_type = [PageInfo]
 20 |     examples = [OperationExample('Coffee')]
 21 | 
 22 |     def extract_results(self, query_resp):
 23 |         ret = []
 24 |         for pid, pid_dict in query_resp.get('pages', {}).iteritems():
 25 |             page_info = PageInfo.from_query(pid_dict,
 26 |                                             source=self.source)
 27 |             ret.append(page_info)
 28 |         return ret
 29 | 
 30 | 
 31 | class GetLinks(QueryOperation):
 32 |     """
 33 |     Fetch page's outgoing links to other pages on source wiki.
 34 |     """
 35 |     field_prefix = 'gpl'
 36 |     input_field = SingleParam('titles', key_prefix=False)
 37 |     fields = [StaticParam('generator', 'links'),
 38 |               StaticParam('prop', 'info'),
 39 |               StaticParam('inprop', 'subjectid|talkid|protection'),
 40 |               MultiParam('namespace')]
 41 |     output_type = [PageInfo]
 42 |     examples = [OperationExample('Coffee'),
 43 |                 OperationExample('Aabach')]
 44 | 
 45 |     def extract_results(self, query_resp):
 46 |         ret = []
 47 |         for pid, pid_dict in query_resp['pages'].iteritems():
 48 |             page_info = PageInfo.from_query(pid_dict,
 49 |                                             source=self.source)
 50 |             ret.append(page_info)
 51 |         return ret
 52 | 
 53 | 
 54 | class GetExternalLinks(QueryOperation):
 55 |     """
 56 |     Fetch page outgoing links to URLs outside of source wiki.
 57 |     """
 58 |     field_prefix = 'el'
 59 |     input_field = SingleParam('titles', key_prefix=False)
 60 |     fields = [StaticParam('prop', 'extlinks')]
 61 |     output_type = [ExternalLink]
 62 |     examples = [OperationExample('Croatian War of Independence')]
 63 | 
 64 |     def extract_results(self, query_resp):
 65 |         ret = []
 66 |         for pid_dict in query_resp.get('pages', {}).values():
 67 |             for el in pid_dict.get('extlinks', []):
 68 |                 cur_dict = dict(pid_dict)
 69 |                 cur_dict['source'] = self.source
 70 |                 cur_dict['url'] = el.get('*')
 71 |                 link = ExternalLink.from_query(cur_dict)
 72 |                 ret.append(link)
 73 |         return ret
 74 | 
 75 |     def prepare_params(self, **kw):
 76 |         params = super(GetExternalLinks, self).prepare_params(**kw)
 77 |         if params.get('elcontinue'):
 78 |             params['eloffset'] = params.pop('elcontinue')
 79 |         return params
 80 | 
 81 | 
 82 | class GetLanguageLinks(QueryOperation):
 83 |     """
 84 |     Fetch pages' interlanguage links (aka "Language Links" in the MediaWiki
 85 |     API). Interlanguage links should correspond to pages on another language
 86 |     wiki. Mostly useful on a source wiki with a family of similar multilingual
 87 |     projects, such as Wikipedias.
 88 |     """
 89 |     field_prefix = 'll'
 90 |     input_field = MultiParam('titles', key_prefix=False)
 91 |     fields = [StaticParam('prop', 'langlinks'),
 92 |               SingleParam('url', True)]
 93 |     output_type = [LanguageLink]
 94 |     examples = [OperationExample('Coffee')]
 95 | 
 96 |     def extract_results(self, query_resp):
 97 |         ret = []
 98 |         for pid_dict in query_resp.get('pages', {}).values():
 99 |             for ld in pid_dict.get('langlinks', []):
100 |                 cur_dict = dict(pid_dict)
101 |                 cur_dict['source'] = self.source
102 |                 cur_dict['url'] = ld.get('*')
103 |                 cur_dict['language'] = ld.get('lang')
104 |                 link = LanguageLink.from_query(cur_dict)
105 |                 ret.append(link)
106 |         return ret
107 | 
108 | 
109 | class GetInterwikiLinks(QueryOperation):
110 |     """
111 |     Fetch pages' interwiki links.
112 |     """
113 |     field_prefix = 'iw'
114 |     input_field = MultiParam('titles', key_prefix=False)
115 |     fields = [StaticParam('prop', 'iwlinks'),
116 |               SingleParam('url', True)]
117 |     output_type = [InterwikiLink]
118 |     examples = [OperationExample('Coffee')]
119 | 
120 |     def extract_results(self, query_resp):
121 |         ret = []
122 |         for pid_dict in query_resp.get('pages', {}).values():
123 |             for iwd in pid_dict.get('iwlinks', []):
124 |                 cur_dict = dict(pid_dict)
125 |                 cur_dict['source'] = self.source
126 |                 cur_dict['url'] = iwd.get('url')
127 |                 cur_dict['prefix'] = iwd.get('prefix')
128 |                 link = InterwikiLink.from_query(cur_dict)
129 |                 ret.append(link)
130 |         return ret
131 | 


--------------------------------------------------------------------------------
/wapiti/operations/meta.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation
 5 | from params import MultiParam, StaticParam
 6 | from models import NamespaceDescriptor, InterwikiDescriptor, SourceInfo
 7 | 
 8 | 
 9 | DEFAULT_PROPS = ('general',
10 |                  'namespaces',
11 |                  'namespacealiases',
12 |                  'statistics',
13 |                  'interwikimap')
14 | """
15 | OTHER_PROPS = ['namespaces',
16 |                'namespacealiases',
17 |                'specialpagealiases',
18 |                'magicwords',
19 |                'statistics',
20 |                'interwikimap',
21 |                'dbrepllag',
22 |                'usergroups',
23 |                'extensions',
24 |                'fileextensions',
25 |                'rightsinfo',
26 |                'languages',
27 |                'skins',
28 |                'extensiontags',
29 |                'functionhooks',
30 |                'showhooks',
31 |                'variables',
32 |                'protocols']
33 |  """
34 | 
35 | 
36 | class GetSourceInfo(QueryOperation):
37 |     """
38 |     Fetch meta site information about the source wiki.
39 | 
40 |     The default properties include:
41 | 
42 |     - General source information: Main Page, base, sitename, generator,
43 |       phpversion,  phpsapi, dbtype, dbversion, case, rights, lang, fallback,
44 |       fallback8bitEncoding, writeapi, timezone, timeoffset, articlepath,
45 |       scriptpath, script, variantarticlepath, server, wikiid, time, misermode,
46 |       maxuploadsize
47 |     - Namespace map
48 |     - Interwiki map
49 |     - Statistics: pages, articles, edits, images, users, activeusers, admins,
50 |       jobs
51 |     """
52 |     field_prefix = 'si'
53 |     input_field = None
54 |     fields = [StaticParam('meta', 'siteinfo'),
55 |               MultiParam('prop', DEFAULT_PROPS)]
56 |     output_type = SourceInfo
57 | 
58 |     def extract_results(self, query_resp):
59 |         ret = query_resp['general']
60 |         namespaces = query_resp.get('namespaces', {})
61 |         interwikis = query_resp.get('interwikimap', {})
62 |         ns_map = []
63 |         for ns, ns_dict in namespaces.iteritems():
64 |             ns_map.append(NamespaceDescriptor(ns_dict.get('id'),
65 |                                               ns_dict.get('*'),
66 |                                               ns_dict.get('canonical')))
67 |         iw_map = []
68 |         for iw in interwikis:
69 |             iw_map.append(InterwikiDescriptor(iw.get('prefix'),
70 |                                               iw.get('url'),
71 |                                               iw.get('language')))
72 |         ret['namespace_map'] = tuple(ns_map)
73 |         ret['interwiki_map'] = tuple(iw_map)
74 |         ret.update(query_resp['statistics'])
75 |         source_info = SourceInfo(**ret)
76 |         return [source_info]
77 | 


--------------------------------------------------------------------------------
/wapiti/operations/misc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from collections import namedtuple
  5 | 
  6 | from base import QueryOperation
  7 | from params import SingleParam, MultiParam, StaticParam
  8 | from models import PageIdentifier, CoordinateIdentifier, PageInfo
  9 | from utils import OperationExample
 10 | 
 11 | # TODO: These operations should be moved to the proper file
 12 | # TODO: convert to real model(s)
 13 | QueryPageInfo = namedtuple('QueryPageInfo', 'title ns value querypage cache')
 14 | 
 15 | DEFAULT_COORD_PROPS = ['type', 'name', 'dim', 'country', 'region']
 16 | 
 17 | 
 18 | class GetPageInfo(QueryOperation):
 19 |     field_prefix = 'in'
 20 |     input_field = MultiParam('titles', key_prefix=False)
 21 |     fields = [StaticParam('prop', 'info'),
 22 |               MultiParam('prop', 'subjectid|talkid|protection')]
 23 |     output_type = PageInfo
 24 |     examples = [OperationExample(['Coffee', 'Category:Africa'])]
 25 | 
 26 |     def extract_results(self, query_resp):
 27 |         ret = []
 28 |         for k, pid_dict in query_resp['pages'].iteritems():
 29 |             page_info = PageInfo.from_query(pid_dict,
 30 |                                             source=self.source)
 31 |             ret.append(page_info)
 32 |         return ret
 33 | 
 34 | 
 35 | class GetCoordinates(QueryOperation):
 36 |     field_prefix = 'co'
 37 |     input_field = MultiParam('titles', key_prefix=False)
 38 |     fields = [StaticParam('prop', 'coordinates'),
 39 |               SingleParam('primary', 'all'),  # primary, secondary, all
 40 |               MultiParam('prop', DEFAULT_COORD_PROPS)]
 41 |     output_type = [CoordinateIdentifier]
 42 |     examples = [OperationExample(['White House', 'Mount Everest'])]
 43 | 
 44 |     def extract_results(self, query_resp):
 45 |         ret = []
 46 |         for k, pid_dict in query_resp['pages'].iteritems():
 47 |             page_ident = PageIdentifier.from_query(pid_dict,
 48 |                                                    source=self.source)
 49 |             for coord in pid_dict['coordinates']:
 50 |                 coord_ident = CoordinateIdentifier(coord, page_ident)
 51 |             ret.append(coord_ident)
 52 |         return ret
 53 | 
 54 | 
 55 | class GeoSearch(QueryOperation):
 56 |     field_prefix = 'gs'
 57 |     input_field = MultiParam('coord')
 58 |     fields = [StaticParam('list', 'geosearch'),
 59 |               SingleParam('radius', 10000),  # must be within 10 and 10000
 60 |               #SingleParam('maxdim', 1000),  # does not work?
 61 |               SingleParam('globe', 'earth'),  # which planet? donno...
 62 |               SingleParam('namespace'),
 63 |               StaticParam('gsprop', DEFAULT_COORD_PROPS)]
 64 |     output_type = [CoordinateIdentifier]
 65 |     examples = [OperationExample(('37.8197', '-122.479'), 1)]
 66 | 
 67 |     def extract_results(self, query_resp):
 68 |         ret = []
 69 |         for pid_dict in query_resp['geosearch']:
 70 |             page_ident = PageIdentifier.from_query(pid_dict,
 71 |                                                    source=self.source)
 72 |             coord_ident = CoordinateIdentifier(pid_dict, page_ident)
 73 |             ret.append(coord_ident)
 74 |         return ret
 75 | 
 76 | 
 77 | class GetRecentChanges(QueryOperation):
 78 |     field_prefix = 'grc'
 79 |     input_field = None
 80 |     fields = [StaticParam('generator', 'recentchanges'),
 81 |               StaticParam('prop', 'info'),
 82 |               StaticParam('inprop', 'subjectid|talkid|protection')]
 83 |     output_type = [PageInfo]
 84 |     examples = [OperationExample()]
 85 | 
 86 |     def extract_results(self, query_resp):
 87 |         ret = []
 88 |         for pid, pid_dict in query_resp['pages'].iteritems():
 89 |             if pid.startswith('-'):
 90 |                 continue
 91 |             page_ident = PageInfo.from_query(pid_dict,
 92 |                                              source=self.source)
 93 |             ret.append(page_ident)
 94 |         return ret
 95 | 
 96 | '''
 97 | If we are completionists (for action=query)
 98 | 
 99 | * prop=pageprops (pp) *
100 |   Get various properties defined in the page content
101 | * prop=videoinfo (vi) *
102 |   Extends imageinfo to include video source information
103 | * prop=transcodestatus *
104 |   Get transcode status for a given file page
105 | * prop=globalusage (gu) *
106 |   Returns global image usage for a certain image
107 | * prop=extracts (ex) *
108 |   Returns plain-text or limited HTML extracts of the given page(s)
109 | * prop=pageimages (pi) *
110 |   Returns information about images on the page such as thumbnail and presence of photos.
111 | * prop=flagged *
112 |   Get information about the flagging status of the given pages.
113 | 
114 | * list=alllinks (al) *
115 |   Enumerate all links that point to a given namespace
116 | * list=allpages (ap) *
117 |   Enumerate all pages sequentially in a given namespace
118 | * list=allusers (au) *
119 |   Enumerate all registered users
120 | * list=blocks (bk) *
121 |   List all blocked users and IP addresses
122 | 
123 | * list=exturlusage (eu) *
124 |   Enumerate pages that contain a given URL
125 | * list=filearchive (fa) *
126 |   Enumerate all deleted files sequentially
127 | * list=iwbacklinks (iwbl) *
128 |   Find all pages that link to the given interwiki link.
129 | * list=langbacklinks (lbl) *
130 |   Find all pages that link to the given language link.
131 | 
132 | * list=logevents (le) *
133 |   Get events from logs
134 | * list=protectedtitles (pt) *
135 |   List all titles protected from creation
136 | 
137 | 
138 | * list=search (sr) *
139 |   Perform a full text search
140 | * list=tags (tg) *
141 |   List change tags
142 | * list=users (us) *
143 |   Get information about a list of users
144 | * list=abuselog (afl) *
145 |   Show events that were caught by one of the abuse filters.
146 | * list=abusefilters (abf) *
147 |   Show details of the abuse filters.
148 | 
149 | '''
150 | 


--------------------------------------------------------------------------------
/wapiti/operations/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     wapiti.operations.models
  4 |     ~~~~~~~~~~~~~~~~~~~~~~~~
  5 | 
  6 |     This module provides structures and abstractions for creating consistent
  7 |     Operation interfaces, regardless of underlying Mediawiki API response
  8 |     types.
  9 | 
 10 |     For example the ``prop=revisions`` and ``list=usercontribs`` APIs
 11 |     both return lists of revision information, however not all of the
 12 |     attributes afforded by ``prop=revisions`` are available from
 13 |     ``list=usercontribs``. Wapiti models and operations strive to
 14 |     resolve and abstract this fact away from the user as sanely as
 15 |     possible.
 16 | """
 17 | from __future__ import unicode_literals
 18 | 
 19 | from datetime import datetime
 20 | from collections import namedtuple, OrderedDict
 21 | 
 22 | 
 23 | def parse_timestamp(timestamp):
 24 |     return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')
 25 | 
 26 | 
 27 | NamespaceDescriptor = namedtuple('NamespaceDescriptor', 'id title canonical')
 28 | InterwikiDescriptor = namedtuple('InterwikiDescriptor', 'alias url language')
 29 | 
 30 | _MISSING = object()
 31 | 
 32 | 
 33 | class NamespaceDescriptor(object):
 34 |     def __init__(self, id, title, canonical, **kw):
 35 |         self.id = id
 36 |         self.title = title
 37 |         self.canonical = canonical
 38 | 
 39 | 
 40 | class WapitiModelAttr(object):
 41 |     def __init__(self, name, **kw):
 42 |         self.name = name
 43 |         self.mw_name = kw.pop('mw_name', name)
 44 |         self.display = kw.pop('display', False)
 45 |         try:
 46 |             self.type = kw.pop('type')
 47 |             if not isinstance(self.type, type):
 48 |                 raise TypeError("WapitiModelAttr kwarg 'type' expected type")
 49 |         except KeyError:
 50 |             self.type = _MISSING
 51 |         try:
 52 |             self.default = kw.pop('default')
 53 |         except KeyError:
 54 |             self.default = _MISSING
 55 |         if kw:
 56 |             raise ValueError('got unexpected keyword arguments: %r'
 57 |                              % kw.keys())
 58 | 
 59 |     def __repr__(self):
 60 |         ret = [self.__class__.__name__, '(', repr(self.name)]
 61 |         if self.mw_name != self.name:
 62 |             ret.extend([', mw_name=', repr(self.mw_name)])
 63 |         if self.type is not _MISSING:
 64 |             ret.extend([', type=', self.type.__name__])
 65 |         if self.default is not _MISSING:
 66 |             ret.extend([', default=', repr(self.default)])
 67 |         if self.display:
 68 |             ret.extend([', display=', repr(self.display)])
 69 |         ret.append(')')
 70 |         return ''.join(ret)
 71 | 
 72 |     def __iter__(self):
 73 |         for attr in ('name', 'mw_name', 'type', 'default', 'display'):
 74 |             yield getattr(self, attr)
 75 | 
 76 | 
 77 | WMA = WapitiModelAttr  # Windows Media Audio
 78 | 
 79 | 
 80 | def title_talk2subject(title):
 81 |     talk_pref, _, title_suf = title.partition(':')
 82 |     subj_pref, _, _ = talk_pref.rpartition('talk')
 83 |     subj_pref = subj_pref.strip()
 84 |     new_title = subj_pref + ':' + title_suf
 85 |     new_title = new_title.lstrip(':')
 86 |     return new_title
 87 | 
 88 | 
 89 | def title_subject2talk(title):
 90 |     subj_pref, _, title_suf = title.partition(':')
 91 |     subj_pref = subj_pref.strip()
 92 |     if not title_suf:
 93 |         talk_pref = 'Talk'
 94 |         title_suf = subj_pref
 95 |     elif subj_pref.endswith('talk'):
 96 |         talk_pref = subj_pref
 97 |     else:
 98 |         talk_pref = subj_pref + ' talk'
 99 |     new_title = talk_pref + ':' + title_suf
100 |     return new_title
101 | 
102 | 
103 | def get_unique_func(val):
104 |     if callable(val):
105 |         return val
106 |     elif isinstance(val, basestring):
107 |         return lambda obj: getattr(obj, val, obj)
108 |     try:
109 |         if all([isinstance(v, basestring) for v in val]):
110 |             return lambda obj: tuple([getattr(obj, v, obj) for v in val])
111 |     except TypeError:
112 |         pass
113 |     raise TypeError('could not derive uniqueification function from %r' % val)
114 | 
115 | 
116 | def get_priority_func(val, default=0):
117 |     if val is None:
118 |         val = default
119 |     if callable(val):
120 |         return val
121 |     elif isinstance(val, basestring):
122 |         return lambda obj: getattr(obj, val, default)
123 |     try:
124 |         int_val = int(val)
125 |         return lambda obj: int_val
126 |     except TypeError:
127 |         pass
128 |     try:
129 |         if all([isinstance(v, basestring) for v in val]):
130 |             return lambda obj: tuple([getattr(obj, v, default) for v in val])
131 |     except TypeError:
132 |         pass
133 |     raise TypeError('could not derive priority function from %r' % val)
134 | 
135 | 
136 | class WapitiModelMeta(type):
137 |     """
138 |     The foundation of Wapiti's data models, which attempt to add
139 |     consistency and order to the wide variety of return types used
140 |     across different Mediawiki APIs. This metaclass enables certain
141 |     inheritance-like usage patterns in models. See WapitiModelBase's
142 |     docstring for more information.
143 | 
144 |     The `attributes` dictionary is a mapping of Python class attribute
145 |     names to Mediawiki API result keys (e.g., `pageid` becomes
146 |     `page_id` on the Python object).
147 | 
148 |     The `defaults` dictionary is a mapping of Python attribute name to
149 |     default value, if allowed. If an attribute does not have a default
150 |     value, and is missing upon instantiation of a model, an exception
151 |     will be raised.
152 |     """
153 |     attributes = []
154 | 
155 |     def __new__(cls, name, bases, attrs):
156 |         all_attributes = OrderedDict()
157 |         for base in bases:
158 |             base_attr_list = getattr(base, 'attributes', [])
159 |             base_attr_dict = OrderedDict([(a.name, a) for a in base_attr_list])
160 |             all_attributes.update(base_attr_dict)
161 |         attr_dict = OrderedDict([(a.name, a) for a
162 |                                  in attrs.get('attributes', [])])
163 |         all_attributes.update(attr_dict)
164 |         attrs['attributes'] = all_attributes.values()
165 |         if 'unique_on' in attrs:
166 |             unique_func = get_unique_func(attrs['unique_on'])
167 |             attrs['unique_key'] = property(unique_func)
168 |         ret = super(WapitiModelMeta, cls).__new__(cls, name, bases, attrs)
169 |         return ret
170 | 
171 | 
172 | class WapitiModelBase(object):
173 |     """
174 |     The more-concrete counterpart of WapitiModelMeta, which primarily
175 |     provides generic initialization and display logic.
176 | 
177 |     There are two methods for instantiation, the standard
178 |     ``__init__()`` (e.g., ``CategoryInfo()``), which takes attributes
179 |     as keyword arguments, and ``from_query()``, which usually takes a
180 |     dictionary deserialized from JSON, as returned by Mediawiki
181 |     API. For information on `attributes` and `defaults`, see
182 |     WapitiModelMeta.
183 | 
184 |     ``__repr__()`` and ``__str__()`` are powered by
185 |     ``get_display_str()``.
186 |     """
187 | 
188 |     __metaclass__ = WapitiModelMeta
189 |     attributes = []
190 |     unique_on = lambda self: self
191 |     exists = True # Defaults to True, instances can represent non-existent pages
192 | 
193 |     def __init__(self, **kw):
194 |         missing = []
195 |         for attr in self.attributes:
196 |             try:
197 |                 val = kw.pop(attr.name)
198 |             except KeyError:
199 |                 if attr.default is _MISSING:
200 |                     missing.append(attr.name)
201 |                     continue
202 |                 val = attr.default
203 |             if attr.type is not _MISSING and not isinstance(val, attr.type):
204 |                 val = attr.type(val)
205 |                 print val
206 |             setattr(self, attr.name, val)
207 |         if missing:
208 |             raise ValueError('missing expected keyword arguments: %r'
209 |                              % missing)
210 |         # TODO: raise on unexpected keyword arguments?
211 |         return
212 | 
213 |     @classmethod
214 |     def from_query(cls, q_dict, **kw):
215 |         kwargs = {}
216 |         all_q_dict = dict(kw)
217 |         all_q_dict.update(q_dict)
218 |         for name, mw_name, _, _, _ in cls.attributes:
219 |             if mw_name is None:
220 |                 continue
221 |             try:
222 |                 kwargs[name] = all_q_dict[mw_name]
223 |             except KeyError:
224 |                 pass
225 |         return cls(**kwargs)
226 | 
227 |     def get_display_str(self, raise_exc=True):
228 |         attr_list = []
229 |         try:
230 |             for (name, _, _, _, do_disp) in self.attributes:
231 |                 if not do_disp:
232 |                     continue
233 |                 # TODO: don't display values if equal to default?
234 |                 val = getattr(self, name)
235 |                 attr_list.append('%s=%r' % (name, val))
236 |         except:
237 |             if raise_exc:
238 |                 raise
239 |             return super(WapitiModelBase, self).__str__()
240 |         attr_str = ', '.join(attr_list)
241 |         return ''.join([self.__class__.__name__, '(', attr_str, ')'])
242 | 
243 |     __str__ = get_display_str
244 | 
245 |     def __repr__(self):
246 |         try:
247 |             return self.get_display_str()
248 |         except:
249 |             return super(WapitiModelBase, self).__repr__()
250 | 
251 | 
252 | class SourceInfo(WapitiModelBase):
253 |     attributes = [WMA('namespace_map'),
254 |                   WMA('interwiki_map')]
255 | 
256 |     def __init__(self, **kw):
257 |         for k, v in kw.iteritems():
258 |             attr = WMA(k)
259 |             setattr(self, attr.name, v)
260 | 
261 | 
262 | class PageIdentifier(WapitiModelBase):
263 |     attributes = [WMA('title', display=True),
264 |                   WMA('page_id', mw_name='pageid', display=True, default=None),
265 |                   WMA('ns', display=True),
266 |                   WMA('source')]
267 | 
268 |     unique_on = 'title'
269 | 
270 |     @property
271 |     def exists(self):
272 |         return self.page_id is not None
273 | 
274 |     @property
275 |     def is_subject_page(self):
276 |         return (self.ns >= 0 and self.ns % 2 == 0)
277 | 
278 |     @property
279 |     def is_talk_page(self):
280 |         return (self.ns >= 0 and self.ns % 2 == 1)
281 | 
282 | LanguageLink = namedtuple('LanguageLink', 'url language origin_page')
283 | InterwikiLink = namedtuple('InterwikiLink', 'url prefix origin_page')
284 | ExternalLink = namedtuple('ExternalLink', 'url origin_page')
285 | 
286 | 
287 | class Link(WapitiModelBase):
288 |     unique_on = 'url'
289 |     attributes = [WMA('url', display=True)]
290 | 
291 | 
292 | class LanguageLink(Link, PageIdentifier):
293 |     # TODO: URL is really the other language's title
294 |     unique_on = ('url', 'language')
295 |     attributes = [WMA('language', display=True)]
296 | 
297 | 
298 | class InterwikiLink(Link, PageIdentifier):
299 |     attributes = [WMA('prefix')]
300 | 
301 | 
302 | class ExternalLink(Link, PageIdentifier):
303 |     pass
304 | 
305 | 
306 | class PageInfo(PageIdentifier):
307 |     attributes = [WMA('subject_id', mw_name='subjectid', default=None),
308 |                   WMA('talk_id', mw_name='talkid', default=None)]
309 | 
310 |     def __init__(self, **kw):
311 |         req_title = kw.pop('req_title', None)
312 |         super(PageInfo, self).__init__(**kw)
313 |         self.req_title = req_title or self.title
314 | 
315 |         if self.is_subject_page:
316 |             self.subject_id = self.page_id
317 |         elif self.is_talk_page:
318 |             self.talk_id = self.page_id
319 |         else:
320 |             raise ValueError('special or nonexistent namespace: %r' % self.ns)
321 | 
322 |     def get_subject_info(self):
323 |         if self.is_subject_page:
324 |             return self
325 |         if self.subject_id is None:
326 |             raise ValueError('subject_id not set')
327 |         subj_title = title_talk2subject(self.title)
328 |         subj_ns = self.ns - 1
329 |         kwargs = dict(self.__dict__)
330 |         kwargs['title'] = subj_title
331 |         kwargs['ns'] = subj_ns
332 |         return PageInfo(**kwargs)
333 | 
334 |     def get_talk_info(self):
335 |         if self.is_talk_page:
336 |             return self
337 |         if self.talk_id is None:
338 |             raise ValueError('talk_id not set')
339 |         talk_title = title_subject2talk(self.title)
340 |         talk_ns = self.ns + 1
341 |         kwargs = dict(self.__dict__)
342 |         kwargs['title'] = talk_title
343 |         kwargs['ns'] = talk_ns
344 |         return PageInfo(**kwargs)
345 | 
346 | 
347 | class CategoryInfo(PageInfo):
348 |     kw = {'default': 0, 'type': int}
349 |     attributes = [WMA('total_count', mw_name='size', display=True, **kw),
350 |                   WMA('page_count', mw_name='pages', **kw),
351 |                   WMA('file_count', mw_name='files', **kw),
352 |                   WMA('subcat_count', mw_name='subcats', display=True, **kw)]
353 |     del kw
354 | 
355 | 
356 | class RevisionInfo(PageInfo):
357 |     attributes = [WMA('rev_id', mw_name='revid', display=True),
358 |                   WMA('size'),
359 |                   WMA('user_text', mw_name='user', default='!userhidden'),
360 |                   WMA('user_id', mw_name='userid', default=-1),
361 |                   WMA('timestamp', display=True),
362 |                   WMA('comment', default=''),
363 |                   WMA('parsed_comment', mw_name='parsedcomment', default=''),
364 |                   WMA('tags')]
365 | 
366 |     unique_on = 'rev_id'
367 | 
368 |     # note that certain revisions may have hidden the fields
369 |     # user_id, user_text, and comment for administrative reasons,
370 |     # aka "oversighting"
371 |     # TODO: is oversighting better handled in operation?
372 | 
373 |     def __init__(self, *a, **kw):
374 |         super(RevisionInfo, self).__init__(*a, **kw)
375 |         self.timestamp = parse_timestamp(self.timestamp)
376 | 
377 | 
378 | class Revision(RevisionInfo):
379 |     attributes = [WMA('parent_rev_id', mw_name='parentid', display=True),
380 |                   WMA('content', mw_name='*', default=''),  # default=''?
381 |                   WMA('is_parsed')]
382 | 
383 | 
384 | class ImageInfo(PageIdentifier):
385 |     attributes = [WMA('image_repo', mw_name='imagerepository'),
386 |                   WMA('missing', default=False),
387 |                   WMA('url', default=''),  # will only exist if non-local repo
388 |                   WMA('dimensions', default=''),
389 |                   WMA('mime', default=''),
390 |                   WMA('thumbmime', default=''),
391 |                   WMA('media_type', mw_name='mediatype', default=''),
392 |                   WMA('metadata', default=''),
393 |                   WMA('archive_name', mw_name='archivename', default=''),
394 |                   WMA('bitdepth', default='')]
395 | 
396 | 
397 | #TODO: class ParsedTemplate(object):
398 | 
399 | 
400 | #
401 | # Protections
402 | #
403 | NEW = 'NEW'
404 | AUTOCONFIRMED = 'AUTOCONFIRMED'
405 | SYSOP = 'SYSOP'
406 | PROTECTION_ACTIONS = ('create', 'edit', 'move', 'upload')
407 | 
408 | 
409 | Protection = namedtuple('Protection', 'level, expiry')
410 | 
411 | 
412 | class ProtectionInfo(object):
413 |     # TODO: turn into mixin, add to PageIdentifier
414 |     """
415 |     For more info on protection,
416 |     see https://en.wikipedia.org/wiki/Wikipedia:Protection_policy
417 |     """
418 |     levels = {
419 |         'new': NEW,
420 |         'autoconfirmed': AUTOCONFIRMED,
421 |         'sysop': SYSOP,
422 |     }
423 | 
424 |     def __init__(self, protections, page_ident=None):
425 |         self.page_ident = page_ident
426 | 
427 |         protections = protections or {}
428 |         self.protections = {}
429 |         for p in protections:
430 |             if not p['expiry'] == 'infinity':
431 |                 expiry = parse_timestamp(p['expiry'])
432 |             else:
433 |                 expiry = 'infinity'
434 |             level = self.levels.get(p['level'], p['level'])
435 |             self.protections[p['type']] = Protection(level, expiry)
436 | 
437 |     @property
438 |     def has_protection(self):
439 |         return any([x.level != NEW for x in self.protections.values()])
440 | 
441 |     @property
442 |     def has_indef(self):
443 |         return any([x.expiry == 'infinity' for x in self.protections.values()])
444 | 
445 |     @property
446 |     def is_full_prot(self):
447 |         try:
448 |             if self.protections['edit'].level == SYSOP and \
449 |                     self.protections['move'].level == SYSOP:
450 |                 return True
451 |             else:
452 |                 return False
453 |         except (KeyError, AttributeError):
454 |             return False
455 | 
456 |     @property
457 |     def is_semi_prot(self):
458 |         try:
459 |             if self.protections['edit'].level == AUTOCONFIRMED:
460 |                 return True
461 |             else:
462 |                 return False
463 |         except (KeyError, AttributeError):
464 |             return False
465 | 
466 |     def __repr__(self):
467 |         return u'ProtectionInfo(%r)' % self.protections
468 | 
469 | 
470 | class CoordinateIdentifier(object):
471 |     def __init__(self, coord, page_ident=None):
472 |         self.page_ident = page_ident
473 |         self.lat = coord.get('lat')
474 |         self.lon = coord.get('lon')
475 |         self.type = coord.get('type')
476 |         self.name = coord.get('name')
477 |         self.dim = coord.get('dim')
478 |         self.country = coord.get('country')
479 |         self.region = coord.get('region')
480 |         if coord.get('primary', False):
481 |             self.primary = True
482 |         else:
483 |             self.primary = False
484 |         return
485 | 
486 | 
487 | class QueryPageInfo(object):
488 |     def __init__(self,
489 |                  title,
490 |                  ns,
491 |                  value,
492 |                  querypage,
493 |                  cache):
494 |         self.title = title
495 |         self.ns = ns
496 |         self.value = value
497 |         self.querypage = querypage
498 |         self.cache = cache
499 | 


--------------------------------------------------------------------------------
/wapiti/operations/params.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | """
  5 | Fields, parameters, and coercion
  6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  7 | 
  8 | Fields are typed slots which normalize and validate values passed to
  9 | them, facilitating much more robust Operation usage.
 10 | 
 11 | Parameters (aka params) are the values passed to a field. Another
 12 | benefit of fields is that Operations will not initialize successfully
 13 | without proper parameters, allowing earlier error detection (and in
 14 | some cases correction).
 15 | 
 16 | Coercion refers a field's [limited] ability to transform certain
 17 | values into a parameters usable by the owning Operation. For instance,
 18 | to get all members of 'Category:Africa', one can use the GetCategory
 19 | operation like this, ``GetCategory('Category:Africa')``, or this,
 20 | ``GetCategory('Africa')``. The ``query_field`` on the GetCategory
 21 | Operation will automatically prepend the 'Category:' prefix, as all
 22 | Wikipedia categories start with 'Category:'.
 23 | 
 24 | The normalized value can be retrieved with ``field.get_value()``,
 25 | which (currently) always returns a single string (as would be used in
 26 | an API call URL). ``field.get_value_list()`` also exists, which
 27 | returns a list of strings.
 28 | 
 29 | Fields also encapsulate a ``key``, which typically corresponds the URL
 30 | query argument name used in API request URLs, as well as key
 31 | preparation logic, like prefixing (e.g., 'title' -> 'gcmtitle').
 32 | 
 33 | Here are some notes on various aspects of coercion:
 34 | 
 35 |  - By default, fields allow submission of plain strings (or iterables
 36 |    of strings), which are then normalized and subject to cardinality
 37 |    constraints. This behavior can be disabled with allow_string=False.
 38 | 
 39 |  - Fields can also accept non-string objects (i.e., WapitiModel
 40 |    instances) by specifying the name of an attribute to get from any
 41 |    non-string value submitted to the field.
 42 | """
 43 | 
 44 | from collections import Sequence, Set
 45 | from utils import is_scalar, prefixed
 46 | 
 47 | 
 48 | def param_list2str(p_list, prefix=None, keep_empty=False):
 49 |     if is_scalar(p_list):
 50 |         p_list = param_str2list(p_list, keep_empty)
 51 |     u_p_list = [unicode(p) for p in p_list]
 52 |     ret = "|".join([prefixed(t, prefix)
 53 |                     for t in u_p_list if (t or keep_empty)])
 54 |     return unicode(ret)
 55 | 
 56 | 
 57 | def param_str2list(p, keep_empty=False):
 58 |     p = p or ''
 59 |     if is_scalar(p):
 60 |         p = unicode(p)
 61 |     else:
 62 |         p = param_list2str(p)
 63 |     p_list = p.split('|')
 64 |     if not keep_empty:
 65 |         p_list = [sp for sp in p_list if sp]
 66 |     return p_list
 67 | 
 68 | 
 69 | def normalize_param(p, prefix=None, multi=None):
 70 |     p_list = param_str2list(p)
 71 |     if multi is False:
 72 |         if len(p_list) > 1:
 73 |             tmpl = 'expected singular query parameter, not %r'
 74 |             raise ValueError(tmpl % p)
 75 |     return param_list2str(p_list, prefix)
 76 | 
 77 | 
 78 | # unacceptablllllllle
 79 | PLURAL_MAP = {'titles': 'title', 'revids': 'revid'}
 80 | 
 81 | def make_param_attr_getter(attr_name):
 82 |     def param_attr_getter(obj):
 83 |         ret = getattr(obj, attr_name)
 84 |         if callable(ret):
 85 |             raise AttributeError('unsuitable attribute value %r' % ret)
 86 |         return ret
 87 | 
 88 |     return param_attr_getter
 89 | 
 90 | class Param(object):
 91 |     def __init__(self, key, default=None, val_prefix=None, **kw):
 92 |         if not key:
 93 |             raise ValueError('expected key, not %r' % key)
 94 |         self.key = unicode(key)
 95 |         self.val_prefix = val_prefix
 96 |         self.required = kw.pop('required', False)
 97 |         self.multi = kw.pop('multi', None)
 98 |         self.accept_str = kw.pop('accept_str', True)
 99 |         self.key_prefix = kw.pop('key_prefix', True)  # True = filled in later
100 |         self.limit = kw.pop('limit', None)
101 | 
102 |         param_attr = kw.pop('attr', None)
103 |         coerce_func = kw.pop('coerce', None)
104 |         if coerce_func is None:
105 |             if param_attr is None:
106 |                 param_attr = self.key
107 |                 if self.multi:
108 |                     param_attr = PLURAL_MAP.get(param_attr, param_attr)
109 |             if isinstance(param_attr, basestring):
110 |                 coerce_func = make_param_attr_getter(param_attr)
111 |             elif param_attr is None:
112 |                 coerce_func = lambda x: x
113 |             else:
114 |                 raise TypeError("'attr' expected string")
115 |         elif not callable(coerce_func):
116 |             raise TypeError("'coerce' expected callable")
117 |         self.coerce_func = coerce_func
118 |         if kw:
119 |             raise ValueError('unexpected keyword argument(s): %r' % kw)
120 |         if default is not None:
121 |             default = normalize_param(default, self.val_prefix, self.multi)
122 |         self.default = default
123 | 
124 |     def get_key(self, key_prefix=None):
125 |         if self.key_prefix:
126 |             prefix = key_prefix
127 |             if prefix is None:
128 |                 prefix = self.key_prefix
129 |             if isinstance(prefix, basestring):
130 |                 prefix = unicode(prefix)
131 |             else:
132 |                 raise TypeError('expected valid string prefix')
133 |         else:
134 |             prefix = ''
135 |         return prefix + self.key
136 | 
137 |     def _coerce_value(self, value):
138 |         # TODO: it's real late and this is a bit of a sty
139 |         # also, in some cases the bar-split normalization
140 |         # should not occur (e.g., on a URL)
141 |         if value is None: 
142 |             return value
143 |         try:
144 |             return self.coerce_func(value)
145 |         except AttributeError:
146 |             pass
147 | 
148 |         if is_scalar(value):
149 |             if isinstance(value, basestring):
150 |                 return value
151 |         else:
152 |             # some models are iterable, but none are sequences/sets (yet)
153 |             coerced = []
154 |             for v in value:
155 |                 if isinstance(v, basestring):
156 |                     coerced.append(v)
157 |                 else:
158 |                     coerced.append(self.coerce_func(v))
159 |             return coerced
160 |         raise TypeError('could not coerce value %r to %r' % (value, self.key))
161 | 
162 |     def get_value(self, value, prefix=None):
163 |         if prefix is None:
164 |             prefix = self.val_prefix
165 |         value = self._coerce_value(value)
166 |         norm_val = normalize_param(value, prefix, self.multi)
167 |         val = norm_val or self.default
168 |         if val is None and self.required:
169 |             raise ValueError('%r param is required' % self.key)
170 |         return val
171 | 
172 |     def get_value_list(self, value, prefix=None):
173 |         return param_str2list(self.get_value(value, prefix))
174 | 
175 |     def get_tuple(self):
176 |         return (self.key, self.value)
177 | 
178 |     def get_tuple_from_kwargs(self, **kwargs):
179 |         """
180 |         Picks up appropriate values from kwargs,
181 |         returns the defaults if nothing matches.
182 |         """
183 |         pass
184 | 
185 |     __call__ = get_value
186 | 
187 | 
188 | class SingleParam(Param):
189 |     def __init__(self, *a, **kw):
190 |         kw['multi'] = False
191 |         super(SingleParam, self).__init__(*a, **kw)
192 | 
193 | 
194 | class MultiParam(Param):
195 |     def __init__(self, *a, **kw):
196 |         kw['multi'] = True
197 |         super(MultiParam, self).__init__(*a, **kw)
198 | 
199 | 
200 | class StaticParam(Param):
201 |     def __init__(self, key, value):
202 |         super(StaticParam, self).__init__(key, value)
203 | 
204 |     def get_key(self, *a, **kw):
205 |         return self.key
206 | 
207 |     def get_value(self, *a, **kw):
208 |         return self.default
209 | 
210 | 
211 | class PassthroughParam(Param):
212 |     def __init__(self, *a, **kw):
213 |         super(PassthroughParam, self).__init__(*a, **kw)
214 | 
215 |     def get_value(self, value, prefix=None):
216 |         return value
217 | 
218 |     def get_value_list(self, value, prefix=None):
219 |         if is_scalar(value):
220 |             return [value]
221 |         return value
222 | 


--------------------------------------------------------------------------------
/wapiti/operations/protection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation
 5 | from params import MultiParam, StaticParam
 6 | from models import ProtectionInfo
 7 | from utils import OperationExample
 8 | 
 9 | 
10 | class GetProtections(QueryOperation):
11 |     field_prefix = 'in'
12 |     input_field = MultiParam('titles', key_prefix=False)
13 |     fields = [StaticParam('prop', 'info'),
14 |               StaticParam('inprop', 'protection')]
15 |     output_type = ProtectionInfo
16 |     examples = [OperationExample('Coffee'),
17 |                 OperationExample('Coffee|House'),
18 |                 OperationExample(['Coffee', 'House'])]
19 | 
20 |     def extract_results(self, query_resp):
21 |         ret = []
22 |         for page_id, page in query_resp['pages'].iteritems():
23 |             ret.append(ProtectionInfo(page['protection']))
24 |         return ret
25 | 


--------------------------------------------------------------------------------
/wapiti/operations/query_operations.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from base import QueryOperation
  5 | from params import SingleParam, StaticParam
  6 | 
  7 | from models import PageInfo
  8 | 
  9 | 
 10 | class GetQueryPage(QueryOperation):
 11 |     field_prefix = 'gqp'
 12 |     input_field = SingleParam('page')
 13 |     fields = [StaticParam('generator', 'querypage'),
 14 |               StaticParam('prop', 'info'),
 15 |               StaticParam('inprop', 'subjectid|talkid|protection')]
 16 |     output_type = PageInfo
 17 | 
 18 |     def extract_results(self, query_resp):
 19 |         ret = []
 20 |         for k, pid_dict in query_resp['pages'].iteritems():
 21 |             page = PageInfo.from_query(pid_dict,
 22 |                                        source=self.source)
 23 |             ret.append(page)
 24 |         return ret
 25 | 
 26 |     def prepare_params(self, **kw):
 27 |         params = super(GetQueryPage, self).prepare_params(**kw)
 28 |         if params.get('gqpcontinue'):
 29 |             params['gqpoffset'] = params.pop('ggqpcontinue')
 30 |         return params
 31 | 
 32 | 
 33 | class GetAncientPages(GetQueryPage):
 34 |     input_field = None
 35 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Ancientpages')]
 36 | 
 37 | 
 38 | class GetBrokenRedirects(GetQueryPage):
 39 |     input_field = None
 40 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'BrokenRedirects')]
 41 | 
 42 | 
 43 | class GetDeadendPages(GetQueryPage):
 44 |     input_field = None
 45 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Deadendpages')]
 46 | 
 47 | 
 48 | class GetDisambiguations(GetQueryPage):
 49 |     input_field = None
 50 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Disambiguations')]
 51 | 
 52 | 
 53 | class GetDoubleRedirects(GetQueryPage):
 54 |     input_field = None
 55 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Doulberedirects')]
 56 | 
 57 | 
 58 | class GetListRedirects(GetQueryPage):
 59 |     input_field = None
 60 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Listredirects')]
 61 | 
 62 | 
 63 | class GetLonelyPages(GetQueryPage):
 64 |     input_field = None
 65 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Lonelypages')]
 66 | 
 67 | 
 68 | class GetLongPages(GetQueryPage):
 69 |     input_field = None
 70 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Longpages')]
 71 | 
 72 | 
 73 | class GetMostCategories(GetQueryPage):
 74 |     input_field = None
 75 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostcategories')]
 76 | 
 77 | 
 78 | class GetMostImages(GetQueryPage):
 79 |     input_field = None
 80 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostimages')]
 81 | 
 82 | 
 83 | class GetMostInterwikiLinks(GetQueryPage):
 84 |     input_field = None
 85 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostinterwikis')]
 86 | 
 87 | 
 88 | class GetMostLinkedCategories(GetQueryPage):
 89 |     input_field = None
 90 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinkedcategories')]
 91 | 
 92 | 
 93 | class GetMostLinkedTemplates(GetQueryPage):
 94 |     input_field = None
 95 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinkedtemplates')]
 96 | 
 97 | 
 98 | class GetMostLinked(GetQueryPage):
 99 |     input_field = None
100 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostlinked')]
101 | 
102 | 
103 | class GetMostRevisions(GetQueryPage):
104 |     input_field = None
105 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Mostrevisions')]
106 | 
107 | 
108 | class GetFewestRevisions(GetQueryPage):
109 |     input_field = None
110 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Fewestrevisions')]
111 | 
112 | 
113 | class GetShortPages(GetQueryPage):
114 |     input_field = None
115 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Shortpages')]
116 | 
117 | 
118 | class GetUncategorizedCategories(GetQueryPage):
119 |     input_field = None
120 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedcategories')]
121 | 
122 | 
123 | class GetUncategorizedPages(GetQueryPage):
124 |     input_field = None
125 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedpages')]
126 | 
127 | 
128 | class GetUncategorizedImages(GetQueryPage):
129 |     input_field = None
130 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedimages')]
131 | 
132 | 
133 | class GetUncategorizedTemplates(GetQueryPage):
134 |     input_field = None
135 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Uncategorizedtemplates')]
136 | 
137 | 
138 | class GetUnusedCategories(GetQueryPage):
139 |     input_field = None
140 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedcategories')]
141 | 
142 | 
143 | class GetUnusedImages(GetQueryPage):
144 |     input_field = None
145 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedimages')]
146 | 
147 | 
148 | class GetWantedCategories(GetQueryPage):
149 |     input_field = None
150 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedcategories')]
151 | 
152 | 
153 | class GetWantedFiles(GetQueryPage):
154 |     input_field = None
155 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedfiles')]
156 | 
157 | 
158 | class GetWantedPages(GetQueryPage):
159 |     input_field = None
160 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedpages')]
161 | 
162 | 
163 | class GetWantedTemplates(GetQueryPage):
164 |     input_field = None
165 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Wantedtemplates')]
166 | 
167 | 
168 | class GetUnusedTemplates(GetQueryPage):
169 |     input_field = None
170 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Unusedtemplates')]
171 | 
172 | 
173 | class GetWithoutInterwikiLinks(GetQueryPage):
174 |     input_field = None
175 |     fields = GetQueryPage.fields + [StaticParam('gqppage', 'Withoutinterwiki')]
176 | 
177 | # 'Unwatchedpages' requires being logged in
178 | 


--------------------------------------------------------------------------------
/wapiti/operations/rand.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation, QueryLimit
 5 | from params import StaticParam, SingleParam
 6 | from models import PageInfo
 7 | from utils import OperationExample, coerce_namespace
 8 | 
 9 | class GetRandom(QueryOperation):
10 |     """
11 |     Fetch random pages using MediaWiki's Special:Random.
12 |     """
13 |     field_prefix = 'grn'
14 |     fields = [StaticParam('generator', 'random'),
15 |               StaticParam('prop', 'info'),
16 |               StaticParam('inprop', 'subjectid|talkid|protection'), 
17 |               SingleParam('namespace', default='', coerce=coerce_namespace)]
18 |     input_field = None
19 |     output_type = [PageInfo]
20 |     per_query_limit = QueryLimit(10, 20)
21 |     examples = [OperationExample(doc='basic random')]
22 | 
23 |     def extract_results(self, query_resp):
24 |         ret = []
25 |         for k, pid_dict in query_resp['pages'].iteritems():
26 |             page_info = PageInfo.from_query(pid_dict,
27 |                                             source=self.source)
28 |             ret.append(page_info)
29 |         return ret
30 | 
31 |     def get_cont_str(self, *a, **kw):
32 |         return ''
33 | 
34 | 
35 | class GetRandomArticles(GetRandom):
36 |     def __init__(self, *a, **kw):
37 |         kw['namespace'] = 0
38 |         super(GetRandomArticles, self).__init__(*a, **kw)
39 |     examples = [OperationExample(doc='random articles')]
40 | 
41 | 
42 | class GetRandomCategories(GetRandom):
43 |     def __init__(self, *a, **kw):
44 |         kw['namespace'] = 14
45 |         super(GetRandomCategories, self).__init__(*a, **kw)
46 |     examples = [OperationExample(doc='random categories')]
47 | 
48 | 
49 | class GetRandomFilePages(GetRandom):
50 |     def __init__(self, *a, **kw):
51 |         kw['namespace'] = 6
52 |         super(GetRandomFilePages, self).__init__(*a, **kw)
53 |     examples = [OperationExample(doc='random file pages')]
54 | 


--------------------------------------------------------------------------------
/wapiti/operations/revisions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from base import QueryOperation
  5 | from params import StaticParam, MultiParam, SingleParam
  6 | from models import RevisionInfo, Revision
  7 | from utils import OperationExample
  8 | 
  9 | DEFAULT_PROPS = 'ids|flags|timestamp|user|userid|size|sha1|comment|parsedcomment|tags'
 10 | 
 11 | 
 12 | class GetPageRevisionInfos(QueryOperation):
 13 |     """
 14 |     Fetch revisions for pages.
 15 |     """
 16 |     field_prefix = 'rv'
 17 |     input_field = MultiParam('titles', key_prefix=False)
 18 |     fields = [StaticParam('prop', 'revisions'),
 19 |               MultiParam('prop', DEFAULT_PROPS)]
 20 |     output_type = [RevisionInfo]
 21 |     examples = [OperationExample('Coffee', 10)]
 22 | 
 23 |     def extract_results(self, query_resp):
 24 |         ret = []
 25 |         pages = [p for p in query_resp.get('pages', {}).values()
 26 |                  if 'missing' not in p]
 27 |         for pid_dict in pages:
 28 |             for rev in pid_dict.get('revisions', []):
 29 |                 rev_dict = dict(pid_dict)
 30 |                 rev_dict.update(rev)
 31 |                 rev_info = RevisionInfo.from_query(rev_dict,
 32 |                                                    source=self.source)
 33 |                 ret.append(rev_info)
 34 |         return ret
 35 | 
 36 | 
 37 | class GetRevisionInfos(GetPageRevisionInfos):
 38 |     """
 39 |     Fetch information about specific revision.
 40 |     """
 41 |     input_field = MultiParam('revids', attr='rev_id', key_prefix=False)
 42 |     output_type = RevisionInfo
 43 |     examples = [OperationExample(['538903663', '539916351', '531458383'])]
 44 | 
 45 |     def prepare_params(self, *a, **kw):
 46 |         ret = super(GetRevisionInfos, self).prepare_params()
 47 |         ret.pop(self.field_prefix + 'limit', None)
 48 |         return ret
 49 | 
 50 | 
 51 | class GetCurrentContent(QueryOperation):
 52 |     """
 53 |     Fetch full content for current (top) revision.
 54 |     """
 55 |     input_field = MultiParam('titles', key_prefix=False, attr='title')
 56 |     field_prefix = 'rv'
 57 |     fields = [StaticParam('prop', 'revisions'),
 58 |               MultiParam('prop', DEFAULT_PROPS + '|content'),
 59 |               SingleParam('parse', False),
 60 |               SingleParam('redirects', True, key_prefix=False)]
 61 |     examples = [OperationExample('This page does not exist'),
 62 |                 OperationExample('Coffee')]
 63 |     output_type = Revision
 64 | 
 65 |     def extract_results(self, query_resp):
 66 |         ret = []
 67 |         #redirect_list = query_resp.get('redirects', [])  # TODO
 68 |         #redirects = dict([(r['from'], r['to']) for r in redirect_list])
 69 |         requested_title = self.input_param
 70 |         is_parsed = self.kwargs.get('rvparse', False)
 71 | 
 72 |         pages = query_resp.get('pages', {})
 73 |         for page_id, pid_dict in pages.iteritems():
 74 |             if int(page_id) < 0:
 75 |                 continue
 76 |             rev_dict = dict(pid_dict)
 77 |             rev_dict.update(pid_dict['revisions'][0])
 78 |             revision = Revision.from_query(rev_dict,
 79 |                                            source=self.source,
 80 |                                            is_parsed=is_parsed)
 81 |             revision.req_title = requested_title
 82 |             ret.append(revision)
 83 |         return ret
 84 | 
 85 | 
 86 | class GetRevisionContent(GetCurrentContent):
 87 |     input_field = SingleParam('revids', key_prefix=False, attr='rev_id')
 88 |     fields = [StaticParam('prop', 'revisions'),
 89 |               MultiParam('prop', DEFAULT_PROPS + '|content'),
 90 |               SingleParam('parse', False)]
 91 |     examples = [OperationExample('539916351')]
 92 | 
 93 | 
 94 | class GetCurrentTalkContent(GetCurrentContent):
 95 |     """
 96 |     The simple prefix behavior means this operation will only work on
 97 |     namespace 0 pages. I wouldn't rely on this operation being around
 98 |     for long.
 99 |     """
100 |     input_field = MultiParam('titles', val_prefix='Talk:', key_prefix=False)
101 |     examples = [OperationExample('This page does not exist'),
102 |                 OperationExample('Coffee')]
103 | 


--------------------------------------------------------------------------------
/wapiti/operations/template_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | A very simple Mediawiki template parser that turns template
  4 | references into nested key-value objects.
  5 | 
  6 | From a Python perspective, one can think of TemplateReferences as the
  7 | ``partial()``s to MediaWiki templates.
  8 | 
  9 | Thanks to Mark Williams for drafting this.
 10 | 
 11 | TODO
 12 | ----
 13 | 
 14 | * Save original TemplateReference source
 15 | * Nesting robustness
 16 |   * Autoclose template?
 17 | * Merge TemplateReference and ProtoTemplateRef
 18 | * .add_item() for order preservation
 19 | * items(), keys(), etc.
 20 | * parse out limited html for attempted int/float extraction
 21 | * traverse() helper
 22 | * strip wiki syntax
 23 | * .links, .tables, .subtemplates
 24 | """
 25 | from __future__ import unicode_literals
 26 | 
 27 | import re
 28 | import itertools
 29 | """
 30 | Notes
 31 | -----
 32 | 
 33 | - everything inside html comments is ignored
 34 | - no html in keys
 35 | - html in values
 36 | - '=' only allowed after key if no '=' encountered yet
 37 | 
 38 | _transitiony_things = \
 39 |          {'template': {'key': ['=', '|', '}}'], 'value': ['|', '}}']},
 40 |           'html_comment': ['-->'],
 41 |           'link': [']]'],
 42 |           'table': ['|}']}
 43 | """
 44 | 
 45 | 
 46 | def is_iterable(obj):
 47 |     try:
 48 |         iter(obj)
 49 |     except TypeError:
 50 |         return False
 51 |     return True
 52 | 
 53 | 
 54 | def is_scalar(obj):
 55 |     return not is_iterable(obj) or isinstance(obj, basestring)
 56 | 
 57 | 
 58 | class TemplateReference(object):
 59 |     def __init__(self, name, args, kwargs):
 60 |         self.name = name
 61 |         self.args = args
 62 |         self.kwargs = kwargs
 63 | 
 64 |     @classmethod
 65 |     def from_string(cls, text):
 66 |         tokens = tokenize(text)
 67 |         return [t for t in parse(tokens) if isinstance(t, cls)][0]  # blargh
 68 | 
 69 |     def __repr__(self):
 70 |         cn = self.__class__.__name__
 71 |         return '%s(%r, %r, %r)' % (cn, self.name, self.args, self.kwargs)
 72 | 
 73 |     def __getitem__(self, key):
 74 |         try:
 75 |             return self.kwargs[key]
 76 |         except KeyError:
 77 |             raise KeyError('%r template has no key %r' % (self.name, key))
 78 | 
 79 |     def __iter__(self):
 80 |         return itertools.chain(iter(self.args), self.kwargs.iteritems())
 81 | 
 82 | 
 83 | def get_page_templates(source, raise_exc=True):
 84 |     tokens = tokenize(source)
 85 |     parsed = parse(tokens, raise_exc=raise_exc)
 86 |     return [t for t in parsed if isinstance(t, TemplateReference)]
 87 | 
 88 | 
 89 | class Token(object):
 90 |     def __init__(self, start_index, text):
 91 |         self.start_index = start_index
 92 |         self.text = text
 93 | 
 94 |     @classmethod
 95 |     def from_match(cls, match):
 96 |         return cls(start_index=match.start(), text=match.group())
 97 | 
 98 |     def __repr__(self):
 99 |         cn = self.__class__.__name__
100 |         return '%s(%r)' % (cn, self.text)
101 | 
102 | 
103 | class BufferToken(Token):
104 |     pass
105 | 
106 | 
107 | class CommentToken(BufferToken):
108 |     pass
109 | 
110 | 
111 | class LinkToken(BufferToken):
112 |     pass
113 | 
114 | 
115 | class TableToken(BufferToken):
116 |     pass
117 | 
118 | 
119 | class TemplateLogicToken(BufferToken):
120 |     pass
121 | 
122 | 
123 | class SepToken(Token):
124 |     pass
125 | 
126 | 
127 | class StartTemplateToken(Token):
128 |     pass
129 | 
130 | 
131 | class EndTemplateToken(SepToken):
132 |     pass
133 | 
134 | 
135 | LEXICON = \
136 |     [(r'(\[\[.+?\]\])', lambda m, t: LinkToken.from_match(m)),
137 |      (r'(\{\|.+?\|\})', lambda m, t: TableToken.from_match(m)),
138 |      (r'(\{\{\{.+?\}\}\})', lambda m, t: TemplateLogicToken.from_match(m)),
139 |      (r'(\{\{#.+?\|\}\})', lambda m, t: TemplateLogicToken.from_match(m)),
140 |      (r'(<!--.+?-->)', lambda m, t: CommentToken.from_match(m)),
141 |      (r'\{\{', lambda m, t: StartTemplateToken.from_match(m)),
142 |      (r'\}\}', lambda m, t: EndTemplateToken.from_match(m)),
143 |      (r'=', lambda m, t: SepToken.from_match(m)),
144 |      (r'\|', lambda m, t: SepToken.from_match(m))]
145 | 
146 | 
147 | def build_scanner(lexicon, flags=0):
148 |     import sre_parse
149 |     import sre_compile
150 |     from sre_constants import BRANCH, SUBPATTERN
151 |     # combine phrases into a compound pattern
152 |     p = []
153 |     s = sre_parse.Pattern()
154 |     s.flags = flags
155 |     for phrase, action in lexicon:
156 |         p.append(sre_parse.SubPattern(s, [
157 |             (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))),
158 |         ]))
159 |     s.groups = len(p) + 1
160 |     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
161 |     scanner = sre_compile.compile(p)
162 |     return scanner
163 | 
164 | 
165 | def tokenize(source, lexicon=None):
166 |     lexicon = lexicon or LEXICON
167 |     lex = build_scanner(lexicon, re.DOTALL)
168 |     all_tokens = []
169 |     start, end, prev_end = 0, 0, 0
170 |     for match in lex.finditer(source):
171 |         start, end = match.start(), match.end()
172 |         if prev_end < start:
173 |             all_tokens.append(BufferToken(start, source[prev_end:start]))
174 |         action = lexicon[match.lastindex - 1][1]
175 |         if callable(action):
176 |             # TODO: what should the callbacks want?
177 |             cur_token = action(match, match.group())
178 |             all_tokens.append(cur_token)
179 |         else:
180 |             raise TypeError('expected callable callback, not %r' % (action,))
181 |         prev_end = end
182 |     if prev_end < len(source):
183 |         all_tokens.append(BufferToken(prev_end, source[prev_end:]))
184 |     return all_tokens
185 | 
186 | 
187 | def cond_join(items, sep='', cond=None):
188 |     # TODO: messsss
189 |     if cond is None:
190 |         cond = lambda s: isinstance(s, basestring)
191 |     ret, tmp_buffer = [], []
192 |     for item in items:
193 |         if cond(item):
194 |             tmp_buffer.append(item.strip())  # TODO: remove strip()
195 |         else:
196 |             if tmp_buffer:
197 |                 ret.append(sep.join(tmp_buffer))
198 |                 tmp_buffer = []
199 |             ret.append(item)
200 |     if tmp_buffer:
201 |         ret.append(sep.join(tmp_buffer))
202 |     return ret
203 | 
204 | 
205 | def process_korv(korv):
206 |     if not korv:
207 |         return ''
208 |     # TODO: need fancy split() (for <str> <tmpl> <str> <tmpl>)
209 |     korv = [_kv for _kv in cond_join(korv) if _kv]
210 |     if not korv:
211 |         return ''
212 |     elif len(korv) == 1:
213 |         korv = korv[0]
214 |     if isinstance(korv, basestring):
215 |         korv = korv.strip()
216 |         converters = [int, float, unicode]
217 |         for convert in converters:
218 |             try:
219 |                 return convert(korv)
220 |             except ValueError:
221 |                 pass
222 |             else:
223 |                 raise ValueError('unknown key or value {0}'.format(korv))
224 |     return korv
225 | 
226 | 
227 | class ProtoTemplateRef(object):
228 |     def __init__(self, start_token):
229 |         self.start_token = start_token
230 |         self.args = []
231 |         self.kwargs = []
232 |         self.cur_val = []
233 |         self.tmp_key = None
234 |         self.end_token = None
235 | 
236 |     def to_template_ref(self):
237 |         args = [process_korv(a) for a in self.args]
238 |         name, args = args[0], args[1:]
239 |         if not is_scalar(name):
240 |             raise ValueError('invalid template name %r' % (name,))  # todo:
241 |         kwargs = [(process_korv(k), process_korv(v)) for (k, v) in self.kwargs]
242 |         kwargs = dict(kwargs)
243 |         return TemplateReference(name, args, kwargs)
244 | 
245 |     def __repr__(self):
246 |         cn = self.__class__.__name__
247 |         if not self.args:
248 |             return '%s(%r)' % (cn, self.start_token,)
249 |         return ('%s(%r, %r, %r)' %
250 |                 (cn, self.args[0], self.args[1:], self.kwargs))
251 | 
252 | 
253 | def parse(tokens, raise_exc=True):
254 |     ret = []
255 |     pts = []  # ProtoTemplate stack
256 |     interstish = []
257 |     for token in tokens:
258 |         if isinstance(token, CommentToken):
259 |             continue  # TODO: save comments?
260 |         if isinstance(token, StartTemplateToken):
261 |             if interstish:
262 |                 ret.append(''.join(interstish))
263 |                 interstish = []
264 |             pts.append(ProtoTemplateRef(token))
265 |             continue
266 |         elif not pts:
267 |             interstish.append(token.text)
268 |             continue
269 |         else:
270 |             cpt = pts[-1]
271 | 
272 |         if isinstance(token, SepToken):
273 |             tmp_key, cur_val = cpt.tmp_key, cpt.cur_val
274 |             #''.join(cpt.cur_buff).strip()
275 |             if token.text == '|' or token.text == '}}':
276 |                 if tmp_key is None:
277 |                     # cur_val is a value for a positional arg
278 |                     cpt.args.append(cur_val)
279 |                 else:
280 |                     # cur_val is a value for a keyword arg
281 |                     cpt.kwargs.append((tmp_key, cur_val))
282 |                     cpt.tmp_key = None
283 |                 cpt.cur_val = []
284 |             elif token.text == '=' and tmp_key is None:
285 |                 # cur_val is a key
286 |                 try:
287 |                     cpt.tmp_key = ''.join(cur_val).strip()  # TODO: int()s?
288 |                 except Exception as e:
289 |                     if raise_exc:
290 |                         raise
291 | 
292 |                 cpt.cur_val = []
293 |             else:
294 |                 cpt.cur_val.append(token.text)
295 |         else:
296 |             # links and tables
297 |             cpt.cur_val.append(token.text)
298 | 
299 |         if isinstance(token, EndTemplateToken):
300 |             # create real Template
301 |             pts.pop()
302 |             cpt.end_token = token
303 |             try:
304 |                 comp_tmpl = cpt.to_template_ref()
305 |             except Exception as e:
306 |                 if raise_exc:
307 |                     raise
308 |                 continue
309 |             if pts:
310 |                 pts[-1].cur_val.append(comp_tmpl)
311 |             else:
312 |                 ret.append(comp_tmpl)
313 |         # end loop
314 | 
315 |     return ret
316 | 
317 | 
318 | _BASIC_CITE_TEST = '''{{cite web
319 | | url = [http://www.census.gov/geo/www/gazetteer/files/Gaz_places_national.txt U.S. Census]
320 | | publisher=US Census Bureau
321 | | accessdate =2011
322 | | title = U.S. Census
323 | }}'''
324 | 
325 | _BIGGER_CITE_TEST = '''{{citation
326 | | last = Terplan
327 | | first = Egon
328 | | title = Organizing for Economic Growth
329 | | subtitle = A new approach to business attraction and retention in San Francisco
330 | | work=SPUR Report
331 | | publisher=San Francisco Planning and Urban Research Association
332 | | date = June 7, 2010
333 | | url = http://www.spur.org/publications/library/report/organizing-economic-growth
334 | | quote = During the 1960s and 1970s San Francisco's historic maritime industry relocated to Oakland. ... San Francisco remained a center for business and professional services (such as consulting, law, accounting and finance) and also successfully developed its tourism sector, which became the leading local industry.
335 | | accessdate = January 5, 2013
336 | }}'''
337 | 
338 | _SF_CLIMATE_TEST = '''{{climate chart
339 | | San Francisco
340 | |46.2|56.9|4.5
341 | |48.1|60.2|4.61
342 | |49.1|62.9|3.26
343 | |49.9|64.3|1.46
344 | |51.6|65.6|0.7
345 | |53.3|67.9|0.16
346 | |54.6|68.2|0
347 | |55.6|69.4|0.06
348 | |55.7|71.3|0.21
349 | |54.3|70.4|1.13
350 | |50.7|63.2|3.16
351 | |46.7|57.3|4.56
352 | |float=right
353 | |clear=none
354 | |units=imperial}}'''
355 | 
356 | _SF_INFOBOX = '''{{Infobox settlement
357 | |name = San Francisco
358 | |official_name = City and County of San Francisco
359 | |nickname = ''The City by the Bay''; ''Fog City''; ''S.F.''; ''Frisco'';<ref name="Frisco okay" /><ref name="Don't Call It Frisco" /><ref name="Frisco" /><ref name="Friscophobia" /> ''The City that Knows How'' (''antiquated'');<ref name="The City that Knows How" /> ''Baghdad by the Bay'' (''antiquated'');<ref name="Baghdad by the Bay" /> ''The Paris of the West''<ref name="The Paris of the West" />
360 | | settlement_type = [[Consolidated city-county|City and county]]
361 | | motto = ''Oro en Paz, Fierro en Guerra''<br />(English: "Gold in Peace, Iron in War")
362 | | image_skyline = SF From Marin Highlands3.jpg
363 | | imagesize = 280px
364 | | image_caption = San Francisco from the Marin Headlands, with the Golden Gate Bridge in the foreground
365 | | image_flag = Flag of San Francisco.svg
366 | | flag_size = 100px
367 | | image_seal = Sfseal.png
368 | | seal_size = 100px
369 | | image_map = California county map (San Francisco County enlarged).svg
370 | | mapsize = 200px
371 | | map_caption = Location of San Francisco in California
372 | | pushpin_map = USA2
373 | | pushpin_map_caption = Location in the United States
374 | <!-- Location ------------------>
375 | | coordinates_region = US-CA
376 | | subdivision_type = [[List of countries|Country]]
377 | | subdivision_name = {{USA}}
378 | | subdivision_type1 = [[Political divisions of the United States|State]]
379 | | subdivision_name1 = {{flag|California}}
380 | 
381 | <!-- Politics ----------------->
382 | | government_type = [[Mayor-council government|Mayor-council]]
383 | | governing_body = [[San Francisco Board of Supervisors|Board of Supervisors]]
384 | | leader_title = [[Mayor of San Francisco]]
385 | | leader_name = [[Ed Lee (politician)|Ed Lee]]
386 | | leader_title1 = [[San Francisco Board of Supervisors|Board of Supervisors]]
387 | | leader_name1 = {{Collapsible list
388 | | title = Supervisors
389 | | frame_style = border:none; padding: 0;
390 | | list_style = text-align:left;
391 | | 1 = [[Eric Mar]]
392 | | 2 = [[Mark Farrell (politician)|Mark Farrell]]
393 | | 3 = [[David Chiu (politician)|David Chiu]]
394 | | 4 = [[Katy Tang]]
395 | | 5 = [[London Breed]]
396 | | 6 = [[Jane Kim]]
397 | | 7 = [[Norman Yee]]
398 | | 8 = [[Scott Wiener]]
399 | | 9 = [[David Campos]]
400 | | 10 = [[Malia Cohen]]
401 | | 11 = [[John Avalos]]}}
402 | | leader_title2 = [[California State Assembly]]
403 | | leader_name2 = [[Tom Ammiano]] ([[California Democratic Party|D]])<br />[[Phil Ting]] ([[California Democratic Party|D]])
404 | | leader_title3 = [[California State Senate]]
405 | | leader_name3 = [[Leland Yee]] ([[California Democratic Party|D]])<br />[[Mark Leno]] ([[California Democratic Party|D]])
406 | | leader_title4 = [[United States House of Representatives]]
407 | | leader_name4 = [[Nancy Pelosi]] ([[Democratic Party (United States)|D]])<br />[[Jackie Speier]] ([[Democratic Party (United States)|D]])
408 | | established_title = Founded
409 | | established_date = June 29, 1776
410 | | established_title1 = [[Municipal incorporation|Incorporated]]
411 | | established_date1 = April 15, 1850<ref>{{cite web
412 | | url = http://www6.sfgov.org/index.aspx?page=4
413 | | title = San Francisco: Government
414 | | publisher = SFGov.org
415 | | accessdate =March 8, 2012
416 | | quote = San Francisco was incorporated as a City on April 15th, 1850 by act of the Legislature.}}</ref>
417 | | founder = Lieutenant [[José Joaquin Moraga]] and [[Francisco Palóu]]
418 | | named_for = [[St. Francis of Assisi]]
419 | 
420 | <!-- Area------------------>
421 | |area_magnitude =
422 | | unit_pref = US
423 | | area_footnotes = <ref name="Census 2010-GCT-PH1" />
424 | | area_total_sq_mi = 231.89
425 | | area_land_sq_mi = 46.87
426 | | area_water_sq_mi = 185.02
427 | | area_water_percent = 79.79
428 | | area_note =
429 | | area_metro_sq_mi = 3524.4
430 | 
431 | <!-- Elevation ------------------------->
432 | | elevation_ft = 52
433 | | elevation_max_ft = 925
434 | | elevation_min_ft = 0
435 | 
436 | <!-- Population ----------------------->
437 | | population_as_of = 2012
438 | | population_footnotes =
439 | | population_total = 815358 <ref>http://voices.yahoo.com/largest-us-cities-population-size-2012-6453656.html?cat=16</ref>
440 | | population_density_sq_mi = 17179.2
441 | | population = [[Combined statistical area|CSA]]: 8371000
442 | | population_metro = 4335391
443 | | population_urban = 3273190
444 | | population_demonym = San Franciscan
445 | 
446 | <!-- General information --------------->
447 | | timezone = [[Pacific Time Zone|Pacific Standard Time]]
448 | | utc_offset = -8
449 | | timezone_DST = [[Pacific Time Zone|Pacific Daylight Time]]
450 | | utc_offset_DST = -7
451 | | latd = 37
452 | | latm = 47
453 | | latNS = N
454 | | longd = 122
455 | | longm = 25
456 | | longEW = W
457 | | coordinates_display = 8
458 | 
459 | <!-- Area/postal codes & others -------->
460 | | postal_code_type = [[ZIP Code]]
461 | | postal_code = 94101–94112, 94114–94147, 94150–94170, 94172, 94175, 94177
462 | | area_code = [[Area code 415|415]]
463 | | blank_name = [[Federal Information Processing Standard|FIPS code]]
464 | | blank_info = 06-67000
465 | | blank1_name = [[Federal Information Processing Standard|FIPS code]]
466 | | blank1_info = 06-075
467 | | blank2_name = [[Geographic Names Information System|GNIS]] feature ID
468 | | blank2_info = 277593
469 | | website = {{URL|http://www.sfgov.org/}}
470 | | footnotes =
471 | }}
472 | '''
473 | 
474 | 
475 | _ALL_TEST_STRS = [_BASIC_CITE_TEST,
476 |                   _BIGGER_CITE_TEST,
477 |                   _SF_CLIMATE_TEST,
478 |                   _SF_INFOBOX]
479 | 
480 | def _main():
481 |     import os
482 |     import pprint
483 |     CUR_DIR = os.path.dirname(os.path.abspath(__file__))
484 |     ret = []
485 |     try:
486 |         for test in _ALL_TEST_STRS:
487 |             ret.append(TemplateReference.from_string(test))
488 |             pprint.pprint(ret[-1])
489 |         sf_infobox_tmpl = TemplateReference.from_string(_SF_INFOBOX)
490 |         print 'Testing accessor:', sf_infobox_tmpl['leader_name1']['title']
491 | 
492 |         test_path = os.path.join(CUR_DIR, '_test_tmpls',
493 |                                  'regr_moctezuma_parser_funcs.txt')
494 |         tmpl_test = open(test_path).read().decode('utf-8')
495 |         tmpls = get_page_templates(tmpl_test)
496 |         import pdb;pdb.set_trace()
497 |     except Exception as e:
498 |         print e
499 |         import pdb
500 |         pdb.post_mortem()
501 |         raise
502 | 
503 |     return ret
504 | 
505 | 
506 | if __name__ == '__main__':
507 |     _main()
508 | 


--------------------------------------------------------------------------------
/wapiti/operations/templates.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from collections import OrderedDict
  5 | import re
  6 | 
  7 | from base import QueryOperation, Operation, NoMoreResults
  8 | from params import SingleParam, StaticParam, MultiParam, PassthroughParam
  9 | from models import PageInfo
 10 | from utils import OperationExample
 11 | from revisions import GetCurrentContent
 12 | from template_parser import get_page_templates, TemplateReference, _BASIC_CITE_TEST
 13 | 
 14 | 
 15 | class GetTemplates(QueryOperation):
 16 |     field_prefix = 'gtl'
 17 |     input_field = MultiParam('titles', key_prefix=False)
 18 |     fields = [StaticParam('generator', 'templates'),
 19 |               StaticParam('prop', 'info'),
 20 |               StaticParam('inprop', 'subjectid|talkid|protection')]
 21 |     output_type = [PageInfo]
 22 |     examples = [OperationExample('Coffee')]
 23 | 
 24 |     def extract_results(self, query_resp):
 25 |         ret = []
 26 |         for k, pid_dict in query_resp['pages'].iteritems():
 27 |             page_ident = PageInfo.from_query(pid_dict,
 28 |                                              source=self.source)
 29 |             ret.append(page_ident)
 30 |         return ret
 31 | 
 32 | 
 33 | class GetTranscludes(QueryOperation):
 34 |     input_field = SingleParam('title', val_prefix='Template:')
 35 |     field_prefix = 'gei'
 36 |     fields = [StaticParam('generator', 'embeddedin'),
 37 |               StaticParam('prop', 'info'),
 38 |               StaticParam('inprop', 'subjectid|talkid|protection')]
 39 |     output_type = [PageInfo]
 40 |     examples = [OperationExample('Template:ArticleHistory')]
 41 | 
 42 |     def extract_results(self, query_resp):
 43 |         ret = []
 44 |         for k, pid_dict in query_resp.get('pages', {}).items():
 45 |             page_ident = PageInfo.from_query(pid_dict,
 46 |                                              source=self.source)
 47 |             ret.append(page_ident)
 48 |         return ret
 49 | 
 50 | 
 51 | class GetParsedTemplates(Operation):
 52 |     input_field = PassthroughParam('content')
 53 |     output_type = [TemplateReference]
 54 |     examples = [OperationExample(_BASIC_CITE_TEST, limit=1)]
 55 | 
 56 |     @property
 57 |     def remaining(self):
 58 |         if self.results:
 59 |             return 0
 60 |         return 1  # TODO: fix
 61 | 
 62 |     def process(self):
 63 |         if None in self.results:
 64 |             raise NoMoreResults()
 65 |         content = getattr(self.input_param, 'content', self.input_param)
 66 |         res = get_page_templates(content, raise_exc=False)
 67 |         self.results[None] = res
 68 |         return list(res)
 69 | 
 70 | 
 71 | class GetParsedTemplatesPage(Operation):
 72 |     subop_chain = [GetCurrentContent,
 73 |                    GetParsedTemplates]
 74 | 
 75 |     examples = [OperationExample('Coffee', limit=1)]
 76 | 
 77 | 
 78 | class GetParsedTranscludes(Operation):
 79 |     '''
 80 |     Template names may redirect, but this operation doesn't handle that yet
 81 |     '''
 82 |     subop_chain = [GetTranscludes,
 83 |                    GetCurrentContent,
 84 |                    GetParsedTemplates]
 85 |     examples = [OperationExample('ArticleHistory', 10)]
 86 | 
 87 |     def _update_results(self, results):
 88 |         _, _, tmpl_name = self.input_param.rpartition(':')
 89 |         filt_res = [res for res in results
 90 |                     if res.name.lower() == tmpl_name.lower()]
 91 |         return super(GetParsedTranscludes, self)._update_results(filt_res)
 92 | 
 93 | 
 94 | def tmpl_text_to_odict(text):
 95 |     ret = OrderedDict()
 96 |     pairs = text.split('|')
 97 |     for p in pairs:
 98 |         p = p.strip()
 99 |         if not p:
100 |             continue
101 |         k, _, v = p.partition('=')
102 |         k = k.strip()
103 |         v = v.strip()
104 |         if not k:
105 |             print 'blank key error', k
106 |             #import pdb;pdb.set_trace()
107 |             continue
108 |         if k in ret:
109 |             print 'duplicate key error', k
110 |             #import pdb;pdb.set_trace()
111 |             continue
112 |         ret[k] = v
113 |     return ret
114 | 
115 | 
116 | def extract_template(tmpl_name, text):
117 |     ret = []
118 |     tmpl_re = re.compile('\{\{(\s*' + tmpl_name + '.*?)\}\}',
119 |                          flags=(re.DOTALL | re.IGNORECASE| re.M))
120 |     tmpl_txts = re.findall(tmpl_re, text)
121 |     for txt in tmpl_txts:
122 |         ret.append(tmpl_text_to_odict(txt))
123 |     return ret
124 | 
125 | 
126 | #class GetAllTranscludes(GetTranscludes):
127 | #    field_prefix = 'at'
128 | #    input_field = None
129 | #    fields = [StaticParam('list', 'alltransclusions'),
130 | #              #StaticParam('prop', 'info'),
131 | #              StaticParam('atprop', 'ids|title')] # 'subjectid|talkid|protection')]
132 | 


--------------------------------------------------------------------------------
/wapiti/operations/test_basic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | import base
 5 | 
 6 | from misc import GetPageInfo
 7 | from models import PageIdentifier
 8 | from category import GetSubcategoryInfos
 9 | 
10 | from revisions import GetCurrentContent, GetPageRevisionInfos
11 | from meta import GetSourceInfo
12 | 
13 | 
14 | def test_unicode_title():
15 |     get_beyonce = GetCurrentContent("Beyoncé Knowles")
16 |     assert get_beyonce()
17 | 
18 | 
19 | def test_coercion_basic():
20 |     pid = PageIdentifier(title='Africa', page_id=123, ns=4, source='enwp')
21 |     get_subcats = GetSubcategoryInfos(pid)
22 |     assert get_subcats.input_param == 'Category:Africa'
23 | 
24 | 
25 | def test_web_request():
26 |     url = 'http://upload.wikimedia.org/wikipedia/commons/d/d2/Mcgregor.jpg'
27 |     get_photo = base.WebRequestOperation(url)
28 |     res = get_photo()
29 |     text = res[0]
30 |     assert len(text) == 16408
31 | 
32 | 
33 | def test_get_html():
34 |     get_africa_html = base.GetPageHTML('Africa')
35 |     res = get_africa_html()
36 |     text = res[0]
37 |     assert len(text) > 350000
38 | 
39 | 
40 | def test_missing_revisions():
41 |     get_revs = GetPageRevisionInfos('Coffee_lololololol')
42 |     rev_list = get_revs()
43 |     '''
44 |     Should return 'missing' and negative pageid
45 |     '''
46 |     assert len(rev_list) == 0
47 | 
48 | 
49 | def test_get_meta():
50 |     get_source_info = GetSourceInfo()
51 |     meta = get_source_info()
52 |     assert meta
53 | 
54 | 
55 | def test_client_passed_to_subops():
56 |     # This tests whether the client object given to the initial operation
57 |     # is passed to its sub-operations.
58 | 
59 |     # Use just enough titles to force multiplexing so that we can get
60 |     # sub ops to test.
61 |     titles = ['a'] * (base.DEFAULT_QUERY_LIMIT.get_limit() + 1)
62 | 
63 |     client = base.MockClient()
64 |     op = GetPageInfo(titles, client=client)
65 |     assert id(op.subop_queues[0].peek().client) == id(client)
66 | 


--------------------------------------------------------------------------------
/wapiti/operations/test_operations.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | import base
 5 | import category
 6 | import feedback
 7 | import files
 8 | import links
 9 | import meta
10 | import misc
11 | import protection
12 | import rand
13 | import revisions
14 | import templates
15 | import user
16 | 
17 | from revisions import GetRevisionInfos
18 | 
19 | MAGNITUDE = 1
20 | 
21 | 
22 | def limit_equal_or_depleted(op):
23 |     if getattr(op, '_notices', None):
24 |         return False
25 |     elif getattr(op, 'is_depleted', None):
26 |         return True
27 |     elif len(op.results) == op.limit:
28 |         return True
29 |     return False
30 | 
31 | 
32 | def get_op_examples():
33 |     ops = list(base.OperationMeta._all_ops)
34 |     ret = []
35 |     for op in ops:
36 |         examples = getattr(op, 'examples', None)
37 |         if not examples:
38 |             continue
39 |         ret.extend(op.examples)
40 |     return ret
41 | 
42 | 
43 | def pytest_generate_tests(metafunc):
44 |     #if 'limit' in metafunc.fixturenames:  # TODO
45 |     # keyword = metafunc.config.option.keyword
46 |     # it's also too hard to override matching behavior
47 |     if metafunc.function is test_op_example:
48 |         mag = metafunc.config.getoption('--mag')
49 |         op_examples = get_op_examples()
50 |         #op_examples = [ex for ex in op_examples
51 |         #               if keyword.lower() in ex.op_name.lower()]
52 |         ops = [op_ex.make_op(mag=mag) for op_ex in op_examples]
53 |         _test_tuples = [(repr(op), op) for op in ops]
54 |         metafunc.parametrize(('op_repr', 'op'), _test_tuples)
55 |         pass
56 | 
57 | 
58 | #def pytest_funcarg__mag(request):
59 | #    # TODO: switch to command line argument
60 | #    return MAGNITUDE
61 | 
62 | 
63 | #def pytest_funcarg__limit(request):
64 | # wish there was a good way to compose this with mag and the current
65 | # value of the function's "limit" keyword argument to make the final
66 | # limit return 1
67 | 
68 | 
69 | def test_multiplexing(mag):
70 |     limit = mag * 100
71 |     rev_ids = [str(x) for x in range(543184935 - limit, 543184935)]
72 |     get_rev_infos = GetRevisionInfos(rev_ids)
73 |     rev_infos = get_rev_infos()
74 |     assert len(rev_infos) > (0.9 * limit)  # a couple might be missing
75 | 
76 | 
77 | def test_op_example(op_repr, op):
78 |     op.process_all()
79 |     assert limit_equal_or_depleted(op)
80 | 


--------------------------------------------------------------------------------
/wapiti/operations/user.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from base import QueryOperation
 5 | from params import SingleParam, StaticParam
 6 | from models import RevisionInfo
 7 | from utils import OperationExample
 8 | 
 9 | 
10 | DEFAULT_PROPS = 'ids|flags|timestamp|size|comment|tags|title'
11 | 
12 | 
13 | class GetUserContribs(QueryOperation):
14 |     field_prefix = 'uc'
15 |     input_field = SingleParam('user')
16 |     fields = [StaticParam('list', 'usercontribs'),
17 |               StaticParam('ucprop', DEFAULT_PROPS)]
18 |     output_type = [RevisionInfo]
19 |     examples = [OperationExample('Jimbo Wales')]
20 | 
21 |     def extract_results(self, query_resp):
22 |         ret = []
23 |         for rev_dict in query_resp.get('usercontribs', []):
24 |             user_contrib = RevisionInfo.from_query(rev_dict,
25 |                                                    source=self.source)
26 |             ret.append(user_contrib)
27 |         return ret
28 | 
29 | 
30 | #class GetUserContribRevisions(Operation):
31 | #    subop_chain = (GetUserContribs, GetRevision)
32 | 


--------------------------------------------------------------------------------
/wapiti/operations/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | from heapq import heappush, heappop
  5 | import itertools
  6 | from functools import total_ordering
  7 | 
  8 | 
  9 | def is_scalar(obj):
 10 |     return not hasattr(obj, '__iter__') or isinstance(obj, basestring)
 11 | 
 12 | 
 13 | def prefixed(arg, prefix=None):
 14 |     if prefix and not arg.startswith(prefix):
 15 |         arg = prefix + arg
 16 |     return arg
 17 | 
 18 | 
 19 | @total_ordering
 20 | class MaxInt(long):
 21 |     """
 22 |     A quite-large integer type that tries to be like float('inf')
 23 |     (Infinity), but can be used for slicing and other integer
 24 |     operations. float('inf') is generally more correct, except that
 25 |     mixing a float and integer in arithmetic operations will result in
 26 |     a float, which will raise an error on slicing.
 27 |     """
 28 |     def __new__(cls, *a, **kw):
 29 |         return super(MaxInt, cls).__new__(cls, sys.maxint + 1)
 30 | 
 31 |     def __init__(self, name='MAX'):
 32 |         self._name = str(name)
 33 | 
 34 |     def __repr__(self):
 35 |         return self._name
 36 | 
 37 |     def __str__(self):
 38 |         return repr(self)
 39 | 
 40 |     # TODO: better math
 41 |     for func in ('__add__', '__sub__', '__mul__', '__floordiv__', '__div__',
 42 |                  '__mod__', '__divmod__', '__pow__', '__lshift__',
 43 |                  '__rshift__'):
 44 |         locals()[func] = lambda self, other: self
 45 | 
 46 |     def __gt__(self, other):
 47 |         return not self == other
 48 | 
 49 |     def __eq__(self, other):
 50 |         return isinstance(other, MaxInt)
 51 | 
 52 |     def __int__(self):
 53 |         return self
 54 | 
 55 | 
 56 | class OperationExample(object):
 57 |     """
 58 |     Sort of like a partial, but specialer.
 59 | 
 60 |     # other types of tests?
 61 |     """
 62 |     def __init__(self,
 63 |                  param=None,
 64 |                  limit=None,
 65 |                  op_type=None,
 66 |                  **kw):
 67 |         self.op_type = op_type
 68 |         self.param = param
 69 |         self.limit = limit
 70 | 
 71 |         self.doc = kw.pop('doc', '')
 72 |         self.test = kw.pop('test', None)
 73 |         # test defaults to limit_equal_or_depleted in test_ops.py
 74 |         if kw:
 75 |             raise TypeError('got unexpected keyword arguments: %r' % kw)
 76 | 
 77 |     @property
 78 |     def op_name(self):
 79 |         if self.op_type is None:
 80 |             return None
 81 |         return self.op_type.__name__
 82 | 
 83 |     @property
 84 |     def disp_name(self):
 85 |         if not self.op_type:
 86 |             return '(unbound OperationExample)'
 87 |         tmpl = '%(type)s(%(param)r, limit=%(limit)s)'
 88 |         if self.op_type.input_field is None:
 89 |             tmpl = '%(type)s(limit=%(limit)s)'
 90 | 
 91 |         return tmpl % {'type': self.op_type.__name__,
 92 |                        'param': self.param,
 93 |                        'limit': self.limit}
 94 | 
 95 |     def bind_op_type(self, op_type):
 96 |         if self.op_type is None:
 97 |             self.op_type = op_type
 98 |         if self.limit is None:
 99 |             try:
100 |                 pql = op_type.per_query_limit
101 |             except AttributeError:
102 |                 pql = op_type.subop_chain[0].per_query_limit
103 |             self.limit = pql.get_limit()
104 |         return
105 | 
106 |     def make_op(self, mag=None):
107 |         if not self.op_type:
108 |             raise TypeError('no Operation type assigned')
109 |         mag = int(mag or 1)
110 |         limit = self.limit * mag
111 |         if self.op_type.input_field is None:
112 |             return self.op_type(limit=limit)
113 |         return self.op_type(self.param, limit=limit)
114 | 
115 |     def __repr__(self):
116 |         cn = self.__class__.__name__
117 |         kwargs = ['param', 'limit', 'test', 'doc']
118 |         kw_parts = ['op_type=%s' % self.op_name]
119 |         vals = [getattr(self, a) for a in kwargs if getattr(self, a)]
120 |         kw_parts.extend(['%s=%r' % (a, v) for a, v in zip(kwargs, vals)])
121 |         kwarg_str = ', '.join(kw_parts)
122 |         return '%s(%s)' % (cn, kwarg_str)
123 | 
124 |     __str__ = __repr__
125 | 
126 | 
127 | """
128 | TypeWrapper and MetaTypeWrapper are a pair of what are technically
129 | metaclasses, but really just a very overwrought way of enabling
130 | customized versions of types floating around in some
131 | locations. Because Wapiti is a DSL, but also just a bunch of Python,
132 | we have to deal with the fact that if you modify a type/class, it will
133 | be modified everywhere that references it.
134 | 
135 | TL;DR: This overblown thing lets Operations use something like
136 | Prioritized(GetCategory, key='total_count'), which sets a priority for
137 | better queueing, without modifying the GetCategory Operation
138 | itself. (Different operations will want to prioritiez different
139 | things.)
140 | 
141 | (There is almost certainly a better way, but this was a bit of
142 | fun. Ever made an object that is an instance and a subclass of
143 | itself?)
144 | """
145 | 
146 | 
147 | def make_type_wrapper(name, init_args=None):
148 |     init_args = init_args or []
149 |     args, defaults = [], {}
150 |     for ia in init_args:
151 |         try:
152 |             arg, _default = ia
153 |             defaults[arg] = _default
154 |         except ValueError:
155 |             arg = ia
156 |         if not isinstance(arg, basestring):
157 |             raise TypeError('expected string arg name, not %r' % arg)
158 |         args.append(arg)
159 | 
160 |     attrs = {'_args': args, '_defaults': defaults}
161 |     return WrapperType(str(name), (Wrapper,), attrs)
162 | 
163 | 
164 | class WrapperType(type):
165 |     @property
166 |     def _repr_args(self):
167 |         ret = []
168 |         for a in self._args:
169 |             try:
170 |                 ret.append((a, self._defaults[a]))
171 |             except KeyError:
172 |                 ret.append(a)
173 |         return ret
174 | 
175 |     def __repr__(cls):
176 |         name, cname = cls.__name__, cls.__class__.__name__
177 |         if cls._repr_args:
178 |             return '%s(%r, %r)' % (cname, name, cls._repr_args)
179 |         else:
180 |             return '%s(%r)' % (cname, name)
181 | 
182 | 
183 | class Wrapper(object):
184 |     __metaclass__ = WrapperType
185 |     _args, _defaults = [], {}
186 | 
187 |     def __init__(self, to_wrap, *args, **kwargs):
188 |         wrapped_dict = {}
189 |         if isinstance(to_wrap, Wrapper):
190 |             wrapped_dict = dict(to_wrap._wrapped_dict)
191 |             to_wrap = to_wrap._wrapped
192 |         self.__dict__['_wrapped'] = to_wrap
193 |         self.__dict__['_wrapped_dict'] = wrapped_dict
194 | 
195 |         cn = self.__name__
196 |         for arg_i, arg_name in enumerate(self._args):
197 |             try:
198 |                 val = args[arg_i]
199 |                 if arg_name in kwargs:
200 |                     raise TypeError('%s got multiple values for arg %r'
201 |                                     % (cn, arg_name))
202 |             except IndexError:
203 |                 try:
204 |                     val = kwargs.pop(arg_name)
205 |                 except KeyError:
206 |                     try:
207 |                         val = self._defaults[arg_name]
208 |                     except KeyError:
209 |                         raise TypeError('%s expected required arg %r'
210 |                                         % (cn, arg_name))
211 |             setattr(self, arg_name, val)
212 |         return
213 | 
214 |     def __repr__(self):
215 |         kv = ', '.join(['%s=%r' % (k, v) for k, v
216 |                         in self._wrapped_dict.items()])
217 |         tmpl = "<wrapped %r (%s)>"
218 |         return tmpl % (self._wrapped, kv)
219 | 
220 |     def __getattr__(self, name):
221 |         return getattr(self._wrapped, name)
222 | 
223 |     def __setattr__(self, name, val):
224 |         super(Wrapper, self).__setattr__(name, val)
225 |         self._wrapped_dict[name] = val
226 | 
227 |     def __delattr__(self, name, val):
228 |         super(Wrapper, self).__delattr__(name, val)
229 |         self._wrapped_dict.pop(name, None)
230 | 
231 |     def __call__(self, *a, **kw):
232 |         return self._wrapped(*a, **kw)
233 | 
234 | 
235 | REMOVED = '<removed-task>'
236 | 
237 | 
238 | class PriorityQueue(object):
239 |     """
240 |     Real quick type based on the heapq docs.
241 |     """
242 |     def __init__(self):
243 |         self._pq = []
244 |         self._entry_map = {}
245 |         self.counter = itertools.count()
246 | 
247 |     def add(self, task, priority=None):
248 |         # larger numbers = higher priority
249 |         priority = -int(priority or 0)
250 |         if task in self._entry_map:
251 |             self.remove_task(task)
252 |         count = next(self.counter)
253 |         entry = [priority, count, task]
254 |         self._entry_map[task] = entry
255 |         heappush(self._pq, entry)
256 | 
257 |     def remove(self, task):
258 |         entry = self._entry_map.pop(task)
259 |         entry[-1] = REMOVED
260 | 
261 |     def _cull(self):
262 |         while self._pq:
263 |             priority, count, task = self._pq[0]
264 |             if task is REMOVED:
265 |                 heappop(self._pq)
266 |                 continue
267 |             return
268 |         raise IndexError('empty priority queue')
269 | 
270 |     def peek(self, default=REMOVED):
271 |         try:
272 |             self._cull()
273 |             _, _, task = self._pq[0]
274 |         except IndexError:
275 |             if default is not REMOVED:
276 |                 return default
277 |             raise IndexError('peek on empty queue')
278 |         return task
279 | 
280 |     def pop(self, default=REMOVED):
281 |         try:
282 |             self._cull()
283 |             _, _, task = heappop(self._pq)
284 |             del self._entry_map[task]
285 |         except IndexError:
286 |             if default is not REMOVED:
287 |                 return default
288 |             raise IndexError('pop on empty queue')
289 |         return task
290 | 
291 |     def __len__(self):
292 |         return len(self._entry_map)
293 | 
294 | 
295 | def chunked_iter(src, size, **kw):
296 |     """
297 |     Generates 'size'-sized chunks from 'src' iterable. Unless
298 |     the optional 'fill' keyword argument is provided, iterables
299 |     not even divisible by 'size' will have a final chunk that is
300 |     smaller than 'size'.
301 | 
302 |     Note that fill=None will in fact use None as the fill value.
303 | 
304 |     >>> list(chunked_iter(range(10), 3))
305 |     [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
306 |     >>> list(chunked_iter(range(10), 3, fill=None))
307 |     [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
308 |     """
309 |     size = int(size)
310 |     if size <= 0:
311 |         raise ValueError('expected a positive integer chunk size')
312 |     do_fill = True
313 |     try:
314 |         fill_val = kw.pop('fill')
315 |     except KeyError:
316 |         do_fill = False
317 |         fill_val = None
318 |     if kw:
319 |         raise ValueError('got unexpected keyword arguments: %r' % kw.keys())
320 |     if not src:
321 |         return
322 |     cur_chunk = []
323 |     i = 0
324 |     for item in src:
325 |         cur_chunk.append(item)
326 |         i += 1
327 |         if i % size == 0:
328 |             yield cur_chunk
329 |             cur_chunk = []
330 |     if cur_chunk:
331 |         if do_fill:
332 |             lc = len(cur_chunk)
333 |             cur_chunk[lc:] = [fill_val] * (size - lc)
334 |         yield cur_chunk
335 |     return
336 | 
337 | # From http://en.wikipedia.org/wiki/Wikipedia:Namespace
338 | NAMESPACES = {
339 |     'Main': 0,
340 |     'Talk': 1,
341 |     'User': 2,
342 |     'User talk': 3,
343 |     'Wikipedia': 4,
344 |     'Wikipedia talk': 5,
345 |     'File': 6,
346 |     'File talk': 7,
347 |     'MediaWiki': 8,
348 |     'MediaWiki talk': 9,
349 |     'Template': 10,
350 |     'Template talk': 11,
351 |     'Help': 12,
352 |     'Help talk': 13,
353 |     'Category': 14,
354 |     'Category talk': 15,
355 |     'Portal': 100,
356 |     'Portal talk': 101,
357 |     'Book': 108,
358 |     'Book talk': 109,
359 |     'Special': -1,
360 |     'Media': -2}
361 | 
362 | 
363 | def bucketize(src, keyfunc=None):
364 |     """
365 |     Group values in 'src' iterable by value returned by 'keyfunc'.
366 |     keyfunc defaults to bool, which will group the values by
367 |     truthiness; at most there will be two keys, True and False, and
368 |     each key will have a list with at least one item.
369 | 
370 |     >>> bucketize(range(5))
371 |     {False: [0], True: [1, 2, 3, 4]}
372 |     >>> is_odd = lambda x: x % 2 == 1
373 |     >>> bucketize(range(5), is_odd)
374 |     {False: [0, 2, 4], True: [1, 3]}
375 | 
376 |     Value lists are not deduplicated:
377 | 
378 |     >>> bucketize([None, None, None, 'hello'])
379 |     {False: [None, None, None], True: ['hello']}
380 |     """
381 |     if not is_iterable(src):
382 |         raise TypeError('expected an iterable')
383 |     if keyfunc is None:
384 |         keyfunc = bool
385 |     if not callable(keyfunc):
386 |         raise TypeError('expected callable key function')
387 | 
388 |     ret = {}
389 |     for val in src:
390 |         key = keyfunc(val)
391 |         ret.setdefault(key, []).append(val)
392 |     return ret
393 | 
394 | 
395 | def bucketize_bool(src, keyfunc=None):
396 |     """
397 |     Like bucketize, but for added convenience returns a tuple of
398 |     (truthy_values, falsy_values).
399 | 
400 |     >>> nonempty, empty = bucketize_bool(['', '', 'hi', '', 'bye'])
401 |     >>> nonempty
402 |     ['hi', 'bye']
403 | 
404 |     keyfunc defaults to bool, but can be carefully overridden to
405 |     use any function that returns either True or False.
406 | 
407 |     >>> import string
408 |     >>> is_digit = lambda x: x in string.digits
409 |     >>> decimal_digits, hexletters = bucketize_bool(string.hexdigits, is_digit)
410 |     >>> ''.join(decimal_digits), ''.join(hexletters)
411 |     ('0123456789', 'abcdefABCDEF')
412 |     """
413 |     bucketized = bucketize(src, keyfunc)
414 |     return bucketized.get(True, []), bucketized.get(False, [])
415 | 
416 | def coerce_namespace(ns_arg):
417 |     ns_str = str(ns_arg).capitalize()
418 |     return NAMESPACES.get(ns_str, ns_str)
419 |     


--------------------------------------------------------------------------------
/wapiti/ransom.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | """
  5 | This is like a mini version of requests, which has simply
  6 | grown too heavy and presented multiple compatibility issues
  7 | re: api changes and gevent.
  8 | """
  9 | import urllib2
 10 | import gzip
 11 | 
 12 | from compat import (unicode, bytes, OrderedDict, StringIO,
 13 |                     urlparse, urlunparse, urlencode, requote)
 14 | 
 15 | 
 16 | DEFAULT_CONFIG = {
 17 |     'headers': {'User-Agent': 'reqs/0.0.0'}}
 18 | 
 19 | 
 20 | class Response(object):
 21 |     """
 22 |     echoing the tone of the rest of the module, this is abysmally
 23 |     oversimplified and will be improved soon.
 24 |     """
 25 |     def __init__(self, status_code=None, text=None, headers=None, error=None):
 26 |         self.status_code = status_code
 27 |         self.text = text
 28 |         self.headers = headers
 29 |         self.error = error
 30 | 
 31 | 
 32 | def get_items(iterable):
 33 |     if not iterable:
 34 |         return []
 35 |     return OrderedDict(iterable).items()
 36 | 
 37 | 
 38 | def get_keys(iterable):
 39 |     if not iterable:
 40 |         return []
 41 |     return OrderedDict(iterable).keys()
 42 | 
 43 | 
 44 | def is_scalar(obj):
 45 |     return not hasattr(obj, '__iter__') or isinstance(obj, basestring)
 46 | 
 47 | 
 48 | def get_encoded(val):
 49 |     if not isinstance(val, (unicode, bytes)):
 50 |         val = unicode(val)
 51 |     return val.encode('utf-8')
 52 | 
 53 | 
 54 | def ordered_yield(mapping, keys):
 55 |     for k in keys:
 56 |         try:
 57 |             yield (k, mapping.pop(k))
 58 |         except KeyError:
 59 |             pass
 60 |     for k in mapping:
 61 |         yield (k, mapping.pop(k))
 62 | 
 63 | 
 64 | def parse_url(url):
 65 |     try:
 66 |         url = unicode(url)
 67 |     except UnicodeDecodeError:
 68 |         pass
 69 | 
 70 |     parsed = urlparse(url)
 71 |     if not (parsed.scheme and parsed.netloc):
 72 |         raise ValueError("invalid URL, no schema supplied: %r" % url)
 73 | 
 74 |     try:
 75 |         dec_netloc = parsed.netloc.encode('idna').decode('utf-8')
 76 |         parsed = parsed._replace(netloc=dec_netloc)
 77 |     except UnicodeError:
 78 |         raise ValueError('invalid characters in url: %r' % parsed.netloc)
 79 | 
 80 |     if not parsed.path:
 81 |         parsed = parsed._replace(path=u'/')
 82 | 
 83 |     for k, v in parsed._asdict().items():
 84 |         parsed = parsed._replace(**{k: get_encoded(v)})
 85 | 
 86 |     return parsed
 87 | 
 88 | 
 89 | def encode_url_params(params, keep_blank=False):
 90 |     # TODO: handle case where params is just a string
 91 |     res = []
 92 |     for k, vs in get_items(params):
 93 |         if is_scalar(vs):
 94 |             vs = [vs]
 95 |         for v in vs:
 96 |             if not v:
 97 |                 if keep_blank:
 98 |                     v = ''
 99 |                 else:
100 |                     continue
101 |             res.append((get_encoded(k), get_encoded(v)))
102 |     return urlencode(res, doseq=True)
103 | 
104 | 
105 | # TODO: merging url params
106 | """
107 | , keep_order=True):
108 |     if keep_order:
109 |         existing_params = parse_qsl(parsed_url.query,
110 |                                     keep_blank_values=True)
111 |         params = list(ordered_yield(params, get_keys(existing_params)))
112 |         query = encode_url_params(params)
113 |     else:
114 | """
115 | 
116 | 
117 | def construct_url(url, params):
118 |     parsed_url = parse_url(url)
119 | 
120 |     query = parsed_url.query
121 |     encoded_params = encode_url_params(params)
122 |     if encoded_params:
123 |         if query:
124 |             query = query + '&' + encoded_params
125 |         else:
126 |             query = encoded_params
127 |     new_parsed_url = parsed_url._replace(query=query)
128 |     new_url = requote(urlunparse(new_parsed_url))
129 |     return new_url
130 | 
131 | 
132 | def gunzip(text):
133 |     buf = StringIO(text)
134 |     f = gzip.GzipFile(fileobj=buf)
135 |     return f.read()
136 | 
137 | 
138 | class Client(object):
139 |     def __init__(self, config=None):  # among other things
140 |         self.config = dict(DEFAULT_CONFIG)
141 |         if config:
142 |             self.config.update(config)
143 | 
144 |     def req(self, method, url, params=None, headers=None, use_gzip=True):
145 |         _headers = dict(self.config.get('headers', {}))
146 |         if headers:
147 |             _headers.update(headers)
148 |         headers = _headers
149 |         if use_gzip and not headers.get('Accept-encoding'):
150 |             headers['Accept-encoding'] = 'gzip'
151 | 
152 |         full_url = construct_url(url, params)
153 |         ret = Response()
154 |         ret.url = full_url
155 |         resp_text = None
156 |         resp_status = None
157 |         resp_headers = {}
158 |         try:
159 |             req = urllib2.Request(full_url, headers=headers)
160 |             resp = urllib2.urlopen(req)
161 |             resp_text = resp.read()
162 |             resp.close()
163 |             if 'gzip' in resp.info().get('Content-Encoding', ''):  # TODO
164 |                 resp_text = gunzip(resp_text)
165 |             resp_status = resp.getcode()
166 |             resp_headers = resp.headers
167 |         except Exception as e:
168 |             raise
169 |         ret.text = resp_text
170 |         ret.status_code = resp_status
171 |         ret.headers = resp_headers
172 |         return ret
173 | 
174 |     def get(self, url, params=None, headers=None, use_gzip=True):
175 |         return self.req('get', url, params, headers, use_gzip)
176 | 
177 |     def post(self, url, params=None, headers=None, use_gzip=True):
178 |         return self.req('post', url, params, headers, use_gzip)
179 | 
180 | 
181 | # lol compat
182 | requests = Client()
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     print requests.get('https://www.google.com/webhp', params={'q':'python'}).text
187 | 


--------------------------------------------------------------------------------
/wapiti/tests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from argparse import ArgumentParser
  5 | from functools import wraps
  6 | 
  7 | from client import WapitiClient
  8 | from operations import test_basic, test_operations
  9 | 
 10 | from functools import partial
 11 | 
 12 | DEFAULT_MAGNITUDE = 'norm'
 13 | 
 14 | 
 15 | def magnitude(norm, big=None, huge=None):
 16 |     if big is None:
 17 |         big = norm
 18 |     if huge is None:
 19 |         huge = big
 20 | 
 21 |     def mag_dec(func):
 22 | 
 23 |         @wraps(func)
 24 |         def wrapped(limit_or_mag=None):
 25 |             if limit_or_mag is None:
 26 |                 limit_or_mag = wrapped.norm
 27 |             try:
 28 |                 limit = int(limit_or_mag)
 29 |             except ValueError:
 30 |                 limit = int(wrapped.__dict__[limit_or_mag])
 31 |             return func(limit)
 32 |         wrapped.norm = norm
 33 |         wrapped.big = big
 34 |         wrapped.huge = huge
 35 |         return wrapped
 36 |     return mag_dec
 37 | 
 38 | 
 39 | def call_and_ret(func):
 40 |     try:
 41 |         ret = func()
 42 |     except Exception as e:
 43 |         if PDB_ERROR:
 44 |             import pdb;pdb.post_mortem()
 45 |         raise
 46 |     if PDB_ALL:
 47 |         import pdb;pdb.set_trace()
 48 |     if ret:
 49 |         try:
 50 |             disp = ret[0]
 51 |         except TypeError:
 52 |             disp = ret
 53 |         print repr(disp)[:74] + '...'
 54 |     return ret
 55 | 
 56 | 
 57 | def test_client_basic(limit):
 58 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 59 |     return len(client.source_info.namespace_map) > 10
 60 | 
 61 | 
 62 | @magnitude(norm=20, big=550, huge=2000)
 63 | def test_cat(limit):
 64 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 65 |     get_africa = partial(client.get_category_recursive, 'Africa', limit)
 66 |     cat_pages = call_and_ret(get_africa)
 67 |     return len(cat_pages) == limit
 68 | 
 69 | 
 70 | def test_unicode_title(limit):
 71 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 72 |     get_beyonce = partial(client.get_current_content, "Beyoncé Knowles")
 73 |     beyonce = call_and_ret(get_beyonce)
 74 |     return bool(beyonce)
 75 | 
 76 | 
 77 | @magnitude(norm=20, big=550, huge=2000)
 78 | def test_category_basic(limit):
 79 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 80 |     get_2k_featured = partial(client.get_category, 'Featured_articles', limit)
 81 |     pages = call_and_ret(get_2k_featured)
 82 |     return len(pages) == limit
 83 | 
 84 | 
 85 | @magnitude(norm=20, big=550, huge=2000)
 86 | def test_subcategory_infos(limit):
 87 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 88 |     get_subcats = partial(client.get_subcategory_infos, 'FA-Class_articles', limit)
 89 |     subcats = call_and_ret(get_subcats)
 90 |     return len(subcats) == limit
 91 | 
 92 | 
 93 | def test_all_category_infos(limit):
 94 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
 95 |     get_all_cats = partial(client.get_all_category_infos, 501)
 96 |     all_cats = call_and_ret(get_all_cats)
 97 |     return len(all_cats) == 501
 98 | 
 99 | 
100 | @magnitude(norm=10, big=1000, huge=10000)
101 | def test_category_recursive(limit):
102 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
103 |     get_limit_recursive = partial(client.get_category_recursive, 'Africa', limit)
104 |     pages = call_and_ret(get_limit_recursive)
105 |     return len(pages) == limit
106 | 
107 | 
108 | def test_single_prot(limit):
109 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
110 |     get_coffee_prot = partial(client.get_protections, 'Coffee')
111 |     prots = call_and_ret(get_coffee_prot)
112 |     return len(prots) == 1
113 | 
114 | 
115 | def test_multi_prots_list(limit):
116 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
117 |     get_prots = partial(client.get_protections, ['Coffee', 'House'])
118 |     prots = call_and_ret(get_prots)
119 |     return len(prots) == 2
120 | 
121 | 
122 | def test_multi_prots_str(limit):
123 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
124 |     get_prots = partial(client.get_protections, 'Coffee|House')
125 |     prots = call_and_ret(get_prots)
126 |     return len(prots) == 2
127 | 
128 | 
129 | @magnitude(norm=20, big=550, huge=2000)
130 | def test_backlinks(limit):
131 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
132 |     get_bls = partial(client.get_backlinks, 'Coffee', limit)
133 |     bls = call_and_ret(get_bls)
134 |     '''
135 |     Nonexistent title returns []
136 |     '''
137 |     return len(bls) == limit
138 | 
139 | 
140 | @magnitude(norm=20, big=550, huge=2000)
141 | def test_random(limit):
142 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
143 |     get_fifty_random = partial(client.get_random, limit)
144 |     pages = call_and_ret(get_fifty_random)
145 |     return len(pages) == limit
146 | 
147 | 
148 | @magnitude(norm=5, big=550, huge=2000)
149 | def test_lang_links(limit):
150 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
151 |     get_coffee_langs = partial(client.get_language_links, 'Coffee', limit)
152 |     lang_list = call_and_ret(get_coffee_langs)
153 |     return len(lang_list) == limit
154 | 
155 | 
156 | @magnitude(norm=5, big=550, huge=2000)
157 | def test_interwiki_links(limit):
158 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
159 |     get_coffee_iwlinks = partial(client.get_interwiki_links, 'Coffee', limit)
160 |     iw_list = call_and_ret(get_coffee_iwlinks)
161 |     return len(iw_list) == limit
162 | 
163 | @magnitude(norm=20, big=550, huge=2000)
164 | def test_external_links(limit):
165 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
166 |     get_coffee_elinks = partial(client.get_external_links, 'Croatian War of Independence', limit)
167 |     el_list = call_and_ret(get_coffee_elinks)
168 |     assert len(set(el_list)) == len(el_list)
169 |     return len(el_list) == limit
170 | 
171 | 
172 | #def test_feedback_v4(limit):  # no longer available, see feedback.py for info
173 | #    get_v4 = GetFeedbackV4('604727')
174 | #    v4_list = call_and_ret(get_v4)
175 | #    return len(v4_list) > 1
176 | 
177 | 
178 | def test_feedback_v5(limit):
179 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
180 |     get_v5 = partial(client.get_feedback_v5, '604727')  # TODO: support ints
181 |     v5_list = call_and_ret(get_v5)
182 |     return isinstance(v5_list, list)
183 | 
184 | 
185 | @magnitude(norm=10, big=550, huge=2000)
186 | def test_revisions(limit):
187 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
188 |     get_revs = partial(client.get_page_revision_infos, 'Coffee', 10)
189 |     rev_list = call_and_ret(get_revs)
190 |     return len(rev_list) == 10
191 | 
192 | 
193 | def test_missing_revisions(limit):
194 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
195 |     get_revs = partial(client.get_page_revision_infos, 'Coffee_lololololol')
196 |     rev_list = call_and_ret(get_revs)
197 |     '''
198 |     Should return 'missing' and negative pageid
199 |     '''
200 |     return len(rev_list) == 0
201 | 
202 | 
203 | @magnitude(norm=20, big=550, huge=2000)
204 | def test_transclusions(limit):
205 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
206 |     get_transcludes = partial(client.get_transcludes, 'Template:ArticleHistory', limit)
207 |     tr_list = call_and_ret(get_transcludes)
208 |     '''
209 |     Nonexistent title returns []
210 |     '''
211 |     return len(tr_list) == limit
212 | 
213 | 
214 | @magnitude(norm=20, big=550, huge=2000)
215 | def test_resolve_subjects(limit):
216 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
217 |     get_res_transcludes = partial(client.get_transcludes, 'Template:ArticleHistory',
218 |                                          limit,
219 |                                          resolve_to_subject=True)
220 |     tr_list = call_and_ret(get_res_transcludes)
221 |     tr_list = [t.get_subject_info() for t in tr_list]
222 |     return len(tr_list) == limit and all([t.is_subject_page for t in tr_list])
223 | 
224 | 
225 | def test_current_content(limit):
226 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
227 |     get_page = partial(client.get_current_content, 'Coffee')
228 |     page = call_and_ret(get_page)
229 |     return page[0].title == 'Coffee'
230 | 
231 | 
232 | def test_current_content_redirect(limit):
233 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
234 |     get_page = partial(client.get_current_content, 'Obama')
235 |     page = call_and_ret(get_page)
236 |     return page[0].title == 'Barack Obama'
237 | 
238 | 
239 | def test_current_talk_content(limit):
240 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
241 |     get_talk_page = partial(client.get_current_talk_content, 'Obama')
242 |     page = call_and_ret(get_talk_page)
243 |     return page[0].title == 'Talk:Barack Obama'
244 | 
245 | 
246 | @magnitude(norm=20, big=550, huge=2000)
247 | def test_flatten_category(limit):
248 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
249 |     get_flat_cat = partial(client.get_flattened_category, 'History', limit)
250 |     cat_infos = call_and_ret(get_flat_cat)
251 |     assert len(set([ci.title for ci in cat_infos])) == len(cat_infos)
252 |     return len(cat_infos) == limit
253 | 
254 | 
255 | @magnitude(norm=10, big=550, huge=2000)
256 | def test_cat_mem_namespace(limit):
257 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
258 |     get_star_portals = partial(client.get_category,
259 |                                'Astronomy_portals',
260 |                                limit,
261 |                                namespace=['100'])
262 |     portals = call_and_ret(get_star_portals)
263 |     return len(portals) == limit
264 | 
265 | 
266 | @magnitude(norm=20, big=550, huge=2000)
267 | def test_cat_pages_recursive(limit):
268 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
269 |     get_cat_pages_rec = partial(client.get_category_articles_recursive,
270 |                                 'Africa',
271 |                                 limit,
272 |                                 resolve_to_subject=True)
273 |     pages = call_and_ret(get_cat_pages_rec)
274 |     return len(pages) == limit
275 | 
276 | 
277 | @magnitude(norm=11, big=550, huge=2000)
278 | def test_cat_list(limit):
279 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
280 |     get_cat_list = partial(client.get_category_list, 'Physics', limit)
281 |     pages = call_and_ret(get_cat_list)
282 |     return len(pages) == limit
283 | 
284 | 
285 | @magnitude(norm=4, big=550, huge=2000)
286 | def test_get_images(limit):
287 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
288 |     get_imgs = partial(client.get_images, 'Coffee', limit)
289 |     imgs = call_and_ret(get_imgs)
290 |     return len(imgs) == limit
291 | 
292 | 
293 | @magnitude(norm=5, big=550, huge=2000)
294 | def test_get_links(limit):
295 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
296 |     get_links = partial(client.get_links, 'Coffee', limit)
297 |     links = call_and_ret(get_links)
298 |     return len(links) == limit
299 | 
300 | 
301 | def test_coordinates(limit):
302 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
303 |     get_coordinates = partial(client.get_coordinates, ['White House', 'Golden Gate Bridge', 'Mount Everest'])
304 |     coords = call_and_ret(get_coordinates)
305 |     return len(coords) == 3
306 | 
307 | 
308 | def test_geosearch(limit):
309 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
310 |     geosearch = partial(client.geo_search, ('37.8197', '-122.479'))
311 |     geo = call_and_ret(geosearch)
312 |     return len(geo) > 1
313 | 
314 | 
315 | @magnitude(norm=20, big=550, huge=2000)
316 | def test_get_user_contribs(limit):
317 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
318 |     get_contribs = partial(client.get_user_contribs, 'Jimbo Wales', limit)
319 |     contribs = call_and_ret(get_contribs)
320 |     return len(contribs) == limit
321 | 
322 | 
323 | def test_get_meta(limit):
324 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
325 |     get_source_info = client.get_source_info
326 |     meta = call_and_ret(get_source_info)
327 |     return len(meta[0].interwiki_map) > 1
328 | 
329 | 
330 | def test_get_revision_infos(limit):
331 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
332 |     get_revisions = partial(client.get_revision_infos, ['538903663', '539916351', '531458383'])
333 |     rev_infos = call_and_ret(get_revisions)
334 |     return len(rev_infos) == 3
335 | 
336 | 
337 | def test_get_image_info(limit):
338 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
339 |     get_image_info = partial(client.get_image_infos, 'File:Logo.gif')
340 |     image_info = call_and_ret(get_image_info)
341 |     return image_info[0].url == 'http://upload.wikimedia.org/wikipedia/en/e/ea/Logo.gif'
342 | 
343 | 
344 | """
345 | @magnitude(norm=20, big=550, huge=2000)
346 | def test_get_all_image_infos(limit):
347 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
348 |     get_all_img = partial(client.get_all_image_infos, limit)
349 |     all_imgs = call_and_ret(get_all_img)
350 |     return len(all_imgs) == limit
351 | """
352 | 
353 | 
354 | @magnitude(norm=20, big=550, huge=2000)
355 | def test_get_templates(limit):
356 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
357 |     get_templates = partial(client.get_templates, 'Coffee', limit)
358 |     tmpl = call_and_ret(get_templates)
359 |     return len(tmpl) == limit
360 | 
361 | 
362 | """
363 | @magnitude(norm=1, big=5, huge=600)
364 | def test_query_pages(limit):
365 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
366 |     from operations.misc import GetQueryPage as gqp
367 |     qp_types = gqp.known_qps
368 |     ret = []
369 |     for qpt in qp_types:
370 |         get_pages = partial(client.get_query_page, qpt, limit)
371 |         ret.extend(call_and_ret(get_pages))
372 |     return len(ret) == limit * len(qp_types)
373 | """
374 | 
375 | """
376 | def test_nonexistent_query_page(limit):
377 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
378 |     try:
379 |         non_existent_qp = partial(client.get_query_page, 'FakeQueryPage')
380 |         call_and_ret(non_existent_qp)
381 |     except ValueError:
382 |         return True
383 | """
384 | 
385 | 
386 | @magnitude(norm=20, big=550, huge=2000)
387 | def test_recent_changes(limit):
388 |     client = WapitiClient('mahmoudrhashemi@gmail.com')
389 |     get_recent_changes = partial(client.get_recent_changes, limit)
390 |     recent_changes = call_and_ret(get_recent_changes)
391 |     return len(recent_changes) == limit
392 | 
393 | 
394 | def create_parser():
395 |     parser = ArgumentParser(description='Test operations')
396 |     parser.add_argument('functions', nargs='*')
397 |     parser.add_argument('--pdb_all', '-a', action='store_true')
398 |     parser.add_argument('--pdb_error', '-e', action='store_true')
399 |     parser.add_argument('--do_print', '-p', action='store_true')
400 |     parser.add_argument('--magnitude', '-m',
401 |                         default=DEFAULT_MAGNITUDE)
402 |     return parser
403 | 
404 | 
405 | def main():
406 |     global PDB_ALL, PDB_ERROR, DO_PRINT
407 |     parser = create_parser()
408 |     args = parser.parse_args()
409 |     PDB_ALL = args.pdb_all
410 |     PDB_ERROR = args.pdb_error
411 |     DO_PRINT = args.do_print
412 |     if args.functions:
413 |         tests = {}
414 |         for func in args.functions:
415 |             try:
416 |                 tests[func] = globals()[func]
417 |             except KeyError:
418 |                 print func, 'is not a valid test function'
419 |                 continue
420 |     else:
421 |         tests = dict([(k, v) for k, v in globals().items()
422 |                       if callable(v) and k.startswith('test_')])
423 |     results = {}
424 |     for k, v in tests.items():
425 |         results[k] = v(args.magnitude)
426 |         print k, results[k]
427 |     return results
428 | 
429 | if __name__ == '__main__':
430 |     from pprint import pprint
431 |     pprint(main())
432 | 


--------------------------------------------------------------------------------