├── mediawiki ├── py.typed ├── __init__.py ├── utilities.py ├── exceptions.py ├── configuraton.py ├── mediawikipage.py └── mediawiki.py ├── docs ├── source │ ├── _static │ │ ├── .gitkeep │ │ └── custom.css │ ├── index.rst │ ├── code.rst │ ├── quickstart.rst │ └── conf.py ├── requirements.txt └── Makefile ├── tests ├── __init__.py └── utilities.py ├── setup.py ├── pyproject.toml ├── .github ├── dependabot.yml └── workflows │ ├── publish.yml │ └── python-package.yml ├── .readthedocs.yaml ├── codecov.yml ├── LICENSE ├── setup.cfg ├── .gitignore ├── README.rst ├── CONTRIBUTING.md ├── CHANGELOG.md ├── scripts └── generate_test_data.py └── .pylintrc /mediawiki/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/_static/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing Module 3 | """ 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup # type: ignore 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.0 2 | sphinx-rtd-theme 3 | requests>=2.0.0,<3.0.0 4 | beautifulsoup4 5 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Set the properties to be full width */ 2 | dl.py.property { 3 | display: block !important; 4 | } -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _home: 2 | .. include:: ../../README.rst 3 | 4 | .. toctree:: 5 | 6 | code 7 | quickstart 8 | 9 | 10 | Read More 11 | ================== 12 | 13 | * :ref:`api` 14 | * :ref:`quickstart` 15 | * :ref:`genindex` 16 | * :ref:`modindex` 17 | * :ref:`search` 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel", 5 | "setuptools_scm>=6.2", 6 | "requests>=2.0.0,<3.0.0", 7 | "beautifulsoup4", 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | 11 | [tool.isort] 12 | profile = "black" 13 | 14 | [tool.black] 15 | line-length = 120 16 | target-version = ['py38'] 17 | include = '\.pyi?$' 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | # Check for updates to GitHub Actions every week 13 | interval: "weekly" 14 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | status: 10 | project: 11 | default: 12 | # basic settings 13 | target: "85%" 14 | base: auto 15 | threshold: 15 16 | patch: 17 | default: 18 | target: "50%" 19 | changes: no 20 | 21 | parsers: 22 | gcov: 23 | branch_detection: 24 | conditional: yes 25 | loop: yes 26 | method: no 27 | macro: no 28 | 29 | comment: 30 | layout: "reach,diff,flags,tree" 31 | behavior: default 32 | require_changes: no 33 | 34 | ignore: 35 | - "./tests/" 36 | - "setup.py" 37 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v6 17 | - name: Set up Python 18 | uses: actions/setup-python@v6 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install --upgrade twine build 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: __token__ 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python -m build 31 | twine check dist/* 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Tyler Barrus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mediawiki/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | mediawiki module initialization 3 | """ 4 | 5 | from mediawiki.configuraton import URL, VERSION 6 | from mediawiki.exceptions import ( 7 | DisambiguationError, 8 | HTTPTimeoutError, 9 | MediaWikiAPIURLError, 10 | MediaWikiCategoryTreeError, 11 | MediaWikiException, 12 | MediaWikiForbidden, 13 | MediaWikiGeoCoordError, 14 | MediaWikiLoginError, 15 | PageError, 16 | RedirectError, 17 | ) 18 | from mediawiki.mediawiki import MediaWiki 19 | from mediawiki.mediawikipage import MediaWikiPage 20 | 21 | __author__ = "Tyler Barrus" 22 | __maintainer__ = "Tyler Barrus" 23 | __email__ = "barrust@gmail.com" 24 | __license__ = "MIT" 25 | __version__ = VERSION 26 | __credits__ = ["Jonathan Goldsmith"] 27 | __url__ = URL 28 | __bugtrack_url__ = f"{__url__}/issues" 29 | __download_url__ = f"{__url__}/tarball/v{__version__}" 30 | 31 | __all__ = [ 32 | "MediaWiki", 33 | "MediaWikiPage", 34 | "PageError", 35 | "RedirectError", 36 | "MediaWikiException", 37 | "DisambiguationError", 38 | "MediaWikiAPIURLError", 39 | "HTTPTimeoutError", 40 | "MediaWikiGeoCoordError", 41 | "MediaWikiCategoryTreeError", 42 | "MediaWikiLoginError", 43 | "MediaWikiForbidden", 44 | ] 45 | -------------------------------------------------------------------------------- /tests/utilities.py: -------------------------------------------------------------------------------- 1 | """ random functions that will be needed for the tests """ 2 | 3 | 4 | class FunctionUseCounter(object): 5 | """decorator to keep a running count of how many 6 | times function has been called; stop at 50""" 7 | 8 | def __init__(self, func): 9 | """init decorator""" 10 | self.func = func 11 | self.count = 0 12 | 13 | def __call__(self, *args, **kwargs): 14 | """what to do when called""" 15 | self.count += 1 16 | if self.count > 50: # arbitrary large 17 | return dict() 18 | return self.func(*args, **kwargs) 19 | 20 | 21 | def find_depth(node): 22 | """find depth of tree""" 23 | 24 | def walk(next_node, depth): 25 | """walk down tree finding depth""" 26 | if next_node is None: 27 | return depth 28 | if "sub-categories" not in next_node: 29 | return depth 30 | if next_node["sub-categories"] is None: 31 | return depth 32 | 33 | if len(next_node["sub-categories"].keys()) == 0: 34 | return next_node["depth"] 35 | 36 | for key in next_node["sub-categories"].keys(): 37 | path_depth = walk(next_node["sub-categories"][key], depth) 38 | if path_depth and path_depth > depth: 39 | depth = path_depth 40 | return depth 41 | 42 | return walk(node, 0) 43 | -------------------------------------------------------------------------------- /docs/source/code.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | MediaWiki Documentation 4 | *********************** 5 | 6 | Here you can find the full developer API for the mediawiki project. 7 | 8 | 9 | Functions and Classes 10 | =============================== 11 | 12 | MediaWiki 13 | +++++++++++++++++++++++++++++++ 14 | 15 | .. autoclass:: mediawiki.MediaWiki 16 | :members: version, api_version, extensions, rate_limit, 17 | rate_limit_min_wait, timeout, language, user_agent, api_url, 18 | memoized, clear_memoized, refresh_interval, set_api_url, 19 | supported_languages, random, categorytree, page, wiki_request 20 | 21 | .. automethod:: mediawiki.MediaWiki.login(username, password) 22 | .. automethod:: mediawiki.MediaWiki.suggest(query) 23 | .. automethod:: mediawiki.MediaWiki.search(query, results=10, suggestion=False) 24 | .. automethod:: mediawiki.MediaWiki.allpages(query='', results=10) 25 | .. automethod:: mediawiki.MediaWiki.summary(title, sentences=0, chars=0, auto_suggest=True, redirect=True) 26 | .. automethod:: mediawiki.MediaWiki.geosearch(latitude=None, longitude=None, radius=1000, title=None, auto_suggest=True, results=10) 27 | .. automethod:: mediawiki.MediaWiki.prefixsearch(prefix, results=10) 28 | .. automethod:: mediawiki.MediaWiki.opensearch(query, results=10, redirect=True) 29 | .. automethod:: mediawiki.MediaWiki.categorymembers(category, results=10, subcategories=True) 30 | 31 | 32 | MediaWikiPage 33 | +++++++++++++++++++++++++++++++ 34 | 35 | .. autoclass:: mediawiki.MediaWikiPage 36 | :members: 37 | 38 | Exceptions 39 | =============================== 40 | 41 | .. automodule:: mediawiki.exceptions 42 | :members: 43 | 44 | Indices and tables 45 | ================== 46 | 47 | * :ref:`home` 48 | * :ref:`quickstart` 49 | * :ref:`genindex` 50 | * :ref:`modindex` 51 | * :ref:`search` 52 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = pymediawiki 3 | version = attr: mediawiki.__version__ 4 | author = Tyler Barrus 5 | author_email = barrust@gmail.com 6 | url = https://github.com/barrust/mediawiki 7 | description = Wikipedia and MediaWiki API wrapper for Python 8 | long_description = file: README.rst 9 | long_description_content_type = text/x-rst 10 | keywords = python, mediawiki, wikipedia, API, wiki, parser, natural language processing, nlp 11 | license = MIT 12 | license_files = LICENSE 13 | bugtrack_url = https://github.com/barrust/mediawiki/issues 14 | classifiers = 15 | Development Status :: 5 - Production/Stable 16 | Intended Audience :: Developers 17 | Intended Audience :: Information Technology 18 | Intended Audience :: Science/Research 19 | Topic :: Software Development :: Libraries 20 | Topic :: Utilities 21 | Topic :: Internet 22 | Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Wiki 23 | License :: OSI Approved 24 | License :: OSI Approved :: MIT License 25 | Operating System :: OS Independent 26 | Programming Language :: Python 27 | Programming Language :: Python :: 3 28 | Programming Language :: Python :: 3.7 29 | Programming Language :: Python :: 3.8 30 | Programming Language :: Python :: 3.9 31 | Programming Language :: Python :: 3.10 32 | Programming Language :: Python :: 3.11 33 | Programming Language :: Python :: 3.12 34 | 35 | [options] 36 | zip_safe = False 37 | include_package_data = True 38 | packages = find: 39 | install_requires = 40 | beautifulsoup4 41 | requests>=2.0.0,<3.0.0 42 | python_requires = >=3.6 43 | 44 | [options.packages.find] 45 | exclude = tests 46 | 47 | [tool.setuptools_scm] 48 | 49 | [bdist_wheel] 50 | universal=0 51 | 52 | [pep8] 53 | max-line-length=120 54 | 55 | [pycodestyle] 56 | max-line-length = 120 57 | ignore = E203,W503 58 | 59 | [flake8] 60 | max-line-length = 120 61 | ignore = E203,W503 -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: [push, pull_request] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 15 | 16 | steps: 17 | - uses: actions/checkout@v6 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v6 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install flake8 pytest pytest-cov 26 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 27 | python -m pip install -e . 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 mediawiki/ --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 mediawiki/ --count --exit-zero --max-complexity=11 --max-line-length=127 --statistics 34 | - name: Test with pytest 35 | run: | 36 | # Run tests while also generating coverage statistics 37 | pytest --cov . --cov-report xml:/home/runner/coverage.xml 38 | - name: Upload coverage to Codecov 39 | uses: codecov/codecov-action@v5 40 | with: 41 | files: /home/runner/coverage.xml 42 | fail_ci_if_error: true 43 | verbose: true 44 | token: ${{ secrets.CODECOV_TOKEN }} # required 45 | 46 | build-verification: 47 | 48 | runs-on: ubuntu-latest 49 | steps: 50 | - uses: actions/checkout@v6 51 | - uses: actions/setup-python@v6 52 | with: 53 | python-version: '3.x' 54 | - name: Build and check twine 55 | run: | 56 | python -m pip install --upgrade pip 57 | python -m pip install build twine 58 | python -m build 59 | twine check dist/* 60 | 61 | Lint-black: 62 | runs-on: ubuntu-latest 63 | steps: 64 | - uses: actions/checkout@v6 65 | - uses: psf/black@stable 66 | with: 67 | # src: "./mediawiki/*" 68 | version: "22.8.0" 69 | -------------------------------------------------------------------------------- /mediawiki/utilities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions 3 | """ 4 | import functools 5 | import inspect 6 | import sys 7 | import time 8 | from typing import Any, Callable, Dict, Optional 9 | 10 | 11 | def parse_all_arguments(func: Callable) -> Dict[str, Any]: 12 | """determine all positional and named arguments as a dict""" 13 | args = {} 14 | 15 | func_args = inspect.signature(func) 16 | for itm in list(func_args.parameters)[1:]: 17 | param = func_args.parameters[itm] 18 | if param.default is not param.empty: 19 | args[param.name] = param.default 20 | return args 21 | 22 | 23 | def memoize(func: Callable) -> Callable: 24 | """quick memoize decorator for class instance methods 25 | NOTE: this assumes that the class that the functions to be 26 | memoized already has a memoized and refresh_interval 27 | property""" 28 | 29 | @functools.wraps(func) 30 | def wrapper(*args, **kwargs): 31 | """wrap it up and store info in a cache""" 32 | cache = args[0].memoized 33 | refresh = args[0]._config.refresh_interval 34 | use_cache = args[0]._config.use_cache 35 | 36 | # short circuit if not using cache 37 | if use_cache is False: 38 | return func(*args, **kwargs) 39 | 40 | if func.__name__ not in cache: 41 | cache[func.__name__] = {} 42 | if "defaults" not in cache: 43 | cache["defaults"] = {} 44 | cache["defaults"][func.__name__] = parse_all_arguments(func) 45 | # build a key; should also consist of the default values 46 | defaults = cache["defaults"][func.__name__].copy() 47 | for key, val in kwargs.items(): 48 | defaults[key] = val 49 | tmp = [] 50 | tmp.extend(args[1:]) 51 | for k in sorted(defaults.keys()): 52 | tmp.append(f"({k}: {defaults[k]})") 53 | 54 | tmp = [str(x) for x in tmp] 55 | key = " - ".join(tmp) 56 | 57 | # set the value in the cache if missing or needs to be refreshed 58 | if key not in cache[func.__name__]: 59 | cache[func.__name__][key] = (time.time(), func(*args, **kwargs)) 60 | else: 61 | tmp = cache[func.__name__][key] 62 | # determine if we need to refresh the data... 63 | if refresh is not None and time.time() - tmp[0] > refresh: 64 | cache[func.__name__][key] = (time.time(), func(*args, **kwargs)) 65 | return cache[func.__name__][key][1] 66 | 67 | return wrapper 68 | 69 | 70 | def str_or_unicode(text: str) -> str: 71 | """handle python 3 unicode""" 72 | encoding = sys.stdout.encoding 73 | return text.encode(encoding).decode(encoding) 74 | 75 | 76 | def is_relative_url(url: str) -> Optional[bool]: 77 | """simple method to determine if a url is relative or absolute""" 78 | return url.find("://") <= 0 and not url.startswith("//") if not url.startswith("#") else None 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # Project Specific 3 | ############################################### 4 | 5 | 6 | ############################################### 7 | # Python 8 | ############################################### 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # dotenv 93 | .env 94 | 95 | # virtualenv 96 | .venv 97 | venv/ 98 | ENV/ 99 | Pipfile* 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | 115 | ############################################### 116 | # Operating Systems 117 | ############################################### 118 | # Windows thumbnail cache files 119 | Thumbs.db 120 | Thumbs.db:encryptable 121 | ehthumbs.db 122 | ehthumbs_vista.db 123 | 124 | # Dump file 125 | *.stackdump 126 | 127 | # Folder config file 128 | [Dd]esktop.ini 129 | 130 | # Recycle Bin used on file shares 131 | $RECYCLE.BIN/ 132 | 133 | # Windows Installer files 134 | *.cab 135 | *.msi 136 | *.msix 137 | *.msm 138 | *.msp 139 | 140 | # Windows shortcuts 141 | *.lnk 142 | 143 | # 144 | # MacOS 145 | # 146 | 147 | # General 148 | .DS_Store 149 | .AppleDouble 150 | .LSOverride 151 | 152 | # Icon must end with two \r 153 | Icon 154 | 155 | 156 | # Thumbnails 157 | ._* 158 | 159 | # Files that might appear in the root of a volume 160 | .DocumentRevisions-V100 161 | .fseventsd 162 | .Spotlight-V100 163 | .TemporaryItems 164 | .Trashes 165 | .VolumeIcon.icns 166 | .com.apple.timemachine.donotpresent 167 | 168 | # Directories potentially created on remote AFP share 169 | .AppleDB 170 | .AppleDesktop 171 | Network Trash Folder 172 | Temporary Items 173 | .apdisk 174 | 175 | # 176 | # Linux 177 | # 178 | *~ 179 | 180 | # temporary files which can be created if a process still has a handle open of a deleted file 181 | .fuse_hidden* 182 | 183 | # KDE directory preferences 184 | .directory 185 | 186 | # Linux trash folder which might appear on any partition or disk 187 | .Trash-* 188 | 189 | # .nfs files are created when an open file is removed but is still being accessed 190 | .nfs* 191 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | MediaWiki 2 | ========= 3 | 4 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 5 | :target: https://opensource.org/licenses/MIT/ 6 | :alt: License 7 | .. image:: https://img.shields.io/github/release/barrust/mediawiki.svg 8 | :target: https://github.com/barrust/mediawiki/releases 9 | :alt: GitHub release 10 | .. image:: https://github.com/barrust/mediawiki/workflows/Python%20package/badge.svg 11 | :target: https://github.com/barrust/mediawiki/actions?query=workflow%3A%22Python+package%22 12 | :alt: Build Status 13 | .. image:: https://codecov.io/gh/barrust/mediawiki/branch/master/graph/badge.svg?token=OdETiNgz9k 14 | :target: https://codecov.io/gh/barrust/mediawiki 15 | :alt: Test Coverage 16 | .. image:: https://app.codacy.com/project/badge/Grade/34162198611f4aa0bde630d1dab72ce8 17 | :target: https://www.codacy.com/gh/barrust/mediawiki/dashboard?utm_source=github.com&utm_medium=referral&utm_content=barrust/mediawiki&utm_campaign=Badge_Grade 18 | :alt: Codacy Review 19 | .. image:: https://badge.fury.io/py/pymediawiki.svg 20 | :target: https://badge.fury.io/py/pymediawiki 21 | :alt: PyPi Release 22 | .. image:: http://pepy.tech/badge/pymediawiki 23 | :target: https://pepy.tech/project/pymediawiki 24 | :alt: Downloads 25 | 26 | ***mediawiki*** is a python wrapper and parser for the MediaWiki API. The goal 27 | is to allow users to quickly and efficiently pull data from the MediaWiki site 28 | of their choice instead of worrying about dealing directly with the API. As 29 | such, it does not force the use of a particular MediaWiki site. It defaults to 30 | `Wikipedia `__ but other MediaWiki sites can 31 | also be used. 32 | 33 | MediaWiki wraps the `MediaWiki API `_ 34 | so you can focus on *leveraging* your favorite MediaWiki site's data, 35 | not getting it. Please check out the code on 36 | `github `_! 37 | 38 | **Note:** this library was designed for ease of use and simplicity. If you plan 39 | on doing serious scraping, automated requests, or editing, please look into 40 | `Pywikibot `__ 41 | which has a larger API, advanced rate limiting, and other features so we may 42 | be considerate of the MediaWiki infrastructure. Pywikibot has also other extra 43 | features such as support for Wikibase (that runs Wikidata). 44 | 45 | 46 | Installation 47 | ------------------ 48 | 49 | Pip Installation: 50 | 51 | :: 52 | 53 | $ pip install pymediawiki 54 | 55 | To install from source: 56 | 57 | To install ``mediawiki``, simply clone the `repository on GitHub 58 | `__, then run from the folder: 59 | 60 | :: 61 | 62 | $ python setup.py install 63 | 64 | ``mediawiki`` supports python versions 3.7 - 3.13 65 | 66 | For *python 2.7* support, install `release 0.6.7 `__ 67 | 68 | :: 69 | 70 | $ pip install pymediawiki==0.6.7 71 | 72 | Documentation 73 | ------------- 74 | 75 | Documentation of the latest release is hosted on 76 | `readthedocs.io `__ 77 | 78 | To build the documentation yourself run: 79 | 80 | :: 81 | 82 | $ pip install sphinx 83 | $ cd docs/ 84 | $ make html 85 | 86 | Automated Tests 87 | ------------------ 88 | 89 | To run automated tests, one must simply run the following command from the 90 | downloaded folder: 91 | 92 | :: 93 | 94 | $ python setup.py test 95 | 96 | 97 | Quickstart 98 | ------------------ 99 | 100 | Import mediawiki and run a standard search against Wikipedia: 101 | 102 | .. code:: python 103 | 104 | >>> from mediawiki import MediaWiki 105 | >>> wikipedia = MediaWiki() 106 | >>> wikipedia.search('washington') 107 | 108 | Run more advanced searches: 109 | 110 | .. code:: python 111 | 112 | >>> wikipedia.opensearch('washington') 113 | >>> wikipedia.allpages('a') 114 | >>> wikipedia.geosearch(title='washington, d.c.') 115 | >>> wikipedia.geosearch(latitude='0.0', longitude='0.0') 116 | >>> wikipedia.prefixsearch('arm') 117 | >>> wikipedia.random(pages=10) 118 | 119 | Pull a MediaWiki page and some of the page properties: 120 | 121 | .. code:: python 122 | 123 | >>> p = wikipedia.page('Chess') 124 | >>> p.title 125 | >>> p.summary 126 | >>> p.categories 127 | >>> p.images 128 | >>> p.links 129 | >>> p.langlinks 130 | 131 | See the `documentation for more examples! 132 | `_ 133 | 134 | 135 | 136 | Changelog 137 | ------------------ 138 | 139 | Please see the `changelog 140 | `__ for a list 141 | of all changes. 142 | 143 | 144 | License 145 | ------- 146 | 147 | MIT licensed. See the `LICENSE file 148 | `__ 149 | for full details. 150 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | MediaWiki Quickstart 4 | ==================== 5 | 6 | Quickly get started using the `mediawiki` python library. This page is designed 7 | to help users understand the basics of using the `mediawiki` library. 8 | 9 | To understand all possible parameters for each function and properties, 10 | please see :ref:`api`. 11 | 12 | 13 | Install 14 | ^^^^^^^ 15 | 16 | Using pip 17 | """"""""" 18 | 19 | :: 20 | 21 | $ pip install pymediawiki 22 | 23 | From source 24 | """"""""""" 25 | 26 | Begin by installing pymediawiki: simply clone the 27 | `repository on GitHub `__, 28 | then run the following command from the extracted folder: 29 | 30 | :: 31 | 32 | $ python setup.py install 33 | 34 | Setup 35 | ^^^^^ 36 | 37 | Setting up the library is as easy as: 38 | 39 | .. code: python 40 | 41 | >>> from mediawiki import MediaWiki 42 | >>> wikipedia = MediaWiki() 43 | 44 | 45 | Change API URL 46 | ^^^^^^^^^^^^^^ 47 | 48 | To change the API URL, one can either set the url parameter 49 | 50 | .. code: python 51 | 52 | >>> from mediawiki import MediaWiki 53 | >>> asoiaf = MediaWiki(url='http://awoiaf.westeros.org/api.php') 54 | 55 | Or one can update an already setup MediaWiki object: 56 | 57 | .. code: python 58 | 59 | >>> wikipedia.set_api_url('http://awoiaf.westeros.org/api.php') 60 | 61 | Set the User-Agent String 62 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 63 | Per the `MediaWiki API Etiquette `__ 64 | it is recommended to not use a library's default user-agent string. Therefore, 65 | it is easy to change the `user-agent` string either during initialization or by 66 | setting the user_agent property: 67 | 68 | .. code: python 69 | 70 | >>> from mediawiki import MediaWiki 71 | >>> wikipedia = MediaWiki(user_agent='pyMediaWiki-User-Agent-String') 72 | >>> 73 | >>> # Or reset it! 74 | >>> wikipedia.user_agent = 'my-new-user-agent-string' 75 | 76 | Searching 77 | ^^^^^^^^^ 78 | 79 | To search the MediaWiki site, it is as easy as calling one of the search 80 | functions: `random`, `search`, `geosearch`, `opensearch`, or `prefixsearch` 81 | 82 | random 83 | """""" 84 | 85 | Get a random page: 86 | 87 | .. code: python 88 | 89 | >>> wikipedia.random(pages=3) 90 | # ['Sutton House, London', 'Iolaus violacea', 'Epigenetics & Chromatin'] 91 | 92 | 93 | search 94 | """""" 95 | 96 | Search for the provided title: 97 | 98 | .. code: python 99 | 100 | >>> wikipedia.search('washington', results=3) 101 | # ['Washington', 'Washington, D.C.', 'List of Governors of Washington'] 102 | 103 | allpages 104 | """""""" 105 | 106 | Search for the provided title: 107 | 108 | .. code: python 109 | 110 | >>> wikipedia.allpages('a', results=3) 111 | # ['A', 'A!', 'A! (Alexa Feser album)'] 112 | 113 | geosearch 114 | """"""""" 115 | 116 | Search based on geocoords (latitude/longitude): 117 | 118 | .. code: python 119 | 120 | >>> wikipedia.geosearch(latitude=0.0, longitude=0.0) 121 | # ['Null Island', 'Mirdif 35'] 122 | 123 | opensearch 124 | """""""""" 125 | 126 | Search using the OpenSearch specification: 127 | 128 | .. code: python 129 | 130 | >>> wikipedia.opensearch('new york', results=1) 131 | # [('New York', 'New York is a state in the Northeastern United States 132 | and is the 27th-most extensive, fourth-most populous, and seventh-most 133 | densely populated U.S.', 'https://en.wikipedia.org/wiki/New_York')] 134 | 135 | prefixsearch 136 | """""""""""" 137 | 138 | Search for pages whose title has the defined prefix: 139 | 140 | .. code: python 141 | 142 | >>> wikipedia.prefixsearch('ba', results=5) 143 | # ['Ba', 'Barack Obama', 'Baseball', "Bahá'í Faith", 'Basketball'] 144 | 145 | 146 | Page 147 | ^^^^ 148 | 149 | Load and access information from full MediaWiki pages. Load the page using 150 | a title or page id and then access individual properties: 151 | 152 | Initialize Page 153 | """"""""""""""" 154 | 155 | Initializing a page is easily accomplished in one line of code 156 | 157 | .. code: python 158 | 159 | >>> p = wikipedia.page('grid compass') 160 | 161 | title 162 | """"""""""" 163 | 164 | The page title 165 | 166 | .. code: python 167 | 168 | >>> p.title 169 | # 'Grid Compass' 170 | 171 | 172 | pageid 173 | """"""""""" 174 | 175 | The page id of the page 176 | 177 | .. code: python 178 | 179 | >>> p.pageid 180 | # 3498511 181 | 182 | 183 | revision_id 184 | """"""""""" 185 | 186 | The revision id of the page 187 | 188 | .. code: python 189 | 190 | >>> p.revision_id 191 | # 740685101 192 | 193 | parent_id 194 | """"""""""" 195 | 196 | The parent id of the page 197 | 198 | .. code: python 199 | 200 | >>> p.parent_id 201 | # 740682666 202 | 203 | links 204 | """"" 205 | 206 | Links to other MediaWiki pages 207 | 208 | .. code: python 209 | 210 | >>> p.links 211 | # ['Astronaut', 'Bill Moggridge', 'CP/M', 'Central processing unit', 212 | 'Dynabook', 'Electroluminescent display', 'FTP', 'Flip (form)', 213 | 'GRiD Systems Corporation', 'GRiD-OS', 'Gavilan SC', 'Grid compass', 214 | 'Hard drive', 'IEEE-488', 'Industrial design', 'Intel 8086', 215 | 'John Oliver Creighton', 'Kilobyte', 'Laptop computer', 216 | 'Magnetic bubble memory', 'Modem', 'NASA', 'Operating system', 217 | 'Osborne 1', 'Paratrooper', 'Patent rights', 'Perfect (film)', 218 | 'Portable computer', 'RadioShack', 'Riptide (American TV series)', 219 | 'STS-51-G', 'Sharp PC-5000', 'Space Shuttle Discovery', 220 | 'Tandy Corporation', 'U.S. government', 'United Kingdom', 221 | 'United States Army Special Forces', 'Xerox PARC'] 222 | 223 | Other Properties 224 | """""""""""""""" 225 | 226 | Other properties for a page include: 227 | 228 | - `content` 229 | - `html` 230 | - `images` 231 | - `references` 232 | - `categories` 233 | - `coordinates` 234 | - `redirects` 235 | - `backlinks` 236 | - `langlinks` 237 | - `summary` 238 | - `sections` 239 | - `logos` 240 | - `hatnotes` 241 | 242 | Summarize 243 | """""""""""""""" 244 | 245 | Summarize a page using additional parameters: 246 | 247 | .. code: python 248 | 249 | >>> p.summarize(chars=50) 250 | # The Grid Compass (written GRiD by its manufacturer... 251 | 252 | 253 | 254 | Indices and tables 255 | ================== 256 | 257 | * :ref:`home` 258 | * :ref:`api` 259 | * :ref:`genindex` 260 | * :ref:`modindex` 261 | * :ref:`search` 262 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | ## Welcome 3 | 4 | Welcome to the pymediawiki: a python MediaWiki API wrapper project. I hope that 5 | you have found the project to be useful. If you are here, you must want to help 6 | out in some way! I am very grateful for any help and support. 7 | 8 | ### Table Of Contents 9 | * [Contributing](#contributing) 10 | * [Issues and Bug Reports](#issues-and-bug-reports) 11 | * [Enhancement Requests](#enhancements) 12 | * [Submitting Pull Requests](#pull-requests) 13 | * [Testing](#testing) 14 | * [Coding Style](#coding-style) 15 | * [Code Contributors](#code-contributors) 16 | 17 | ### Contributing 18 | 19 | Contributing to open-source software comes in many forms: adding additional 20 | functionality, reporting and/or fixing bugs and defects, and helping maintain 21 | documentation. Any and all forms are welcome! 22 | 23 | Below you will find ways to help the project along with notes on how to report 24 | bugs and issues, request enhancements, and issue pull requests. 25 | 26 | #### Issues and Bug Reports 27 | 28 | If you have found an issue with `pymediawiki`, please do not hesitate to let us 29 | know! Before submitting an issue or bug report, we ask that you complete a few 30 | cursory items: 31 | 32 | * **Review** current bugs to see if your issue has already been reported. If it 33 | has been previously reported, please comment on the original report with any 34 | additional details. This will help the maintainers triage the issue more 35 | quickly. 36 | 37 | * **Ensure** that the issue is **not** related to the MediaWiki site you are 38 | trying to which you are trying to connect. There are times where the MediaWiki 39 | site may refuse connections or throw an error. There are times when trying 40 | again is all that is needed! If the error is the MediaWiki site, please do not 41 | report an issue as there is nothing we can do to help. If, however it is 42 | something within the library, please do not hesitate to report the issue! 43 | 44 | * **Determine** that the issue is reproducible - a code sample of the issue 45 | will help narrow down the search for the cause of the issue and may lead to a 46 | quicker fix! 47 | 48 | A **great bug report** will consist of the following: 49 | 50 | * A descriptive title 51 | 52 | * A brief description of the issue 53 | 54 | * Description of the expected results 55 | 56 | * A code example to reproduce the error. Please use 57 | [Markdown code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks/) 58 | with syntax highlighting 59 | 60 | * The link to the API URL if not the default: 61 | [Wikipedia API](http://en.wikipedia.org/w/api.php) 62 | 63 | * The affected version(s) of `pymediawiki` 64 | 65 | #### Enhancements 66 | 67 | Enhancements are additional functionality not currently supported by the 68 | `pymediawiki` library. Unfortunately, not all enhancements make sense for the 69 | goal of the project. If you have a desired feature, there are a few things you 70 | can do to possibly help get the feature into the `pymediawiki` library: 71 | 72 | * **Review** to see if the feature has been requested in the past. 73 | 74 | * If it is requested and still open, add your comment as to why you would 75 | like it. 76 | 77 | * If it was previously requested but closed, you may be interested in why 78 | it was closed and not implemented. I will try to explain my reasoning for 79 | not supporting actions as much as possible. 80 | 81 | * Add an issue to the 82 | [issue tracker](https://github.com/barrust/mediawiki/issues) and mark it as an 83 | enhancement. A ***great enhancement*** request will have the following 84 | information: 85 | 86 | * A descriptive title 87 | 88 | * A description of the desired functionality: use cases, added benefit to 89 | the library, etc. 90 | 91 | * A code example, if necessary, to explain how the code would be used 92 | 93 | * A description of the desired results 94 | 95 | #### Pull Requests 96 | 97 | Pull requests are how you will be able to add new features, fix bugs, or update 98 | documentation in the pymediawiki library. To create a pull request, you will 99 | first need to fork the repository, make all necessary changes and then create 100 | a pull request. There are a few guidelines for creating pull requests: 101 | 102 | * All pull requests must be based off of the latest development branch and not 103 | master (unless there is not a development branch!) 104 | 105 | * If the PR only changes documentation, please add `[ci skip]` to the commit 106 | message. To learn more, you can [read about skipping integration testing](https://docs.travis-ci.com/user/customizing-the-build#Skipping-a-build) 107 | 108 | * Reference ***any and all*** [issues](https://github.com/barrust/mediawiki/issues) 109 | related to the pull request 110 | 111 | #### Testing 112 | 113 | Each pull request should add or modify the appropriate tests. pymediawiki uses 114 | the unittest module to support tests and most are currently found in the 115 | `./tests/mediawiki_test.py` file. 116 | 117 | The `./scripts/generate_test_data.py` file is used to help capture request and 118 | response data in different json files for running tests without internet 119 | access. 120 | 121 | * ###### New Feature: 122 | * Add tests for each variation of the new feature 123 | 124 | * ###### Bug Fix 125 | * Add at least one regression test of an instance that is working to help 126 | ensure that the bug fix does not cause a new bug 127 | 128 | * Add at least one test to show the corrected outcome from the updated code 129 | to help ensure that the code works as intended 130 | 131 | #### Coding Style 132 | 133 | The MediaWiki API wrapper project follows the 134 | [PEP8](https://www.python.org/dev/peps/pep-0008/) coding style for consistency 135 | and readability. Code that does not comply with PEP8 will not be accepted into 136 | the project as-is. All code should adhere to the PEP8 coding style standard 137 | where possible. 138 | 139 | The MediaWiki API wrapper project also uses [pylint](https://www.pylint.org/) 140 | to help identify potential errors, code duplication, and non-pythonic syntax. 141 | Adhering to pylint's results is not strictly required. 142 | 143 | To install the [PEP8 compliance checker](https://pypi.org/project/pycodestyle/), 144 | you can simply run the following: 145 | 146 | ``` 147 | pip install pycodestyle 148 | ``` 149 | 150 | To test for PEP8 compliance, run the following from the root directory: 151 | 152 | ``` 153 | pep8 mediawiki 154 | ``` 155 | 156 | ### Code Contributors: 157 | 158 | A special thanks to all the code contributors to `pymediawiki`! 159 | 160 | * [@barrust](https://github.com/barrust) (Maintainer) 161 | * [@dan-blanchard](https://github.com/dan-blanchard) - Default URL conforms to passed in language [#26](https://github.com/barrust/mediawiki/pull/26) 162 | * [@nagash91](https://github.com/nagash91) - Pull section titles without additional markup [#42](https://github.com/barrust/mediawiki/issues/42) 163 | * [@flamableconcrete](https://github.com/flamableconcrete) - Added `allpages` functionality [#75](https://github.com/barrust/mediawiki/pull/75) 164 | * [@shnela](https://github.com/shnela) - Added `langlinks` property [#65](https://github.com/barrust/mediawiki/issues/65) 165 | * [@rubabredwan](https://github.com/rubabredwan) - Fix for `suggest` [#85](https://github.com/barrust/mediawiki/pull/85) 166 | * [@ldorigo](https://github.com/ldorigo) - Pulling links for header sections [#90](https://github.com/barrust/mediawiki/pull/90) 167 | * [@tbm](https://github.com/tbm) - `categorymember` improvements [PR #100](https://github.com/barrust/mediawiki/pull/100) 168 | * [@dnanto](https://github.com/dnanto) - Determining `available_languages` [PR #116](https://github.com/barrust/mediawiki/pull/116) 169 | * [gbenson](https://github.com/gbenson) - HTTPAutenticator [PR #141](https://github.com/barrust/mediawiki/pull/141) 170 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # MediaWiki Changelog 2 | 3 | ## Version 0.7.5 4 | 5 | * Move configuration items to a configuration data class 6 | * Will allow for the deprication of some top level properties in lieu of changing against the `Configuration` class 7 | * Added HTTPAuthenticator for web server authentication; [PR #141](https://github.com/barrust/mediawiki/pull/141) Thanks [gbenson](https://github.com/gbenson) 8 | 9 | ## Version 0.7.4 10 | 11 | * Add typing support 12 | 13 | ## Version 0.7.3 14 | 15 | * Add `unordered_options` to the `DisambiguationError` to attempt to get options in the order presented on the page; [issue #124](https://github.com/barrust/mediawiki/issues/124) 16 | * Add [verify SSL support](https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification) by passing info directly to the requests library. 17 | 18 | ## Version 0.7.2 19 | 20 | * Add `page_preview` property to simulate the page preview hover [PR #114](https://github.com/barrust/mediawiki/pull/114) 21 | * Add `available_languages` property [PR #116](https://github.com/barrust/mediawiki/pull/116) 22 | 23 | ## Version 0.7.1 24 | 25 | * Add `page.wikitext` support for pulling the page contents as [wikitext](https://en.wikipedia.org/wiki/Help:Wikitext) 26 | * Add [proxy support](https://requests.readthedocs.io/en/master/user/advanced/#proxies) by passing info directly to the requests library. 27 | 28 | ## Version 0.7.0 29 | 30 | * Remove support for ***python 2.7***! 31 | * Add files to `categorymembers()` pull [PR #100](https://github.com/barrust/mediawiki/pull/100) Thanks [tbm](https://github.com/tbm) 32 | * Better support calls to limit results to the maximum 500 results per query (per API documentation) [PR #99](https://github.com/barrust/mediawiki/pull/99) Thanks [tbm](https://github.com/tbm) 33 | 34 | ## Version 0.6.7 35 | 36 | * ***NOTE:*** Last planned support for **Python 2.7** 37 | * Cache results of `BeautifulSoup` parsing of `page.html` [PR #90](https://github.com/barrust/mediawiki/pull/90) Thanks [ldorigo](https://github.com/ldorigo) 38 | * Add ability to pull links from the header section; Thanks to [ldorigo](https://github.com/ldorigo) for example code! 39 | * Add ability to pull the header section text 40 | * Move to GitHub Actions and CodeCov for testing 41 | 42 | ## Version 0.6.6 43 | 44 | * Fix a bug using `find_all()` on newer versions of BeautifulSoup4 45 | 46 | ## Version 0.6.5 47 | 48 | * Fix for `suggest` [PR #85](https://github.com/barrust/mediawiki/pull/85) Thanks [rubabredwan](https://github.com/rubabredwan) 49 | * `__slots__` usage 50 | 51 | ## Version 0.6.4 52 | 53 | * Add ability to login during initialization [issue #79](https://github.com/barrust/mediawiki/issues/79) 54 | 55 | ## Version 0.6.3 56 | 57 | * Capture timeout exception 58 | * bs4 does not support `hasattr` but uses `*.has_attr()` 59 | 60 | ## Version 0.6.2 61 | 62 | * Add `allpages` functionality [PR #75](https://github.com/barrust/mediawiki/pull/75) 63 | * Add `langlinks` page property [PR #76](https://github.com/barrust/mediawiki/pull/76) 64 | 65 | ## Version 0.6.1 66 | 67 | * Fix DisambiguationError title property [issue #72](https://github.com/barrust/mediawiki/issues/72) 68 | * Change to using [black](https://github.com/ambv/black) formatting 69 | 70 | ## Version 0.6.0 71 | 72 | * Fix for the table of contents for all subsections [issue #64](https://github.com/barrust/mediawiki/issues/64) 73 | * Combined properties into a single set of pulling to reduce the load on the MediaWiki infrastructure [issue #55](https://github.com/barrust/mediawiki/issues/55) 74 | 75 | ## Version 0.5.1 76 | 77 | * Added Table of Contents parsing based on sections: result is an OrderedDict 78 | * Fix issue where some sections are not pulled correctly 79 | 80 | ## Version 0.5.0 81 | 82 | * Add support for logging into the MediaWiki site [issue #59](https://github.com/barrust/mediawiki/issues/59) 83 | 84 | ## Version 0.4.1 85 | 86 | * Default to `https` 87 | * Add `category_prefix` property to properly support categories in non-English 88 | MediaWiki sites [issue #48](https://github.com/barrust/mediawiki/issues/48) 89 | * Add `user_agent` as an initialization parameter and added information to the 90 | documentation about why one should set the user-agent string [issue #50](https://github.com/barrust/mediawiki/issues/50) 91 | 92 | ### Version 0.4.0 93 | 94 | * Add fix to use the `query-continue` parameter to continue to pull category 95 | members [issue #39](https://github.com/barrust/mediawiki/issues/39) 96 | * Better handle large categorymember selections 97 | * Add better handling of exception attributes including adding them to the 98 | documentation 99 | * Correct the pulling of the section titles without additional markup [issue #42](https://github.com/barrust/mediawiki/issues/42) 100 | * Handle memoization of unicode parameters in python 2.7 101 | * ***Change default timeout*** for HTTP requests to 15 seconds 102 | 103 | ### Version 0.3.16 104 | 105 | * Add ability to turn off caching completely 106 | * Fix bug when disambiguation link does not have a title [issue #35](https://github.com/barrust/mediawiki/issues/35) 107 | 108 | ### Version 0.3.15 109 | 110 | * Add parse all links within a section [issue #33](https://github.com/barrust/mediawiki/issues/33) 111 | * Add base url property to mediawiki site 112 | 113 | ### Version 0.3.14 114 | 115 | * Add refresh interval to cached responses (Defaults to not refresh) 116 | [issue #30](https://github.com/barrust/mediawiki/issues/30) 117 | * Fix minor documentation issues 118 | 119 | ### Version 0.3.13 120 | 121 | * Add pulling hatnotes [issue #6](https://github.com/barrust/mediawiki/issues/6) 122 | * Add pulling list of main images or logos [issue #28](https://github.com/barrust/mediawiki/issues/28) 123 | 124 | ### Version 0.3.12 125 | 126 | * Default API URL is now language specific: [PR #26](https://github.com/barrust/mediawiki/pull/26) 127 | 128 | ### Version 0.3.11 129 | 130 | * Re-factor MediaWikiPage into its own file 131 | * Remove setting properties outside of **init**() 132 | * Better Unicode support 133 | * Add CONTRIBUTING.md file 134 | 135 | ### Version 0.3.10 136 | 137 | * Add categorytree support 138 | * Remove adding 'http:' to references if missing 139 | 140 | ### Version 0.3.9 141 | 142 | * Fix infinite loop on continued queries: [issue #15](https://github.com/barrust/mediawiki/issues/15) 143 | * Check by looking at the continue variable over time; if it is the same, exit 144 | * Fix image with no url: [issue #14](https://github.com/barrust/mediawiki/issues/14) 145 | 146 | ### Version 0.3.8 147 | 148 | * Fix empty disambiguation list items 149 | 150 | ### Version 0.3.7 151 | 152 | * Memoize support default parameters 153 | * Add support test for Python 3.6 154 | 155 | ### Version 0.3.6 156 | 157 | * Updated Exception documentation 158 | * Fix badges in Readme file 159 | * Additional test coverage 160 | 161 | ### Version 0.3.5 162 | 163 | * Add documentation to README 164 | * Quickstart information 165 | * pip install instructions [pypi - pymediawiki](https://pypi.python.org/pypi/pymediawiki/) 166 | * Additional testing 167 | 168 | ### Version 0.3.4 169 | 170 | * Update documentation 171 | * Better continuous integration 172 | * Better test data: [issue #4](https://github.com/barrust/mediawiki/issues/4) 173 | * First version on PyPi: [issue #8](https://github.com/barrust/mediawiki/issues/8) 174 | 175 | ### Version 0.3.3 176 | 177 | * Improve testing strategy 178 | * Move tests to json from pickle 179 | * Improve parameter checking for geosearch 180 | * Code standardization 181 | * Pep8 182 | * Pylint 183 | * Single quote strings 184 | 185 | ### Version 0.3.2 186 | 187 | * OpenSearch functionality 188 | * PrefixSearch functionality 189 | 190 | ### Version 0.3.1 191 | 192 | * Page Summary 193 | * Page Sections 194 | * Enforce sorting of page properties 195 | 196 | ### Pre-Version 0.3.1 197 | 198 | * Add MediaWiki class 199 | * Add MediaWikiPage class 200 | * Stubbed out functionality 201 | * Add page properties 202 | -------------------------------------------------------------------------------- /mediawiki/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | MediaWiki Exceptions 3 | """ 4 | 5 | from typing import Dict, List, Optional 6 | 7 | from mediawiki.utilities import str_or_unicode 8 | 9 | ODD_ERROR_MESSAGE = ( 10 | "This should not happen. If the MediaWiki site you are " 11 | "querying is available, then please report this issue on " 12 | "GitHub: github.com/barrust/mediawiki" 13 | ) 14 | 15 | 16 | class MediaWikiBaseException(Exception): 17 | """Base MediaWikiException 18 | 19 | Args: 20 | message: The message of the exception""" 21 | 22 | def __init__(self, message: str): 23 | self._message = message 24 | super().__init__(self.message) 25 | 26 | def __unicode__(self): 27 | return self.message 28 | 29 | def __str__(self): 30 | return str_or_unicode(self.__unicode__()) 31 | 32 | @property 33 | def message(self) -> str: 34 | """str: The MediaWiki exception message""" 35 | return self._message 36 | 37 | 38 | class MediaWikiException(MediaWikiBaseException): 39 | """MediaWiki Exception Class 40 | 41 | Args: 42 | error (str): The error message that the MediaWiki site returned""" 43 | 44 | def __init__(self, error: str): 45 | self._error = error 46 | msg = f'An unknown error occurred: "{self.error}". Please report it on GitHub!' 47 | super().__init__(msg) 48 | 49 | @property 50 | def error(self) -> str: 51 | """str: The error message that the MediaWiki site returned""" 52 | return self._error 53 | 54 | 55 | class PageError(MediaWikiBaseException): 56 | """Exception raised when no MediaWiki page matched a query 57 | 58 | Args: 59 | title (str): Title of the page 60 | pageid (int): MediaWiki page id of the page""" 61 | 62 | def __init__(self, title: Optional[str] = None, pageid: Optional[int] = None): 63 | if title: 64 | self._title = title 65 | msg = f'"{self.title}" does not match any pages. Try another query!' 66 | elif pageid: 67 | self._pageid = pageid 68 | msg = f'Page id "{self.pageid}" does not match any pages. Try another id!' 69 | else: 70 | self._title = "" 71 | msg = f'"{self.title}" does not match any pages. Try another query!' 72 | super().__init__(msg) 73 | 74 | @property 75 | def title(self) -> str: 76 | """str: The title that caused the page error""" 77 | return self._title 78 | 79 | @property 80 | def pageid(self) -> int: 81 | """int: The page id that caused the page error""" 82 | return self._pageid 83 | 84 | 85 | class RedirectError(MediaWikiBaseException): 86 | """ Exception raised when a page title unexpectedly resolves to 87 | a redirect 88 | 89 | Args: 90 | title (str): Title of the page that redirected 91 | Note: 92 | This should only occur if both auto_suggest and redirect \ 93 | are set to **False** """ 94 | 95 | def __init__(self, title: str): 96 | self._title = title 97 | msg = ( 98 | f'"{self.title}" resulted in a redirect. Set the redirect property to True ' "to allow automatic redirects." 99 | ) 100 | 101 | super().__init__(msg) 102 | 103 | @property 104 | def title(self) -> str: 105 | """str: The title that was redirected""" 106 | return self._title 107 | 108 | 109 | class DisambiguationError(MediaWikiBaseException): 110 | """ Exception raised when a page resolves to a Disambiguation page 111 | 112 | Args: 113 | title (str): Title that resulted in a disambiguation page 114 | may_refer_to (list): List of possible titles 115 | url (str): Full URL to the disambiguation page 116 | details (dict): A list of dictionaries with more information of \ 117 | possible results 118 | Note: 119 | `options` only includes titles that link to valid \ 120 | MediaWiki pages """ 121 | 122 | def __init__(self, title: str, may_refer_to: List[str], url: str, details: Optional[List[Dict]] = None): 123 | self._title = title 124 | self._unordered_options = may_refer_to 125 | self._options = sorted(may_refer_to) 126 | self._details = details 127 | self._url = url 128 | options_str = "\n ".join(self.options) 129 | msg = f'\n"{self.title}" may refer to: \n {options_str}' 130 | super().__init__(msg) 131 | 132 | @property 133 | def url(self) -> str: 134 | """str: The url, if possible, of the disambiguation page""" 135 | return self._url 136 | 137 | @property 138 | def title(self) -> str: 139 | """str: The title of the page""" 140 | return self._title 141 | 142 | @property 143 | def options(self) -> List[str]: 144 | """list: The list of possible page titles""" 145 | return self._options 146 | 147 | @property 148 | def unordered_options(self) -> List[str]: 149 | """list: The list of possible page titles, un-sorted in an attempt to get them as they showup on the page""" 150 | return self._unordered_options 151 | 152 | @property 153 | def details(self) -> Optional[List[Dict]]: 154 | """list: The details of the proposed non-disambigous pages""" 155 | return self._details 156 | 157 | 158 | class HTTPTimeoutError(MediaWikiBaseException): 159 | """Exception raised when a request to the Mediawiki site times out. 160 | 161 | Args: 162 | query (str): The query that timed out""" 163 | 164 | def __init__(self, query: str): 165 | self._query = query 166 | msg = ( 167 | f'Searching for "{self.query}" resulted in a timeout. ' 168 | "Try again in a few seconds, and ensure you have rate limiting " 169 | "set to True." 170 | ) 171 | super().__init__(msg) 172 | 173 | @property 174 | def query(self) -> str: 175 | """str: The query that timed out""" 176 | return self._query 177 | 178 | 179 | class MediaWikiAPIURLError(MediaWikiBaseException): 180 | """Exception raised when the MediaWiki server does not support the API 181 | 182 | Args: 183 | api_url (str): The API URL that was not recognized""" 184 | 185 | def __init__(self, api_url: str): 186 | self._api_url = api_url 187 | msg = f"{self.api_url} is not a valid MediaWiki API URL" 188 | super().__init__(msg) 189 | 190 | @property 191 | def api_url(self) -> str: 192 | """str: The api url that raised the exception""" 193 | return self._api_url 194 | 195 | 196 | class MediaWikiGeoCoordError(MediaWikiBaseException): 197 | """ Exceptions to handle GeoData exceptions 198 | 199 | Args: 200 | error (str): Error message from the MediaWiki site related to \ 201 | GeoCoordinates """ 202 | 203 | def __init__(self, error: str): 204 | self._error = error 205 | msg = ( 206 | f"GeoData search resulted in the following error: {self.error}" 207 | " - Please use valid coordinates or a proper page title." 208 | ) 209 | super().__init__(msg) 210 | 211 | @property 212 | def error(self) -> str: 213 | """str: The error that was thrown when pulling GeoCoordinates""" 214 | return self._error 215 | 216 | 217 | class MediaWikiCategoryTreeError(MediaWikiBaseException): 218 | """Exception when the category tree is unable to complete for an unknown 219 | reason 220 | 221 | Args: 222 | category (str): The category that threw an exception""" 223 | 224 | def __init__(self, category: str): 225 | self._category = category 226 | msg = ( 227 | f"Categorytree threw an exception for trying to get the same category '{self._category}' " 228 | "too many times. Please try again later and perhaps use the rate limiting option." 229 | ) 230 | super().__init__(msg) 231 | 232 | @property 233 | def category(self) -> str: 234 | """ str: The category that threw an exception during category tree \ 235 | generation """ 236 | return self._category 237 | 238 | 239 | class MediaWikiLoginError(MediaWikiBaseException): 240 | """Exception raised when unable to login to the MediaWiki site 241 | 242 | Args: 243 | error (str): The error message that the MediaWiki site returned""" 244 | 245 | def __init__(self, error: str): 246 | self._error = error 247 | super().__init__(error) 248 | 249 | @property 250 | def error(self) -> str: 251 | """str: The error message that the MediaWiki site returned""" 252 | return self._error 253 | 254 | 255 | class MediaWikiForbidden(MediaWikiBaseException): 256 | """Exception raised when a forbidden status code is returned""" 257 | 258 | def __init__(self, error: str): 259 | self._error = error 260 | super().__init__(self._error) 261 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | @echo " dummy to check syntax errors of document sources" 51 | 52 | .PHONY: clean 53 | clean: 54 | rm -rf $(BUILDDIR)/* 55 | 56 | .PHONY: html 57 | html: 58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 61 | 62 | .PHONY: dirhtml 63 | dirhtml: 64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 67 | 68 | .PHONY: singlehtml 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | .PHONY: pickle 75 | pickle: 76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 77 | @echo 78 | @echo "Build finished; now you can process the pickle files." 79 | 80 | .PHONY: json 81 | json: 82 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 83 | @echo 84 | @echo "Build finished; now you can process the JSON files." 85 | 86 | .PHONY: htmlhelp 87 | htmlhelp: 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | .PHONY: qthelp 94 | qthelp: 95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 96 | @echo 97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mediawiki.qhcp" 100 | @echo "To view the help file:" 101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mediawiki.qhc" 102 | 103 | .PHONY: applehelp 104 | applehelp: 105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 106 | @echo 107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 108 | @echo "N.B. You won't be able to view it unless you put it in" \ 109 | "~/Library/Documentation/Help or install it in your application" \ 110 | "bundle." 111 | 112 | .PHONY: devhelp 113 | devhelp: 114 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 115 | @echo 116 | @echo "Build finished." 117 | @echo "To view the help file:" 118 | @echo "# mkdir -p $$HOME/.local/share/devhelp/mediawiki" 119 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mediawiki" 120 | @echo "# devhelp" 121 | 122 | .PHONY: epub 123 | epub: 124 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 125 | @echo 126 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 127 | 128 | .PHONY: epub3 129 | epub3: 130 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 131 | @echo 132 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 133 | 134 | .PHONY: latex 135 | latex: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo 138 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 139 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 140 | "(use \`make latexpdf' here to do that automatically)." 141 | 142 | .PHONY: latexpdf 143 | latexpdf: 144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 145 | @echo "Running LaTeX files through pdflatex..." 146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 148 | 149 | .PHONY: latexpdfja 150 | latexpdfja: 151 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 152 | @echo "Running LaTeX files through platex and dvipdfmx..." 153 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 154 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 155 | 156 | .PHONY: text 157 | text: 158 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 159 | @echo 160 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 161 | 162 | .PHONY: man 163 | man: 164 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 165 | @echo 166 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 167 | 168 | .PHONY: texinfo 169 | texinfo: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo 172 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 173 | @echo "Run \`make' in that directory to run these through makeinfo" \ 174 | "(use \`make info' here to do that automatically)." 175 | 176 | .PHONY: info 177 | info: 178 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 179 | @echo "Running Texinfo files through makeinfo..." 180 | make -C $(BUILDDIR)/texinfo info 181 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 182 | 183 | .PHONY: gettext 184 | gettext: 185 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 186 | @echo 187 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 188 | 189 | .PHONY: changes 190 | changes: 191 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 192 | @echo 193 | @echo "The overview file is in $(BUILDDIR)/changes." 194 | 195 | .PHONY: linkcheck 196 | linkcheck: 197 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 198 | @echo 199 | @echo "Link check complete; look for any errors in the above output " \ 200 | "or in $(BUILDDIR)/linkcheck/output.txt." 201 | 202 | .PHONY: doctest 203 | doctest: 204 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 205 | @echo "Testing of doctests in the sources finished, look at the " \ 206 | "results in $(BUILDDIR)/doctest/output.txt." 207 | 208 | .PHONY: coverage 209 | coverage: 210 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 211 | @echo "Testing of coverage in the sources finished, look at the " \ 212 | "results in $(BUILDDIR)/coverage/python.txt." 213 | 214 | .PHONY: xml 215 | xml: 216 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 217 | @echo 218 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 219 | 220 | .PHONY: pseudoxml 221 | pseudoxml: 222 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 223 | @echo 224 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 225 | 226 | .PHONY: dummy 227 | dummy: 228 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 229 | @echo 230 | @echo "Build finished. Dummy builder generates no files." 231 | -------------------------------------------------------------------------------- /mediawiki/configuraton.py: -------------------------------------------------------------------------------- 1 | """Configuration module""" 2 | 3 | from dataclasses import asdict, dataclass, field 4 | from datetime import datetime, timedelta 5 | from typing import Any, Callable, Dict, Optional, Tuple, Union 6 | 7 | URL: str = "https://github.com/barrust/mediawiki" 8 | VERSION: str = "0.7.5" 9 | 10 | HTTPAuthenticator = Union[Tuple[str, str], Callable[[Any], Any]] 11 | 12 | 13 | @dataclass 14 | class Configuration: 15 | """Configuration class""" 16 | 17 | _lang: str = field(default="en", init=False, repr=False) 18 | _api_url: str = field(default="https://en.wikipedia.org/w/api.php", init=False, repr=False) 19 | _category_prefix: str = field(default="Category", init=False, repr=False) 20 | _timeout: Optional[float] = field(default=15.0, init=False, repr=False) 21 | _user_agent: str = field(default=f"python-mediawiki/VERSION-{VERSION}/({URL})/BOT", init=False, repr=False) 22 | _proxies: Optional[Dict] = field(default=None, init=False, repr=False) 23 | _verify_ssl: Union[bool, str] = field(default=True, init=False, repr=False) 24 | _rate_limit: bool = field(default=False, init=False, repr=False) 25 | _rate_limit_min_wait: timedelta = field(default=timedelta(milliseconds=50), init=False, repr=False) 26 | _username: Optional[str] = field(default=None, init=False, repr=False) 27 | _password: Optional[str] = field(default=None, init=False, repr=False) 28 | _refresh_interval: Optional[int] = field(default=None, init=False, repr=False) 29 | _use_cache: bool = field(default=True, init=False, repr=False) 30 | _http_auth: Optional[HTTPAuthenticator] = field(default=None, init=False, repr=False) 31 | 32 | # not in repr 33 | _reset_session: bool = field(default=True, init=False, repr=False) 34 | _clear_memoized: bool = field(default=False, init=False, repr=False) 35 | _rate_limit_last_call: Optional[datetime] = field(default=None, init=False, repr=False) 36 | 37 | def __init__( 38 | self, 39 | lang: Optional[str] = None, 40 | api_url: Optional[str] = None, 41 | category_prefix: Optional[str] = None, 42 | timeout: Optional[float] = None, 43 | user_agent: Optional[str] = None, 44 | proxies: Optional[Dict] = None, 45 | verify_ssl: Union[bool, str, None] = None, 46 | rate_limit: bool = False, 47 | rate_limit_wait: Optional[timedelta] = None, 48 | username: Optional[str] = None, 49 | password: Optional[str] = None, 50 | refresh_interval: Optional[int] = None, 51 | use_cache: bool = True, 52 | http_auth: Optional[HTTPAuthenticator] = None, 53 | ): 54 | if api_url: 55 | self._api_url = api_url 56 | 57 | if lang: 58 | self.lang = lang 59 | 60 | if category_prefix: 61 | self.category_prefix = category_prefix 62 | 63 | if user_agent: 64 | self._user_agent = user_agent 65 | 66 | if proxies: 67 | self.proxies = proxies 68 | 69 | if verify_ssl: 70 | self.verify_ssl = verify_ssl 71 | 72 | if rate_limit: 73 | self.rate_limit = rate_limit 74 | 75 | if rate_limit_wait: 76 | self._rate_limit_min_wait = rate_limit_wait 77 | 78 | if username: 79 | self.username = username 80 | 81 | if password: 82 | self.password = password 83 | 84 | if refresh_interval: 85 | self.refresh_interval = refresh_interval 86 | 87 | if use_cache: 88 | self.use_cache = use_cache 89 | 90 | if timeout: 91 | self.timeout = timeout 92 | 93 | if http_auth: 94 | self.http_auth = http_auth 95 | 96 | def __repr__(self): 97 | """repr""" 98 | keys = [ 99 | x.replace("_", "", 1) 100 | for x in sorted(asdict(self).keys()) 101 | if x not in ["_rate_limit_last_call", "_clear_memoized", "_reset_session"] 102 | ] 103 | full = [f"{x}={self.__getattribute__(x)}" for x in keys] 104 | return f"Configuration({', '.join(full)})" 105 | 106 | @property 107 | def lang(self) -> str: 108 | """str: The API URL language, if possible this will update the API URL 109 | 110 | Note: 111 | Use correct language titles with the updated API URL 112 | Note: 113 | Some API URLs do not encode language; unable to update if this is the case""" 114 | return self._lang 115 | 116 | @lang.setter 117 | def lang(self, language: str): 118 | """Set the language to use; attempts to change the API URL""" 119 | if self._lang == language.lower(): 120 | return 121 | url = self._api_url 122 | tmp = url.replace(f"/{self._lang}.", f"/{language.lower()}.") 123 | 124 | self.api_url = tmp 125 | self._lang = language.lower() 126 | self._clear_memoized = True 127 | 128 | @property 129 | def api_url(self) -> str: 130 | """str: API URL of the MediaWiki site 131 | 132 | Note: 133 | Not settable; See :py:func:`mediawiki.MediaWiki.set_api_url`""" 134 | return self._api_url 135 | 136 | @api_url.setter 137 | def api_url(self, api_url: str): 138 | self._lang = self.lang.lower() 139 | self._api_url = api_url.format(lang=self._lang) 140 | 141 | # reset session 142 | self._reset_session = True 143 | 144 | @property 145 | def category_prefix(self) -> str: 146 | """str: The category prefix to use when using category based functions 147 | 148 | Note: 149 | Use the correct category name for the language selected""" 150 | return self._category_prefix 151 | 152 | @category_prefix.setter 153 | def category_prefix(self, category_prefix: str): 154 | """Set the category prefix correctly""" 155 | self._category_prefix = category_prefix[:-1] if category_prefix[-1:] == ":" else category_prefix 156 | 157 | @property 158 | def user_agent(self) -> str: 159 | """str: User agent string 160 | 161 | Note: 162 | If using in as part of another project, this should be changed""" 163 | return self._user_agent 164 | 165 | @user_agent.setter 166 | def user_agent(self, user_agent: str): 167 | """Set the new user agent string 168 | 169 | Note: 170 | Will need to re-log into the MediaWiki if user agent string is changed""" 171 | self._user_agent = user_agent 172 | 173 | @property 174 | def proxies(self) -> Optional[Dict]: 175 | """dict: Turn on, off, or set proxy use with the Requests library""" 176 | return self._proxies 177 | 178 | @proxies.setter 179 | def proxies(self, proxies: Optional[Dict]): 180 | """Turn on, off, or set proxy use through the Requests library""" 181 | self._proxies = proxies if isinstance(proxies, dict) else None 182 | 183 | # reset session 184 | self._reset_session = True 185 | 186 | @property 187 | def verify_ssl(self) -> Union[bool, str]: 188 | """bool | str: Verify SSL when using requests or path to cert file""" 189 | return self._verify_ssl 190 | 191 | @verify_ssl.setter 192 | def verify_ssl(self, verify_ssl: Union[bool, str, None]): 193 | """Set request verify SSL parameter; defaults to True if issue""" 194 | self._verify_ssl = verify_ssl if isinstance(verify_ssl, (bool, str)) else True 195 | 196 | # reset session 197 | self._reset_session = True 198 | 199 | @property 200 | def rate_limit(self) -> bool: 201 | """bool: Turn on or off Rate Limiting""" 202 | return self._rate_limit 203 | 204 | @rate_limit.setter 205 | def rate_limit(self, rate_limit: bool): 206 | """Turn on or off rate limiting""" 207 | self._rate_limit = bool(rate_limit) 208 | self._rate_limit_last_call = None 209 | self._clear_memoized = True 210 | 211 | @property 212 | def rate_limit_min_wait(self) -> timedelta: 213 | """timedelta: Time to wait between calls 214 | 215 | Note: 216 | Only used if rate_limit is **True**""" 217 | return self._rate_limit_min_wait 218 | 219 | @rate_limit_min_wait.setter 220 | def rate_limit_min_wait(self, min_wait: timedelta): 221 | """Set minimum wait to use for rate limiting""" 222 | self._rate_limit_min_wait = min_wait 223 | self._rate_limit_last_call = None 224 | 225 | @property 226 | def username(self) -> Optional[str]: 227 | """str | None: Username to use to log into the mediawiki site""" 228 | return self._username 229 | 230 | @username.setter 231 | def username(self, username: Optional[str]): 232 | """set the username, if needed, to log into the mediawiki site""" 233 | self._username = username 234 | 235 | @property 236 | def password(self) -> Optional[str]: 237 | """str | None: Password to use to log into the mediawiki site""" 238 | return self._password 239 | 240 | @password.setter 241 | def password(self, password: Optional[str]): 242 | """set the password, if needed, to log into the mediawiki site""" 243 | self._password = password 244 | 245 | @property 246 | def refresh_interval(self) -> Optional[int]: 247 | """int | None: The interval at which the memoize cache is to be refresh""" 248 | return self._refresh_interval 249 | 250 | @refresh_interval.setter 251 | def refresh_interval(self, refresh_interval: Optional[int]): 252 | "Set the new cache refresh interval" "" 253 | self._refresh_interval = ( 254 | refresh_interval if isinstance(refresh_interval, int) and refresh_interval > 0 else None 255 | ) 256 | 257 | @property 258 | def use_cache(self) -> bool: 259 | """bool: Whether caching should be used; on (**True**) or off (**False**)""" 260 | return self._use_cache 261 | 262 | @use_cache.setter 263 | def use_cache(self, use_cache: bool): 264 | """toggle using the cache or not""" 265 | self._use_cache = bool(use_cache) 266 | 267 | @property 268 | def timeout(self) -> Optional[float]: 269 | """float: Response timeout for API requests 270 | 271 | Note: 272 | Use **None** for no response timeout""" 273 | return self._timeout 274 | 275 | @timeout.setter 276 | def timeout(self, timeout: Optional[float]): 277 | """Set request timeout in seconds (or fractions of a second)""" 278 | self._timeout = None if timeout is None else float(timeout) 279 | 280 | @property 281 | def http_auth(self) -> Optional[HTTPAuthenticator]: 282 | """tuple|callable: HTTP authenticator to use to access the mediawiki site""" 283 | return self._http_auth 284 | 285 | @http_auth.setter 286 | def http_auth(self, http_auth: Optional[HTTPAuthenticator]): 287 | """Set the HTTP authenticator, if needed, to use to access the mediawiki site""" 288 | self._http_auth = http_auth 289 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # mediawiki documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Sep 24 19:03:06 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import os 17 | import sys 18 | from typing import Dict, List 19 | 20 | # If extensions (or modules to document with autodoc) are in another directory, 21 | # add these directories to sys.path here. If the directory is relative to the 22 | # documentation root, use os.path.abspath to make it absolute, like shown here. 23 | # sys.path.insert(0, os.path.abspath('.')) 24 | sys.path.insert(0, os.path.abspath("../../")) 25 | # sys.path.append(os.path.abspath("_themes")) 26 | import mediawiki 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | "sphinx.ext.autodoc", 38 | "sphinx.ext.napoleon", 39 | "sphinx.ext.doctest", 40 | "sphinx.ext.coverage", 41 | "sphinx.ext.viewcode", 42 | "sphinx.ext.githubpages", 43 | "sphinx.ext.todo", 44 | ] 45 | 46 | napoleon_use_admonition_for_notes = True 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # source_suffix = ['.rst', '.md'] 54 | source_suffix = ".rst" 55 | 56 | # The encoding of source files. 57 | # source_encoding = 'utf-8-sig' 58 | 59 | # The master toctree document. 60 | master_doc = "index" 61 | 62 | # General information about the project. 63 | project = "mediawiki" 64 | copyright = "2016, Tyler Barrus" 65 | author = mediawiki.__author__ 66 | 67 | # The version info for the project you're documenting, acts as replacement for 68 | # |version| and |release|, also used in various other places throughout the 69 | # built documents. 70 | # 71 | # The short X.Y version. 72 | version = mediawiki.__version__ 73 | # The full version, including alpha/beta/rc tags. 74 | release = mediawiki.__version__ 75 | 76 | # The language for content autogenerated by Sphinx. Refer to documentation 77 | # for a list of supported languages. 78 | # 79 | # This is also used if you do content translation via gettext catalogs. 80 | # Usually you set "language" from the command line for these cases. 81 | language = "en" 82 | 83 | # There are two options for replacing |today|: either, you set today to some 84 | # non-false value, then it is used: 85 | # today = '' 86 | # Else, today_fmt is used as the format for a strftime call. 87 | # today_fmt = '%B %d, %Y' 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This patterns also effect to html_static_path and html_extra_path 92 | exclude_patterns: List[str] = [] 93 | 94 | # The reST default role (used for this markup: `text`) to use for all 95 | # documents. 96 | # default_role = None 97 | 98 | # If true, '()' will be appended to :func: etc. cross-reference text. 99 | # add_function_parentheses = True 100 | 101 | # If true, the current module name will be prepended to all description 102 | # unit titles (such as .. function::). 103 | # add_module_names = True 104 | 105 | # If true, sectionauthor and moduleauthor directives will be shown in the 106 | # output. They are ignored by default. 107 | # show_authors = False 108 | 109 | # The name of the Pygments (syntax highlighting) style to use. 110 | pygments_style = "sphinx" 111 | 112 | # A list of ignored prefixes for module index sorting. 113 | # modindex_common_prefix = [] 114 | 115 | # If true, keep warnings as "system message" paragraphs in the built documents. 116 | # keep_warnings = False 117 | 118 | # If true, `todo` and `todoList` produce output, else they produce nothing. 119 | todo_include_todos = True 120 | 121 | 122 | # -- Options for HTML output ---------------------------------------------- 123 | 124 | # The theme to use for HTML and HTML Help pages. See the documentation for 125 | # a list of builtin themes. 126 | html_theme = "sphinx_rtd_theme" 127 | # html_theme = 'alabaster' 128 | # html_theme = "custom_theme" 129 | 130 | 131 | # Theme options are theme-specific and customize the look and feel of a theme 132 | # further. For a list of options available for each theme, see the 133 | # documentation. 134 | # html_theme_options = {} 135 | 136 | # Add any paths that contain custom themes here, relative to this directory. 137 | # html_theme_path = ["_themes"] 138 | 139 | # The name for this set of Sphinx documents. 140 | # " v documentation" by default. 141 | # html_title = 'mediawiki v0.3.4' 142 | 143 | # A shorter title for the navigation bar. Default is the same as html_title. 144 | # html_short_title = None 145 | 146 | # The name of an image file (relative to this directory) to place at the top 147 | # of the sidebar. 148 | # html_logo = None 149 | 150 | # The name of an image file (relative to this directory) to use as a favicon of 151 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 152 | # pixels large. 153 | # html_favicon = None 154 | 155 | # Add any paths that contain custom static files (such as style sheets) here, 156 | # relative to this directory. They are copied after the builtin static files, 157 | # so a file named "default.css" will overwrite the builtin "default.css". 158 | html_static_path = ["_static"] 159 | html_css_files = ["custom.css"] 160 | 161 | # Add any extra paths that contain custom files (such as robots.txt or 162 | # .htaccess) here, relative to this directory. These files are copied 163 | # directly to the root of the documentation. 164 | # html_extra_path = [] 165 | 166 | # If not None, a 'Last updated on:' timestamp is inserted at every page 167 | # bottom, using the given strftime format. 168 | # The empty string is equivalent to '%b %d, %Y'. 169 | # html_last_updated_fmt = None 170 | 171 | # If true, SmartyPants will be used to convert quotes and dashes to 172 | # typographically correct entities. 173 | # html_use_smartypants = True 174 | 175 | # Custom sidebar templates, maps document names to template names. 176 | # html_sidebars = {} 177 | 178 | # Additional templates that should be rendered to pages, maps page names to 179 | # template names. 180 | # html_additional_pages = {} 181 | 182 | # If false, no module index is generated. 183 | # html_domain_indices = True 184 | 185 | # If false, no index is generated. 186 | # html_use_index = True 187 | 188 | # If true, the index is split into individual pages for each letter. 189 | # html_split_index = False 190 | 191 | # If true, links to the reST sources are added to the pages. 192 | # html_show_sourcelink = True 193 | 194 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 195 | # html_show_sphinx = True 196 | 197 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 198 | # html_show_copyright = True 199 | 200 | # If true, an OpenSearch description file will be output, and all pages will 201 | # contain a tag referring to it. The value of this option must be the 202 | # base URL from which the finished HTML is served. 203 | # html_use_opensearch = '' 204 | 205 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 206 | # html_file_suffix = None 207 | 208 | # Language to be used for generating the HTML full-text search index. 209 | # Sphinx supports the following languages: 210 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 211 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 212 | # html_search_language = 'en' 213 | 214 | # A dictionary with options for the search language support, empty by default. 215 | # 'ja' uses this config value. 216 | # 'zh' user can custom change `jieba` dictionary path. 217 | # html_search_options = {'type': 'default'} 218 | 219 | # The name of a javascript file (relative to the configuration directory) that 220 | # implements a search results scorer. If empty, the default will be used. 221 | # html_search_scorer = 'scorer.js' 222 | 223 | # Output file base name for HTML help builder. 224 | htmlhelp_basename = "mediawikidoc" 225 | 226 | # -- Options for LaTeX output --------------------------------------------- 227 | 228 | latex_elements: Dict[str, str] = { 229 | # The paper size ('letterpaper' or 'a4paper'). 230 | #'papersize': 'letterpaper', 231 | # The font size ('10pt', '11pt' or '12pt'). 232 | #'pointsize': '10pt', 233 | # Additional stuff for the LaTeX preamble. 234 | #'preamble': '', 235 | # Latex figure (float) alignment 236 | #'figure_align': 'htbp', 237 | } 238 | 239 | # Grouping the document tree into LaTeX files. List of tuples 240 | # (source start file, target name, title, 241 | # author, documentclass [howto, manual, or own class]). 242 | latex_documents = [ 243 | (master_doc, "mediawiki.tex", "mediawiki Documentation", "Tyler Barrus", "manual"), 244 | ] 245 | 246 | # The name of an image file (relative to this directory) to place at the top of 247 | # the title page. 248 | # latex_logo = None 249 | 250 | # For "manual" documents, if this is true, then toplevel headings are parts, 251 | # not chapters. 252 | # latex_use_parts = False 253 | 254 | # If true, show page references after internal links. 255 | # latex_show_pagerefs = False 256 | 257 | # If true, show URL addresses after external links. 258 | # latex_show_urls = False 259 | 260 | # Documents to append as an appendix to all manuals. 261 | # latex_appendices = [] 262 | 263 | # If false, no module index is generated. 264 | # latex_domain_indices = True 265 | 266 | 267 | # -- Options for manual page output --------------------------------------- 268 | 269 | # One entry per manual page. List of tuples 270 | # (source start file, name, description, authors, manual section). 271 | man_pages = [(master_doc, "mediawiki", "mediawiki Documentation", [author], 1)] 272 | 273 | # If true, show URL addresses after external links. 274 | # man_show_urls = False 275 | 276 | 277 | # -- Options for Texinfo output ------------------------------------------- 278 | 279 | # Grouping the document tree into Texinfo files. List of tuples 280 | # (source start file, target name, title, author, 281 | # dir menu entry, description, category) 282 | texinfo_documents = [ 283 | ( 284 | master_doc, 285 | "mediawiki", 286 | "mediawiki Documentation", 287 | author, 288 | "mediawiki", 289 | "One line description of project.", 290 | "Miscellaneous", 291 | ), 292 | ] 293 | 294 | # Documents to append as an appendix to all manuals. 295 | # texinfo_appendices = [] 296 | 297 | # If false, no module index is generated. 298 | # texinfo_domain_indices = True 299 | 300 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 301 | # texinfo_show_urls = 'footnote' 302 | 303 | # If true, do not generate a @detailmenu in the "Top" node's menu. 304 | # texinfo_no_detailmenu = False 305 | 306 | # Determine which way to group auto documented members 307 | autodoc_member_order = "groupwise" 308 | -------------------------------------------------------------------------------- /scripts/generate_test_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate data for tests 3 | """ 4 | import json 5 | import os 6 | import sys 7 | from datetime import timedelta 8 | from decimal import Decimal 9 | 10 | sys.path.insert(0, "../mediawiki") 11 | from mediawiki import ( 12 | DisambiguationError, 13 | MediaWiki, 14 | MediaWikiAPIURLError, 15 | MediaWikiGeoCoordError, 16 | PageError, 17 | RedirectError, 18 | ) 19 | 20 | # set up the json objects 21 | REQUESTS_FILE = "./tests/mock_requests.json" 22 | RESPONSES_FILE = "./tests/mock_responses.json" 23 | CATTREE_FILE = "./tests/mock_categorytree.json" 24 | 25 | 26 | def capture_response(func): 27 | """capture_response decorator to be used for tests""" 28 | 29 | def wrapper(*args, **kwargs): 30 | """define the actions""" 31 | file_path = os.path.abspath(REQUESTS_FILE) 32 | if os.path.isfile(file_path): 33 | with open(file_path, "r") as mock: 34 | mock_data = json.load(mock) 35 | else: 36 | mock_data = dict() 37 | 38 | new_params = json.dumps(tuple(sorted(args[1].items()))) 39 | # build out parts of the dictionary 40 | if args[0].api_url not in mock_data: 41 | mock_data[args[0].api_url] = dict() 42 | try: 43 | res = func(*args, **kwargs) 44 | except Exception: 45 | res = dict() 46 | mock_data[args[0].api_url][new_params] = res 47 | with open(file_path, "w") as mock: 48 | json.dump(mock_data, mock, ensure_ascii=False, indent=1, sort_keys=True) 49 | return res 50 | 51 | return wrapper 52 | 53 | 54 | class MediaWikiOverloaded(MediaWiki): 55 | """overloaded mediawiki class""" 56 | 57 | def __init__( 58 | self, 59 | url="https://{lang}.wikipedia.org/w/api.php", 60 | lang="en", 61 | timeout=None, 62 | rate_limit=False, 63 | rate_limit_wait=timedelta(milliseconds=50), 64 | ): 65 | """overloaded init""" 66 | MediaWiki.__init__( 67 | self, url=url, lang=lang, timeout=timeout, rate_limit=rate_limit, rate_limit_wait=rate_limit_wait 68 | ) 69 | 70 | @capture_response 71 | def _get_response(self, params): 72 | """overloaded response""" 73 | return MediaWiki._get_response(self, params) 74 | 75 | @capture_response 76 | def _post_response(self, params): 77 | """overloaded response""" 78 | return MediaWiki._post_response(self, params) 79 | 80 | 81 | PULL_ALL = False 82 | 83 | # Parameters to determine which tests to pull 84 | PULL_SEARCHES = False 85 | PULL_ALLPAGES = False 86 | PULL_RANDOM = False 87 | PULL_SUGGEST = False 88 | PULL_OPENSEARCH = False 89 | PULL_PREFIXSEARCH = False 90 | PULL_GEOSEARCH = False 91 | PULL_CATEGORYMEMBERS = False 92 | PULL_CATEGORYTREE = False 93 | PULL_SUMMARY = False 94 | PULL_PAGE_ERRORS = False 95 | PULL_DISAMBIGUATION_ERRORS = False 96 | PULL_API_URL_ERROR = False 97 | PULL_REDIRECT_ERROR = False 98 | PULL_PAGES = False 99 | PULL_LOGOS = False 100 | PULL_PREVIEWS = True 101 | PULL_HATNOTES = False 102 | PULL_SECTION_LINKS = False 103 | PULL_TABLE_OF_CONTENTS = False 104 | PULL_LOGIN = False 105 | 106 | # regression tests 107 | PULL_ISSUE_15 = False 108 | PULL_ISSUE_14 = False 109 | PULL_ISSUE_35 = False 110 | PULL_ISSUE_39 = False 111 | 112 | # make files if they don't exist 113 | if not os.path.isfile(REQUESTS_FILE): 114 | with open(REQUESTS_FILE, "w") as file_handle: 115 | json.dump(dict(), file_handle, ensure_ascii=False) 116 | 117 | if os.path.isfile(RESPONSES_FILE): 118 | with open(RESPONSES_FILE, "r") as file_handle: 119 | responses = json.load(file_handle) 120 | else: 121 | responses = dict() 122 | 123 | 124 | # Begin building out new data objects 125 | site = MediaWikiOverloaded() 126 | french_site = MediaWikiOverloaded(url="https://fr.wikipedia.org/w/api.php", lang="fr") 127 | asoiaf = MediaWikiOverloaded(url="https://awoiaf.westeros.org/api.php", lang="fr") 128 | plants = MediaWikiOverloaded(url="https://practicalplants.org/w/api.php") 129 | wikipedia = MediaWikiOverloaded() 130 | 131 | 132 | # ensure these pieces of information do not throw errors 133 | if site.api_url not in responses: 134 | responses[site.api_url] = dict() 135 | if french_site.api_url not in responses: 136 | responses[french_site.api_url] = dict() 137 | if asoiaf.api_url not in responses: 138 | responses[asoiaf.api_url] = dict() 139 | 140 | # pull in standard information for all sites (every time) 141 | if site.api_url not in responses: 142 | responses[site.api_url] = dict() 143 | responses[site.api_url]["api"] = site.api_url 144 | responses[site.api_url]["lang"] = site.language 145 | responses[site.api_url]["languages"] = site.supported_languages 146 | responses[site.api_url]["api_version"] = site.api_version 147 | responses[site.api_url]["extensions"] = site.extensions 148 | 149 | if french_site.api_url not in responses: 150 | responses[french_site.api_url] = dict() 151 | responses[french_site.api_url]["api"] = french_site.api_url 152 | responses[french_site.api_url]["lang"] = french_site.language 153 | responses[french_site.api_url]["languages"] = french_site.supported_languages 154 | responses[french_site.api_url]["api_version"] = french_site.api_version 155 | responses[french_site.api_url]["extensions"] = french_site.extensions 156 | 157 | if asoiaf.api_url not in responses: 158 | responses[asoiaf.api_url] = dict() 159 | responses[asoiaf.api_url]["api"] = asoiaf.api_url 160 | responses[asoiaf.api_url]["lang"] = asoiaf.language 161 | responses[asoiaf.api_url]["languages"] = asoiaf.supported_languages 162 | responses[asoiaf.api_url]["api_version"] = asoiaf.api_version 163 | responses[asoiaf.api_url]["extensions"] = asoiaf.extensions 164 | 165 | # if plants.api_url not in responses: 166 | # responses[plants.api_url] = dict() 167 | 168 | print("Completed basic mediawiki information") 169 | 170 | if PULL_ALL is True or PULL_SEARCHES is True: 171 | res = site.search("chest set", suggestion=False) 172 | responses[site.api_url]["search_without_suggestion"] = res 173 | res = site.search("chest set", suggestion=True) 174 | responses[site.api_url]["search_with_suggestion_found"] = res 175 | res = site.search("chess set", suggestion=True) 176 | responses[site.api_url]["search_with_suggestion_not_found"] = res 177 | res = site.search("chess set", results=505, suggestion=False) 178 | responses[site.api_url]["search_with_suggestion_not_found_large"] = res 179 | res = site.search("chess set", results=3, suggestion=False) 180 | responses[site.api_url]["search_with_suggestion_not_found_small"] = res 181 | 182 | print("Completed pulling searches") 183 | 184 | if PULL_ALL is True or PULL_ALLPAGES is True: 185 | res = site.allpages("a") 186 | responses[site.api_url]["all_pages_query_a"] = res 187 | 188 | res = site.allpages("a", results=1) 189 | responses[site.api_url]["all_pages_query_a_1"] = res 190 | 191 | print("Completed pulling allpages") 192 | 193 | if PULL_ALL is True or PULL_RANDOM is True: 194 | responses[site.api_url]["random_1"] = site.random(pages=1) 195 | responses[site.api_url]["random_2"] = site.random(pages=2) 196 | responses[site.api_url]["random_10"] = site.random(pages=10) 197 | responses[site.api_url]["random_202"] = site.random(pages=202) 198 | 199 | print("Completed pulling random pages") 200 | 201 | if PULL_ALL is True or PULL_SUGGEST is True: 202 | responses[site.api_url]["suggest_chest_set"] = site.suggest("chest set") 203 | responses[site.api_url]["suggest_chess_set"] = site.suggest("chess set") 204 | responses[site.api_url]["suggest_new_york"] = site.suggest("new york") 205 | responses[site.api_url]["suggest_yonkers"] = site.suggest("yonkers") 206 | responses[site.api_url]["suggest_no_results"] = site.suggest("gobbilygook") 207 | 208 | print("Completed pulling suggestions") 209 | 210 | if PULL_ALL is True or PULL_OPENSEARCH is True: 211 | res = site.opensearch("new york") 212 | responses[site.api_url]["opensearch_new_york"] = res 213 | res = site.opensearch("new york", results=5) 214 | responses[site.api_url]["opensearch_new_york_result"] = res 215 | res = site.opensearch("new york", redirect=False) 216 | responses[site.api_url]["opensearch_new_york_redirect"] = res 217 | res = site.opensearch("new york", results=5, redirect=False) 218 | responses[site.api_url]["opensearch_new_york_result_redirect"] = res 219 | 220 | print("Completed pulling open searches") 221 | 222 | if PULL_ALL is True or PULL_PREFIXSEARCH is True: 223 | responses[site.api_url]["prefixsearch_ar"] = site.prefixsearch("ar") 224 | responses[site.api_url]["prefixsearch_ba"] = site.prefixsearch("ba") 225 | res = site.prefixsearch("ba", results=5) 226 | responses[site.api_url]["prefixsearch_ba_5"] = res 227 | res = site.prefixsearch("ba", results=30) 228 | responses[site.api_url]["prefixsearch_ba_30"] = res 229 | 230 | print("Completed pulling prefix searches") 231 | 232 | if PULL_ALL is True or PULL_GEOSEARCH is True: 233 | res = site.geosearch(latitude=Decimal("0.0"), longitude=Decimal("0.0")) 234 | responses[site.api_url]["geosearch_decimals"] = res 235 | res = site.geosearch(latitude=Decimal("0.0"), longitude=0.0) 236 | responses[site.api_url]["geosearch_mix_types"] = res 237 | res = site.geosearch( 238 | title="new york city", latitude=Decimal("-9999999999.999"), longitude=Decimal("0.0"), results=22, radius=10000 239 | ) 240 | responses[site.api_url]["geosearch_page_invalid_lat_long"] = res 241 | res = site.geosearch(title="new york city", results=22, radius=10000) 242 | responses[site.api_url]["geosearch_page_radius_results_set"] = res 243 | res = site.geosearch(title="new york city", radius=10000) 244 | responses[site.api_url]["geosearch_page_radius_results"] = res 245 | res = site.geosearch(title="new york city") 246 | responses[site.api_url]["geosearch_page"] = res 247 | try: 248 | site.geosearch(latitude=None, longitude=Decimal("0.0"), results=22, radius=10000) 249 | except ValueError as ex: 250 | responses[site.api_url]["invalid_lat_long_value_msg"] = str(ex) 251 | try: 252 | site.geosearch(latitude=Decimal("-9999999999.999"), longitude=Decimal("0.0"), results=22, radius=10000) 253 | except MediaWikiGeoCoordError as ex: 254 | responses[site.api_url]["invalid_lat_long_geo_msg"] = ex.message 255 | 256 | print("Completed pulling geo search") 257 | 258 | if PULL_ALL is True or PULL_CATEGORYMEMBERS is True: 259 | res = site.categorymembers("Chess", results=15, subcategories=True) 260 | responses[site.api_url]["category_members_with_subcategories"] = res 261 | res = site.categorymembers("Chess", results=15, subcategories=False) 262 | responses[site.api_url]["category_members_without_subcategories"] = res 263 | res = site.categorymembers("Chess", results=5, subcategories=False) 264 | responses[site.api_url]["category_members_without_subcategories_5"] = res 265 | res = site.categorymembers("Disambiguation categories", results=None) 266 | responses[site.api_url]["category_members_very_large"] = res 267 | 268 | print("Completed pulling category members") 269 | 270 | if PULL_ALL is True or PULL_CATEGORYTREE is True: 271 | site.rate_limit = True 272 | ct = site.categorytree(["Chess", "Ebola"], depth=None) # type: ignore 273 | with open(CATTREE_FILE, "w") as fp: 274 | json.dump(ct, fp, ensure_ascii=False, sort_keys=True) 275 | 276 | try: 277 | site.categorytree("Chess Ebola", depth=None) # type: ignore 278 | except Exception as ex: 279 | responses[site.api_url]["missing_categorytree"] = str(ex) 280 | site.rate_limit = False 281 | 282 | print("Completed pulling category tree") 283 | 284 | if PULL_ALL is True or PULL_SUMMARY is True: 285 | res = site.summary("chess", chars=50) 286 | responses[site.api_url]["summarize_chars_50"] = res 287 | res = site.summary("chess", sentences=5) 288 | responses[site.api_url]["summarize_sent_5"] = res 289 | res = site.summary("chess") 290 | responses[site.api_url]["summarize_first_paragraph"] = res 291 | 292 | print("Completed pulling summaries") 293 | 294 | if PULL_ALL is True or PULL_PAGE_ERRORS is True: 295 | try: 296 | site.page("gobbilygook") 297 | except PageError as ex: 298 | responses[site.api_url]["page_error_msg"] = ex.message 299 | 300 | try: 301 | site.page("gobbilygook", auto_suggest=False) 302 | except PageError as ex: 303 | responses[site.api_url]["page_error_msg_title"] = ex.message 304 | 305 | try: 306 | site.page(pageid=-1) 307 | except PageError as ex: 308 | responses[site.api_url]["page_error_msg_pageid"] = ex.message 309 | 310 | print("Completed pulling page errors") 311 | 312 | if PULL_ALL is True or PULL_DISAMBIGUATION_ERRORS is True: 313 | try: 314 | site.page("bush") 315 | except DisambiguationError as ex: 316 | responses[site.api_url]["disambiguation_error_msg"] = ex.message 317 | 318 | try: 319 | site.page("Oasis") 320 | except DisambiguationError as ex: 321 | msg = ex.message 322 | responses[site.api_url]["disambiguation_error_msg_with_empty"] = msg 323 | 324 | print("Completed pulling disambiguation errors") 325 | 326 | if PULL_ALL is True or PULL_API_URL_ERROR is True: 327 | url = "https://french.wikipedia.org/w/api.php" 328 | try: 329 | site.set_api_url(api_url=url, lang="fr") 330 | except MediaWikiAPIURLError as ex: 331 | responses[site.api_url]["api_url_error_msg"] = ex.message 332 | 333 | # this shouldn't be necessary since it should go back to the original 334 | # values 335 | site.set_api_url(api_url="https://en.wikipedia.org/w/api.php", lang="en") 336 | print("Completed pulling api url errors") 337 | 338 | if PULL_ALL is True or PULL_REDIRECT_ERROR is True: 339 | # print('Start redirect error') 340 | try: 341 | asoiaf.page("arya", auto_suggest=False, redirect=False) 342 | except RedirectError as ex: 343 | responses[asoiaf.api_url]["redirect_error_msg"] = ex.message 344 | 345 | print("Completed pulling redirect errors") 346 | 347 | 348 | if PULL_ALL is True or PULL_PAGES is True: 349 | # unicode 350 | site.page("Jacques Léonard Muller") 351 | # page id and wikitext 352 | p = site.page(pageid=24337758, auto_suggest=False) 353 | responses["bpp-complexity_wikitext"] = p.wikitext 354 | 355 | # coordinates 356 | p = site.page("Washington Monument") 357 | coords = p.coordinates 358 | responses[site.api_url]["wash_mon"] = [str(coords[0]), str(coords[1])] 359 | 360 | # page properties 361 | 362 | # arya 363 | pg = asoiaf.page("arya") 364 | responses[asoiaf.api_url]["arya"] = dict() 365 | responses[asoiaf.api_url]["arya"]["title"] = pg.title 366 | responses[asoiaf.api_url]["arya"]["pageid"] = pg.pageid 367 | responses[asoiaf.api_url]["arya"]["revision_id"] = pg.revision_id 368 | responses[asoiaf.api_url]["arya"]["parent_id"] = pg.parent_id 369 | responses[asoiaf.api_url]["arya"]["content"] = pg.content 370 | responses[asoiaf.api_url]["arya"]["url"] = pg.url 371 | # other properties 372 | responses[asoiaf.api_url]["arya"]["backlinks"] = pg.backlinks 373 | responses[asoiaf.api_url]["arya"]["images"] = pg.images 374 | responses[asoiaf.api_url]["arya"]["redirects"] = pg.redirects 375 | responses[asoiaf.api_url]["arya"]["links"] = pg.links 376 | responses[asoiaf.api_url]["arya"]["categories"] = pg.categories 377 | responses[asoiaf.api_url]["arya"]["references"] = pg.references 378 | responses[asoiaf.api_url]["arya"]["content"] = pg.content 379 | responses[asoiaf.api_url]["arya"]["parent_id"] = pg.parent_id 380 | responses[asoiaf.api_url]["arya"]["revision_id"] = pg.revision_id 381 | responses[asoiaf.api_url]["arya"]["coordinates"] = pg.coordinates 382 | responses[asoiaf.api_url]["arya"]["summary"] = pg.summary 383 | responses[asoiaf.api_url]["arya"]["sections"] = pg.sections 384 | res = pg.section("A Game of Thrones") 385 | responses[asoiaf.api_url]["arya"]["section_a_game_of_thrones"] = res 386 | res = pg.section("External links") 387 | responses[asoiaf.api_url]["arya"]["last_section"] = res 388 | responses[asoiaf.api_url]["arya"]["html"] = pg.html 389 | 390 | # jon snow 391 | pg = asoiaf.page("jon snow") 392 | responses[asoiaf.api_url]["jon-snow"] = dict() 393 | responses[asoiaf.api_url]["jon-snow"]["title"] = pg.title 394 | responses[asoiaf.api_url]["jon-snow"]["pageid"] = pg.pageid 395 | responses[asoiaf.api_url]["jon-snow"]["revision_id"] = pg.revision_id 396 | responses[asoiaf.api_url]["jon-snow"]["parent_id"] = pg.parent_id 397 | responses[asoiaf.api_url]["jon-snow"]["content"] = pg.content 398 | responses[asoiaf.api_url]["jon-snow"]["url"] = pg.url 399 | 400 | # castos 401 | pg = asoiaf.page("Castos") 402 | responses[asoiaf.api_url]["castos"] = dict() 403 | res = pg.section("References and Notes") 404 | responses[asoiaf.api_url]["castos"]["section"] = res 405 | 406 | # other pages as they will be in the response object 407 | asoiaf.page("arya", auto_suggest=False) 408 | 409 | # lang links property (standard wikipedia) 410 | pg = site.page("Nobel Prize in Chemistry") 411 | responses[site.api_url]["nobel_chemistry"] = dict() 412 | responses[site.api_url]["nobel_chemistry"]["langlinks"] = pg.langlinks 413 | 414 | print("Completed pulling pages and properties") 415 | 416 | 417 | if PULL_ALL is True or PULL_LOGOS is True: 418 | # single logo 419 | res = wikipedia.page("Chess").logos 420 | responses[wikipedia.api_url]["chess_logos"] = res 421 | # multiple logos 422 | res = wikipedia.page("Sony Music").logos 423 | responses[wikipedia.api_url]["sony_music_logos"] = res 424 | # no infobox 425 | res = wikipedia.page("Antivirus Software").logos 426 | responses[wikipedia.api_url]["antivirus_software_logos"] = res 427 | 428 | print("Completed pulling logos") 429 | 430 | 431 | if PULL_ALL is True or PULL_PREVIEWS is True: 432 | res = wikipedia.page("Chess").preview 433 | responses[wikipedia.api_url]["chess_preview"] = res 434 | 435 | print("Completed pulling previews") 436 | 437 | 438 | if PULL_ALL is True or PULL_HATNOTES is True: 439 | # contains hatnotes 440 | res = wikipedia.page("Chess").hatnotes 441 | responses[wikipedia.api_url]["chess_hatnotes"] = res 442 | # no hatnotes 443 | page_name = "List of Battlestar Galactica (1978 TV series) and " "Galactica 1980 episodes" 444 | res = wikipedia.page(page_name).hatnotes 445 | responses[wikipedia.api_url]["page_no_hatnotes"] = res 446 | 447 | print("Completed pulling hat notes") 448 | 449 | if PULL_ALL is True or PULL_SECTION_LINKS is True: 450 | # contains external links 451 | pg = wikipedia.page("""McDonald's""") 452 | res = pg.parse_section_links("EXTERNAL LINKS") 453 | responses[wikipedia.api_url]["mcy_ds_external_links"] = res 454 | 455 | res = pg.parse_section_links(None) 456 | responses[wikipedia.api_url]["mcy_ds_external_links_none"] = res 457 | 458 | # doesn't contain external links 459 | pg = wikipedia.page("Tropical rainforest conservation") 460 | res = pg.parse_section_links("EXTERNAL LINKS") 461 | responses[wikipedia.api_url]["page_no_sec_links"] = res 462 | 463 | pg = asoiaf.page("arya") 464 | for section in pg.sections: 465 | links = pg.parse_section_links(section) 466 | responses[asoiaf.api_url]["arya_{}_links".format(section)] = links 467 | 468 | print("Completed pulling the section links") 469 | 470 | if PULL_ALL is True or PULL_TABLE_OF_CONTENTS is True: 471 | pg = wikipedia.page("New York City") 472 | res = pg.sections 473 | responses[wikipedia.api_url]["new_york_city_sections"] = res 474 | res = pg.table_of_contents 475 | responses[wikipedia.api_url]["new_york_city_toc"] = res 476 | responses[wikipedia.api_url]["new_york_city_air_quality"] = pg.section("Air quality") 477 | responses[wikipedia.api_url]["new_york_city_none"] = pg.section(None) 478 | responses[wikipedia.api_url]["new_york_city_last_sec"] = pg.section("External links") 479 | print("Completed pulling Table of Content data") 480 | 481 | if PULL_ALL is True or PULL_LOGIN is True: 482 | pg = wikipedia.login(username="badusername", password="fakepassword") 483 | print("Completed pulling login") 484 | 485 | 486 | if PULL_ALL is True or PULL_ISSUE_14 is True: 487 | res = site.page("One Two Three... Infinity").images 488 | responses[wikipedia.api_url]["hidden_images"] = res 489 | 490 | # missing http got lumped into this issue... 491 | page = site.page("Minneapolis") 492 | responses[site.api_url]["references_without_http"] = page.references 493 | 494 | print("Completed pulling issue 14") 495 | 496 | if PULL_ALL is True or PULL_ISSUE_15 is True: 497 | res = site.page("Rober Eryol").images 498 | responses[wikipedia.api_url]["infinite_loop_images"] = res 499 | res = site.page("List of named minor planets (numerical)").links 500 | responses[wikipedia.api_url]["large_continued_query"] = res 501 | res = wikipedia.page("B8 polytope").images 502 | responses[wikipedia.api_url]["large_continued_query_images"] = res 503 | 504 | print("Completed pulling issue 15") 505 | 506 | if PULL_ALL is True or PULL_ISSUE_35 is True: 507 | try: 508 | site.page("Leaching") 509 | except DisambiguationError as ex: 510 | responses[wikipedia.api_url]["missing_title_disamb_dets"] = ex.details 511 | responses[wikipedia.api_url]["missing_title_disamb_msg"] = str(ex) 512 | 513 | print("Completed pulling issue 35") 514 | 515 | if PULL_ALL is True or PULL_ISSUE_39 is True: 516 | res = plants.categorymembers("Plant", results=None, subcategories=False) 517 | responses[plants.api_url]["query-continue-find"] = res 518 | 519 | print("Completed pulling issue 39") 520 | 521 | # dump data to file 522 | with open(RESPONSES_FILE, "w") as mock: 523 | json.dump(responses, mock, ensure_ascii=False, indent=1, sort_keys=True) 524 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | 3 | # Analyse import fallback blocks. This can be used to support both Python 2 and 4 | # 3 compatible code, which means that the block might have code that exists 5 | # only in one or another interpreter, leading to false positives when analysed. 6 | analyse-fallback-blocks=no 7 | 8 | # Load and enable all available extensions. Use --list-extensions to see a list 9 | # all available extensions. 10 | #enable-all-extensions= 11 | 12 | # In error mode, messages with a category besides ERROR or FATAL are 13 | # suppressed, and no reports are done by default. Error mode is compatible with 14 | # disabling specific errors. 15 | #errors-only= 16 | 17 | # Always return a 0 (non-error) status code, even if lint errors are found. 18 | # This is primarily useful in continuous integration scripts. 19 | #exit-zero= 20 | 21 | # A comma-separated list of package or module names from where C extensions may 22 | # be loaded. Extensions are loading into the active Python interpreter and may 23 | # run arbitrary code. 24 | extension-pkg-allow-list= 25 | 26 | # A comma-separated list of package or module names from where C extensions may 27 | # be loaded. Extensions are loading into the active Python interpreter and may 28 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list 29 | # for backward compatibility.) 30 | extension-pkg-whitelist= 31 | 32 | # Return non-zero exit code if any of these messages/categories are detected, 33 | # even if score is above --fail-under value. Syntax same as enable. Messages 34 | # specified are enabled, while categories only check already-enabled messages. 35 | fail-on= 36 | 37 | # Specify a score threshold to be exceeded before program exits with error. 38 | fail-under=10 39 | 40 | # Interpret the stdin as a python script, whose filename needs to be passed as 41 | # the module_or_package argument. 42 | #from-stdin= 43 | 44 | # Files or directories to be skipped. They should be base names, not paths. 45 | ignore=CVS 46 | 47 | # Add files or directories matching the regex patterns to the ignore-list. The 48 | # regex matches against paths and can be in Posix or Windows format. 49 | ignore-paths= 50 | 51 | # Files or directories matching the regex patterns are skipped. The regex 52 | # matches against base names, not paths. The default value ignores Emacs file 53 | # locks 54 | ignore-patterns=^\.# 55 | 56 | # List of module names for which member attributes should not be checked 57 | # (useful for modules/projects where namespaces are manipulated during runtime 58 | # and thus existing member attributes cannot be deduced by static analysis). It 59 | # supports qualified module names, as well as Unix pattern matching. 60 | ignored-modules= 61 | 62 | # Python code to execute, usually for sys.path manipulation such as 63 | # pygtk.require(). 64 | #init-hook= 65 | 66 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 67 | # number of processors available to use, and will cap the count on Windows to 68 | # avoid hangs. 69 | jobs=1 70 | 71 | # Control the amount of potential inferred values when inferring a single 72 | # object. This can help the performance when dealing with large functions or 73 | # complex, nested conditions. 74 | limit-inference-results=100 75 | 76 | # List of plugins (as comma separated values of python module names) to load, 77 | # usually to register additional checkers. 78 | load-plugins= 79 | 80 | # Pickle collected data for later comparisons. 81 | persistent=yes 82 | 83 | # Minimum Python version to use for version dependent checks. Will default to 84 | # the version used to run pylint. 85 | py-version=3.10 86 | 87 | # Discover python modules and packages in the file system subtree. 88 | recursive=no 89 | 90 | # When enabled, pylint would attempt to guess common misconfiguration and emit 91 | # user-friendly hints instead of false-positive error messages. 92 | suggestion-mode=yes 93 | 94 | # Allow loading of arbitrary C extensions. Extensions are imported into the 95 | # active Python interpreter and may run arbitrary code. 96 | unsafe-load-any-extension=no 97 | 98 | # In verbose mode, extra non-checker-related info will be displayed. 99 | #verbose= 100 | 101 | 102 | [REPORTS] 103 | 104 | # Python expression which should return a score less than or equal to 10. You 105 | # have access to the variables 'fatal', 'error', 'warning', 'refactor', 106 | # 'convention', and 'info' which contain the number of messages in each 107 | # category, as well as 'statement' which is the total number of statements 108 | # analyzed. This score is used by the global evaluation report (RP0004). 109 | evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) 110 | 111 | # Template used to display messages. This is a python new-style format string 112 | # used to format the message information. See doc for all details. 113 | msg-template= 114 | 115 | # Set the output format. Available formats are text, parseable, colorized, json 116 | # and msvs (visual studio). You can also give a reporter class, e.g. 117 | # mypackage.mymodule.MyReporterClass. 118 | # output-format=text 119 | 120 | # Tells whether to display a full report or only the messages. 121 | reports=yes 122 | 123 | # Activate the evaluation score. 124 | score=yes 125 | 126 | 127 | [MESSAGES CONTROL] 128 | 129 | # Only show warnings with the listed confidence levels. Leave empty to show 130 | # all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, 131 | # UNDEFINED. 132 | confidence=HIGH, 133 | CONTROL_FLOW, 134 | INFERENCE, 135 | INFERENCE_FAILURE, 136 | UNDEFINED 137 | 138 | # Disable the message, report, category or checker with the given id(s). You 139 | # can either give multiple identifiers separated by comma (,) or put this 140 | # option multiple times (only on the command line, not in the configuration 141 | # file where it should appear only once). You can also use "--disable=all" to 142 | # disable everything first and then re-enable specific checks. For example, if 143 | # you want to run only the similarities checker, you can use "--disable=all 144 | # --enable=similarities". If you want to run only the classes checker, but have 145 | # no Warning level messages displayed, use "--disable=all --enable=classes 146 | # --disable=W". 147 | disable=raw-checker-failed, 148 | bad-inline-option, 149 | locally-disabled, 150 | file-ignored, 151 | suppressed-message, 152 | useless-suppression, 153 | deprecated-pragma, 154 | use-symbolic-message-instead, 155 | too-many-arguments, 156 | protected-access, 157 | 158 | # Enable the message, report, category or checker with the given id(s). You can 159 | # either give multiple identifier separated by comma (,) or put this option 160 | # multiple time (only on the command line, not in the configuration file where 161 | # it should appear only once). See also the "--disable" option for examples. 162 | enable=c-extension-no-member 163 | 164 | 165 | [DESIGN] 166 | 167 | # List of regular expressions of class ancestor names to ignore when counting 168 | # public methods (see R0903) 169 | exclude-too-few-public-methods= 170 | 171 | # List of qualified class names to ignore when counting class parents (see 172 | # R0901) 173 | ignored-parents= 174 | 175 | # Maximum number of arguments for function / method. 176 | # Default = 5 177 | max-args=12 178 | 179 | # Maximum number of attributes for a class (see R0902). 180 | max-attributes=35 181 | 182 | # Maximum number of boolean expressions in an if statement (see R0916). 183 | max-bool-expr=5 184 | 185 | # Maximum number of branch for function / method body (see R0912) 186 | max-branches=15 187 | 188 | # Maximum number of locals for function / method body. 189 | max-locals=20 190 | 191 | # Maximum number of parents for a class (see R0901). 192 | max-parents=7 193 | 194 | # Maximum number of public methods for a class (see R0904). 195 | max-public-methods=40 196 | 197 | # Maximum number of return / yield for function / method body. 198 | max-returns=6 199 | 200 | # Maximum number of statements in function / method body. 201 | max-statements=50 202 | 203 | # Minimum number of public methods for a class (see R0903). 204 | min-public-methods=2 205 | 206 | 207 | [MISCELLANEOUS] 208 | 209 | # List of note tags to take in consideration, separated by a comma. 210 | notes=FIXME, 211 | XXX, 212 | TODO 213 | 214 | # Regular expression of note tags to take in consideration. 215 | notes-rgx= 216 | 217 | 218 | [SPELLING] 219 | 220 | # Limits count of emitted suggestions for spelling mistakes. 221 | max-spelling-suggestions=4 222 | 223 | # Spelling dictionary name. Available dictionaries: none. To make it work, 224 | # install the 'python-enchant' package. 225 | spelling-dict= 226 | 227 | # List of comma separated words that should be considered directives if they 228 | # appear at the beginning of a comment and should not be checked. 229 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: 230 | 231 | # List of comma separated words that should not be checked. 232 | spelling-ignore-words= 233 | 234 | # A path to a file that contains the private dictionary; one word per line. 235 | spelling-private-dict-file= 236 | 237 | # Tells whether to store unknown words to the private dictionary (see the 238 | # --spelling-private-dict-file option) instead of raising a message. 239 | spelling-store-unknown-words=no 240 | 241 | 242 | [FORMAT] 243 | 244 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 245 | expected-line-ending-format= 246 | 247 | # Regexp for a line that is allowed to be longer than the limit. 248 | ignore-long-lines=^\s*(# )??$ 249 | 250 | # Number of spaces of indent required inside a hanging or continued line. 251 | indent-after-paren=4 252 | 253 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 254 | # tab). 255 | indent-string=' ' 256 | 257 | # Maximum number of characters on a single line. 258 | # Default = 100 259 | max-line-length=120 260 | 261 | # Maximum number of lines in a module. 262 | # Default = 1000 263 | max-module-lines=1250 264 | 265 | # Allow the body of a class to be on the same line as the declaration if body 266 | # contains single statement. 267 | single-line-class-stmt=no 268 | 269 | # Allow the body of an if to be on the same line as the test if there is no 270 | # else. 271 | single-line-if-stmt=no 272 | 273 | 274 | [REFACTORING] 275 | 276 | # Maximum number of nested blocks for function / method body 277 | max-nested-blocks=5 278 | 279 | # Complete name of functions that never returns. When checking for 280 | # inconsistent-return-statements if a never returning function is called then 281 | # it will be considered as an explicit return statement and no message will be 282 | # printed. 283 | never-returning-functions=sys.exit,argparse.parse_error 284 | 285 | 286 | [STRING] 287 | 288 | # This flag controls whether inconsistent-quotes generates a warning when the 289 | # character used as a quote delimiter is used inconsistently within a module. 290 | check-quote-consistency=no 291 | 292 | # This flag controls whether the implicit-str-concat should generate a warning 293 | # on implicit string concatenation in sequences defined over several lines. 294 | check-str-concat-over-line-jumps=no 295 | 296 | 297 | [VARIABLES] 298 | 299 | # List of additional names supposed to be defined in builtins. Remember that 300 | # you should avoid defining new builtins when possible. 301 | additional-builtins= 302 | 303 | # Tells whether unused global variables should be treated as a violation. 304 | allow-global-unused-variables=yes 305 | 306 | # List of names allowed to shadow builtins 307 | allowed-redefined-builtins= 308 | 309 | # List of strings which can identify a callback function by name. A callback 310 | # name must start or end with one of those strings. 311 | callbacks=cb_, 312 | _cb 313 | 314 | # A regular expression matching the name of dummy variables (i.e. expected to 315 | # not be used). 316 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 317 | 318 | # Argument names that match this expression will be ignored. Default to name 319 | # with leading underscore. 320 | ignored-argument-names=_.*|^ignored_|^unused_ 321 | 322 | # Tells whether we should check for unused import in __init__ files. 323 | init-import=no 324 | 325 | # List of qualified module names which can have objects that can redefine 326 | # builtins. 327 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 328 | 329 | 330 | [TYPECHECK] 331 | 332 | # List of decorators that produce context managers, such as 333 | # contextlib.contextmanager. Add to this list to register other decorators that 334 | # produce valid context managers. 335 | contextmanager-decorators=contextlib.contextmanager 336 | 337 | # List of members which are set dynamically and missed by pylint inference 338 | # system, and so shouldn't trigger E1101 when accessed. Python regular 339 | # expressions are accepted. 340 | generated-members= 341 | 342 | # Tells whether to warn about missing members when the owner of the attribute 343 | # is inferred to be None. 344 | ignore-none=yes 345 | 346 | # This flag controls whether pylint should warn about no-member and similar 347 | # checks whenever an opaque object is returned when inferring. The inference 348 | # can return multiple potential results while evaluating a Python object, but 349 | # some branches might not be evaluated, which results in partial inference. In 350 | # that case, it might be useful to still emit no-member and other checks for 351 | # the rest of the inferred objects. 352 | ignore-on-opaque-inference=yes 353 | 354 | # List of symbolic message names to ignore for Mixin members. 355 | ignored-checks-for-mixins=no-member, 356 | not-async-context-manager, 357 | not-context-manager, 358 | attribute-defined-outside-init 359 | 360 | # List of class names for which member attributes should not be checked (useful 361 | # for classes with dynamically set attributes). This supports the use of 362 | # qualified names. 363 | ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace 364 | 365 | # Show a hint with possible names when a member name was not found. The aspect 366 | # of finding the hint is based on edit distance. 367 | missing-member-hint=yes 368 | 369 | # The minimum edit distance a name should have in order to be considered a 370 | # similar match for a missing member name. 371 | missing-member-hint-distance=1 372 | 373 | # The total number of similar names that should be taken in consideration when 374 | # showing a hint for a missing member. 375 | missing-member-max-choices=1 376 | 377 | # Regex pattern to define which classes are considered mixins. 378 | mixin-class-rgx=.*[Mm]ixin 379 | 380 | # List of decorators that change the signature of a decorated function. 381 | signature-mutators= 382 | 383 | 384 | [CLASSES] 385 | 386 | # Warn about protected attribute access inside special methods 387 | check-protected-access-in-special-methods=no 388 | 389 | # List of method names used to declare (i.e. assign) instance attributes. 390 | defining-attr-methods=__init__, 391 | __new__, 392 | setUp, 393 | __post_init__ 394 | 395 | # List of member names, which should be excluded from the protected access 396 | # warning. 397 | exclude-protected=_asdict, 398 | _fields, 399 | _replace, 400 | _source, 401 | _make 402 | 403 | # List of valid names for the first argument in a class method. 404 | valid-classmethod-first-arg=cls 405 | 406 | # List of valid names for the first argument in a metaclass class method. 407 | valid-metaclass-classmethod-first-arg=cls 408 | 409 | 410 | [IMPORTS] 411 | 412 | # List of modules that can be imported at any level, not just the top level 413 | # one. 414 | allow-any-import-level= 415 | 416 | # Allow wildcard imports from modules that define __all__. 417 | allow-wildcard-with-all=no 418 | 419 | # Deprecated modules which should not be used, separated by a comma. 420 | deprecated-modules= 421 | 422 | # Output a graph (.gv or any supported image format) of external dependencies 423 | # to the given file (report RP0402 must not be disabled). 424 | ext-import-graph= 425 | 426 | # Output a graph (.gv or any supported image format) of all (i.e. internal and 427 | # external) dependencies to the given file (report RP0402 must not be 428 | # disabled). 429 | import-graph= 430 | 431 | # Output a graph (.gv or any supported image format) of internal dependencies 432 | # to the given file (report RP0402 must not be disabled). 433 | int-import-graph= 434 | 435 | # Force import order to recognize a module as part of the standard 436 | # compatibility libraries. 437 | known-standard-library= 438 | 439 | # Force import order to recognize a module as part of a third party library. 440 | known-third-party=enchant 441 | 442 | # Couples of modules and preferred modules, separated by a comma. 443 | preferred-modules= 444 | 445 | 446 | [SIMILARITIES] 447 | 448 | # Comments are removed from the similarity computation 449 | ignore-comments=yes 450 | 451 | # Docstrings are removed from the similarity computation 452 | ignore-docstrings=yes 453 | 454 | # Imports are removed from the similarity computation 455 | ignore-imports=yes 456 | 457 | # Signatures are removed from the similarity computation 458 | ignore-signatures=yes 459 | 460 | # Minimum lines number of a similarity. 461 | min-similarity-lines=4 462 | 463 | 464 | [BASIC] 465 | 466 | # Naming style matching correct argument names. 467 | argument-naming-style=snake_case 468 | 469 | # Regular expression matching correct argument names. Overrides argument- 470 | # naming-style. If left empty, argument names will be checked with the set 471 | # naming style. 472 | #argument-rgx= 473 | 474 | # Naming style matching correct attribute names. 475 | attr-naming-style=snake_case 476 | 477 | # Regular expression matching correct attribute names. Overrides attr-naming- 478 | # style. If left empty, attribute names will be checked with the set naming 479 | # style. 480 | #attr-rgx= 481 | 482 | # Bad variable names which should always be refused, separated by a comma. 483 | bad-names=foo, 484 | bar, 485 | baz, 486 | toto, 487 | tutu, 488 | tata 489 | 490 | # Bad variable names regexes, separated by a comma. If names match any regex, 491 | # they will always be refused 492 | bad-names-rgxs= 493 | 494 | # Naming style matching correct class attribute names. 495 | class-attribute-naming-style=any 496 | 497 | # Regular expression matching correct class attribute names. Overrides class- 498 | # attribute-naming-style. If left empty, class attribute names will be checked 499 | # with the set naming style. 500 | #class-attribute-rgx= 501 | 502 | # Naming style matching correct class constant names. 503 | class-const-naming-style=UPPER_CASE 504 | 505 | # Regular expression matching correct class constant names. Overrides class- 506 | # const-naming-style. If left empty, class constant names will be checked with 507 | # the set naming style. 508 | #class-const-rgx= 509 | 510 | # Naming style matching correct class names. 511 | class-naming-style=PascalCase 512 | 513 | # Regular expression matching correct class names. Overrides class-naming- 514 | # style. If left empty, class names will be checked with the set naming style. 515 | #class-rgx= 516 | 517 | # Naming style matching correct constant names. 518 | const-naming-style=UPPER_CASE 519 | 520 | # Regular expression matching correct constant names. Overrides const-naming- 521 | # style. If left empty, constant names will be checked with the set naming 522 | # style. 523 | #const-rgx= 524 | 525 | # Minimum line length for functions/classes that require docstrings, shorter 526 | # ones are exempt. 527 | docstring-min-length=-1 528 | 529 | # Naming style matching correct function names. 530 | function-naming-style=snake_case 531 | 532 | # Regular expression matching correct function names. Overrides function- 533 | # naming-style. If left empty, function names will be checked with the set 534 | # naming style. 535 | #function-rgx= 536 | 537 | # Good variable names which should always be accepted, separated by a comma. 538 | good-names=i, 539 | j, 540 | k, 541 | ex, 542 | Run, 543 | _ 544 | 545 | # Good variable names regexes, separated by a comma. If names match any regex, 546 | # they will always be accepted 547 | good-names-rgxs= 548 | 549 | # Include a hint for the correct naming format with invalid-name. 550 | include-naming-hint=no 551 | 552 | # Naming style matching correct inline iteration names. 553 | inlinevar-naming-style=any 554 | 555 | # Regular expression matching correct inline iteration names. Overrides 556 | # inlinevar-naming-style. If left empty, inline iteration names will be checked 557 | # with the set naming style. 558 | #inlinevar-rgx= 559 | 560 | # Naming style matching correct method names. 561 | method-naming-style=snake_case 562 | 563 | # Regular expression matching correct method names. Overrides method-naming- 564 | # style. If left empty, method names will be checked with the set naming style. 565 | #method-rgx= 566 | 567 | # Naming style matching correct module names. 568 | module-naming-style=snake_case 569 | 570 | # Regular expression matching correct module names. Overrides module-naming- 571 | # style. If left empty, module names will be checked with the set naming style. 572 | #module-rgx= 573 | 574 | # Colon-delimited sets of names that determine each other's naming style when 575 | # the name regexes allow several styles. 576 | name-group= 577 | 578 | # Regular expression which should only match function or class names that do 579 | # not require a docstring. 580 | no-docstring-rgx=^_ 581 | 582 | # List of decorators that produce properties, such as abc.abstractproperty. Add 583 | # to this list to register other decorators that produce valid properties. 584 | # These decorators are taken in consideration only for invalid-name. 585 | property-classes=abc.abstractproperty 586 | 587 | # Regular expression matching correct type variable names. If left empty, type 588 | # variable names will be checked with the set naming style. 589 | #typevar-rgx= 590 | 591 | # Naming style matching correct variable names. 592 | variable-naming-style=snake_case 593 | 594 | # Regular expression matching correct variable names. Overrides variable- 595 | # naming-style. If left empty, variable names will be checked with the set 596 | # naming style. 597 | #variable-rgx= 598 | 599 | 600 | [EXCEPTIONS] 601 | 602 | # Exceptions that will emit a warning when caught. 603 | overgeneral-exceptions=mediawiki.exceptions.BaseException,builtins.Exception 604 | 605 | 606 | [LOGGING] 607 | 608 | # The type of string formatting that logging methods do. `old` means using % 609 | # formatting, `new` is for `{}` formatting. 610 | logging-format-style=old 611 | 612 | # Logging modules to check that the string format arguments are in logging 613 | # function parameter format. 614 | logging-modules=logging 615 | -------------------------------------------------------------------------------- /mediawiki/mediawikipage.py: -------------------------------------------------------------------------------- 1 | """ 2 | MediaWikiPage class module 3 | """ 4 | 5 | # MIT License 6 | # Author: Tyler Barrus (barrust@gmail.com) 7 | 8 | import re 9 | from collections import OrderedDict 10 | from decimal import Decimal 11 | from typing import Any, Dict, Iterator, List, Optional, Tuple, Union 12 | 13 | from bs4 import BeautifulSoup, NavigableString, Tag 14 | 15 | from mediawiki.exceptions import ( 16 | ODD_ERROR_MESSAGE, 17 | DisambiguationError, 18 | MediaWikiBaseException, 19 | MediaWikiException, 20 | PageError, 21 | RedirectError, 22 | ) 23 | from mediawiki.utilities import is_relative_url, str_or_unicode 24 | 25 | 26 | class MediaWikiPage: 27 | """MediaWiki Page Instance 28 | 29 | Args: 30 | mediawiki (MediaWiki): MediaWiki class object from which to pull 31 | title (str): Title of page to retrieve 32 | pageid (int): MediaWiki site pageid to retrieve 33 | redirect (bool): **True:** Follow redirects 34 | preload (bool): **True:** Load most properties after getting page 35 | original_title (str): Not to be used from the caller; used to help follow redirects 36 | Raises: 37 | :py:func:`mediawiki.exceptions.PageError`: if page provided does not exist 38 | Raises: 39 | :py:func:`mediawiki.exceptions.DisambiguationError`: if page provided is a disambiguation page 40 | Raises: 41 | :py:func:`mediawiki.exceptions.RedirectError`: if redirect is **False** and the pageid or title \ 42 | provided redirects to another page 43 | Warning: 44 | This should never need to be used directly! Please use :func:`mediawiki.MediaWiki.page`""" 45 | 46 | __slots__ = [ 47 | "mediawiki", 48 | "url", 49 | "title", 50 | "original_title", 51 | "pageid", 52 | "_content", 53 | "_revision_id", 54 | "_parent_id", 55 | "_html", 56 | "_soup", 57 | "_images", 58 | "_references", 59 | "_categories", 60 | "_coordinates", 61 | "_links", 62 | "_redirects", 63 | "_backlinks", 64 | "_langlinks", 65 | "_summary", 66 | "_sections", 67 | "_table_of_contents", 68 | "_logos", 69 | "_hatnotes", 70 | "_wikitext", 71 | "_preview", 72 | ] 73 | 74 | def __init__( 75 | self, 76 | mediawiki, 77 | title: Optional[str] = None, 78 | pageid: Optional[int] = None, 79 | redirect: bool = True, 80 | preload: bool = False, 81 | original_title: str = "", 82 | ): 83 | self.mediawiki = mediawiki 84 | self.url: Optional[str] = None 85 | if title is not None: 86 | self.title = title 87 | self.original_title = original_title or title 88 | elif pageid is not None: 89 | self.pageid = pageid 90 | else: 91 | raise ValueError("Either a title or a pageid must be specified") 92 | 93 | self._content: Optional[str] = None 94 | self._revision_id: Optional[int] = None 95 | self._parent_id: Optional[int] = None 96 | self._html: Union[bool, str] = False # None signifies nothing returned... 97 | self._images: Optional[List[str]] = None 98 | self._references: Optional[List[str]] = None 99 | self._categories: Optional[List[str]] = None 100 | self._coordinates: Union[bool, None, Tuple[Decimal, Decimal]] = False # None signifies nothing returned... 101 | self._links: Optional[List[str]] = None 102 | self._redirects: Optional[List[str]] = None 103 | self._backlinks: Optional[List[str]] = None 104 | self._langlinks: Optional[Dict[str, str]] = None 105 | self._summary: Optional[str] = None 106 | self._sections: Optional[List[str]] = None 107 | self._table_of_contents: Optional[Dict[str, Any]] = None 108 | self._logos: Optional[List[str]] = None 109 | self._hatnotes: Optional[List[str]] = None 110 | self._soup: Optional[BeautifulSoup] = None 111 | self._wikitext: Optional[str] = None 112 | self._preview: Optional[Dict[str, str]] = None 113 | 114 | self.__load(redirect=redirect, preload=preload) 115 | 116 | preload_props = [ 117 | "content", 118 | "summary", 119 | "images", 120 | "references", 121 | "links", 122 | "sections", 123 | "redirects", 124 | "coordinates", 125 | "backlinks", 126 | "categories", 127 | ] 128 | if preload: 129 | for prop in preload_props: 130 | getattr(self, prop) 131 | 132 | # end __init__ 133 | 134 | def __repr__(self): 135 | """repr""" 136 | return self.__str__() 137 | 138 | def __unicode__(self): 139 | """python 2.7 unicode""" 140 | return f"""""" 141 | 142 | def __str__(self): 143 | """python > 3 unicode python 2.7 byte str""" 144 | return str_or_unicode(self.__unicode__()) 145 | 146 | def __eq__(self, other): 147 | """base eq function""" 148 | try: 149 | return self.pageid == other.pageid and self.title == other.title and self.url == other.url 150 | except AttributeError: 151 | return False 152 | 153 | # Properties 154 | def _pull_content_revision_parent(self) -> Tuple[Optional[str], Optional[int], Optional[int]]: 155 | """combine the pulling of these three properties""" 156 | 157 | if self._revision_id is None: 158 | query_params = { 159 | "prop": "extracts|revisions", 160 | "explaintext": "", 161 | "rvprop": "ids", 162 | } 163 | query_params.update(self.__title_query_param()) 164 | request = self.mediawiki.wiki_request(query_params) 165 | page_info = request["query"]["pages"][self.pageid] 166 | self._content = page_info.get("extract", None) 167 | self._revision_id = page_info["revisions"][0]["revid"] 168 | self._parent_id = page_info["revisions"][0]["parentid"] 169 | 170 | if self._content is None and "TextExtracts" not in self.mediawiki.extensions: 171 | msg = "Unable to extract page content; the TextExtracts extension must be installed!" 172 | raise MediaWikiBaseException(msg) 173 | return self._content, self._revision_id, self._parent_id 174 | 175 | @property 176 | def content(self) -> str: 177 | """str: The page content in text format 178 | 179 | Note: 180 | Not settable 181 | Note: 182 | Side effect is to also get revision_id and parent_id""" 183 | if self._content is None: 184 | self._pull_content_revision_parent() 185 | return self._content # type: ignore 186 | 187 | @property 188 | def revision_id(self) -> int: 189 | """int: The current revision id of the page 190 | 191 | Note: 192 | Not settable 193 | Note: 194 | Side effect is to also get content and parent_id""" 195 | if self._revision_id is None: 196 | self._pull_content_revision_parent() 197 | return self._revision_id # type: ignore 198 | 199 | @property 200 | def parent_id(self) -> int: 201 | """int: The parent id of the page 202 | 203 | Note: 204 | Not settable 205 | Note: 206 | Side effect is to also get content and revision_id""" 207 | if self._parent_id is None: 208 | self._pull_content_revision_parent() 209 | return self._parent_id # type: ignore 210 | 211 | @property 212 | def html(self) -> str: 213 | """str: HTML representation of the page 214 | 215 | Note: 216 | Not settable 217 | Warning: 218 | This can be slow for very large pages""" 219 | if self._html is False: 220 | self._html = "" 221 | query_params = { 222 | "prop": "revisions", 223 | "rvprop": "content", 224 | "rvlimit": 1, 225 | "rvparse": "", 226 | "titles": self.title, 227 | } 228 | request = self.mediawiki.wiki_request(query_params) 229 | page = request["query"]["pages"][self.pageid] 230 | self._html = page["revisions"][0]["*"] 231 | return self._html # type: ignore 232 | 233 | @property 234 | def wikitext(self) -> str: 235 | """str: Wikitext representation of the page 236 | 237 | Note: 238 | Not settable""" 239 | if self._wikitext is None: 240 | query_params = { 241 | "action": "parse", 242 | "pageid": self.pageid, 243 | "prop": "wikitext", 244 | "formatversion": "latest", 245 | } 246 | request = self.mediawiki.wiki_request(query_params) 247 | self._wikitext = request["parse"]["wikitext"] 248 | return self._wikitext 249 | 250 | @property 251 | def images(self) -> List[str]: 252 | """list: Images on the page 253 | 254 | Note: 255 | Not settable""" 256 | if self._images is None: 257 | params = { 258 | "generator": "images", 259 | "gimlimit": "max", 260 | "prop": "imageinfo", # this will be replaced by fileinfo 261 | "iiprop": "url", 262 | } 263 | self._images = [ 264 | page["imageinfo"][0]["url"] 265 | for page in self._continued_query(params) 266 | if "imageinfo" in page and "url" in page["imageinfo"][0] 267 | ] 268 | self._images = sorted(self._images) 269 | return self._images 270 | 271 | @property 272 | def logos(self) -> List[str]: 273 | """list: Parse images within the infobox signifying either the main image or logo 274 | 275 | Note: 276 | Not settable 277 | Note: 278 | Side effect is to also pull the html which can be slow 279 | Note: 280 | This is a parsing operation and not part of the standard API""" 281 | if self._logos is None: 282 | self._logos = [] 283 | # Cache the results of parsing the html, so that multiple calls happen much faster 284 | if not self._soup: 285 | self._soup = BeautifulSoup(self.html, "html.parser") 286 | info = self._soup.find("table", {"class": "infobox"}) 287 | if info is not None and isinstance(info, Tag): 288 | children = info.find_all("a", class_="image") 289 | self._logos.extend("https:" + child.img["src"] for child in children) 290 | return self._logos 291 | 292 | @property 293 | def hatnotes(self) -> List[str]: 294 | """list: Parse hatnotes from the HTML 295 | 296 | Note: 297 | Not settable 298 | Note: 299 | Side effect is to also pull the html which can be slow 300 | Note: 301 | This is a parsing operation and not part of the standard API""" 302 | if self._hatnotes is None: 303 | self._hatnotes = [] 304 | # Cache the results of parsing the html, so that multiple calls happen much faster 305 | if not self._soup: 306 | self._soup = BeautifulSoup(self.html, "html.parser") 307 | notes = self._soup.find_all("div", class_="hatnote") 308 | if notes is not None: 309 | for note in notes: 310 | tmp = [] 311 | for child in note.children: 312 | if hasattr(child, "text"): 313 | tmp.append(child.text) 314 | else: 315 | tmp.append(child) 316 | self._hatnotes.append("".join(tmp)) 317 | return self._hatnotes 318 | 319 | @property 320 | def references(self) -> List[str]: 321 | """list: External links, or references, listed anywhere on the MediaWiki page 322 | Note: 323 | Not settable 324 | Note 325 | May include external links within page that are not technically cited anywhere""" 326 | if self._references is None: 327 | self._references = [] 328 | self.__pull_combined_properties() 329 | return self._references 330 | 331 | @property 332 | def categories(self) -> List[str]: 333 | """list: Non-hidden categories on the page 334 | 335 | Note: 336 | Not settable""" 337 | if self._categories is None: 338 | self._categories = [] 339 | self.__pull_combined_properties() 340 | return self._categories 341 | 342 | @property 343 | def coordinates(self) -> Optional[Tuple[Decimal, Decimal]]: 344 | """Tuple: GeoCoordinates of the place referenced; results in lat/long tuple or None if no geocoordinates present 345 | 346 | Note: 347 | Not settable 348 | Note: 349 | Requires the GeoData extension to be installed""" 350 | if self._coordinates is False: 351 | self._coordinates = None 352 | self.__pull_combined_properties() 353 | return self._coordinates # type: ignore 354 | 355 | @property 356 | def links(self) -> List[str]: 357 | """list: List of all MediaWiki page links on the page 358 | 359 | Note: 360 | Not settable""" 361 | if self._links is None: 362 | self._links = [] 363 | self.__pull_combined_properties() 364 | return self._links 365 | 366 | @property 367 | def redirects(self) -> List[str]: 368 | """list: List of all redirects to this page; **i.e.,** the titles listed here will redirect to this page title 369 | 370 | Note: 371 | Not settable""" 372 | if self._redirects is None: 373 | self._redirects = [] 374 | self.__pull_combined_properties() 375 | return self._redirects 376 | 377 | @property 378 | def backlinks(self) -> List[str]: 379 | """list: Pages that link to this page 380 | 381 | Note: 382 | Not settable""" 383 | if self._backlinks is None: 384 | self._backlinks = [] 385 | params = { 386 | "action": "query", 387 | "list": "backlinks", 388 | "bltitle": self.title, 389 | "bllimit": "max", 390 | "blfilterredir": "nonredirects", 391 | "blnamespace": 0, 392 | } 393 | tmp = [link["title"] for link in self._continued_query(params, "backlinks")] 394 | self._backlinks = sorted(tmp) 395 | return self._backlinks 396 | 397 | @property 398 | def langlinks(self) -> Dict[str, str]: 399 | """dict: Names of the page in other languages for which page is where the key is the language code 400 | and the page name is the name of the page in that language. 401 | 402 | Note: 403 | Not settable 404 | Note: 405 | list of all language links from the provided pages to other 406 | languages according to: https://www.mediawiki.org/wiki/API:Langlinks""" 407 | 408 | if self._langlinks is None: 409 | params = {"prop": "langlinks", "cllimit": "max"} 410 | query_result = self._continued_query(params) 411 | 412 | langlinks = {} 413 | for lang_info in query_result: 414 | langlinks[lang_info["lang"]] = lang_info["*"] 415 | self._langlinks = langlinks 416 | return self._langlinks 417 | 418 | @property 419 | def preview(self) -> Dict[str, str]: 420 | """dict: Page preview information that builds the preview hover""" 421 | if self._preview is None: 422 | params = { 423 | "action": "query", 424 | "formatversion": "2", 425 | "prop": "info|extracts|pageimages|revisions|pageterms|coordinates|pageviews", 426 | "exsentences": "5", 427 | "explaintext": "true", 428 | "piprop": "thumbnail|original", 429 | "pithumbsize": "320", 430 | "pilicense": "any", 431 | "rvprop": "timestamp|ids", 432 | "wbptterms": "description", 433 | "titles": self.title, 434 | } 435 | raw = self.mediawiki.wiki_request(params) 436 | self._preview = raw.get("query", {}).get("pages", [])[0] 437 | return self._preview 438 | 439 | @property 440 | def summary(self) -> Optional[str]: 441 | """str: Default page summary 442 | 443 | Note: 444 | Not settable""" 445 | if self._summary is None: 446 | self.__pull_combined_properties() 447 | if self._summary is None: 448 | self._summary = "" 449 | return self._summary 450 | 451 | def summarize(self, sentences: int = 0, chars: int = 0) -> str: 452 | """Summarize page either by number of sentences, chars, or first 453 | section (**default**) 454 | 455 | Args: 456 | sentences (int): Number of sentences to use in summary (first `x` sentences) 457 | chars (int): Number of characters to use in summary (first `x` characters) 458 | Returns: 459 | str: The summary of the MediaWiki page 460 | Note: 461 | Precedence for parameters: sentences then chars; if both are 0 then the entire first section is returned""" 462 | query_params: Dict[str, Any] = {"prop": "extracts", "explaintext": "", "titles": self.title} 463 | if sentences: 464 | query_params["exsentences"] = min(sentences, 10) 465 | elif chars: 466 | query_params["exchars"] = max(chars, 1) 467 | else: 468 | query_params["exintro"] = "" 469 | 470 | request = self.mediawiki.wiki_request(query_params) 471 | return request["query"]["pages"][self.pageid].get("extract") 472 | 473 | @property 474 | def sections(self) -> List[str]: 475 | """list: Table of contents sections 476 | 477 | Note: 478 | Not settable""" 479 | # NOTE: Due to MediaWiki sites adding superscripts or italics or bold 480 | # information in the sections, moving to regex to get the 481 | # `non-decorated` name instead of using the query api! 482 | if self._sections is None: 483 | self._parse_sections() 484 | if self._sections is None: 485 | self._sections = [] 486 | return self._sections 487 | 488 | @property 489 | def table_of_contents(self) -> Dict[str, Any]: 490 | """OrderedDict: Dictionary of sections and sub-sections 491 | 492 | Note: 493 | Leaf nodes are empty OrderedDict objects 494 | Note: 495 | Not Settable""" 496 | 497 | if self._table_of_contents is None: 498 | self._parse_sections() 499 | if self._table_of_contents is None: 500 | self._table_of_contents = {} 501 | return self._table_of_contents 502 | 503 | def section(self, section_title: Optional[str]) -> Optional[str]: 504 | """Plain text section content 505 | 506 | Args: 507 | section_title (str): Name of the section to pull or None for the header section 508 | Returns: 509 | str: The content of the section 510 | Note: 511 | Use **None** if the header section is desired 512 | Note: 513 | Returns **None** if section title is not found; only text between title and next \ 514 | section or sub-section title is returned 515 | Note: 516 | Side effect is to also pull the content which can be slow 517 | Note: 518 | This is a parsing operation and not part of the standard API""" 519 | if not section_title: 520 | try: 521 | content = self.content 522 | index = 0 523 | except ValueError: 524 | return None 525 | except IndexError: 526 | pass 527 | else: 528 | section = f"== {section_title} ==" 529 | try: 530 | # TODO, move index to find to remove exceptions 531 | content = self.content 532 | index = content.index(section) + len(section) 533 | 534 | # ensure we have the full section header... 535 | while True: 536 | if content[index + 1] == "=": 537 | index += 1 538 | else: 539 | break 540 | except ValueError: 541 | return None 542 | except IndexError: 543 | pass 544 | 545 | try: 546 | next_index = self.content.index("==", index) 547 | except ValueError: 548 | next_index = len(self.content) 549 | 550 | val = self.content[index:next_index].lstrip("=").strip() 551 | if val == "": 552 | return None 553 | return val 554 | 555 | def parse_section_links(self, section_title: str) -> Optional[List[Tuple[str, str]]]: 556 | """Parse all links within a section 557 | 558 | Args: 559 | section_title (str): Name of the section to pull or, if None is provided, \ 560 | the links between the main heading and the first section 561 | Returns: 562 | list: List of (title, url) tuples 563 | Note: 564 | Use **None** to pull the links from the header section 565 | Note: 566 | Returns **None** if section title is not found 567 | Note: 568 | Side effect is to also pull the html which can be slow 569 | Note: 570 | This is a parsing operation and not part of the standard API""" 571 | # Cache the results of parsing the html, so that multiple calls happen much faster 572 | if not self.html: 573 | return None 574 | if not self._soup: 575 | self._soup = BeautifulSoup(self.html, "html.parser") 576 | 577 | if not section_title: 578 | return self._parse_section_links(None) 579 | 580 | headlines = self._soup.find_all("span", class_="mw-headline") 581 | tmp_soup = BeautifulSoup(section_title, "html.parser") 582 | tmp_sec_title = tmp_soup.get_text().lower() 583 | id_tag = None 584 | for headline in headlines: 585 | tmp_id = headline.text 586 | if tmp_id.lower() == tmp_sec_title: 587 | id_tag = headline.get("id") 588 | break 589 | 590 | return self._parse_section_links(id_tag) if id_tag is not None else None 591 | 592 | # Protected Methods 593 | def __load(self, redirect: bool = True, preload: bool = False): 594 | """load the basic page information""" 595 | query_params = { 596 | "prop": "info|pageprops", 597 | "inprop": "url", 598 | "ppprop": "disambiguation", 599 | "redirects": "", 600 | } 601 | query_params.update(self.__title_query_param()) 602 | 603 | request = self.mediawiki.wiki_request(query_params) 604 | 605 | query = request["query"] 606 | pageid = list(query["pages"].keys())[0] 607 | page = query["pages"][pageid] 608 | 609 | # determine result of the request 610 | # missing is present if the page is missing 611 | if "missing" in page: 612 | self._raise_page_error() 613 | # redirects is present in query if page is a redirect 614 | elif "redirects" in query: 615 | self._handle_redirect(redirect, preload, query, page) 616 | # if pageprops is returned, it must be a disambiguation error 617 | elif "pageprops" in page: 618 | self._raise_disambiguation_error(page, pageid) 619 | else: 620 | self.pageid = pageid 621 | self.title = page["title"] 622 | self.url = page["fullurl"] 623 | 624 | def _raise_page_error(self): 625 | """raise the correct type of page error""" 626 | if hasattr(self, "title"): 627 | raise PageError(title=self.title) 628 | raise PageError(pageid=self.pageid) 629 | 630 | def _raise_disambiguation_error(self, page: Dict, pageid: int): 631 | """parse and throw a disambiguation error""" 632 | query_params = { 633 | "prop": "revisions", 634 | "rvprop": "content", 635 | "rvparse": "", 636 | "rvlimit": 1, 637 | } 638 | query_params.update(self.__title_query_param()) 639 | request = self.mediawiki.wiki_request(query_params) 640 | html = request["query"]["pages"][pageid]["revisions"][0]["*"] 641 | 642 | lis = BeautifulSoup(html, "html.parser").find_all("li") 643 | filtered_lis = [li for li in lis if "tocsection" not in "".join(li.get("class", []))] 644 | may_refer_to = [li.a.get_text() for li in filtered_lis if li.a] 645 | 646 | disambiguation = [] 647 | for lis_item in filtered_lis: 648 | item = lis_item.find_all("a") 649 | one_disambiguation = {} 650 | one_disambiguation["description"] = lis_item.text 651 | if item and item[0].has_attr("title"): 652 | one_disambiguation["title"] = item[0]["title"] 653 | else: 654 | # these are non-linked records so double up the text 655 | one_disambiguation["title"] = lis_item.text 656 | disambiguation.append(one_disambiguation) 657 | raise DisambiguationError( 658 | getattr(self, "title", page["title"]), 659 | may_refer_to, 660 | page["fullurl"], 661 | disambiguation, 662 | ) 663 | 664 | def _handle_redirect(self, redirect: bool, preload: bool, query: Dict, page: Dict[str, Any]): 665 | """handle redirect""" 666 | if not redirect: 667 | raise RedirectError(getattr(self, "title", page["title"])) 668 | 669 | redirects = query["redirects"][0] 670 | 671 | if "normalized" in query: 672 | normalized = query["normalized"][0] 673 | if normalized["from"] != self.title: 674 | raise MediaWikiException(ODD_ERROR_MESSAGE) 675 | from_title = normalized["to"] 676 | else: 677 | if not getattr(self, "title", None): 678 | self.title = redirects["from"] 679 | delattr(self, "pageid") 680 | from_title = self.title 681 | if redirects["from"] != from_title: 682 | raise MediaWikiException(ODD_ERROR_MESSAGE) 683 | 684 | # change the title and reload the whole object 685 | self.__init__( # type: ignore 686 | self.mediawiki, 687 | title=redirects["to"], 688 | redirect=redirect, 689 | preload=preload, 690 | ) 691 | 692 | def _continued_query(self, query_params: Dict[str, Any], key: str = "pages") -> Iterator[Dict[Any, Any]]: 693 | """Based on 694 | https://www.mediawiki.org/wiki/API:Query#Continuing_queries""" 695 | query_params.update(self.__title_query_param()) 696 | 697 | last_cont: Dict = {} 698 | prop = query_params.get("prop") 699 | 700 | while True: 701 | params = query_params.copy() 702 | params.update(last_cont) 703 | 704 | request = self.mediawiki.wiki_request(params) 705 | 706 | if "query" not in request: 707 | break 708 | 709 | pages = request["query"][key] 710 | if "generator" in query_params: 711 | yield from pages.values() 712 | elif isinstance(pages, list): 713 | yield from [v for x, v in enumerate(pages)] 714 | else: 715 | yield from pages[self.pageid].get(prop, []) 716 | 717 | if "continue" not in request or request["continue"] == last_cont: 718 | break 719 | 720 | last_cont = request["continue"] 721 | 722 | def _parse_section_links(self, id_tag: Optional[str]) -> List[Tuple[str, str]]: 723 | """given a section id, parse the links in the unordered list""" 724 | all_links: List[Tuple[str, str]] = [] 725 | 726 | if not self._soup: 727 | self._soup = BeautifulSoup(self.html, "html.parser") 728 | 729 | if id_tag is None: 730 | root = self._soup.find("div", {"class": "mw-parser-output"}) 731 | if root is None or isinstance(root, NavigableString): 732 | return all_links 733 | candidates = root.children 734 | else: 735 | root = self._soup.find("span", {"id": id_tag}) 736 | if root is None: 737 | return all_links 738 | candidates = self._soup.find(id=id_tag).parent.next_siblings # type: ignore 739 | 740 | for node in candidates: 741 | if not isinstance(node, Tag) or node.get("role", "") == "navigation": 742 | continue 743 | classes = node.get("class", []) 744 | if not isinstance(classes, list): 745 | classes = [classes if classes else ""] 746 | if "infobox" in classes: 747 | continue 748 | 749 | # If the classname contains "toc", the element is a table of contents. 750 | # The comprehension is necessary because there are several possible 751 | # types of tocs: "toclevel", "toc", ... 752 | toc_classnames = [cname for cname in classes if "toc" in cname] 753 | if toc_classnames: 754 | continue 755 | 756 | # this is actually the child node's class... 757 | is_headline = node.find("span", {"class": "mw-headline"}) 758 | if is_headline is not None: 759 | break 760 | if node.name == "a": 761 | all_links.append(self.__parse_link_info(node)) 762 | else: 763 | all_links.extend(self.__parse_link_info(link) for link in node.find_all("a")) 764 | return all_links 765 | 766 | def __parse_link_info(self, link: Tag) -> Tuple[str, str]: 767 | """parse the tag for the link""" 768 | href = link.get("href", "") 769 | if isinstance(href, list): 770 | href = href[0] 771 | href = "" if href is None else href 772 | txt = link.string or href 773 | is_rel = is_relative_url(href) 774 | if is_rel is True: 775 | tmp = f"{self.mediawiki.base_url}{href}" 776 | elif is_rel is None: 777 | tmp = f"{self.url}{href}" 778 | else: 779 | tmp = href 780 | return txt, tmp 781 | 782 | def _parse_sections(self): 783 | """parse sections and TOC""" 784 | 785 | def _list_to_dict(_dict, path, sec): 786 | tmp = _dict 787 | for elm in path[:-1]: 788 | tmp = tmp[elm] 789 | tmp[sec] = OrderedDict() 790 | 791 | self._sections = [] 792 | section_regexp = r"\n==* .* ==*\n" # '== {STUFF_NOT_\n} ==' 793 | found_obj = re.findall(section_regexp, self.content) 794 | 795 | res = OrderedDict() 796 | path = [] 797 | last_depth = 0 798 | for obj in found_obj: 799 | depth = obj.count("=") / 2 # this gets us to the single side... 800 | depth -= 2 # now, we can calculate depth 801 | 802 | sec = obj.lstrip("\n= ").rstrip(" =\n") 803 | if depth == 0: 804 | last_depth = 0 805 | path = [sec] 806 | res[sec] = OrderedDict() 807 | elif depth > last_depth: 808 | last_depth = depth 809 | path.append(sec) 810 | _list_to_dict(res, path, sec) 811 | elif depth < last_depth: 812 | while last_depth > depth: 813 | path.pop() 814 | last_depth -= 1 815 | if path: 816 | path.pop() 817 | path.append(sec) 818 | _list_to_dict(res, path, sec) 819 | last_depth = depth 820 | else: 821 | if path: 822 | path.pop() 823 | path.append(sec) 824 | _list_to_dict(res, path, sec) 825 | last_depth = depth 826 | self._sections.append(sec) 827 | 828 | self._table_of_contents = res 829 | 830 | def __title_query_param(self) -> Dict[str, Any]: 831 | """util function to determine which parameter method to use""" 832 | if getattr(self, "title", None) is not None: 833 | return {"titles": self.title} 834 | return {"pageids": self.pageid} 835 | 836 | def __pull_combined_properties(self): 837 | """something here...""" 838 | 839 | query_params = { 840 | "titles": self.title, 841 | "prop": "extracts|redirects|links|coordinates|categories|extlinks", 842 | "continue": {}, 843 | # summary 844 | "explaintext": "", 845 | "exintro": "", # full first section for the summary! 846 | # redirects 847 | "rdprop": "title", 848 | "rdlimit": "max", 849 | # links 850 | "plnamespace": 0, 851 | "pllimit": "max", 852 | # coordinates 853 | "colimit": "max", 854 | # categories 855 | "cllimit": "max", 856 | "clshow": "!hidden", 857 | # references 858 | "ellimit": "max", 859 | } 860 | 861 | last_cont = {} 862 | results = {} 863 | idx = 0 864 | while True: 865 | params = query_params.copy() 866 | params.update(last_cont) 867 | 868 | request = self.mediawiki.wiki_request(params) 869 | idx += 1 870 | 871 | if "query" not in request: 872 | break 873 | 874 | keys = [ 875 | "extracts", 876 | "redirects", 877 | "links", 878 | "coordinates", 879 | "categories", 880 | "extlinks", 881 | ] 882 | new_cont = request.get("continue") 883 | request = request["query"]["pages"][self.pageid] 884 | if not results: 885 | results = request 886 | else: 887 | for key in keys: 888 | if key in request and request.get(key) is not None: 889 | val = request.get(key) 890 | tmp = results.get(key) 891 | if isinstance(tmp, (list, tuple)): 892 | results[key] = results.get(key, list) + val 893 | if new_cont is None or new_cont == last_cont: 894 | break 895 | 896 | last_cont = new_cont 897 | 898 | # redirects 899 | tmp = [link["title"] for link in results.get("redirects", [])] 900 | self._redirects = sorted(tmp) 901 | 902 | # summary 903 | self._summary = results.get("extract") 904 | 905 | # links 906 | tmp = [link["title"] for link in results.get("links", [])] 907 | self._links = sorted(tmp) 908 | 909 | # categories 910 | def _get_cat(val): 911 | """parse the category correctly""" 912 | tmp = val["title"] 913 | if tmp.startswith(self.mediawiki.category_prefix): 914 | return tmp[len(self.mediawiki.category_prefix) + 1 :] 915 | return tmp 916 | 917 | tmp = [_get_cat(link) for link in results.get("categories", [])] 918 | self._categories = sorted(tmp) 919 | 920 | # coordinates 921 | if "coordinates" in results: 922 | self._coordinates = ( 923 | Decimal(results["coordinates"][0]["lat"]), 924 | Decimal(results["coordinates"][0]["lon"]), 925 | ) 926 | 927 | # references 928 | tmp = [link["*"] for link in results.get("extlinks", [])] 929 | self._references = sorted(tmp) 930 | -------------------------------------------------------------------------------- /mediawiki/mediawiki.py: -------------------------------------------------------------------------------- 1 | """ 2 | MediaWiki class module 3 | """ 4 | 5 | # MIT License 6 | # Author: Tyler Barrus (barrust@gmail.com) 7 | 8 | import time 9 | from datetime import datetime, timedelta 10 | from decimal import Decimal, DecimalException 11 | from json import JSONDecodeError 12 | from typing import Any, Dict, List, Optional, Tuple, Union 13 | 14 | import requests 15 | import requests.exceptions as rex 16 | 17 | from mediawiki.configuraton import VERSION, Configuration, HTTPAuthenticator 18 | from mediawiki.exceptions import ( 19 | HTTPTimeoutError, 20 | MediaWikiAPIURLError, 21 | MediaWikiCategoryTreeError, 22 | MediaWikiException, 23 | MediaWikiForbidden, 24 | MediaWikiGeoCoordError, 25 | MediaWikiLoginError, 26 | PageError, 27 | ) 28 | from mediawiki.mediawikipage import MediaWikiPage 29 | from mediawiki.utilities import memoize 30 | 31 | 32 | class MediaWiki: 33 | """MediaWiki API Wrapper Instance 34 | 35 | Args: 36 | url (str): API URL of the MediaWiki site; defaults to Wikipedia 37 | lang (str): Language of the MediaWiki site; used to help change API URL 38 | timeout (float): HTTP timeout setting; None means no timeout 39 | rate_limit (bool): Use rate limiting to limit calls to the site 40 | rate_limit_wait (timedelta): Amount of time to wait between requests 41 | cat_prefix (str): The prefix for categories used by the mediawiki site; defaults to Category (en) 42 | user_agent (str): The user agent string to use when making requests; defaults to a library version \ 43 | but per the MediaWiki API documentation it recommends setting a unique one and not using the library's \ 44 | default user-agent string 45 | username (str): The username to use to log into the MediaWiki 46 | password (str): The password to use to log into the MediaWiki 47 | proxies (str): A dictionary of specific proxies to use in the Requests libary 48 | verify_ssl (bool|str): Verify SSL Certificates to be passed directly into the Requests library 49 | http_auth (tuple|callable): HTTP authenticator to be passed directly into the Requests library""" 50 | 51 | __slots__ = [ 52 | "_version", 53 | "_config", 54 | "_session", 55 | "_extensions", 56 | "_api_version", 57 | "_api_version_str", 58 | "_base_url", 59 | "__supported_languages", 60 | "__available_languages", 61 | "_is_logged_in", 62 | "_cache", 63 | ] 64 | 65 | def __init__( 66 | self, 67 | url: str = "https://{lang}.wikipedia.org/w/api.php", 68 | lang: str = "en", 69 | timeout: float = 15.0, 70 | rate_limit: bool = False, 71 | rate_limit_wait: timedelta = timedelta(milliseconds=50), 72 | cat_prefix: str = "Category", 73 | user_agent: Optional[str] = None, 74 | username: Optional[str] = None, 75 | password: Optional[str] = None, 76 | proxies: Optional[Dict] = None, 77 | verify_ssl: Union[bool, str] = True, 78 | http_auth: Optional[HTTPAuthenticator] = None, 79 | ): 80 | """Init Function""" 81 | self._version = VERSION 82 | url.format(lang=lang.lower()) 83 | self._config = Configuration( 84 | lang=lang, 85 | api_url=url.format(lang=lang.lower()), 86 | category_prefix=cat_prefix, 87 | timeout=timeout, 88 | user_agent=user_agent, 89 | proxies=proxies, 90 | verify_ssl=verify_ssl, 91 | rate_limit=rate_limit, 92 | rate_limit_wait=rate_limit_wait, 93 | username=username, 94 | password=password, 95 | refresh_interval=None, 96 | use_cache=True, 97 | http_auth=http_auth, 98 | ) 99 | 100 | # requests library parameters 101 | self._session: requests.Session = requests.Session() 102 | 103 | # reset libary parameters 104 | self._extensions = None 105 | self._api_version = None 106 | self._api_version_str = None 107 | self._base_url = None 108 | self.__supported_languages: Optional[Dict[str, str]] = None 109 | self.__available_languages: Optional[Dict[str, bool]] = None 110 | 111 | # for memoized results 112 | self._cache: Dict = {} 113 | 114 | self._reset_session() 115 | 116 | # for login information 117 | self._is_logged_in = False 118 | if self._config.username is not None and self._config.password is not None: 119 | self.login(self._config.username, self._config.password) 120 | 121 | try: 122 | self._get_site_info() 123 | except MediaWikiException as exc: 124 | raise MediaWikiAPIURLError(self._config.api_url) from exc 125 | 126 | # non-settable properties 127 | @property 128 | def version(self) -> str: 129 | """str: The version of the pymediawiki library 130 | 131 | Note: 132 | Not settable""" 133 | return self._version 134 | 135 | @property 136 | def api_version(self) -> Optional[str]: 137 | """str: API Version of the MediaWiki site 138 | 139 | Note: 140 | Not settable""" 141 | return self._api_version_str 142 | 143 | @property 144 | def base_url(self) -> str: 145 | """str: Base URL for the MediaWiki site 146 | 147 | Note: 148 | Not settable""" 149 | return self._base_url if self._base_url else "" 150 | 151 | @property 152 | def extensions(self) -> List[str]: 153 | """list: Extensions installed on the MediaWiki site 154 | 155 | Note: 156 | Not settable""" 157 | return self._extensions if self._extensions else [] 158 | 159 | # settable properties 160 | @property 161 | def rate_limit(self) -> bool: 162 | """bool: Turn on or off Rate Limiting""" 163 | return self._config.rate_limit 164 | 165 | @rate_limit.setter 166 | def rate_limit(self, rate_limit: bool): 167 | """Turn on or off rate limiting""" 168 | self._config.rate_limit = rate_limit 169 | if self._config._clear_memoized: 170 | self.clear_memoized() 171 | 172 | @property 173 | def proxies(self) -> Optional[Dict]: 174 | """dict: Turn on, off, or set proxy use with the Requests library""" 175 | return self._config.proxies 176 | 177 | @proxies.setter 178 | def proxies(self, proxies: Optional[Dict]): 179 | """Turn on, off, or set proxy use through the Requests library""" 180 | self._config.proxies = proxies 181 | if self._config._reset_session: 182 | self._reset_session() 183 | 184 | @property 185 | def use_cache(self) -> bool: 186 | """bool: Whether caching should be used; on (**True**) or off (**False**)""" 187 | return self._config.use_cache 188 | 189 | @use_cache.setter 190 | def use_cache(self, use_cache: bool): 191 | """toggle using the cache or not""" 192 | self._config.use_cache = use_cache 193 | 194 | @property 195 | def rate_limit_min_wait(self) -> timedelta: 196 | """timedelta: Time to wait between calls 197 | 198 | Note: 199 | Only used if rate_limit is **True**""" 200 | return self._config.rate_limit_min_wait 201 | 202 | @rate_limit_min_wait.setter 203 | def rate_limit_min_wait(self, min_wait: timedelta): 204 | """Set minimum wait to use for rate limiting""" 205 | self._config.rate_limit_min_wait = min_wait 206 | 207 | @property 208 | def timeout(self) -> Optional[float]: 209 | """float: Response timeout for API requests 210 | 211 | Note: 212 | Use **None** for no response timeout""" 213 | return self._config.timeout 214 | 215 | @timeout.setter 216 | def timeout(self, timeout: Optional[float]): 217 | """Set request timeout in seconds (or fractions of a second)""" 218 | self._config.timeout = timeout 219 | 220 | @property 221 | def verify_ssl(self) -> Union[bool, str]: 222 | """bool | str: Verify SSL when using requests or path to cert file""" 223 | return self._config.verify_ssl 224 | 225 | @verify_ssl.setter 226 | def verify_ssl(self, verify_ssl: Union[bool, str]): 227 | """Set request verify SSL parameter; defaults to True if issue""" 228 | self._config.verify_ssl = verify_ssl 229 | if self._config._reset_session: 230 | self._reset_session() 231 | 232 | @property 233 | def language(self) -> str: 234 | """str: The API URL language, if possible this will update the API URL 235 | 236 | Note: 237 | Use correct language titles with the updated API URL 238 | Note: 239 | Some API URLs do not encode language; unable to update if this is the case""" 240 | return self._config.lang 241 | 242 | @language.setter 243 | def language(self, lang: str): 244 | """Set the language to use; attempts to change the API URL""" 245 | self._config.lang = lang 246 | if self._config._clear_memoized: 247 | self.clear_memoized() 248 | 249 | @property 250 | def category_prefix(self) -> str: 251 | """str: The category prefix to use when using category based functions 252 | 253 | Note: 254 | Use the correct category name for the language selected""" 255 | return self._config.category_prefix 256 | 257 | @category_prefix.setter 258 | def category_prefix(self, prefix: str): 259 | """Set the category prefix correctly""" 260 | self._config.category_prefix = prefix 261 | 262 | @property 263 | def user_agent(self) -> str: 264 | """str: User agent string 265 | 266 | Note: If using in as part of another project, this should be changed""" 267 | return self._config.user_agent 268 | 269 | @user_agent.setter 270 | def user_agent(self, user_agent: str): 271 | """Set the new user agent string 272 | 273 | Note: Will need to re-log into the MediaWiki if user agent string is changed""" 274 | self._config.user_agent = user_agent 275 | if self._config._reset_session: 276 | self._reset_session() 277 | 278 | @property 279 | def api_url(self) -> str: 280 | """str: API URL of the MediaWiki site 281 | 282 | Note: 283 | Not settable; See :py:func:`mediawiki.MediaWiki.set_api_url`""" 284 | return self._config.api_url 285 | 286 | @property 287 | def memoized(self) -> Dict[Any, Any]: 288 | """dict: Return the memoize cache 289 | 290 | Note: 291 | Not settable; see 292 | :py:func:`mediawiki.MediaWiki.clear_memoized`""" 293 | return self._cache 294 | 295 | @property 296 | def refresh_interval(self) -> Optional[int]: 297 | """int: The interval at which the memoize cache is to be refresh""" 298 | return self._config.refresh_interval 299 | 300 | @refresh_interval.setter 301 | def refresh_interval(self, refresh_interval: int): 302 | """Set the new cache refresh interval""" 303 | self._config.refresh_interval = refresh_interval 304 | 305 | @property 306 | def http_auth(self) -> Optional[HTTPAuthenticator]: 307 | """tuple|callable: HTTP authenticator to use to access the mediawiki site""" 308 | return self._config.http_auth 309 | 310 | @http_auth.setter 311 | def http_auth(self, http_auth: Optional[HTTPAuthenticator]): 312 | """Set the HTTP authenticator, if needed, to use to access the mediawiki site""" 313 | self._config.http_auth = http_auth 314 | self._session.auth = http_auth 315 | 316 | def login(self, username: str, password: str, strict: bool = True) -> bool: 317 | """Login as specified user 318 | 319 | Args: 320 | username (str): The username to log in with 321 | password (str): The password for the user 322 | strict (bool): `True` to throw an error on failure 323 | Returns: 324 | bool: `True` if successfully logged in; `False` otherwise 325 | Raises: 326 | :py:func:`mediawiki.exceptions.MediaWikiLoginError`: if unable to login 327 | 328 | Note: 329 | Per the MediaWiki API, one should use the `bot password`; \ 330 | see https://www.mediawiki.org/wiki/API:Login for more information 331 | """ 332 | # get login token 333 | params = { 334 | "action": "query", 335 | "meta": "tokens", 336 | "type": "login", 337 | "format": "json", 338 | } 339 | token_res = self._get_response(params) 340 | if "query" in token_res and "tokens" in token_res["query"]: 341 | token = token_res["query"]["tokens"]["logintoken"] 342 | else: 343 | return False 344 | 345 | params = { 346 | "action": "login", 347 | "lgname": username, 348 | "lgpassword": password, 349 | "lgtoken": token, 350 | "format": "json", 351 | } 352 | 353 | res = self._post_response(params) 354 | if res["login"]["result"] == "Success": 355 | self._is_logged_in = True 356 | return True 357 | self._is_logged_in = False 358 | reason = res["login"]["reason"] 359 | if strict: 360 | raise MediaWikiLoginError(f"MediaWiki login failure: {reason}") 361 | return False 362 | 363 | # non-properties 364 | def set_api_url( 365 | self, 366 | api_url: str = "https://{lang}.wikipedia.org/w/api.php", 367 | lang: str = "en", 368 | username: Optional[str] = None, 369 | password: Optional[str] = None, 370 | ): 371 | """Set the API URL and language 372 | 373 | Args: 374 | api_url (str): API URL to use 375 | lang (str): Language of the API URL 376 | username (str): The username, if needed, to log into the MediaWiki site 377 | password (str): The password, if needed, to log into the MediaWiki site 378 | Raises: 379 | :py:func:`mediawiki.exceptions.MediaWikiAPIURLError`: if the \ 380 | url is not a valid MediaWiki site or login fails 381 | """ 382 | old_api_url = self._config.api_url 383 | old_lang = self._config.lang 384 | self._config.lang = lang.lower() 385 | self._config.api_url = api_url.format(lang=self._config.lang) 386 | self._config.username = username 387 | self._config.password = password 388 | self._is_logged_in = False 389 | try: 390 | if self._config.username is not None and self._config.password is not None: 391 | self.login(self._config.username, self._config.password) 392 | self._get_site_info() 393 | self.__supported_languages = None # reset this 394 | self.__available_languages = None # reset this 395 | except (rex.ConnectTimeout, MediaWikiException) as exc: 396 | # reset api url and lang in the event that the exception was caught 397 | self._config.api_url = old_api_url 398 | self._config.lang = old_lang 399 | raise MediaWikiAPIURLError(api_url) from exc 400 | self.clear_memoized() 401 | 402 | def _reset_session(self): 403 | """Set session information""" 404 | if self._session: 405 | self._session.close() 406 | 407 | headers = {"User-Agent": self._config.user_agent} 408 | self._session = requests.Session() 409 | self._session.auth = self._config.http_auth 410 | self._session.headers.update(headers) 411 | if self._config.proxies is not None: 412 | self._session.proxies.update(self._config.proxies) 413 | self._session.verify = self._config.verify_ssl 414 | 415 | self._is_logged_in = False 416 | self._config._reset_session = False 417 | 418 | def clear_memoized(self): 419 | """Clear memoized (cached) values""" 420 | if hasattr(self, "_cache"): 421 | self._cache.clear() 422 | self._config._clear_memoized = False 423 | 424 | # non-setup functions 425 | @property 426 | def supported_languages(self) -> Dict[str, str]: 427 | """dict: All supported language prefixes on the MediaWiki site 428 | 429 | Note: 430 | Not Settable""" 431 | if self.__supported_languages is None: 432 | res = self.wiki_request({"meta": "siteinfo", "siprop": "languages"}) 433 | tmp = res["query"]["languages"] 434 | supported = {lang["code"]: lang["*"] for lang in tmp} 435 | self.__supported_languages = supported 436 | return self.__supported_languages 437 | 438 | @property 439 | def available_languages(self) -> Dict[str, bool]: 440 | """dict: All available language prefixes on the MediaWiki site 441 | 442 | Note: 443 | Not Settable""" 444 | if self.__available_languages is None: 445 | available = {} 446 | for lang in self.supported_languages: 447 | try: 448 | MediaWiki(lang=lang) 449 | available[lang] = True 450 | except (rex.ConnectionError, rex.ConnectTimeout, MediaWikiException, MediaWikiAPIURLError): 451 | available[lang] = False 452 | self.__available_languages = available 453 | return self.__available_languages 454 | 455 | @property 456 | def logged_in(self) -> bool: 457 | """bool: Returns if logged into the MediaWiki site""" 458 | return self._is_logged_in 459 | 460 | def random(self, pages: int = 1) -> Union[str, List[str]]: 461 | """Request a random page title or list of random titles 462 | 463 | Args: 464 | pages (int): Number of random pages to return 465 | Returns: 466 | list or int: A list of random page titles or a random page title if pages = 1""" 467 | if pages is None or pages < 1: 468 | raise ValueError("Number of pages must be greater than 0") 469 | 470 | query_params = {"list": "random", "rnnamespace": 0, "rnlimit": pages} 471 | 472 | request = self.wiki_request(query_params) 473 | titles = [page["title"] for page in request["query"]["random"]] 474 | 475 | return titles[0] if len(titles) == 1 else titles 476 | 477 | @memoize 478 | def allpages(self, query: str = "", results: int = 10) -> List[str]: 479 | """Request all pages from mediawiki instance 480 | 481 | Args: 482 | query (str): Search string to use for pulling pages 483 | results (int): The number of pages to return 484 | Returns: 485 | list: The pages that meet the search query 486 | Note: 487 | Could add ability to continue past the limit of 500 488 | """ 489 | max_pull = 500 490 | limit = min(results, max_pull) if results is not None else max_pull 491 | query_params = {"list": "allpages", "aplimit": limit, "apfrom": query} 492 | 493 | request = self.wiki_request(query_params) 494 | 495 | self._check_error_response(request, query) 496 | 497 | return [page["title"] for page in request["query"]["allpages"]] 498 | 499 | @memoize 500 | def search( 501 | self, query: str, results: int = 10, suggestion: bool = False 502 | ) -> Union[List[str], Tuple[List[str], Optional[str]]]: 503 | """Search for similar titles 504 | 505 | Args: 506 | query (str): Page title 507 | results (int): Number of pages to return 508 | suggestion (bool): Use suggestion 509 | Returns: 510 | tuple or list: tuple (list results, suggestion) if suggestion is **True**; list of results otherwise 511 | Note: 512 | Could add ability to continue past the limit of 500 513 | """ 514 | 515 | self._check_query(query, "Query must be specified") 516 | 517 | max_pull = 500 518 | 519 | search_params = { 520 | "list": "search", 521 | "srprop": "", 522 | "srlimit": min(results, max_pull) if results is not None else max_pull, 523 | "srsearch": query, 524 | "sroffset": 0, # this is what will be used to pull more than the max 525 | } 526 | if suggestion: 527 | search_params["srinfo"] = "suggestion" 528 | 529 | raw_results = self.wiki_request(search_params) 530 | 531 | self._check_error_response(raw_results, query) 532 | 533 | search_results = [d["title"] for d in raw_results["query"]["search"]] 534 | 535 | if suggestion: 536 | sug = raw_results["query"]["searchinfo"]["suggestion"] if raw_results["query"].get("searchinfo") else None 537 | return search_results, sug 538 | return search_results 539 | 540 | @memoize 541 | def suggest(self, query: str) -> Optional[str]: 542 | """Gather suggestions based on the provided title or None if no 543 | suggestions found 544 | 545 | Args: 546 | query (str): Page title 547 | Returns: 548 | String or None: Suggested page title or **None** if no suggestion found 549 | """ 550 | res, suggest = self.search(query, results=1, suggestion=True) 551 | try: 552 | title = res[0] or suggest 553 | except IndexError: # page doesn't exist 554 | title = None 555 | return title 556 | 557 | @memoize 558 | def geosearch( 559 | self, 560 | latitude: Union[Decimal, float, None] = None, 561 | longitude: Union[Decimal, float, None] = None, 562 | radius: int = 1000, 563 | title: Optional[str] = None, 564 | auto_suggest: bool = True, 565 | results: int = 10, 566 | ) -> List[str]: 567 | """Search for pages that relate to the provided geocoords or near 568 | the page 569 | 570 | Args: 571 | latitude (Decimal or None): Latitude geocoord; must be coercible to decimal 572 | longitude (Decimal or None): Longitude geocoord; must be coercible to decimal 573 | radius (int): Radius around page or geocoords to pull back; in meters 574 | title (str): Page title to use as a geocoordinate; this has precedence over lat/long 575 | auto_suggest (bool): Auto-suggest the page title 576 | results (int): Number of pages within the radius to return 577 | Returns: 578 | list: A listing of page titles 579 | Note: 580 | The Geosearch API does not support pulling more than the maximum of 500 581 | Note: 582 | If the page doesn't match the provided title, try setting auto_suggest to `False` 583 | Raises: 584 | ValueError: If either the passed latitude or longitude are not coercible to a Decimal 585 | """ 586 | 587 | def test_lat_long(val): 588 | """handle testing lat and long""" 589 | if not isinstance(val, Decimal): 590 | error = ( 591 | "Latitude and Longitude must be specified either as " 592 | "a Decimal or in formats that can be coerced into " 593 | "a Decimal." 594 | ) 595 | try: 596 | return Decimal(val) 597 | except (DecimalException, TypeError) as exc: 598 | raise ValueError(error) from exc 599 | return val 600 | 601 | # end local function 602 | max_pull = 500 603 | 604 | limit = min(results, max_pull) if results is not None else max_pull 605 | params = {"list": "geosearch", "gsradius": radius, "gslimit": limit} 606 | if title is not None: 607 | if auto_suggest: 608 | title = self.suggest(title) 609 | params["gspage"] = title 610 | else: 611 | lat = test_lat_long(latitude) 612 | lon = test_lat_long(longitude) 613 | params["gscoord"] = f"{lat}|{lon}" 614 | 615 | raw_results = self.wiki_request(params) 616 | 617 | self._check_error_response(raw_results, title if title else "Page Title Not Provided") 618 | 619 | return [d["title"] for d in raw_results["query"]["geosearch"]] 620 | 621 | @memoize 622 | def opensearch(self, query: str, results: int = 10, redirect: bool = True) -> List[Tuple[str, str, str]]: 623 | """Execute a MediaWiki opensearch request, similar to search box 624 | suggestions and conforming to the OpenSearch specification 625 | 626 | Args: 627 | query (str): Title to search for 628 | results (int): Number of pages within the radius to return 629 | redirect (bool): If **False** return the redirect itself, otherwise resolve redirects 630 | Returns: 631 | List: List of results that are stored in a tuple (Title, Summary, URL) 632 | Note: 633 | The Opensearch API does not support pulling more than the maximum of 500 634 | Raises: 635 | """ 636 | 637 | self._check_query(query, "Query must be specified") 638 | max_pull = 500 639 | 640 | query_params = { 641 | "action": "opensearch", 642 | "search": query, 643 | "limit": (min(results, max_pull) if results is not None else max_pull), 644 | "redirects": ("resolve" if redirect else "return"), 645 | "warningsaserror": True, 646 | "namespace": "", 647 | } 648 | 649 | out = self.wiki_request(query_params) 650 | 651 | self._check_error_response(out, query) 652 | 653 | return [(item, out[2][i], out[3][i]) for i, item in enumerate(out[1])] 654 | 655 | @memoize 656 | def prefixsearch(self, prefix: str, results: int = 10) -> List[str]: 657 | """ Perform a prefix search using the provided prefix string 658 | 659 | Args: 660 | prefix (str): Prefix string to use for search 661 | results (int): Number of pages with the prefix to return 662 | Returns: 663 | list: List of page titles 664 | Note: 665 | **Per the documentation:** "The purpose of this module is \ 666 | similar to action=opensearch: to take user input and provide \ 667 | the best-matching titles. Depending on the search engine \ 668 | backend, this might include typo correction, redirect \ 669 | avoidance, or other heuristics." 670 | Note: 671 | Could add ability to continue past the limit of 500 672 | """ 673 | 674 | self._check_query(prefix, "Prefix must be specified") 675 | 676 | query_params = { 677 | "list": "prefixsearch", 678 | "pssearch": prefix, 679 | "pslimit": ("max" if (results > 500 or results is None) else results), 680 | "psnamespace": 0, 681 | "psoffset": 0, # parameterize to skip to later in the list? 682 | } 683 | 684 | raw_results = self.wiki_request(query_params) 685 | 686 | self._check_error_response(raw_results, prefix) 687 | 688 | return [rec["title"] for rec in raw_results["query"]["prefixsearch"]] 689 | 690 | @memoize 691 | def summary(self, title: str, sentences: int = 0, chars: int = 0, auto_suggest: bool = True, redirect: bool = True): 692 | """ Get the summary for the title in question 693 | 694 | Args: 695 | title (str): Page title to summarize 696 | sentences (int): Number of sentences to return in summary 697 | chars (int): Number of characters to return in summary 698 | auto_suggest (bool): Run auto-suggest on title before summarizing 699 | redirect (bool): Use page redirect on title before summarizing 700 | Returns: 701 | str: The summarized results of the page 702 | Note: 703 | Precedence for parameters: sentences then chars; if both are \ 704 | 0 then the entire first section is returned 705 | Note: 706 | If the page doesn't match the provided title, try setting auto_suggest to `False`""" 707 | page_info = self.page(title, auto_suggest=auto_suggest, redirect=redirect) 708 | return page_info.summarize(sentences, chars) 709 | 710 | @memoize 711 | def categorymembers( 712 | self, category: str, results: int = 10, subcategories: bool = True 713 | ) -> Union[List[str], Tuple[List[str], List[str]]]: 714 | """Get information about a category: pages and subcategories 715 | 716 | Args: 717 | category (str): Category name 718 | results (int): Number of result 719 | subcategories (bool): Include subcategories (**True**) or not (**False**) 720 | Returns: 721 | Tuple or List: Either a tuple ([pages], [subcategories]) or just the list of pages 722 | Note: 723 | Set results to **None** to get all results""" 724 | self._check_query(category, "Category must be specified") 725 | 726 | max_pull = 500 727 | search_params = { 728 | "list": "categorymembers", 729 | "cmprop": "ids|title|type", 730 | "cmtype": ("page|subcat|file" if subcategories else "page|file"), 731 | "cmlimit": (min(results, max_pull) if results is not None else max_pull), 732 | "cmtitle": f"{self.category_prefix}:{category}", 733 | } 734 | pages = [] 735 | subcats = [] 736 | returned_results = 0 737 | finished = False 738 | last_cont: Dict = {} 739 | while not finished: 740 | params = search_params.copy() 741 | params.update(last_cont) 742 | raw_res = self.wiki_request(params) 743 | 744 | self._check_error_response(raw_res, category) 745 | 746 | current_pull = len(raw_res["query"]["categorymembers"]) 747 | for rec in raw_res["query"]["categorymembers"]: 748 | if rec["type"] in ("page", "file"): 749 | pages.append(rec["title"]) 750 | elif rec["type"] == "subcat": 751 | tmp = rec["title"] 752 | if tmp.startswith(self.category_prefix): 753 | tmp = tmp[len(self.category_prefix) + 1 :] 754 | subcats.append(tmp) 755 | 756 | cont = raw_res.get("query-continue", False) 757 | if cont and "categorymembers" in cont: 758 | cont = cont["categorymembers"] 759 | else: 760 | cont = raw_res.get("continue", False) 761 | 762 | if cont is False or last_cont == cont: 763 | break 764 | 765 | returned_results += current_pull 766 | if results is None or (results - returned_results > 0): 767 | last_cont = cont 768 | else: 769 | finished = True 770 | 771 | if results is not None and results - returned_results < max_pull: 772 | search_params["cmlimit"] = results - returned_results 773 | # end while loop 774 | 775 | return (pages, subcats) if subcategories else pages 776 | 777 | def categorytree(self, category: str, depth: int = 5) -> Dict[str, Any]: 778 | """Generate the Category Tree for the given categories 779 | 780 | Args: 781 | category(str or list of strings): Category name(s) 782 | depth(int): Depth to traverse the tree 783 | Returns: 784 | dict: Category tree structure 785 | Note: 786 | Set depth to **None** to get the whole tree 787 | Note: 788 | Return Data Structure: Subcategory contains the same recursive structure 789 | 790 | >>> { 791 | 'category': { 792 | 'depth': Number, 793 | 'links': list, 794 | 'parent-categories': list, 795 | 'sub-categories': dict 796 | } 797 | } 798 | 799 | .. versionadded:: 0.3.10""" 800 | 801 | # make it simple to use both a list or a single category term 802 | cats = [category] if not isinstance(category, list) else category 803 | 804 | self.__category_parameter_verification(cats, depth, category) 805 | 806 | results: Dict = {} 807 | categories: Dict = {} 808 | links: Dict = {} 809 | 810 | for cat in [x for x in cats if x]: 811 | self.__cat_tree_rec(cat, depth, results, 0, categories, links) 812 | return results 813 | 814 | def page(self, title=None, pageid=None, auto_suggest=True, redirect=True, preload=False): 815 | """Get MediaWiki page based on the provided title or pageid 816 | 817 | Args: 818 | title (str): Page title 819 | pageid (int): MediaWiki page identifier 820 | auto-suggest (bool): **True:** Allow page title auto-suggest 821 | redirect (bool): **True:** Follow page redirects 822 | preload (bool): **True:** Load most page properties 823 | Raises: 824 | ValueError: when title is blank or None and no pageid is provided 825 | Raises: 826 | :py:func:`mediawiki.exceptions.PageError`: if page does not exist 827 | Note: 828 | Title takes precedence over pageid if both are provided 829 | Note: 830 | If the page doesn't match the provided title, try setting auto_suggest to `False`""" 831 | if (title is None or title.strip() == "") and pageid is None: 832 | raise ValueError("Either a title or a pageid must be specified") 833 | if title: 834 | if auto_suggest: 835 | temp_title = self.suggest(title) 836 | if temp_title is None: # page doesn't exist 837 | raise PageError(title=title) 838 | title = temp_title 839 | return MediaWikiPage(self, title, redirect=redirect, preload=preload) 840 | return MediaWikiPage(self, pageid=pageid, preload=preload) 841 | 842 | def wiki_request(self, params: Dict[str, Any]) -> Dict[Any, Any]: 843 | """ Make a request to the MediaWiki API using the given search 844 | parameters 845 | 846 | Args: 847 | params (dict): Request parameters 848 | Returns: 849 | A parsed dict of the JSON response 850 | Note: 851 | Useful when wanting to query the MediaWiki site for some \ 852 | value that is not part of the wrapper API """ 853 | 854 | params["format"] = "json" 855 | if "action" not in params: 856 | params["action"] = "query" 857 | 858 | limit = self._config.rate_limit 859 | last_call = self._config._rate_limit_last_call 860 | if limit and last_call and last_call + self._config.rate_limit_min_wait > datetime.now(): 861 | # call time to quick for rate limited api requests, wait 862 | wait_time = (last_call + self._config.rate_limit_min_wait) - datetime.now() 863 | time.sleep(wait_time.total_seconds()) 864 | 865 | req = self._get_response(params) 866 | 867 | if self._config.rate_limit: 868 | self._config._rate_limit_last_call = datetime.now() 869 | 870 | return req 871 | 872 | # Protected functions 873 | def _get_site_info(self): 874 | """Parse out the Wikimedia site information including API Version and Extensions""" 875 | 876 | response = self.wiki_request({"meta": "siteinfo", "siprop": "extensions|general"}) 877 | 878 | # parse what we need out here! 879 | query = response.get("query", None) 880 | if query is None or query.get("general", None) is None: 881 | raise MediaWikiException("Missing query in response") 882 | 883 | gen = query.get("general", None) 884 | 885 | api_version = gen["generator"].split(" ")[1].split("-")[0] 886 | 887 | major_minor = [int(i) for i in api_version.split(".")] 888 | 889 | self._api_version = tuple(major_minor) 890 | self._api_version_str = ".".join([str(x) for x in self._api_version]) 891 | 892 | # parse the base url out 893 | tmp = gen.get("server", "") 894 | if tmp == "": 895 | raise MediaWikiException("Unable to parse base url") 896 | if tmp.startswith("http://") or tmp.startswith("https://"): 897 | self._base_url = tmp 898 | elif gen["base"].startswith("https:"): 899 | self._base_url = f"https:{tmp}" 900 | else: 901 | self._base_url = f"http:{tmp}" 902 | 903 | self._extensions = [ext["name"] for ext in query["extensions"]] 904 | self._extensions = sorted(list(set(self._extensions))) 905 | 906 | # end _get_site_info 907 | 908 | @staticmethod 909 | def _check_error_response(response, query: str): 910 | """check for default error messages and throw correct exception""" 911 | if "error" in response: 912 | http_error = ["HTTP request timed out.", "Pool queue is full"] 913 | geo_error = [ 914 | "Page coordinates unknown.", 915 | "One of the parameters gscoord, gspage, gsbbox is required", 916 | "Invalid coordinate provided", 917 | ] 918 | err = response["error"]["info"] 919 | if err in http_error: 920 | raise HTTPTimeoutError(query) 921 | if err in geo_error: 922 | raise MediaWikiGeoCoordError(err) 923 | raise MediaWikiException(err) 924 | 925 | @staticmethod 926 | def _check_query(value, message: str): 927 | """check if the query is 'valid'""" 928 | if value is None or value.strip() == "": 929 | raise ValueError(message) 930 | 931 | @staticmethod 932 | def __category_parameter_verification(cats, depth, category): 933 | # parameter verification 934 | if len(cats) == 1 and (cats[0] is None or cats[0] == ""): 935 | msg = ( 936 | "CategoryTree: Parameter 'category' must either " 937 | "be a list of one or more categories or a string; " 938 | f"provided: '{category}'" 939 | ) 940 | raise ValueError(msg) 941 | 942 | if depth is not None and depth < 1: 943 | msg = "CategoryTree: Parameter 'depth' must be either None (for the full tree) or be greater than 0" 944 | raise ValueError(msg) 945 | 946 | def __cat_tree_rec( 947 | self, cat: str, depth: int, tree: Dict[str, Any], level: int, categories: Dict[str, Any], links: Dict[str, Any] 948 | ): 949 | """recursive function to build out the tree""" 950 | tree[cat] = {} 951 | tree[cat]["depth"] = level 952 | tree[cat]["sub-categories"] = {} 953 | tree[cat]["links"] = [] 954 | tree[cat]["parent-categories"] = [] 955 | parent_cats = [] 956 | 957 | if cat not in categories: 958 | tries = 0 959 | while True: 960 | if tries > 10: 961 | raise MediaWikiCategoryTreeError(cat) 962 | try: 963 | pag = self.page(f"{self.category_prefix}:{cat}") 964 | categories[cat] = pag 965 | parent_cats = categories[cat].categories 966 | links[cat] = self.categorymembers(cat, results=None, subcategories=True) 967 | break 968 | except PageError as exc: 969 | raise PageError(f"{self.category_prefix}:{cat}") from exc 970 | except KeyboardInterrupt as exc: 971 | raise exc 972 | except Exception: 973 | tries = tries + 1 974 | # TODO: Should this really sleep? 975 | time.sleep(1) 976 | else: 977 | parent_cats = categories[cat].categories 978 | 979 | tree[cat]["parent-categories"].extend(parent_cats) 980 | tree[cat]["links"].extend(links[cat][0]) 981 | 982 | if depth and level >= depth: 983 | for ctg in links[cat][1]: 984 | tree[cat]["sub-categories"][ctg] = None 985 | else: 986 | for ctg in links[cat][1]: 987 | self.__cat_tree_rec( 988 | ctg, 989 | depth, 990 | tree[cat]["sub-categories"], 991 | level + 1, 992 | categories, 993 | links, 994 | ) 995 | 996 | def _get_response(self, params: Dict[str, Any]) -> Dict[str, Any]: 997 | """wrap the call to the requests package""" 998 | try: 999 | r = self._session.get(self._config.api_url, params=params, timeout=self._config.timeout) 1000 | if r.status_code == 403: 1001 | raise MediaWikiForbidden(f"{self.api_url} return a 403 Forbidden; likely need to login!") 1002 | return r.json() 1003 | except JSONDecodeError: 1004 | return {} 1005 | 1006 | def _post_response(self, params: Dict[str, Any]) -> Dict[str, Any]: 1007 | """wrap a post call to the requests package""" 1008 | try: 1009 | r = self._session.post(self._config.api_url, data=params, timeout=self._config.timeout) 1010 | if r.status_code == 403: 1011 | raise MediaWikiForbidden(f"{self.api_url} return a 403 Forbidden; likely need to login!") 1012 | return r.json() 1013 | except JSONDecodeError: 1014 | return {} 1015 | 1016 | 1017 | # end MediaWiki class 1018 | --------------------------------------------------------------------------------