├── .bumpversion.cfg ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── labels.yml ├── release-drafter.yml ├── renovate.json └── workflows │ ├── ci.yml │ ├── constraints.txt │ ├── labeler.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CONTRIBUTING.rst ├── LICENSE.rst ├── Makefile ├── README.rst ├── docs ├── _static │ ├── artwork.ai │ ├── default.css │ ├── favicon.ico │ ├── logo.png │ └── logo2.png ├── conf.py ├── contributing.rst ├── datasources │ ├── ClubElo.ipynb │ ├── ESPN.ipynb │ ├── FBref.ipynb │ ├── FotMob.ipynb │ ├── MatchHistory.ipynb │ ├── SoFIFA.ipynb │ ├── Sofascore.ipynb │ ├── Understat.ipynb │ ├── WhoScored.ipynb │ └── index.rst ├── examples │ ├── ClubElo - Evolution of current top teams.ipynb │ ├── MatchHistory - Home advantage.ipynb │ └── index.rst ├── faq.rst ├── howto │ ├── custom-leagues.rst │ ├── index.rst │ └── proxy.rst ├── index.rst ├── intro.rst ├── license.rst ├── output.csv ├── reference │ ├── base.rst │ ├── clubelo.rst │ ├── espn.rst │ ├── fbref.rst │ ├── fotmob.rst │ ├── index.rst │ ├── matchhistory.rst │ ├── sofascore.rst │ ├── sofifa.rst │ ├── understat.rst │ ├── utils.rst │ └── whoscored.rst ├── requirements.txt └── topics │ └── index.rst ├── noxfile.py ├── poetry.lock ├── pyproject.toml ├── soccerdata ├── __init__.py ├── _common.py ├── _config.py ├── clubelo.py ├── espn.py ├── fbref.py ├── fotmob.py ├── match_history.py ├── sofascore.py ├── sofifa.py ├── understat.py └── whoscored.py └── tests ├── __init__.py ├── appdata └── config │ ├── league_dict.json │ └── teamname_replacements.json ├── conftest.py ├── test_ClubElo.py ├── test_ESPN.py ├── test_FBref.py ├── test_FotMob.py ├── test_Integration.py ├── test_MatchHistory.py ├── test_SoFIFA.py ├── test_Sofascore.py ├── test_Understat.py ├── test_Whoscored.py ├── test_common.py └── test_config.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.8.7 3 | commit = True 4 | tag = False 5 | 6 | [bumpversion:file:pyproject.toml] 7 | search = version = "{current_version}" 8 | replace = version = "{new_version}" 9 | 10 | [bumpversion:file:docs/conf.py] 11 | search = release = "{current_version}" 12 | replace = release = "{new_version}" 13 | 14 | [bumpversion:file:soccerdata/__init__.py] 15 | search = __version__ = "{current_version}" 16 | replace = __version__ = "{new_version}" 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is and the expected behavior. 12 | 13 | **Affected scrapers** 14 | This affects the following scrapers: 15 | - [ ] ClubElo 16 | - [ ] ESPN 17 | - [ ] FBref 18 | - [ ] FiveThirtyEight 19 | - [ ] FotMob 20 | - [ ] Match History 21 | - [ ] SoFIFA 22 | - [ ] Understat 23 | - [ ] WhoScored 24 | 25 | **Code example** 26 | A minimal code example that fails. Use `no_cache=True` to make sure an invalid cached file does not cause the bug and make sure you have the latest version of soccerdata installed. 27 | 28 | ```python 29 | import soccerdata as sd 30 | fbref = sd.FBref(leagues="ENG-Premier League", seasons="24/25", no_cache=True) 31 | fbref.read_schedule() 32 | ``` 33 | 34 | **Error message** 35 | 36 | ``` 37 | 38 | ``` 39 | 40 | **Additional context** 41 | Add any other context about the problem here. 42 | 43 | **Contributor Action Plan** 44 | 45 | - [ ] I can fix this issue and will submit a pull request. 46 | - [ ] I’m unsure how to fix this, but I'm willing to work on it with guidance. 47 | - [ ] I’m not able to fix this issue. 48 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/labels.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Labels names are important as they are used by Release Drafter to decide 3 | # regarding where to record them in changelog or if to skip them. 4 | # 5 | # The repository labels will be automatically configured using this file and 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler. 7 | - name: breaking 8 | description: Breaking Changes 9 | color: bfd4f2 10 | - name: bug 11 | description: Something isn't working 12 | color: d73a4a 13 | - name: build 14 | description: Build System and Dependencies 15 | color: bfdadc 16 | - name: ci 17 | description: Continuous Integration 18 | color: 4a97d6 19 | - name: dependencies 20 | description: Pull requests that update a dependency file 21 | color: 0366d6 22 | - name: documentation 23 | description: Improvements or additions to documentation 24 | color: 0075ca 25 | - name: duplicate 26 | description: This issue or pull request already exists 27 | color: cfd3d7 28 | - name: enhancement 29 | description: New feature or request 30 | color: a2eeef 31 | - name: github_actions 32 | description: Pull requests that update Github_actions code 33 | color: "000000" 34 | - name: good first issue 35 | description: Good for newcomers 36 | color: 7057ff 37 | - name: help wanted 38 | description: Extra attention is needed 39 | color: 008672 40 | - name: invalid 41 | description: This doesn't seem right 42 | color: e4e669 43 | - name: performance 44 | description: Performance 45 | color: "016175" 46 | - name: question 47 | description: Further information is requested 48 | color: d876e3 49 | - name: refactoring 50 | description: Refactoring 51 | color: ef67c4 52 | - name: removal 53 | description: Removals and Deprecations 54 | color: 9ae7ea 55 | - name: testing 56 | description: Testing 57 | color: b1fc6f 58 | - name: wontfix 59 | description: This will not be worked on 60 | color: ffffff 61 | - name: common 62 | description: Issue or pull request related to all scrapers 63 | color: F1C40F 64 | - name: ClubElo 65 | description: Issue or pull request related to the ClubElo scraper 66 | color: "273746" 67 | - name: ESPN 68 | description: Issue or pull request related to the ESPN scraper 69 | color: "943126" 70 | - name: FBref 71 | description: Issue or pull request related to the FBref scraper 72 | color: 145A32 73 | - name: FiveThirtyEight 74 | description: Issue or pull request related to the FiveThirtyEight scraper 75 | color: E67E22 76 | - name: FotMob 77 | description: Issue or pull request related to the FotMob scraper 78 | color: 228B22 79 | - name: MatchHistory 80 | description: Issue or pull request related to the MatchHistory scraper 81 | color: 1A5276 82 | - name: Sofascore 83 | description: Issue or pull request related to the Sofascore scraper 84 | color: 3740F5 85 | - name: SoFIFA 86 | description: Issue or pull request related to the SoFIFA scraper 87 | color: 138D75 88 | - name: WhoScored 89 | description: Issue or pull request related to the WhoScored scraper 90 | color: 76448A 91 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - title: ":boom: Breaking Changes" 3 | label: "breaking" 4 | - title: ":rocket: Features" 5 | label: "enhancement" 6 | - title: ":fire: Removals and Deprecations" 7 | label: "removal" 8 | - title: ":beetle: Fixes" 9 | label: "bug" 10 | - title: ":racehorse: Performance" 11 | label: "performance" 12 | - title: ":rotating_light: Testing" 13 | label: "testing" 14 | - title: ":construction_worker: Continuous Integration" 15 | label: "ci" 16 | - title: ":books: Documentation" 17 | label: "documentation" 18 | - title: ":hammer: Refactoring" 19 | label: "refactoring" 20 | - title: ":lipstick: Style" 21 | label: "style" 22 | - title: ":package: Dependencies" 23 | labels: 24 | - "dependencies" 25 | - "build" 26 | change-template: "* $TITLE (#$NUMBER) @$AUTHOR" 27 | template: | 28 | ## Changes 29 | 30 | $CHANGES 31 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:base", ":automergePatch"], 3 | "stabilityDays": 7, 4 | "addLabels": ["dependencies"], 5 | "pip_requirements": { 6 | "fileMatch": ["constraints.txt"] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | tests: 7 | name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }} 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - {python: "3.11", os: "ubuntu-latest", session: "pre-commit"} 14 | - {python: "3.11", os: "ubuntu-latest", session: "mypy"} 15 | - {python: "3.11", os: "ubuntu-latest", session: "tests"} 16 | - {python: "3.10", os: "ubuntu-latest", session: "tests"} 17 | - {python: "3.9", os: "ubuntu-latest", session: "tests"} 18 | - {python: "3.11", os: "windows-latest", session: "tests"} 19 | - {python: "3.11", os: "macos-latest", session: "tests"} 20 | - {python: "3.11", os: "ubuntu-latest", session: "docs-build"} 21 | env: 22 | NOXSESSION: ${{ matrix.session }} 23 | FORCE_COLOR: "1" 24 | PRE_COMMIT_COLOR: "always" 25 | steps: 26 | - name: Check out the repository 27 | uses: actions/checkout@v4.2.2 28 | - name: Restore data cache 29 | if: matrix.session == 'tests' 30 | id: cache-data 31 | uses: actions/cache@v4 32 | with: 33 | path: tests/appdata/data 34 | key: cache-data-${{ runner.os }}-${{ matrix.python }} 35 | - name: Set up Python ${{ matrix.python }} 36 | uses: actions/setup-python@v5.6.0 37 | with: 38 | python-version: ${{ matrix.python }} 39 | - name: Upgrade pip 40 | run: | 41 | pip install --constraint=.github/workflows/constraints.txt pip 42 | pip --version 43 | - name: Upgrade pip in virtual environments 44 | shell: python 45 | run: | 46 | import os 47 | import pip 48 | 49 | with open(os.environ["GITHUB_ENV"], mode="a") as io: 50 | print(f"VIRTUALENV_PIP={pip.__version__}", file=io) 51 | - name: Install Poetry 52 | run: | 53 | pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt poetry 54 | poetry --version 55 | - name: Install Nox 56 | run: | 57 | pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox 58 | pipx inject --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox nox-poetry 59 | nox --version 60 | - name: Compute pre-commit cache key 61 | if: matrix.session == 'pre-commit' 62 | id: pre-commit-cache 63 | shell: python 64 | run: | 65 | import hashlib 66 | import sys 67 | 68 | python = "py{}.{}".format(*sys.version_info[:2]) 69 | payload = sys.version.encode() + sys.executable.encode() 70 | digest = hashlib.sha256(payload).hexdigest() 71 | result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8]) 72 | 73 | print("::set-output name=result::{}".format(result)) 74 | - name: Restore pre-commit cache 75 | uses: actions/cache@v4.2.3 76 | if: matrix.session == 'pre-commit' 77 | with: 78 | path: ~/.cache/pre-commit 79 | key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }} 80 | restore-keys: | 81 | ${{ steps.pre-commit-cache.outputs.result }}- 82 | - name: Install pandoc 83 | if: matrix.session == 'docs-build' 84 | run: sudo apt-get install -y pandoc 85 | - name: Run Nox 86 | run: | 87 | nox --force-color --python=${{ matrix.python }} 88 | - name: Upload coverage data 89 | if: always() && matrix.session == 'tests' 90 | uses: actions/upload-artifact@v4 91 | with: 92 | name: coverage-data-${{ matrix.os }}-${{ matrix.python }} 93 | path: ".coverage.*" 94 | include-hidden-files: true 95 | - name: Upload documentation 96 | if: matrix.session == 'docs-build' 97 | uses: actions/upload-artifact@v4 98 | with: 99 | name: docs 100 | path: docs/_build 101 | coverage: 102 | runs-on: ubuntu-latest 103 | needs: tests 104 | steps: 105 | - name: Check out the repository 106 | uses: actions/checkout@v4.2.2 107 | - name: Set up Python 108 | uses: actions/setup-python@v5.6.0 109 | with: 110 | python-version: "3.11" 111 | - name: Upgrade pip 112 | run: | 113 | pip install --constraint=.github/workflows/constraints.txt pip 114 | pip --version 115 | - name: Install Poetry 116 | run: | 117 | pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt poetry 118 | poetry --version 119 | - name: Install Nox 120 | run: | 121 | pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox 122 | pipx inject --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox nox-poetry 123 | nox --version 124 | - name: Download coverage data 125 | uses: actions/download-artifact@v4 126 | with: 127 | pattern: coverage-data-* 128 | merge-multiple: true 129 | - name: Combine coverage data and display human readable report 130 | run: | 131 | nox --force-color --session=coverage 132 | - name: Create coverage report 133 | run: | 134 | nox --force-color --session=coverage -- xml 135 | - name: Upload coverage report 136 | uses: codecov/codecov-action@v5.4.2 137 | -------------------------------------------------------------------------------- /.github/workflows/constraints.txt: -------------------------------------------------------------------------------- 1 | pip==25.1.1 2 | nox==2025.5.1 3 | nox-poetry==1.1.0 4 | poetry==1.8.5 5 | virtualenv==20.31.2 6 | -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: Labeler 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | jobs: 10 | labeler: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Check out the repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Run Labeler 17 | uses: crazy-max/ghaction-github-labeler@v5.3.0 18 | with: 19 | skip-delete: true 20 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | jobs: 10 | release: 11 | name: Release 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Check out the repository 15 | uses: actions/checkout@v4.2.2 16 | with: 17 | fetch-depth: 2 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5.6.0 21 | with: 22 | python-version: "3.8" 23 | 24 | - name: Upgrade pip 25 | run: | 26 | pip install --constraint=.github/workflows/constraints.txt pip 27 | pip --version 28 | 29 | - name: Install Poetry 30 | run: | 31 | pip install --constraint=.github/workflows/constraints.txt poetry 32 | poetry --version 33 | 34 | - name: Check if there is a parent commit 35 | id: check-parent-commit 36 | run: | 37 | echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)" 38 | 39 | - name: Detect and tag new version 40 | id: check-version 41 | if: steps.check-parent-commit.outputs.sha 42 | uses: salsify/action-detect-and-tag-new-version@v2.0.3 43 | with: 44 | version-command: | 45 | bash -o pipefail -c "poetry version | awk '{ print \$2 }'" 46 | 47 | - name: Bump version for developmental release 48 | if: "! steps.check-version.outputs.tag" 49 | run: | 50 | poetry version patch && 51 | version=$(poetry version | awk '{ print $2 }') && 52 | poetry version $version.dev.$(date +%s) 53 | 54 | - name: Build package 55 | run: | 56 | poetry build --ansi 57 | 58 | - name: Publish package on PyPI 59 | if: steps.check-version.outputs.tag 60 | uses: pypa/gh-action-pypi-publish@v1.12.4 61 | with: 62 | user: __token__ 63 | password: ${{ secrets.PYPI_TOKEN }} 64 | 65 | - name: Publish package on TestPyPI 66 | if: "! steps.check-version.outputs.tag" 67 | uses: pypa/gh-action-pypi-publish@v1.12.4 68 | with: 69 | user: __token__ 70 | password: ${{ secrets.TEST_PYPI_TOKEN }} 71 | repository_url: https://test.pypi.org/legacy/ 72 | 73 | - name: Publish the release notes 74 | uses: release-drafter/release-drafter@v6.1.0 75 | with: 76 | publish: ${{ steps.check-version.outputs.tag != '' }} 77 | tag: ${{ steps.check-version.outputs.tag }} 78 | env: 79 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | logs 3 | config 4 | notebooks/data 5 | notebooks_priv 6 | 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Packages 13 | *.egg 14 | *.egg-info 15 | dist 16 | build 17 | eggs 18 | parts 19 | bin 20 | var 21 | sdist 22 | develop-eggs 23 | .installed.cfg 24 | lib 25 | lib64 26 | __pycache__ 27 | 28 | # Installer logs 29 | pip-log.txt 30 | 31 | # Unit test / coverage reports 32 | .coverage 33 | .tox 34 | 35 | # Translations 36 | *.mo 37 | 38 | # Data 39 | .ipynb_checkpoints 40 | 41 | # Sphinx documentation 42 | docs/_build/ 43 | docs/modules/generated/ 44 | 45 | # Hidden files 46 | .* 47 | 48 | # ...except these 49 | !.gitignore 50 | !.travis.yml 51 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: check-added-large-files 5 | name: Check for added large files 6 | entry: check-added-large-files 7 | language: system 8 | - id: check-toml 9 | name: Check Toml 10 | entry: check-toml 11 | language: system 12 | types: [toml] 13 | - id: check-yaml 14 | name: Check Yaml 15 | entry: check-yaml 16 | language: system 17 | types: [yaml] 18 | - id: darglint 19 | name: darglint 20 | entry: darglint 21 | language: system 22 | types: [python] 23 | stages: [manual] 24 | - id: end-of-file-fixer 25 | name: Fix End of Files 26 | entry: end-of-file-fixer 27 | language: system 28 | types: [text] 29 | stages: [commit, push, manual] 30 | - id: pyupgrade 31 | name: pyupgrade 32 | description: Automatically upgrade syntax for newer versions. 33 | entry: pyupgrade 34 | language: system 35 | types: [python] 36 | args: [--py39-plus] 37 | - id: trailing-whitespace 38 | name: Trim Trailing Whitespace 39 | entry: trailing-whitespace-fixer 40 | language: system 41 | types: [text] 42 | stages: [commit, push, manual] 43 | - repo: https://github.com/astral-sh/ruff-pre-commit 44 | # Ruff version. 45 | rev: v0.4.9 46 | hooks: 47 | # Run the linter. 48 | - id: ruff 49 | args: [--fix] 50 | # Run the formatter. 51 | - id: ruff-format 52 | - repo: https://github.com/pre-commit/mirrors-prettier 53 | rev: v3.1.0 54 | hooks: 55 | - id: prettier 56 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Configuration for the documentation build process. 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF 19 | formats: all 20 | 21 | # Optionally set the version of Python and requirements required to build your docs 22 | python: 23 | install: 24 | - requirements: docs/requirements.txt 25 | - method: pip 26 | path: . 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Contributor Guide 3 | ================= 4 | 5 | This document lays out guidelines and advice for contributing to this project. 6 | If you're thinking of contributing, please start by reading this document and 7 | getting a feel for how contributing to this project works. If you have any 8 | questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer. 9 | 10 | .. _Pieter Robberechts: https://people.cs.kuleuven.be/~pieter.robberechts/ 11 | 12 | The guide is split into sections based on the type of contribution you're 13 | thinking of making. 14 | 15 | 16 | .. _bug-reports: 17 | 18 | Bug Reports 19 | ----------- 20 | 21 | Bug reports are hugely important! Before you raise one, though, please check 22 | through the `GitHub issues`_, **both open and closed**, to confirm that the bug 23 | hasn't been reported before. 24 | 25 | When filing an issue, make sure to answer these questions: 26 | 27 | - Which Python version are you using? 28 | - Which version of soccerdata are you using? 29 | - What did you do? 30 | - What did you expect to see? 31 | - What did you see instead? 32 | 33 | The best way to get your bug fixed is to provide a test case, 34 | and/or steps to reproduce the issue. 35 | 36 | .. _GitHub issues: https://github.com/probberechts/soccerdata/issues 37 | 38 | 39 | Feature Requests 40 | ---------------- 41 | 42 | If you believe there is a feature missing, feel free to raise a feature 43 | request on the `Issue Tracker`_. 44 | 45 | .. _Issue tracker: https://github.com/probberechts/soccerdata/issues 46 | 47 | 48 | Documentation Contributions 49 | --------------------------- 50 | 51 | Documentation improvements are always welcome! The documentation files live in 52 | the ``docs/`` directory of the codebase. They're written in 53 | `reStructuredText`_, and use `Sphinx`_ to generate the full suite of 54 | documentation. 55 | 56 | You do not have to setup a development environment to make small changes to 57 | the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes. 58 | 59 | When contributing documentation, please do your best to follow the style of the 60 | documentation files. This means a soft-limit of 79 characters wide in your text 61 | files and a semi-formal, yet friendly and approachable, prose style. 62 | 63 | When presenting Python code, use single-quoted strings (``'hello'`` instead of 64 | ``"hello"``). 65 | 66 | .. _reStructuredText: http://docutils.sourceforge.net/rst.html 67 | .. _Sphinx: http://sphinx-doc.org/index.html 68 | .. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files 69 | 70 | 71 | Code Contributions 72 | ------------------ 73 | 74 | If you intend to contribute code, do not feel the need to sit on your 75 | contribution until it is perfectly polished and complete. It helps everyone 76 | involved for you to seek feedback as early as you possibly can. Submitting an 77 | early, unfinished version of your contribution for feedback can save you from 78 | putting a lot of work into a contribution that is not suitable for the 79 | project. 80 | 81 | Setting up your development environment 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 83 | 84 | You need Python 3.7.1+ and the following tools: 85 | 86 | - Poetry_ 87 | - Nox_ 88 | - nox-poetry_ 89 | 90 | Install the package with development requirements: 91 | 92 | .. code:: console 93 | 94 | $ poetry install 95 | $ poetry self add poetry-plugin-export 96 | 97 | You can now run an interactive Python session. 98 | 99 | .. code:: console 100 | 101 | $ poetry run python 102 | 103 | .. _Poetry: https://python-poetry.org/ 104 | .. _Nox: https://nox.thea.codes/ 105 | .. _nox-poetry: https://nox-poetry.readthedocs.io/ 106 | 107 | Steps for submitting Code 108 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 109 | 110 | When contributing code, you'll want to follow this checklist: 111 | 112 | 1. Fork the repository on GitHub. 113 | 2. Run the tests to confirm they all pass on your system. If they don't, you'll 114 | need to investigate why they fail. If you're unable to diagnose this 115 | yourself, raise it as a bug report. 116 | 3. Write tests that demonstrate your bug or feature. Ensure that they fail. 117 | 4. Make your change. 118 | 5. Run the entire test suite again, confirming that all tests pass *including 119 | the ones you just added*. 120 | 6. Make sure your code follows the code style discussed below. 121 | 7. Send a GitHub Pull Request to the main repository's ``master`` branch. 122 | GitHub Pull Requests are the expected method of code collaboration on this 123 | project. 124 | 125 | Testing the project 126 | ~~~~~~~~~~~~~~~~~~~ 127 | 128 | Run the full test suite: 129 | 130 | .. code:: console 131 | 132 | $ nox 133 | 134 | List the available Nox sessions: 135 | 136 | .. code:: console 137 | 138 | $ nox --list-sessions 139 | 140 | You can also run a specific Nox session. 141 | For example, invoke the unit test suite like this: 142 | 143 | .. code:: console 144 | 145 | $ nox --session=tests 146 | 147 | Unit tests are located in the ``tests`` directory, 148 | and are written using the pytest_ testing framework. 149 | 150 | .. _pytest: https://pytest.readthedocs.io/ 151 | 152 | Code style 153 | ~~~~~~~~~~~ 154 | 155 | The soccerdata codebase uses the `PEP 8`_ code style. In addition, we have 156 | a few guidelines: 157 | 158 | - Line-length can exceed 79 characters, to 100, when convenient. 159 | - Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient. 160 | - Always use double-quoted strings (e.g. ``"#soccer"``), unless a double-quote occurs within the string. 161 | 162 | To ensure all code conforms to this format. You can format the code using the 163 | pre-commit hooks. 164 | 165 | .. code:: console 166 | 167 | $ nox --session=pre-commit 168 | 169 | Docstrings are to follow the `numpydoc guidelines`_. 170 | 171 | .. _PEP 8: https://pep8.org/ 172 | .. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html 173 | 174 | Submitting changes 175 | ~~~~~~~~~~~~~~~~~~ 176 | 177 | Open a `pull request`_ to submit changes to this project. 178 | 179 | Your pull request needs to meet the following guidelines for acceptance: 180 | 181 | - The Nox test suite must pass without errors and warnings. 182 | - Include unit tests. 183 | - If your changes add functionality, update the documentation accordingly. 184 | 185 | Feel free to submit early, though. We can always iterate on this. 186 | 187 | To run linting and code formatting checks before committing your change, you 188 | can install pre-commit as a Git hook by running the following command: 189 | 190 | .. code:: console 191 | 192 | $ nox --session=pre-commit -- install 193 | 194 | It is recommended to open an issue before starting work on anything. 195 | 196 | .. _pull request: https://github.com/probberechts/soccerdata/pulls 197 | .. github-only 198 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Apache License 2 | ============== 3 | 4 | Copyright (c) 2021 Pieter Robberechts 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | This file incorporates code of the `footballdata`_ software package covered 19 | by the following copyright and permission notice: 20 | 21 | Copyright (c) 2017 skagr 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy 24 | of this software and associated documentation files (the "Software"), to deal 25 | in the Software without restriction, including without limitation the rights 26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | copies of the Software, and to permit persons to whom the Software is 28 | furnished to do so, subject to the following conditions: 29 | 30 | The above copyright notice and this permission notice shall be included in all 31 | copies or substantial portions of the Software. 32 | 33 | .. _footballdata: https://github.com/skagr/footballdata 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init test lint pretty 2 | 3 | BIN = .venv/bin/ 4 | CODE = soccerdata 5 | PY = 3.9 6 | 7 | init: 8 | python3 -m venv .venv 9 | poetry install 10 | 11 | test: 12 | nox -rs tests-$(PY) -- $(args) 13 | 14 | mypy: 15 | nox -rs mypy-$(PY) -- $(args) 16 | 17 | lint: 18 | nox -rs pre-commit -- $(args) 19 | 20 | precommit_install: 21 | nox -rs pre-commit -- install 22 | 23 | bump_major: 24 | $(BIN)bumpversion major 25 | 26 | bump_minor: 27 | $(BIN)bumpversion minor 28 | 29 | bump_patch: 30 | $(BIN)bumpversion patch 31 | 32 | clean: 33 | find . -type f -name "*.py[co]" -delete 34 | find . -type d -name "__pycache__" -delete 35 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://raw.githubusercontent.com/probberechts/soccerdata/master/docs/_static/logo2.png 2 | :align: center 3 | :alt: SoccerData 4 | :width: 600px 5 | 6 | .. badges-begin 7 | 8 | |Downloads| |PyPI| |Python Version| |License| |Read the Docs| |Tests| |Codecov| |pre-commit| |Black| 9 | 10 | .. |Downloads| image:: https://static.pepy.tech/badge/soccerdata/month 11 | :target: https://pepy.tech/project/soccerdata 12 | :alt: Downloads Per Month 13 | .. |PyPI| image:: https://img.shields.io/pypi/v/soccerdata.svg 14 | :target: https://pypi.org/project/soccerdata/ 15 | :alt: PyPI 16 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/soccerdata 17 | :target: https://pypi.org/project/soccerdata 18 | :alt: Python Version 19 | .. |License| image:: https://img.shields.io/pypi/l/soccerdata.svg 20 | :target: https://opensource.org/licenses/Apache-2.0 21 | :alt: License 22 | .. |Read the Docs| image:: https://img.shields.io/readthedocs/soccerdata/latest.svg?label=Read%20the%20Docs 23 | :target: https://soccerdata.readthedocs.io/ 24 | :alt: Read the documentation at https://soccerdata.readthedocs.io/ 25 | .. |Tests| image:: https://github.com/probberechts/soccerdata/workflows/CI/badge.svg 26 | :target: https://github.com/probberechts/soccerdata/actions?workflow=CI 27 | :alt: Tests 28 | .. |Codecov| image:: https://codecov.io/gh/probberechts/soccerdata/branch/master/graph/badge.svg 29 | :target: https://app.codecov.io/gh/probberechts/soccerdata 30 | :alt: Codecov 31 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white 32 | :target: https://github.com/pre-commit/pre-commit 33 | :alt: pre-commit 34 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 35 | :target: https://github.com/psf/black 36 | :alt: Black 37 | 38 | .. badges-end 39 | 40 | SoccerData is a collection of scrapers to gather soccer data from popular 41 | websites, including `Club Elo`_, `ESPN`_, `FBref`_, 42 | `Football-Data.co.uk`_, `FotMob`_, `Sofascore`_, `SoFIFA`_, `Understat`_ and `WhoScored`_. 43 | You get Pandas DataFrames with sensible, matching column names and identifiers 44 | across datasets. Data is downloaded when needed and cached locally. 45 | 46 | .. code:: python 47 | 48 | import soccerdata as sd 49 | 50 | # Create a scraper class instance for the 2020/21 Premier League 51 | fbref = sd.FBref('ENG-Premier League', '2021') 52 | 53 | # Fetch data 54 | games = fbref.read_schedule() 55 | team_season_stats = fbref.read_team_season_stats(stat_type="passing") 56 | player_season_stats = fbref.read_player_season_stats(stat_type="standard") 57 | 58 | To learn how to install, configure and use SoccerData, see the 59 | `Quickstart guide `__. For documentation on each of the 60 | supported data sources, see the `example notebooks `__ 61 | and `API reference `__. 62 | 63 | .. _Club Elo: https://www.clubelo.com/ 64 | .. _ESPN: https://www.espn.com/soccer/ 65 | .. _FBref: https://www.fbref.com/en/ 66 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/ 67 | .. _Football-Data.co.uk: https://www.football-data.co.uk/ 68 | .. _FotMob: https://fotmob.com/ 69 | .. _Sofascore: https://www.sofascore.com/ 70 | .. _SoFIFA: https://sofifa.com/ 71 | .. _Understat: https://understat.com/ 72 | .. _WhoScored: https://www.whoscored.com/ 73 | 74 | **Usage Notice:** Please use this web scraping tool responsibly and in compliance with the terms of service of the 75 | websites you intend to scrape. The software is provided as-is, without any warranty or guarantees of any kind. The 76 | developers disclaim any responsibility for misuse, legal consequences, or damages resulting from its use. It is 77 | your responsibility to use the software in accordance with the laws and regulations of your jurisdiction. 78 | 79 | **Contribution and Issues:** As SoccerData relies on web scraping, any changes to the 80 | scraped websites will break the package. Hence, do not expect that all code 81 | will work all the time. If you spot any bugs, then please `fork it and start 82 | a pull request `__. 83 | -------------------------------------------------------------------------------- /docs/_static/artwork.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/artwork.ai -------------------------------------------------------------------------------- /docs/_static/default.css: -------------------------------------------------------------------------------- 1 | .toctree-l1 a:active, 2 | .toctree-l1 a:hover { 3 | background-color: #676767; 4 | } 5 | 6 | .sidebar-logo { 7 | max-width: 100%; 8 | } 9 | 10 | .sidebar-drawer { 11 | width: calc(50% - 25em); 12 | min-width: 22em; 13 | } 14 | 15 | .sidebar-drawer .sidebar-container { 16 | width: 23em; 17 | } 18 | 19 | li.toctree-l2 { 20 | font-size: 80%; 21 | } 22 | 23 | @media (max-width: 67em) { 24 | .sidebar-drawer { 25 | width: 22em; 26 | left: -22em; 27 | } 28 | .sidebar-drawer .sidebar-container { 29 | width: 22em; 30 | } 31 | li.toctree-l2 { 32 | font-size: 75%; 33 | } 34 | } 35 | 36 | /* autosummary table text */ 37 | article .align-center, 38 | article .align-default { 39 | text-align: left; 40 | } 41 | 42 | dt { 43 | font-weight: bold !important; 44 | } 45 | -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/logo2.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | """Sphinx configuration.""" 2 | 3 | # -- Project information ----------------------------------------------------- 4 | 5 | project = "soccerdata" 6 | author = "Pieter Robberechts" 7 | copyright = f"2021, {author}" # noqa: A001 8 | 9 | # The full version, including alpha/beta/rc tags 10 | release = "1.8.7" 11 | 12 | # -- General configuration ------------------------------------------------ 13 | 14 | extensions = [ 15 | "sphinx.ext.autodoc", 16 | "sphinx.ext.napoleon", 17 | "nbsphinx", 18 | # 'sphinx_gallery.load_style', 19 | ] 20 | exclude_patterns = ["_build", "**.ipynb_checkpoints"] 21 | autodoc_typehints = "description" 22 | autodoc_member_order = "bysource" 23 | 24 | # -- Options for HTML output ------------------------------------------------- 25 | 26 | html_theme = "furo" 27 | html_logo = "_static/logo2.png" 28 | html_favicon = "_static/favicon.ico" 29 | html_theme_options = { 30 | "sidebar_hide_name": True, 31 | "light_css_variables": { 32 | "color-brand-primary": "#2F3C7E", 33 | "color-brand-content": "#2F3C7E", 34 | "color-sidebar-background": "#fdf3f4", 35 | # "color-api-name": "#7bb5b2", 36 | # "color-api-pre-name": "#7bb5b2", 37 | }, 38 | "dark_css_variables": { 39 | "color-brand-primary": "#7C4DFF", 40 | "color-brand-content": "#7C4DFF", 41 | }, 42 | } 43 | 44 | html_static_path = ["_static"] 45 | html_css_files = ["default.css"] 46 | 47 | # -- Options for nbsphinx --------------------------------------------------- 48 | 49 | nbsphinx_thumbnails = { 50 | "examples/datasources/ClubElo": "_static/ClubElo-logo.png", 51 | "examples/datasources/ESPN": "_static/ESPN-logo.png", 52 | "examples/datasources/WhoScored": "_static/WhoScored-logo.png", 53 | "examples/datasources/FBref": "_static/FBref-logo.png", 54 | "examples/datasources/FiveThirtyEight": "_static/FiveThirtyEight-logo.png", 55 | "examples/datasources/MatchHistory": "_static/FootballData-logo.jpg", 56 | "examples/datasources/SoFIFA": "_static/SoFIFA-logo.png", 57 | "examples/datasources/Understat": "_static/Understat-logo.png", 58 | } 59 | 60 | # This is processed by Jinja2 and inserted before each notebook 61 | nbsphinx_prolog = r""" 62 | {% set docname = 'doc/' + env.doc2path(env.docname, base=None) %} 63 | 64 | .. raw:: html 65 | 66 |
67 | This page was generated from 68 | {{ docname|e }}.
69 | You can download the notebook, 70 | 85 |
86 | 87 | .. raw:: latex 88 | 89 | \nbsphinxstartnotebook{\scriptsize\noindent\strut 90 | \textcolor{gray}{The following section was generated from 91 | \sphinxcode{\sphinxupquote{\strut {{ docname | escape_latex }}}} \dotfill}} 92 | """ # noqa 93 | 94 | # This is processed by Jinja2 and inserted after each notebook 95 | nbsphinx_epilog = r""" 96 | {% set docname = 'doc/' + env.doc2path(env.docname, base=None) %} 97 | .. raw:: latex 98 | 99 | \nbsphinxstopnotebook{\scriptsize\noindent\strut 100 | \textcolor{gray}{\dotfill\ \sphinxcode{\sphinxupquote{\strut 101 | {{ docname | escape_latex }}}} ends here.}} 102 | """ 103 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | .. include:: ../CONTRIBUTING.rst 3 | -------------------------------------------------------------------------------- /docs/datasources/ClubElo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e621e3ae", 7 | "metadata": { 8 | "nbsphinx": "hidden" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "env: SOCCERDATA_LOGLEVEL=ERROR\n", 16 | "env: SOCCERDATA_NOCACHE=True\n", 17 | "env: SOCCERDATA_NOSTORE=True\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "%env SOCCERDATA_LOGLEVEL=ERROR\n", 23 | "%env SOCCERDATA_NOCACHE=True\n", 24 | "%env SOCCERDATA_NOSTORE=True" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "2454afe6", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import soccerdata as sd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "b5784f2d", 40 | "metadata": {}, 41 | "source": [ 42 | "# ClubElo" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "id": "8dab5be9", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Provides pd.DataFrames from CSV API at http://api.clubelo.com.\n", 56 | "\n", 57 | " Data will be downloaded as necessary and cached locally in\n", 58 | " ``~/soccerdata/data/ClubElo``.\n", 59 | "\n", 60 | " Since the source does not provide league names, this class will not filter\n", 61 | " by league. League names will be inserted from the other sources where\n", 62 | " available. Leagues that are only covered by clubelo.com will have NaN\n", 63 | " values.\n", 64 | "\n", 65 | " Parameters\n", 66 | " ----------\n", 67 | " proxy : 'tor' or or dict or list(dict) or callable, optional\n", 68 | " Use a proxy to hide your IP address. Valid options are:\n", 69 | " - \"tor\": Uses the Tor network. Tor should be running in\n", 70 | " the background on port 9050.\n", 71 | " - dict: A dictionary with the proxy to use. The dict should be\n", 72 | " a mapping of supported protocols to proxy addresses. For example::\n", 73 | "\n", 74 | " {\n", 75 | " 'http': 'http://10.10.1.10:3128',\n", 76 | " 'https': 'http://10.10.1.10:1080',\n", 77 | " }\n", 78 | "\n", 79 | " - list(dict): A list of proxies to choose from. A different proxy will\n", 80 | " be selected from this list after failed requests, allowing rotating\n", 81 | " proxies.\n", 82 | " - callable: A function that returns a valid proxy. This function will\n", 83 | " be called after failed requests, allowing rotating proxies.\n", 84 | " no_cache : bool\n", 85 | " If True, will not use cached data.\n", 86 | " no_store : bool\n", 87 | " If True, will not store downloaded data.\n", 88 | " data_dir : Path\n", 89 | " Path to directory where data will be cached.\n", 90 | " \n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "elo = sd.ClubElo()\n", 96 | "print(elo.__doc__)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "3a4c2916", 102 | "metadata": {}, 103 | "source": [ 104 | "## ELO scores for all teams at specified date" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "id": "745be31a", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/html": [ 116 | "
\n", 117 | "\n", 130 | "\n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
rankcountrylevelelofromtoleague
team
Liverpool1.0ENG12047.0838622022-04-202022-04-24ENG-Premier League
Man City2.0ENG12037.0599372022-04-212022-04-23ENG-Premier League
Bayern3.0GER11984.7753912022-04-182022-04-23GER-Bundesliga
Real Madrid4.0ESP11969.5843512022-04-212022-04-26ESP-La Liga
Chelsea5.0ENG11921.1014402022-04-212022-04-24ENG-Premier League
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " rank country level elo from to \\\n", 210 | "team \n", 211 | "Liverpool 1.0 ENG 1 2047.083862 2022-04-20 2022-04-24 \n", 212 | "Man City 2.0 ENG 1 2037.059937 2022-04-21 2022-04-23 \n", 213 | "Bayern 3.0 GER 1 1984.775391 2022-04-18 2022-04-23 \n", 214 | "Real Madrid 4.0 ESP 1 1969.584351 2022-04-21 2022-04-26 \n", 215 | "Chelsea 5.0 ENG 1 1921.101440 2022-04-21 2022-04-24 \n", 216 | "\n", 217 | " league \n", 218 | "team \n", 219 | "Liverpool ENG-Premier League \n", 220 | "Man City ENG-Premier League \n", 221 | "Bayern GER-Bundesliga \n", 222 | "Real Madrid ESP-La Liga \n", 223 | "Chelsea ENG-Premier League " 224 | ] 225 | }, 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "current_elo = elo.read_by_date()\n", 233 | "current_elo.head()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "246ca661", 239 | "metadata": {}, 240 | "source": [ 241 | "## Full ELO history for one club" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "id": "1c87e14a", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
rankteamcountryleveleloto
from
1939-10-22NaNBarcelonaESP11636.7045901939-12-03
1939-12-04NaNBarcelonaESP11626.1021731939-12-10
1939-12-11NaNBarcelonaESP11636.7282711939-12-17
1939-12-18NaNBarcelonaESP11646.9516601939-12-24
1939-12-25NaNBarcelonaESP11637.4243161939-12-31
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " rank team country level elo to\n", 340 | "from \n", 341 | "1939-10-22 NaN Barcelona ESP 1 1636.704590 1939-12-03\n", 342 | "1939-12-04 NaN Barcelona ESP 1 1626.102173 1939-12-10\n", 343 | "1939-12-11 NaN Barcelona ESP 1 1636.728271 1939-12-17\n", 344 | "1939-12-18 NaN Barcelona ESP 1 1646.951660 1939-12-24\n", 345 | "1939-12-25 NaN Barcelona ESP 1 1637.424316 1939-12-31" 346 | ] 347 | }, 348 | "execution_count": 5, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "barca_elo = elo.read_team_history(\"Barcelona\")\n", 355 | "barca_elo.head()" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "soccerdata", 362 | "language": "python", 363 | "name": "soccerdata" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.9.6" 376 | }, 377 | "toc": { 378 | "base_numbering": 1, 379 | "nav_menu": {}, 380 | "number_sections": true, 381 | "sideBar": true, 382 | "skip_h1_title": false, 383 | "title_cell": "Table of Contents", 384 | "title_sidebar": "Contents", 385 | "toc_cell": false, 386 | "toc_position": {}, 387 | "toc_section_display": true, 388 | "toc_window_display": true 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 5 393 | } 394 | -------------------------------------------------------------------------------- /docs/datasources/Sofascore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "pd.set_option('display.max_columns', None)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "env: SOCCERDATA_LOGLEVEL=ERROR\n", 23 | "env: SOCCERDATA_NOCACHE=True\n", 24 | "env: SOCCERDATA_NOSTORE=True\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "%env SOCCERDATA_LOGLEVEL=ERROR\n", 30 | "%env SOCCERDATA_NOCACHE=True\n", 31 | "%env SOCCERDATA_NOSTORE=True" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import soccerdata as sd" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "# Sofascore" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Provides pd.DataFrames from data available at http://www.sofascore.com.\n", 60 | "\n", 61 | " Data will be downloaded as necessary and cached locally in\n", 62 | " ``~/soccerdata/data/Sofascore``.\n", 63 | "\n", 64 | " Parameters\n", 65 | " ----------\n", 66 | " leagues : string or iterable, optional\n", 67 | " IDs of Leagues to include.\n", 68 | " seasons : string, int or list, optional\n", 69 | " Seasons to include. Supports multiple formats.\n", 70 | " Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]\n", 71 | " proxy : 'tor' or dict or list(dict) or callable, optional\n", 72 | " Use a proxy to hide your IP address. Valid options are:\n", 73 | " - 'tor': Uses the Tor network. Tor should be running in\n", 74 | " the background on port 9050.\n", 75 | " - dict: A dictionary with the proxy to use. The dict should be\n", 76 | " a mapping of supported protocols to proxy addresses. For example::\n", 77 | "\n", 78 | " {\n", 79 | " 'http': 'http://10.10.1.10:3128',\n", 80 | " 'https': 'http://10.10.1.10:1080',\n", 81 | " }\n", 82 | "\n", 83 | " - list(dict): A list of proxies to choose from. A different proxy will\n", 84 | " be selected from this list after failed requests, allowing rotating\n", 85 | " proxies.\n", 86 | " - callable: A function that returns a valid proxy. This function will\n", 87 | " be called after failed requests, allowing rotating proxies.\n", 88 | " no_cache : bool\n", 89 | " If True, will not use cached data.\n", 90 | " no_store : bool\n", 91 | " If True, will not store downloaded data.\n", 92 | " data_dir : Path\n", 93 | " Path to directory where data will be cached.\n", 94 | " \n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "sofascore = sd.Sofascore(leagues='ESP-La Liga', seasons='2022/2023')\n", 100 | "print(sofascore.__doc__)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Read league table" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "
\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
teamMPWDLGFGAGDPts
leagueseason
ESP-La Liga2223Barcelona38284670205088
2223Real Madrid38246875363978
2223Atlético Madrid38238770333777
2223Real Sociedad38218951351671
2223Villarreal381971259401964
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " team MP W D L GF GA GD Pts\n", 229 | "league season \n", 230 | "ESP-La Liga 2223 Barcelona 38 28 4 6 70 20 50 88\n", 231 | " 2223 Real Madrid 38 24 6 8 75 36 39 78\n", 232 | " 2223 Atlético Madrid 38 23 8 7 70 33 37 77\n", 233 | " 2223 Real Sociedad 38 21 8 9 51 35 16 71\n", 234 | " 2223 Villarreal 38 19 7 12 59 40 19 64" 235 | ] 236 | }, 237 | "execution_count": 5, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "league_table = sofascore.read_league_table()\n", 244 | "league_table.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## Read schedule" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 8, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/html": [ 262 | "
\n", 263 | "\n", 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | "
roundweekdatehome_teamaway_teamhome_scoreaway_scoregame_id
leagueseasongame
ESP-La Liga22232022-08-12 Osasuna-SevillaNone12022-08-12 15:00:00OsasunaSevilla2110408559
2022-08-13 Barcelona-Rayo VallecanoNone12022-08-13 15:00:00BarcelonaRayo Vallecano0010408557
2022-08-13 Celta Vigo-EspanyolNone12022-08-13 11:00:00Celta VigoEspanyol2210408645
2022-08-13 Real Valladolid-VillarrealNone12022-08-13 13:00:00Real ValladolidVillarreal0310408563
2022-08-14 Almería-Real MadridNone12022-08-14 16:00:00AlmeríaReal Madrid1210408712
\n", 365 | "
" 366 | ], 367 | "text/plain": [ 368 | " round week \\\n", 369 | "league season game \n", 370 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla None 1 \n", 371 | " 2022-08-13 Barcelona-Rayo Vallecano None 1 \n", 372 | " 2022-08-13 Celta Vigo-Espanyol None 1 \n", 373 | " 2022-08-13 Real Valladolid-Villarreal None 1 \n", 374 | " 2022-08-14 Almería-Real Madrid None 1 \n", 375 | "\n", 376 | " date \\\n", 377 | "league season game \n", 378 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla 2022-08-12 15:00:00 \n", 379 | " 2022-08-13 Barcelona-Rayo Vallecano 2022-08-13 15:00:00 \n", 380 | " 2022-08-13 Celta Vigo-Espanyol 2022-08-13 11:00:00 \n", 381 | " 2022-08-13 Real Valladolid-Villarreal 2022-08-13 13:00:00 \n", 382 | " 2022-08-14 Almería-Real Madrid 2022-08-14 16:00:00 \n", 383 | "\n", 384 | " home_team \\\n", 385 | "league season game \n", 386 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla Osasuna \n", 387 | " 2022-08-13 Barcelona-Rayo Vallecano Barcelona \n", 388 | " 2022-08-13 Celta Vigo-Espanyol Celta Vigo \n", 389 | " 2022-08-13 Real Valladolid-Villarreal Real Valladolid \n", 390 | " 2022-08-14 Almería-Real Madrid Almería \n", 391 | "\n", 392 | " away_team \\\n", 393 | "league season game \n", 394 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla Sevilla \n", 395 | " 2022-08-13 Barcelona-Rayo Vallecano Rayo Vallecano \n", 396 | " 2022-08-13 Celta Vigo-Espanyol Espanyol \n", 397 | " 2022-08-13 Real Valladolid-Villarreal Villarreal \n", 398 | " 2022-08-14 Almería-Real Madrid Real Madrid \n", 399 | "\n", 400 | " home_score \\\n", 401 | "league season game \n", 402 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla 2 \n", 403 | " 2022-08-13 Barcelona-Rayo Vallecano 0 \n", 404 | " 2022-08-13 Celta Vigo-Espanyol 2 \n", 405 | " 2022-08-13 Real Valladolid-Villarreal 0 \n", 406 | " 2022-08-14 Almería-Real Madrid 1 \n", 407 | "\n", 408 | " away_score game_id \n", 409 | "league season game \n", 410 | "ESP-La Liga 2223 2022-08-12 Osasuna-Sevilla 1 10408559 \n", 411 | " 2022-08-13 Barcelona-Rayo Vallecano 0 10408557 \n", 412 | " 2022-08-13 Celta Vigo-Espanyol 2 10408645 \n", 413 | " 2022-08-13 Real Valladolid-Villarreal 3 10408563 \n", 414 | " 2022-08-14 Almería-Real Madrid 2 10408712 " 415 | ] 416 | }, 417 | "execution_count": 8, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "schedule = sofascore.read_schedule()\n", 424 | "schedule.head()" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [] 433 | } 434 | ], 435 | "metadata": { 436 | "kernelspec": { 437 | "display_name": "srcftbl", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.12.1" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 2 456 | } 457 | -------------------------------------------------------------------------------- /docs/datasources/index.rst: -------------------------------------------------------------------------------- 1 | .. soccerdata package index documentation toctree 2 | .. _datasources: 3 | 4 | .. currentmodule:: soccerdata 5 | .. highlight:: python 6 | 7 | ======================== 8 | Overview of Data Sources 9 | ======================== 10 | 11 | Currently, the following data sources are supported. 12 | 13 | .. rst-class:: datasources 14 | 15 | ----- 16 | 17 | ClubElo 18 | `URL `__ | 19 | :doc:`Example usage ` | 20 | :doc:`API reference ` 21 | 22 | .. code:: 23 | 24 | from soccerdata import ClubElo 25 | 26 | Team's relative strengths as Elo ratings, for most European leagues. Recalculated after every round, includes history. 27 | 28 | ----- 29 | 30 | ESPN 31 | `URL `__ | 32 | :doc:`Example usage ` | 33 | :doc:`API reference ` 34 | 35 | .. code:: 36 | 37 | from soccerdata import ESPN 38 | 39 | Historical results, statistics and lineups. 40 | 41 | ----- 42 | 43 | FBref 44 | `URL `__ | 45 | :doc:`Example usage ` | 46 | :doc:`API reference ` 47 | 48 | .. code:: 49 | 50 | from soccerdata import FBref 51 | 52 | Historical results, lineups, and detailed aggregated statistics for teams and individual players based on Opta data. 53 | 54 | ----- 55 | 56 | FotMob 57 | `URL `__ | 58 | :doc:`Example usage ` | 59 | :doc:`API reference ` 60 | 61 | .. code:: 62 | 63 | from soccerdata import FotMob 64 | 65 | Historical results, lineups, and detailed aggregated statistics for teams and individual players based on Opta data. 66 | 67 | ----- 68 | 69 | Football-Data.co.uk 70 | `URL `__ | 71 | :doc:`Example usage ` | 72 | :doc:`API reference ` 73 | 74 | .. code:: 75 | 76 | from soccerdata import MatchHistory 77 | 78 | Historical results, betting odds and match statistics. Level of detail depends on league. 79 | 80 | ----- 81 | 82 | Sofascore 83 | `URL `__ | 84 | :doc:`Example usage ` | 85 | :doc:`API reference ` 86 | 87 | .. code:: 88 | 89 | from soccerdata import Sofascore 90 | 91 | Results, schedules, lineups, and detailed statistics for teams and individual players. 92 | 93 | ----- 94 | 95 | SoFIFA 96 | `URL `__ | 97 | :doc:`Example usage ` | 98 | :doc:`API reference ` 99 | 100 | .. code:: 101 | 102 | from soccerdata import SoFIFA 103 | 104 | Detailed scores on all player's abilities from EA Sports FC. 105 | 106 | ----- 107 | 108 | Understat 109 | `URL `__ | 110 | :doc:`Example usage ` | 111 | :doc:`API reference ` 112 | 113 | .. code:: 114 | 115 | from soccerdata import Understat 116 | 117 | Advanced statistics such as xG, xGBuildup and xGChain, and shot events with associated xG values for the top European leagues. 118 | 119 | ----- 120 | 121 | WhoScored 122 | `URL `__ | 123 | :doc:`Example usage ` | 124 | :doc:`API reference ` 125 | 126 | .. code:: 127 | 128 | from soccerdata import WhoScored 129 | 130 | Historical results, match preview data and detailed Opta event stream data for major leagues. 131 | 132 | .. toctree:: 133 | :hidden: 134 | 135 | ClubElo 136 | ESPN 137 | FBref 138 | FiveThirtyEight 139 | FotMob 140 | MatchHistory 141 | SoFIFA 142 | Understat 143 | WhoScored 144 | -------------------------------------------------------------------------------- /docs/examples/index.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Data Analysis Examples 3 | ====================== 4 | 5 | Below is a gallery of examples of data analysis using the ``soccerdata`` package. 6 | If you have an example you would like to share, please submit a pull request to the 7 | SoccerData GitHub repository. 8 | 9 | .. nbgallery:: 10 | :glob: 11 | :hidden: 12 | 13 | * 14 | -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | .. _faq: 2 | 3 | FAQ 4 | ===== 5 | 6 | **Is web scraping legal?** 7 | 8 | Even though web scraping is ubiquitous, its legal status remains unclear. That 9 | is because whether web scraping is legal will depend on many aspects. 10 | It is always best to consult with a lawyer or legal expert to ensure that your 11 | web scraping activities are legal and comply with all applicable laws and 12 | regulations. 13 | 14 | .. Currently, web scraping is not per se prohibited in the European Union but the 15 | .. use of data mining tools is legally risky. 16 | .. 17 | .. The sui generis database right protects the content of a database. What does 18 | .. it mean for web scrapers? That you can scrape such data (and, therefore, copy 19 | .. and collect contents of the protected database – which falls under the 20 | .. definition of “extraction” under the analyzed Directive) as long as (a) you 21 | .. don’t scrape a ‘substantial part, evaluated qualitatively and/or 22 | .. quantitatively, of the contents of that database’ and you don’t re-use it 23 | .. (meaning basically selling or publishing it); or (b) scraping falls under TDM 24 | .. exception described below; or (c) you’ve received an appropriate licence. 25 | .. 26 | .. However, the TDM exception is limited: the database owners are granted the 27 | .. possibility to restrict the reproduction and extraction of the databases and 28 | .. their content. That restriction must be made in a manner that will allow bots 29 | .. and crawlers etc. to see that restriction (therefore, on a website there 30 | .. should be installed for example a special program communicating visiting 31 | .. scraping programs that scraping is prohibited). Any such restriction should, 32 | .. in any case, permit scraping made for scientific research purposes (see art. 33 | .. 3 (1) and 7(1) of the DSM Directive). 34 | .. 35 | .. But there are more traps on your way. One of them is the possibility of 36 | .. breaching the website’s Terms of Use if they prohibit web scraping. 37 | .. As the situation is highly uncertain, it is advisable to be careful and, if 38 | .. possible, rather avoid breaching terms of use made available in any form. 39 | 40 | .. To minimize concerns, scraping should be discreet, respect websites’ terms of 41 | .. service, check whether sites are using the robots.txt protocol to communicate 42 | .. that scraping is prohibited, avoid personal data scraping and, if it is 43 | .. necessary, make sure no GDPR violations are made and avoid scraping private or 44 | .. classified information. If possible, it would be advisable to get a licence 45 | .. for scraping. 46 | 47 | 48 | **Something doesn’t work** 49 | 50 | 1. Have you updated to the newest version of soccerdata? 51 | 2. Clear the cache or run your script without caching enabled. 52 | 3. Does the log produce any warnings that sound like they might be related? 53 | Maybe the data you are looking for is not available or can not be processed 54 | correctly. 55 | 4. Open an issue on GitHub. 56 | -------------------------------------------------------------------------------- /docs/howto/custom-leagues.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | How to add custom leagues 3 | =========================== 4 | 5 | SoccerData has built-in support to scrape data from the top-5 European leagues 6 | and the major international tournaments. The leagues available for each source 7 | can be listed with the :meth:`~soccerdata.FBref.available_leagues` class method. 8 | 9 | .. code:: python 10 | 11 | import soccerdata as sd 12 | sd.FBref.available_leagues() 13 | >>> ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A'] 14 | 15 | This documentation explains how to add custom leagues. 16 | 17 | 18 | .. warning:: 19 | 20 | Note that you might encounter errors when trying to scrape data for the 21 | leagues you added yourself. This is because the data provided for these 22 | leagues might have a different structure. If you encounter such an error, 23 | please do not open an issue on GitHub, but try to fix it yourself. 24 | 25 | 26 | 27 | Adding a new league 28 | ------------------- 29 | 30 | Additional leagues can configured in ``SOCCERDATA_DIR/config/league_dict.json``. 31 | This file should contain a mapping between a generic name for the league and 32 | the identifier used internally by each data source (see below) that you want 33 | to support. For example, for the Dutch Eredivisie this would be: 34 | 35 | .. code-block:: json 36 | 37 | { 38 | "NED-Eredivisie": { 39 | "ClubElo": "NED_1", 40 | "MatchHistory": "N1", 41 | "SoFIFA": "[Netherlands] Eredivisie", 42 | "FBref": "Eredivisie", 43 | "ESPN": "ned.1", 44 | "FiveThirtyEight": "eredivisie", 45 | "WhoScored": "Netherlands - Eredivisie", 46 | "Sofascore": "Eredivisie", 47 | "season_start": "Aug", 48 | "season_end": "May" 49 | } 50 | } 51 | 52 | The ``season_end`` and ``season_start`` fields are optional. This should be 53 | the month in which the last game and first game of a season are played, 54 | respectively. If they are not provided, June is used as the last month of the 55 | season and July as the first one. 56 | 57 | Now, restart your Python session and check whether it is added to available 58 | leagues by running the command below. 59 | 60 | .. code:: python 61 | 62 | >>> import soccerdata as sd 63 | >>> sd.FBref.available_leagues() 64 | [..., 'NED-Eredivisie', ...] 65 | 66 | 67 | 68 | Internal identifiers 69 | -------------------- 70 | 71 | Below are instructions on how to find the internal identifiers for each data 72 | source. 73 | 74 | **ClubElo** 75 | The internal identifier has the format ``{country_code}_{level}``. The get 76 | the country code, go to https://clubelo.com/, click on the league you want 77 | to add and take the three-letter code in the URL. For example, the URL for 78 | the Dutch Eredivisie is http://clubelo.com/NED which means that the country 79 | identifier is ``NED``. The level is the number of the league, starting with 80 | 1 for the top league. The internal identifier for the Dutch Eredivisie is 81 | therefore ``NED_1``. 82 | 83 | **MatchHistory** 84 | The internal identifier has the format ``{country_code}{level}``. Download 85 | the CSV file corresponding corresponding to the league you would like to add 86 | from https://www.football-data.co.uk/data.php and take the value in the 87 | ``Div`` column. 88 | 89 | **SoFIFA** 90 | The internal identifier has the format ``[{region}] {league name}``. Go to 91 | https://sofifa.com/api/league to get the list of available leagues. The 92 | ``{region}`` corresponds to the ``nationName`` field in the JSON response. The 93 | ``{league name}`` corresponds to the ``value`` field. 94 | 95 | **FBref** 96 | Go to https://fbref.com/en/comps/ and take the value in the ``Competition 97 | Name`` column. 98 | 99 | **ESPN** 100 | The internal identifier has the format ``{country_code}.{level}``. Go to 101 | https://www.espn.com/soccer/competitions, click on the league you want 102 | to add and take the value in the URL after ``/league/_/name/``. 103 | 104 | **FiveThirtyEight** 105 | Go to https://projects.fivethirtyeight.com/soccer-predictions/, select the 106 | relevant league and take the value in the URL after 107 | ``/soccer-predictions/``. 108 | 109 | **WhoScored** 110 | Go to https://www.whoscored.com and use the JavaScript console to get the 111 | value of the ``allRegions`` variable. The internal identifier has the format 112 | ``{region name} - {league name}``. 113 | 114 | **FotMob** 115 | The internal identifier is identical in style to the general format: 116 | ``[{region}]-{league name}``. Go to https://www.fotmob.com/api/allLeagues 117 | to get the list of available leagues. The ``{region}`` corresponds to the 118 | ``ccode`` field in the JSON response. The ``{league name}`` corresponds to 119 | the ``name`` field. 120 | 121 | **Sofascore** 122 | Go to https://api.sofascore.com/api/v1/config/unique-tournaments/EN/football 123 | to get the list of major leagues and tournaments. Access ``uniqueTournaments`` 124 | in the JSON response, and the ``{league name}`` corresponds to the ``name`` 125 | field. 126 | 127 | Troubleshooting 128 | --------------- 129 | 130 | If you add a new league and it doesn't show up in the list of available leagues, 131 | there are a few things you can do to debug the problem. 132 | 133 | 1. Make sure to reload the soccerdata module after you modify the 134 | ``league_dict.json`` file. The most straightforward way to do this is to 135 | restart your notebook or Python interpreter. 136 | 137 | 2. Check whether your ``league_dict.json`` file is at the correct location. If 138 | so, you should see this appear in the log messages when importing the 139 | soccerdata library. 140 | 141 | .. code:: python 142 | 143 | >>> import soccerdata as sd 144 | [11/25/22 11:49:12] INFO Custom team name replacements loaded from /teamname_replacements.json. _config.py:83 145 | INFO Custom league dict loaded from /league_dict.json. _config.py:153 146 | 147 | 148 | 3. Check whether the content of your ``league_dict.json`` file is valid JSON. 149 | You can check the file's syntax using Python's built-in ``json.tool`` 150 | module. 151 | 152 | .. code:: sh 153 | 154 | $ cat config/league_dict.json | python -m json.tool 155 | Expecting ',' delimiter: line 1 column 10 (char 9) 156 | -------------------------------------------------------------------------------- /docs/howto/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | How-to Guides 3 | =============== 4 | 5 | Here you'll find short answers to "How do I...?" types of questions. These 6 | how-to guides don't cover topics in depth -- you'll find that material in the 7 | :doc:`/reference/index`. However, these guides will help you quickly 8 | accomplish common tasks. 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | custom-leagues 14 | proxy 15 | -------------------------------------------------------------------------------- /docs/howto/proxy.rst: -------------------------------------------------------------------------------- 1 | How to use a proxy server 2 | ------------------------- 3 | 4 | You can setup a SOCKS5 proxy with Tor. 5 | Checkout the `installation guide`_ on the Tor website for installation 6 | instructions. After installing Tor, make sure to start it up before scraping. 7 | This can easily be done by running the ``tor`` command from your terminal (in 8 | a separate window), Tor will start up and run on “localhost:9050” by default. 9 | Once Tor is running, you can enable the extension by setting ``proxy='tor'``. 10 | 11 | .. code:: python 12 | 13 | ws = sd.WhoScored(proxy='tor') 14 | 15 | The code snippet above assumes you have a Tor proxy running on 16 | "localhost:9050". Many distributions indeed default to having a SOCKS proxy 17 | listening on port 9050, but some may not. In particular, the Tor Browser 18 | Bundle defaults to listening on port 9150. You can specify a custom host and 19 | port as 20 | 21 | .. code:: python 22 | 23 | ws = sd.WhoScored(proxy={ 24 | "http": "socks5://127.0.0.1:9150", 25 | "https": "socks5://127.0.0.1:9150", 26 | }) 27 | 28 | 29 | .. _installation guide: https://community.torproject.org/onion-services/setup/install/ 30 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Welcome to SoccerData's docs! 3 | ============================= 4 | 5 | Release v\ |release|. (``pip install soccerdata``) 6 | 7 | 8 | .. image:: https://pepy.tech/badge/soccerdata/month 9 | :target: https://pepy.tech/project/soccerdata 10 | :alt: SoccerData Downloads Per Month Badge 11 | 12 | .. image:: https://img.shields.io/pypi/l/soccerdata.svg 13 | :target: https://pypi.org/project/soccerdata/ 14 | :alt: License Badge 15 | 16 | .. image:: https://img.shields.io/pypi/pyversions/soccerdata.svg 17 | :target: https://pypi.org/project/soccerdata/ 18 | :alt: Python Version Support Badge 19 | 20 | 21 | **SoccerData** is a collection of scrapers to gather soccer data from popular 22 | websites, including `Club Elo`_, `ESPN`_, `FBref`_, `FiveThirtyEight`_, 23 | `Football-Data.co.uk`_, `FotMob`_, `Sofascore`_, `SoFIFA`_, `Understat`_ and `WhoScored`_. 24 | 25 | .. code:: python 26 | 27 | import soccerdata as sd 28 | 29 | # Create a scraper class instance for the 2020/21 Premier League 30 | fbref = sd.FBref('ENG-Premier League', '2021') 31 | 32 | # Fetch data 33 | games = fbref.read_schedule() 34 | team_season_stats = fbref.read_team_season_stats(stat_type="passing") 35 | player_season_stats = fbref.read_player_season_stats(stat_type="standard") 36 | 37 | 38 | ------------------- 39 | 40 | **Main features** 41 | 42 | - Access current and historical soccer fixtures, forecasts, detailed match 43 | stats, event stream data and more. 44 | - All data is provided in the form of Pandas DataFrames with sensible, 45 | matching column names and identifiers across datasets to make working with 46 | the data and combining data from multiple sources easy. 47 | - Data is only downloaded when needed and cached locally to speed up your 48 | analyis scripts. 49 | - Integrates with the `socceraction`_ package to allow analysis of event stream 50 | data. 51 | 52 | Do you like it? :doc:`Let's dive in! ` 53 | 54 | .. toctree:: 55 | :hidden: 56 | :maxdepth: 1 57 | 58 | intro 59 | datasources/index 60 | howto/index 61 | examples/index 62 | reference/index 63 | faq 64 | contributing 65 | License 66 | Changelog 67 | 68 | .. _socceraction: https://socceraction.readthedocs.io/en/latest/documentation/data/opta.html#whoscored 69 | .. _Club Elo: https://www.clubelo.com/ 70 | .. _ESPN: https://www.espn.com/soccer/ 71 | .. _FBref: https://www.fbref.com/en/ 72 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/ 73 | .. _Football-Data.co.uk: https://www.football-data.co.uk/ 74 | .. _FotMob: https://fotmob.com/ 75 | .. _Sofascore: https://www.sofascore.com/ 76 | .. _SoFIFA: https://sofifa.com/ 77 | .. _Understat: https://understat.com/ 78 | .. _WhoScored: https://www.whoscored.com/ 79 | -------------------------------------------------------------------------------- /docs/intro.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | =============== 4 | Getting Started 5 | =============== 6 | 7 | New to `soccerdata`? Well, you came to the right place: this tutorial will walk 8 | you through installing, configuring, and using the library. By the end of this 9 | tutorial, you will be able to scrape data from the top-5 European leagues and 10 | use it to create your own data-driven analyses. 11 | 12 | 13 | Installation 14 | ------------ 15 | 16 | SoccerData can be easily installed via `pip `__: 17 | 18 | .. code:: bash 19 | 20 | python3 -m pip install soccerdata 21 | 22 | 23 | Scraping data 24 | ------------- 25 | 26 | Each of the :ref:`supported data sources ` has its corresponding 27 | class for fetching data with a uniform API. For example, the 28 | :class:`~soccerdata.FBref` class is used to fetch data from `fbref.com 29 | `__. 30 | 31 | .. code:: python 32 | 33 | import soccerdata as sd 34 | 35 | # Create scraper class instance 36 | fbref = sd.FBref() 37 | 38 | 39 | Once you have a scraper class instance, you can use it to fetch data. See the 40 | the :ref:`examples ` and :ref:`API reference ` for the full 41 | list of options available for each scraper. For example, to fetch aggregated 42 | shooting stats for all teams: 43 | 44 | .. code:: python 45 | 46 | # Create dataframes 47 | season_stats = fbref.read_team_season_stats(stat_type='shooting') 48 | 49 | 50 | The data is always returned as a convenient Pandas DataFrame. 51 | 52 | .. csv-table:: 53 | :file: output.csv 54 | :header-rows: 1 55 | 56 | By default, the data for all available leagues and the five most recent 57 | seasons will be retrieved. However, in most cases, you would want to limit the 58 | data to specific leagues and / or seasons. This can be done by passing a list 59 | of leagues and seasons to the constructor of the scraper class. For example: 60 | 61 | .. code:: python 62 | 63 | # Create scraper class instance filtering on specific leagues and seasons 64 | fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['1718', '1819']) 65 | # Retrieve data for the specified leagues and seasons 66 | season_stats = fbref.read_team_season_stats(stat_type='shooting') 67 | 68 | 69 | Note that only a limited number of leagues are supported out-of-the-box. The 70 | leagues available for each source can be listed with the 71 | :meth:`~soccerdata.FBref.available_leagues` class method. 72 | 73 | .. code:: python 74 | 75 | sd.FBref.available_leagues() 76 | >>> ['Big 5 European Leagues Combined', 'ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A', 'INT-World Cup', "INT-Women's World Cup"] 77 | 78 | 79 | You can :doc:`add more leagues ` but there are no 80 | guarantees that they will be scraped correctly. 81 | 82 | 83 | Data caching 84 | ------------ 85 | 86 | Data caching is used to speed up the runtime and to prevent exceeding the rate 87 | limit of web servers. By default, all downloaded data is cached to 88 | ``~/soccerdata`` on Linux and Mac OS, and to ``C:\Users\yourusername\soccerdata`` 89 | on Windows. A custom location can be set if desired. You can configure this 90 | using environment variables (see below) or on the level of an individual 91 | scraper by setting the ``data_dir`` parameter when creating the scraper class 92 | instance: 93 | 94 | .. code:: python 95 | 96 | # Create scraper class instance with custom caching directory 97 | fbref = sd.FBref(data_dir="/tmp/FBref") 98 | 99 | 100 | This directory can be deleted at any time to reclaim disk space. 101 | However, this also means you will have to redownload the same data again if 102 | you need it, which will lead to reduced performance. 103 | 104 | SoccerData has no knowledge of when the data on the server changes, so it is 105 | up to the user to decide when to refresh the cache. This can be done by 106 | deleting the cache directory or by setting the ``no_cache`` option to ``True`` 107 | when creating the scraper class instance: 108 | 109 | .. code:: python 110 | 111 | # Create scraper class instance which always re-downloads the latest data 112 | fbref = sd.FBref(no_cache=True) 113 | 114 | 115 | Some methods will assume the cache is always out-of-date (for example, when 116 | scraping the fixture of the current season). Typically, these methods will 117 | have a ``force_cache`` option that can be set to ``True`` to force the cached 118 | data to be used. For example: 119 | 120 | .. code:: python 121 | 122 | fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['2324']) 123 | fbref.read_schedule(force_cache=True) 124 | 125 | 126 | Caching can also be disabled entirely by setting the ``no_store`` option to 127 | ``True`` when creating the scraper class instance. However, it should almost 128 | always be left enabled. 129 | 130 | .. code:: python 131 | 132 | # Create scraper class instance with caching disabled 133 | fbref = sd.FBref(no_store=True) 134 | 135 | 136 | Global configuration 137 | --------------------- 138 | 139 | Several settings can be configured globally using the following environment 140 | variables: 141 | 142 | ``SOCCERDATA_DIR`` 143 | The directory where the downloaded data is cached and where logs are 144 | stored. By default, all data is stored to ``~/soccerdata`` on Linux / Mac 145 | OS and ``C:\Users\yourusername\soccerdata`` on Windows. 146 | ``SOCCERDATA_NOCACHE`` 147 | If set to "true", no cached data is returned. Note that no-cache does not 148 | mean "don't cache". All downloaded data is still cached and overwrites 149 | existing caches. If the sense of "don't cache" that you want is actually 150 | "don't store", then ``SOCCERDATA_NOSTORE`` is the option to use. By 151 | default, data is retrieved from the cache. 152 | ``SOCCERDATA_NOSTORE`` 153 | If set to "true", no data is stored. By default, data is cached. 154 | ``SOCCERDATA_MAXAGE`` 155 | The maximum age of cached data in seconds. If the cached data is older 156 | than this, it will be re-downloaded. By default, this is set to infinity. 157 | ``SOCCERDATA_LOGLEVEL`` 158 | The level of logging to use. By default, this is set to "INFO". 159 | 160 | Example: 161 | 162 | .. code-block:: bash 163 | 164 | # bash 165 | export SOCCERDATA_DIR = "~/soccerdata" 166 | export SOCCERDATA_NOCACHE = "False" 167 | export SOCCERDATA_NOSTORE = "False" 168 | export SOCCERDATA_LOGLEVEL = "INFO" 169 | 170 | 171 | Uniform team names 172 | ------------------ 173 | 174 | Each data source uses a different set of team names, which makes it difficult 175 | to combine data from multiple sources. To mitigate this, SoccerData allows 176 | translating the team names to uniform names. This is done by providing 177 | a ``SOCCERDATA_DIR/config/teamname_replacements.json`` file. This file should contain a 178 | mapping between a generic name for each team and the team name used by each 179 | data source that you want to support. The example below will map "Tottenham 180 | Hotspur", "Tottenham Hotspur FC" and "Spurs" to "Tottenham" in all scraped 181 | data. 182 | 183 | .. code-block:: json 184 | 185 | { 186 | "Tottenham": ["Tottenham Hotspur", "Tottenham Hotspur FC", "Spurs"], 187 | } 188 | 189 | Additional setup for scraping WhoScored data 190 | -------------------------------------------- 191 | 192 | WhoScored implements strong protection against scraping using Incapsula. To 193 | circumvent this, this scraper uses Selenium with the ChromeDriver extension to 194 | emulate a real user. Before using this scraper, you will have to `install 195 | Chrome`_. A Selenium driver matching your Chrome version will be downloaded 196 | automatically when you run the scraper. 197 | 198 | Next steps 199 | ---------- 200 | Look at you! You’re now basically an expert at SoccerData! ✨ 201 | 202 | From this point you can: 203 | 204 | - Look at the example notebooks for each :ref:`Data source `. 205 | - Take a deep dive into the :ref:`API `. 206 | - Give us feedback or contribute, see :ref:`Contributing `. 207 | 208 | Have fun! 🎉 209 | 210 | 211 | .. _install Chrome: https://www.google.com/chrome/ 212 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../LICENSE.rst 2 | -------------------------------------------------------------------------------- /docs/output.csv: -------------------------------------------------------------------------------- 1 | league,season,team,#Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG 2 | ENG-Premier League,2021,Arsenal,29,38.0,53,455,141,31.0,11.97,3.71,0.1,0.33,16.9,23,6,6,53.5,49.0,0.11,-0.5,-2.0 3 | ,,Aston Villa,24,38.0,52,518,179,34.6,13.63,4.71,0.09,0.26,16.5,15,5,6,52.9,48.5,0.1,-0.9,-1.5 4 | ,,Brighton,27,38.0,39,476,129,27.1,12.53,3.39,0.07,0.26,16.6,14,6,9,51.6,44.8,0.1,-12.6,-11.8 5 | ,,Burnley,25,38.0,32,383,125,32.6,10.08,3.29,0.08,0.23,16.6,15,3,3,39.9,37.6,0.1,-7.9,-8.6 6 | ,,Chelsea,27,38.0,56,553,194,35.1,14.55,5.11,0.09,0.25,16.3,16,8,10,64.0,56.4,0.1,-8.0,-8.4 7 | -------------------------------------------------------------------------------- /docs/reference/base.rst: -------------------------------------------------------------------------------- 1 | .. _api-base: 2 | 3 | Base Readers 4 | ============ 5 | 6 | The logic for downloading data from the web is implemented in the base classes 7 | that are documented here. The base classes are not intended to be used directly 8 | but rather to be subclassed by the specific readers which implement the logic 9 | to parse the data. 10 | 11 | The :class:`BaseRequestsReader` is a wrapper around the `requests` library 12 | and is used by scrapers that do not require JavaScript to be executed. The 13 | :class:`BaseSeleniumReader` is a wrapper around the `selenium` library and is 14 | used by scrapers that require JavaScript to be executed. 15 | 16 | .. autoclass:: soccerdata._common.BaseRequestsReader 17 | :inherited-members: 18 | :members: 19 | 20 | .. autoclass:: soccerdata._common.BaseSeleniumReader 21 | :inherited-members: 22 | :members: 23 | -------------------------------------------------------------------------------- /docs/reference/clubelo.rst: -------------------------------------------------------------------------------- 1 | .. _api-clubelo: 2 | 3 | Club Elo 4 | ======== 5 | 6 | .. autoclass:: soccerdata.ClubElo 7 | :inherited-members: available_leagues 8 | :members: read_by_date, read_team_history 9 | -------------------------------------------------------------------------------- /docs/reference/espn.rst: -------------------------------------------------------------------------------- 1 | .. _api-espn: 2 | 3 | ESPN 4 | ===== 5 | 6 | .. autoclass:: soccerdata.ESPN 7 | :inherited-members: 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/reference/fbref.rst: -------------------------------------------------------------------------------- 1 | .. _api-fbref: 2 | 3 | FBref 4 | ===== 5 | 6 | .. autoclass:: soccerdata.FBref 7 | :members: available_leagues, read_leagues, read_seasons, 8 | read_team_season_stats, read_team_match_stats, 9 | read_player_season_stats, read_player_match_stats, 10 | read_schedule, read_lineup, read_events, read_shot_events 11 | -------------------------------------------------------------------------------- /docs/reference/fotmob.rst: -------------------------------------------------------------------------------- 1 | .. _api-fotmob: 2 | 3 | FotMob 4 | ====== 5 | 6 | .. autoclass:: soccerdata.FotMob 7 | :members: available_leagues, read_leagues, read_seasons, 8 | read_league_table, read_schedule, read_team_match_stats, 9 | -------------------------------------------------------------------------------- /docs/reference/index.rst: -------------------------------------------------------------------------------- 1 | .. soccerdata package index documentation toctree 2 | .. _api: 3 | 4 | .. currentmodule:: soccerdata 5 | 6 | API Reference 7 | ============= 8 | 9 | This part of the documentation covers all the interfaces of the implemented 10 | data scrapers. 11 | 12 | .. toctree:: 13 | 14 | clubelo 15 | espn 16 | fbref 17 | fivethirtyeight 18 | fotmob 19 | matchhistory 20 | sofascore 21 | sofifa 22 | understat 23 | whoscored 24 | 25 | If you would like to extend the functionality of soccerdata, you might also be 26 | interested in the following modules: 27 | 28 | .. toctree:: 29 | 30 | base 31 | utils 32 | -------------------------------------------------------------------------------- /docs/reference/matchhistory.rst: -------------------------------------------------------------------------------- 1 | .. _api-matchhistory: 2 | 3 | MatchHistory 4 | ============= 5 | 6 | .. autoclass:: soccerdata.MatchHistory 7 | :inherited-members: 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/reference/sofascore.rst: -------------------------------------------------------------------------------- 1 | .. _api-sofascore: 2 | 3 | Sofascore 4 | ====== 5 | 6 | .. autoclass:: soccerdata.Sofascore 7 | :members: read_leagues, read_seasons, 8 | read_league_table, read_schedule, 9 | -------------------------------------------------------------------------------- /docs/reference/sofifa.rst: -------------------------------------------------------------------------------- 1 | .. _api-sofifa: 2 | 3 | SoFIFA 4 | ======== 5 | 6 | .. autoclass:: soccerdata.SoFIFA 7 | :members: read_leagues, read_versions, read_teams, read_players, 8 | read_team_ratings, read_player_ratings, available_leagues 9 | -------------------------------------------------------------------------------- /docs/reference/understat.rst: -------------------------------------------------------------------------------- 1 | .. _api-understat: 2 | 3 | Understat 4 | ========= 5 | 6 | .. autoclass:: soccerdata.Understat 7 | :inherited-members: available_leagues 8 | :members: read_leagues, read_seasons, read_schedule, 9 | read_team_match_stats, read_player_season_stats, 10 | read_player_match_stats, read_shot_events 11 | -------------------------------------------------------------------------------- /docs/reference/utils.rst: -------------------------------------------------------------------------------- 1 | .. _api-utils: 2 | 3 | Utilities 4 | ============ 5 | 6 | .. automethod:: soccerdata._common.season_code 7 | .. automethod:: soccerdata._common.make_game_id 8 | .. automethod:: soccerdata._common.standardize_colnames 9 | .. automethod:: soccerdata._common.get_proxy 10 | .. automethod:: soccerdata._common.check_proxy 11 | -------------------------------------------------------------------------------- /docs/reference/whoscored.rst: -------------------------------------------------------------------------------- 1 | .. _api-whoscored: 2 | 3 | WhoScored 4 | ========= 5 | 6 | .. autoclass:: soccerdata.WhoScored 7 | :members: available_leagues, read_schedule, read_missing_players, read_events 8 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo==2024.8.6 2 | sphinx==7.4.7 3 | nbsphinx==0.9.7 4 | -------------------------------------------------------------------------------- /docs/topics/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Advanced Usage 3 | =============== 4 | 5 | Introductions to all the key parts of SoccerData you'll need to know: 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Nox sessions.""" 2 | 3 | import os 4 | import shlex 5 | import shutil 6 | import sys 7 | from pathlib import Path 8 | from textwrap import dedent 9 | 10 | import nox 11 | 12 | try: 13 | from nox_poetry import Session, session 14 | except ImportError: 15 | message = f"""\ 16 | Nox failed to import the 'nox-poetry' package. 17 | 18 | Please install it using the following command: 19 | 20 | {sys.executable} -m pip install nox-poetry""" 21 | raise SystemExit(dedent(message)) from None 22 | 23 | 24 | package = "soccerdata" 25 | python_versions = ["3.11", "3.10", "3.9", "3.8"] 26 | nox.needs_version = ">= 2021.6.6" 27 | nox.options.sessions = ( 28 | "pre-commit", 29 | "mypy", 30 | "tests", 31 | "docs-build", 32 | ) 33 | 34 | 35 | def activate_virtualenv_in_precommit_hooks(session: Session) -> None: 36 | """Activate virtualenv in hooks installed by pre-commit. 37 | 38 | This function patches git hooks installed by pre-commit to activate the 39 | session's virtual environment. This allows pre-commit to locate hooks in 40 | that environment when invoked from git. 41 | 42 | Parameters 43 | ---------- 44 | session : Session 45 | The Session object. 46 | """ 47 | assert session.bin is not None 48 | 49 | # Only patch hooks containing a reference to this session's bindir. Support 50 | # quoting rules for Python and bash, but strip the outermost quotes so we 51 | # can detect paths within the bindir, like /python. 52 | bindirs = [ 53 | bindir[1:-1] if bindir[0] in "'\"" else bindir 54 | for bindir in (repr(session.bin), shlex.quote(session.bin)) 55 | ] 56 | 57 | virtualenv = session.env.get("VIRTUAL_ENV") 58 | if virtualenv is None: 59 | return 60 | 61 | headers = { 62 | # pre-commit < 2.16.0 63 | "python": f"""\ 64 | import os 65 | os.environ["VIRTUAL_ENV"] = {virtualenv!r} 66 | os.environ["PATH"] = os.pathsep.join(( 67 | {session.bin!r}, 68 | os.environ.get("PATH", ""), 69 | )) 70 | """, 71 | # pre-commit >= 2.16.0 72 | "bash": f"""\ 73 | VIRTUAL_ENV={shlex.quote(virtualenv)} 74 | PATH={shlex.quote(session.bin)}"{os.pathsep}$PATH" 75 | """, 76 | } 77 | 78 | hookdir = Path(".git") / "hooks" 79 | if not hookdir.is_dir(): 80 | return 81 | 82 | for hook in hookdir.iterdir(): 83 | if hook.name.endswith(".sample") or not hook.is_file(): 84 | continue 85 | 86 | if not hook.read_bytes().startswith(b"#!"): 87 | continue 88 | 89 | text = hook.read_text() 90 | 91 | if not any( 92 | Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text 93 | for bindir in bindirs 94 | ): 95 | continue 96 | 97 | lines = text.splitlines() 98 | 99 | for executable, header in headers.items(): 100 | if executable in lines[0].lower(): 101 | lines.insert(1, dedent(header)) 102 | hook.write_text("\n".join(lines)) 103 | break 104 | 105 | 106 | @session(name="pre-commit", python=python_versions[0]) 107 | def precommit(session: Session) -> None: 108 | """Lint using pre-commit.""" 109 | args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"] 110 | session.install( 111 | "bandit", 112 | "darglint", 113 | "ruff", 114 | "pep8-naming", 115 | "pre-commit", 116 | "pre-commit-hooks", 117 | "pyupgrade", 118 | ) 119 | session.run("pre-commit", *args) 120 | if args and args[0] == "install": 121 | activate_virtualenv_in_precommit_hooks(session) 122 | 123 | 124 | @session(python=python_versions) 125 | def mypy(session: Session) -> None: 126 | """Type-check using mypy.""" 127 | args = session.posargs or ["soccerdata", "tests", "docs/conf.py"] 128 | session.install(".") 129 | session.install("mypy", "pytest") 130 | session.run("mypy", "--install-types", "--non-interactive", *args) 131 | if not session.posargs: 132 | session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py") 133 | 134 | 135 | @session(python=python_versions) 136 | def tests(session: Session) -> None: 137 | """Run the test suite.""" 138 | args = session.posargs or ["-m", "not e2e and not fails_gha"] 139 | session.install(".") 140 | session.install("coverage[toml]", "pytest", "pytest-mock", "time-machine", "pygments") 141 | try: 142 | session.run( 143 | "coverage", 144 | "run", 145 | "--parallel", 146 | "-m", 147 | "pytest", 148 | *args, 149 | env={ 150 | "SOCCERDATA_DIR": str(Path(__file__).parent / "tests" / "appdata"), 151 | "MAXAGE": "604800", 152 | }, 153 | ) 154 | finally: 155 | if session.interactive: 156 | session.notify("coverage", posargs=[]) 157 | 158 | 159 | @session(python=python_versions[0]) 160 | def coverage(session: Session) -> None: 161 | """Produce the coverage report.""" 162 | args = session.posargs or ["report"] 163 | 164 | session.install("coverage[toml]") 165 | 166 | if not session.posargs and any(Path().glob(".coverage.*")): 167 | session.run("coverage", "combine") 168 | 169 | session.run("coverage", *args) 170 | 171 | 172 | @session(name="docs-build", python=python_versions[0]) 173 | def docs_build(session: Session) -> None: 174 | """Build the documentation.""" 175 | args = session.posargs or ["docs", "docs/_build"] 176 | if not session.posargs and "FORCE_COLOR" in os.environ: 177 | args.insert(0, "--color") 178 | 179 | session.install(".") 180 | session.install("sphinx", "sphinx-click", "furo", "nbsphinx", "ipython") 181 | 182 | build_dir = Path("docs", "_build") 183 | if build_dir.exists(): 184 | shutil.rmtree(build_dir) 185 | 186 | session.run("sphinx-build", *args, env={"SOCCERDATA_DIR": str(Path.home() / "soccerdata")}) 187 | 188 | 189 | @session(python=python_versions[0]) 190 | def docs(session: Session) -> None: 191 | """Build and serve the documentation with live reloading on file changes.""" 192 | args = session.posargs or ["--host=0.0.0.0", "docs", "docs/_build"] 193 | session.install(".") 194 | session.install("sphinx", "sphinx-autobuild", "furo", "nbsphinx", "ipython") 195 | 196 | build_dir = Path("docs", "_build") 197 | if build_dir.exists(): 198 | shutil.rmtree(build_dir) 199 | 200 | session.run( 201 | "sphinx-autobuild", 202 | *args, 203 | env={"SOCCERDATA_DIR": str(Path.home() / "soccerdata")}, 204 | ) 205 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "soccerdata" 3 | version = "1.8.7" 4 | description = "A collection of wrappers over soccer data from various websites / APIs." 5 | authors = ["Pieter Robberechts "] 6 | license = "Apache-2.0" 7 | readme = 'README.rst' 8 | homepage = "https://github.com/probberechts/soccerdata" 9 | repository = "https://github.com/probberechts/soccerdata" 10 | documentation = "https://soccerdata.readthedocs.io" 11 | keywords = ["soccer", "football", "soccer data", "web scraping", "soccer analytics"] 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | "License :: OSI Approved :: MIT License", 15 | "Operating System :: OS Independent" 16 | ] 17 | 18 | [tool.poetry.urls] 19 | Changelog = "https://github.com/probberechts/soccerdata/releases" 20 | 21 | [tool.poetry.dependencies] 22 | python = ">=3.9,<3.13" 23 | html5lib = "^1.1" 24 | pandas = "^2.0.0, !=2.1.0" 25 | lxml = "^4.9.3" 26 | socceraction = {version="^1.5.3", optional=true} 27 | wrapper-tls-requests = "^1.1.4" 28 | tqdm = "^4.67.1" 29 | rich = "^14.0.0" 30 | seleniumbase = "^4.38.2" 31 | unidecode = "^1.4.0" 32 | urllib3 = "<2" 33 | 34 | [tool.poetry.extras] 35 | socceraction = ["socceraction"] 36 | 37 | [tool.poetry.group.test.dependencies] 38 | pytest = "^8.0.0" 39 | pytest-deadfixtures = "^2.2.1" 40 | pytest-mock = "^3.10.0" 41 | coverage = {version = "^7.0", extras = ["toml"]} 42 | time-machine = "^2.8.2" 43 | 44 | [tool.poetry.group.dev.dependencies] 45 | Pygments = "^2.13.0" 46 | bumpversion = "^0.6.0" 47 | darglint = "^1.8.1" 48 | ruff = "*" 49 | mypy = "*" 50 | pep8-naming = "^0.15.0" 51 | pre-commit = "^4.0.0" 52 | pre-commit-hooks = "^5.0.0" 53 | 54 | [tool.poetry.group.docs.dependencies] 55 | Sphinx = "^7.0.0" 56 | furo = "^2024.0.0" 57 | nbsphinx = "^0.9.0" 58 | sphinx-autobuild = "^2024.0.0" 59 | 60 | [tool.coverage.paths] 61 | source = ["soccerdata", "*/site-packages"] 62 | tests = ["tests", "*/tests"] 63 | 64 | [tool.coverage.run] 65 | branch = true 66 | source = ["soccerdata", "tests"] 67 | 68 | [tool.coverage.report] 69 | show_missing = true 70 | ignore_errors = true 71 | 72 | [tool.mypy] 73 | ignore_missing_imports = true 74 | disallow_untyped_defs = true 75 | disallow_incomplete_defs = true 76 | no_implicit_optional = true 77 | check_untyped_defs = true 78 | show_error_codes = true 79 | warn_unused_ignores = true 80 | 81 | [[tool.mypy.overrides]] 82 | module = ["tests.*"] 83 | disallow_untyped_defs = false 84 | 85 | [tool.ruff] 86 | src = ['soccerdata', 'tests'] 87 | line-length = 99 88 | target-version = 'py39' 89 | 90 | [tool.ruff.lint] 91 | select = [ 92 | 'A', 93 | 'ARG', 94 | 'B', 95 | 'B9', 96 | 'C', 97 | 'C4', 98 | 'D', 99 | 'DTZ', 100 | 'E', 101 | 'F', 102 | 'I', 103 | 'N', 104 | 'PIE', 105 | 'PT', 106 | 'PTH', 107 | 'Q', 108 | 'RET', 109 | 'RUF', 110 | 'SIM', 111 | 'SLF', 112 | 'T10', 113 | 'TCH', 114 | 'UP', 115 | 'W', 116 | ] 117 | ignore = ['B904'] 118 | 119 | [tool.ruff.lint.per-file-ignores] 120 | "__init__.py" = ['F401'] 121 | "tests/*" = [ 122 | 'S', 123 | 'D212', 124 | 'D415', 125 | 'D205', 126 | 'D103', 127 | 'D104', 128 | 'N999', 129 | 'SLF001', 130 | ] 131 | 132 | [tool.ruff.lint.mccabe] 133 | max-complexity = 10 134 | 135 | [tool.ruff.lint.pydocstyle] 136 | convention = 'numpy' 137 | 138 | [tool.ruff.lint.isort] 139 | known-first-party = ["soccerdata", "tests"] 140 | 141 | [build-system] 142 | requires = ["poetry-core>=1.0.0"] 143 | build-backend = "poetry.core.masonry.api" 144 | -------------------------------------------------------------------------------- /soccerdata/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of tools to read and process soccer data from various sources.""" 2 | 3 | __version__ = "1.8.7" 4 | 5 | __all__ = [ 6 | "ClubElo", 7 | "ESPN", 8 | "FBref", 9 | "FotMob", 10 | "MatchHistory", 11 | "Sofascore", 12 | "SoFIFA", 13 | "Understat", 14 | "WhoScored", 15 | ] 16 | 17 | from .clubelo import ClubElo 18 | from .espn import ESPN 19 | from .fbref import FBref 20 | from .fotmob import FotMob 21 | from .match_history import MatchHistory 22 | from .sofascore import Sofascore 23 | from .sofifa import SoFIFA 24 | from .understat import Understat 25 | from .whoscored import WhoScored 26 | -------------------------------------------------------------------------------- /soccerdata/_config.py: -------------------------------------------------------------------------------- 1 | """Configurations.""" 2 | 3 | import json 4 | import logging 5 | import logging.config 6 | import os 7 | import sys 8 | from pathlib import Path 9 | 10 | from rich.logging import RichHandler 11 | 12 | # Configuration 13 | NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", "False").lower() in ("true", "1", "t") 14 | NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", "False").lower() in ("true", "1", "t") 15 | MAXAGE = None 16 | if os.environ.get("SOCCERDATA_MAXAGE") is not None: 17 | MAXAGE = int(os.environ.get("SOCCERDATA_MAXAGE", 0)) 18 | LOGLEVEL = os.environ.get("SOCCERDATA_LOGLEVEL", "INFO").upper() 19 | 20 | # Directories 21 | BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata")) 22 | LOGS_DIR = Path(BASE_DIR, "logs") 23 | DATA_DIR = Path(BASE_DIR, "data") 24 | CONFIG_DIR = Path(BASE_DIR, "config") 25 | 26 | # Create dirs 27 | LOGS_DIR.mkdir(parents=True, exist_ok=True) 28 | DATA_DIR.mkdir(parents=True, exist_ok=True) 29 | CONFIG_DIR.mkdir(parents=True, exist_ok=True) 30 | 31 | # Logger 32 | logging_config = { 33 | "version": 1, 34 | "disable_existing_loggers": False, 35 | "formatters": { 36 | "minimal": {"format": "%(message)s"}, 37 | "detailed": { 38 | "format": "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n" # noqa: E501 39 | }, 40 | }, 41 | "handlers": { 42 | "console": { 43 | "class": "logging.StreamHandler", 44 | "stream": sys.stdout, 45 | "formatter": "minimal", 46 | "level": logging.DEBUG, 47 | }, 48 | "info": { 49 | "class": "logging.handlers.RotatingFileHandler", 50 | "filename": Path(LOGS_DIR, "info.log"), 51 | "maxBytes": 10485760, # 1 MB 52 | "backupCount": 10, 53 | "formatter": "detailed", 54 | "level": logging.INFO, 55 | }, 56 | "error": { 57 | "class": "logging.handlers.RotatingFileHandler", 58 | "filename": Path(LOGS_DIR, "error.log"), 59 | "maxBytes": 10485760, # 1 MB 60 | "backupCount": 10, 61 | "formatter": "detailed", 62 | "level": logging.ERROR, 63 | }, 64 | }, 65 | "loggers": { 66 | "root": { 67 | "handlers": ["console", "info", "error"], 68 | "level": LOGLEVEL, 69 | "propagate": True, 70 | }, 71 | }, 72 | } 73 | logging.config.dictConfig(logging_config) 74 | logging.captureWarnings(True) 75 | logger = logging.getLogger("root") 76 | logger.handlers[0] = RichHandler(markup=True) 77 | 78 | # Team name replacements 79 | TEAMNAME_REPLACEMENTS = {} 80 | _f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json" 81 | if _f_custom_teamnname_replacements.is_file(): 82 | with _f_custom_teamnname_replacements.open(encoding="utf8") as json_file: 83 | for team, to_replace_list in json.load(json_file).items(): 84 | for to_replace in to_replace_list: 85 | TEAMNAME_REPLACEMENTS[to_replace] = team 86 | logger.info( 87 | "Custom team name replacements loaded from %s.", 88 | _f_custom_teamnname_replacements, 89 | ) 90 | else: 91 | logger.info( 92 | "No custom team name replacements found. You can configure these in %s.", 93 | _f_custom_teamnname_replacements, 94 | ) 95 | 96 | 97 | # League dict 98 | LEAGUE_DICT = { 99 | "ENG-Premier League": { 100 | "ClubElo": "ENG_1", 101 | "MatchHistory": "E0", 102 | "FiveThirtyEight": "premier-league", 103 | "FBref": "Premier League", 104 | "FotMob": "ENG-Premier League", 105 | "ESPN": "eng.1", 106 | "Sofascore": "Premier League", 107 | "SoFIFA": "[England] Premier League", 108 | "Understat": "EPL", 109 | "WhoScored": "England - Premier League", 110 | "season_start": "Aug", 111 | "season_end": "May", 112 | }, 113 | "ESP-La Liga": { 114 | "ClubElo": "ESP_1", 115 | "MatchHistory": "SP1", 116 | "FiveThirtyEight": "la-liga", 117 | "FBref": "La Liga", 118 | "FotMob": "ESP-LaLiga", 119 | "ESPN": "esp.1", 120 | "Sofascore": "LaLiga", 121 | "SoFIFA": "[Spain] La Liga", 122 | "Understat": "La liga", 123 | "WhoScored": "Spain - LaLiga", 124 | "season_start": "Aug", 125 | "season_end": "May", 126 | }, 127 | "ITA-Serie A": { 128 | "ClubElo": "ITA_1", 129 | "MatchHistory": "I1", 130 | "FiveThirtyEight": "serie-a", 131 | "FBref": "Serie A", 132 | "FotMob": "ITA-Serie A", 133 | "ESPN": "ita.1", 134 | "Sofascore": "Serie A", 135 | "SoFIFA": "[Italy] Serie A", 136 | "Understat": "Serie A", 137 | "WhoScored": "Italy - Serie A", 138 | "season_start": "Aug", 139 | "season_end": "May", 140 | }, 141 | "GER-Bundesliga": { 142 | "ClubElo": "GER_1", 143 | "MatchHistory": "D1", 144 | "FiveThirtyEight": "bundesliga", 145 | "FBref": "Fußball-Bundesliga", 146 | "FotMob": "GER-Bundesliga", 147 | "ESPN": "ger.1", 148 | "Sofascore": "Bundesliga", 149 | "SoFIFA": "[Germany] Bundesliga", 150 | "Understat": "Bundesliga", 151 | "WhoScored": "Germany - Bundesliga", 152 | "season_start": "Aug", 153 | "season_end": "May", 154 | }, 155 | "FRA-Ligue 1": { 156 | "ClubElo": "FRA_1", 157 | "MatchHistory": "F1", 158 | "FiveThirtyEight": "ligue-1", 159 | "FBref": "Ligue 1", 160 | "FotMob": "FRA-Ligue 1", 161 | "ESPN": "fra.1", 162 | "Sofascore": "Ligue 1", 163 | "SoFIFA": "[France] Ligue 1", 164 | "Understat": "Ligue 1", 165 | "WhoScored": "France - Ligue 1", 166 | "season_start": "Aug", 167 | "season_end": "May", 168 | }, 169 | "INT-World Cup": { 170 | "FBref": "FIFA World Cup", 171 | "FotMob": "INT-World Cup", 172 | "WhoScored": "International - FIFA World Cup", 173 | "season_code": "single-year", 174 | }, 175 | "INT-European Championship": { 176 | "FBref": "UEFA European Football Championship", 177 | "FotMob": "INT-EURO", 178 | "Sofascore": "EURO", 179 | "WhoScored": "International - European Championship", 180 | "season_start": "Jun", 181 | "season_end": "Jul", 182 | "season_code": "single-year", 183 | }, 184 | "INT-Women's World Cup": { 185 | "FBref": "FIFA Women's World Cup", 186 | "FotMob": "INT-Women's World Cup", 187 | "WhoScored": "International - FIFA Women's World Cup", 188 | "season_code": "single-year", 189 | }, 190 | } 191 | _f_custom_league_dict = CONFIG_DIR / "league_dict.json" 192 | if _f_custom_league_dict.is_file(): 193 | with _f_custom_league_dict.open(encoding="utf8") as json_file: 194 | LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)} 195 | logger.info("Custom league dict loaded from %s.", _f_custom_league_dict) 196 | else: 197 | logger.info( 198 | "No custom league dict found. You can configure additional leagues in %s.", 199 | _f_custom_league_dict, 200 | ) 201 | -------------------------------------------------------------------------------- /soccerdata/clubelo.py: -------------------------------------------------------------------------------- 1 | """Scraper for api.clubelo.com.""" 2 | 3 | import re 4 | from datetime import datetime, timedelta, timezone 5 | from pathlib import Path 6 | from typing import IO, Callable, Optional, Union 7 | 8 | import pandas as pd 9 | from unidecode import unidecode 10 | 11 | from ._common import BaseRequestsReader, add_alt_team_names, standardize_colnames 12 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 13 | 14 | CLUB_ELO_DATADIR = DATA_DIR / "ClubElo" 15 | CLUB_ELO_API = "http://api.clubelo.com" 16 | 17 | 18 | def _parse_csv(data: IO[bytes]) -> pd.DataFrame: 19 | return pd.read_csv(data, parse_dates=["From", "To"], date_format="%Y-%m-%d") 20 | 21 | 22 | class ClubElo(BaseRequestsReader): 23 | """Provides pd.DataFrames from CSV API at http://api.clubelo.com. 24 | 25 | Data will be downloaded as necessary and cached locally in 26 | ``~/soccerdata/data/ClubElo``. 27 | 28 | Since the source does not provide league names, this class will not filter 29 | by league. League names will be inserted from the other sources where 30 | available. Leagues that are only covered by clubelo.com will have NaN 31 | values. 32 | 33 | Parameters 34 | ---------- 35 | proxy : 'tor' or or dict or list(dict) or callable, optional 36 | Use a proxy to hide your IP address. Valid options are: 37 | - "tor": Uses the Tor network. Tor should be running in 38 | the background on port 9050. 39 | - str: The address of the proxy server to use. 40 | - list(str): A list of proxies to choose from. A different proxy will 41 | be selected from this list after failed requests, allowing rotating 42 | proxies. 43 | - callable: A function that returns a valid proxy. This function will 44 | be called after failed requests, allowing rotating proxies. 45 | no_cache : bool 46 | If True, will not use cached data. 47 | no_store : bool 48 | If True, will not store downloaded data. 49 | data_dir : Path 50 | Path to directory where data will be cached. 51 | """ 52 | 53 | def __init__( 54 | self, 55 | proxy: Optional[Union[str, list[str], Callable[[], str]]] = None, 56 | no_cache: bool = NOCACHE, 57 | no_store: bool = NOSTORE, 58 | data_dir: Path = CLUB_ELO_DATADIR, 59 | ): 60 | """Initialize a new ClubElo reader.""" 61 | super().__init__(proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir) 62 | 63 | def read_by_date(self, date: Optional[Union[str, datetime]] = None) -> pd.DataFrame: 64 | """Retrieve ELO scores for all teams at specified date. 65 | 66 | Elo scores are available as early as 1939. Values before 1960 should 67 | be considered provisional. 68 | 69 | Parameters 70 | ---------- 71 | date : datetime object or string like 'YYYY-MM-DD' 72 | Date for which to retrieve ELO scores. If no date is specified, 73 | get today's scores. 74 | 75 | Raises 76 | ------ 77 | TypeError 78 | If date is not a date string or datetime object. 79 | ValueError 80 | If data is an invalid date string. 81 | 82 | Returns 83 | ------- 84 | pd.DataFrame 85 | """ 86 | if not date: 87 | date = datetime.now(tz=timezone.utc) 88 | elif isinstance(date, str): 89 | date = datetime.strptime(date, "%Y-%m-%d").astimezone(timezone.utc) 90 | 91 | if not isinstance(date, datetime): 92 | raise TypeError("'date' must be a datetime object or string like 'YYYY-MM-DD'") 93 | 94 | datestring = date.strftime("%Y-%m-%d") 95 | filepath = self.data_dir / f"{datestring}.csv" 96 | url = f"{CLUB_ELO_API}/{datestring}" 97 | 98 | data = self.get(url, filepath) 99 | 100 | return ( 101 | _parse_csv(data) 102 | .pipe(standardize_colnames) 103 | .rename(columns={"club": "team"}) 104 | .replace({"team": TEAMNAME_REPLACEMENTS}) 105 | .replace("None", float("nan")) 106 | .assign(rank=lambda x: x["rank"].astype("float")) 107 | .assign(league=lambda x: x["country"] + "_" + x["level"].astype(str)) 108 | .pipe(self._translate_league) 109 | .reset_index(drop=True) 110 | .set_index("team") 111 | ) 112 | 113 | def read_team_history( 114 | self, team: str, max_age: Union[int, timedelta] = 1 115 | ) -> Optional[pd.DataFrame]: 116 | """Retrieve full ELO history for one club. 117 | 118 | For the exact spelling of a club's name, check the result of 119 | :func:`~soccerdata.ClubElo.read_by_date` or `clubelo.com 120 | `__. You can also use alternative team 121 | names specified in `teamname_replacements.json`. Values before 1960 122 | should be considered provisional. 123 | 124 | Parameters 125 | ---------- 126 | team : str 127 | The club's name. 128 | max_age : int for age in days, or timedelta object 129 | The max. age of locally cached file before re-download. 130 | 131 | Raises 132 | ------ 133 | TypeError 134 | If max_age is not an integer or timedelta object. 135 | ValueError 136 | If no ratings for the given team are available. 137 | 138 | Returns 139 | ------- 140 | pd.DataFrame 141 | """ 142 | teams_to_check = add_alt_team_names(team) 143 | teams_to_check = {re.sub(r"[\s']", "", unidecode(team)) for team in teams_to_check} 144 | 145 | for _team in teams_to_check: 146 | filepath = self.data_dir / f"{_team}.csv" 147 | url = f"{CLUB_ELO_API}/{_team}" 148 | data = self.get(url, filepath, max_age) 149 | 150 | df = ( 151 | _parse_csv(data) 152 | .pipe(standardize_colnames) 153 | .rename(columns={"club": "team"}) 154 | .replace("None", float("nan")) 155 | .assign(rank=lambda x: x["rank"].astype("float")) 156 | .set_index("from") 157 | .sort_index() 158 | ) 159 | 160 | if len(df) > 0: 161 | # clubelo.com returns a CSV with just a header for nonexistent club 162 | df.replace({"team": TEAMNAME_REPLACEMENTS}, inplace=True) 163 | return df 164 | 165 | raise ValueError(f"No data found for team {team}") 166 | -------------------------------------------------------------------------------- /soccerdata/espn.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://site.api.espn.com/apis/site/v2/sports/soccer.""" 2 | 3 | import itertools 4 | import json 5 | import re 6 | from datetime import datetime, timezone 7 | from pathlib import Path 8 | from typing import Callable, Optional, Union 9 | 10 | import pandas as pd 11 | 12 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger 14 | 15 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/summary?event=513466 16 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/scoreboard?dates=20180901 17 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/news 18 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/teams 19 | 20 | ESPN_DATADIR = DATA_DIR / "ESPN" 21 | ESPN_API = "http://site.api.espn.com/apis/site/v2/sports/soccer" 22 | 23 | 24 | class ESPN(BaseRequestsReader): 25 | """Provides pd.DataFrames from JSON api available at http://site.api.espn.com. 26 | 27 | Data will be downloaded as necessary and cached locally in 28 | ``~/soccerdata/data/ESPN``. 29 | 30 | Parameters 31 | ---------- 32 | leagues : string or iterable, optional 33 | IDs of leagues to include. 34 | seasons : string, int or list, optional 35 | Seasons to include. Supports multiple formats. 36 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 37 | proxy : 'tor' or or dict or list(dict) or callable, optional 38 | Use a proxy to hide your IP address. Valid options are: 39 | - "tor": Uses the Tor network. Tor should be running in 40 | the background on port 9050. 41 | - str: The address of the proxy server to use. 42 | - list(str): A list of proxies to choose from. A different proxy will 43 | be selected from this list after failed requests, allowing rotating 44 | proxies. 45 | - callable: A function that returns a valid proxy. This function will 46 | be called after failed requests, allowing rotating proxies. 47 | no_cache : bool 48 | If True, will not use cached data. 49 | no_store : bool 50 | If True, will not store downloaded data. 51 | data_dir : Path 52 | Path to directory where data will be cached. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | leagues: Optional[Union[str, list[str]]] = None, 58 | seasons: Optional[Union[str, int, list]] = None, 59 | proxy: Optional[Union[str, list[str], Callable[[], str]]] = None, 60 | no_cache: bool = NOCACHE, 61 | no_store: bool = NOSTORE, 62 | data_dir: Path = ESPN_DATADIR, 63 | ): 64 | """Initialize a new ESPN reader.""" 65 | super().__init__( 66 | leagues=leagues, 67 | proxy=proxy, 68 | no_cache=no_cache, 69 | no_store=no_store, 70 | data_dir=data_dir, 71 | ) 72 | self.seasons = seasons # type: ignore 73 | 74 | def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: 75 | """Retrieve the game schedule for the selected leagues and seasons. 76 | 77 | Parameters 78 | ---------- 79 | force_cache : bool 80 | By default no cached data is used for the current season. 81 | If True, will force the use of cached data anyway. 82 | 83 | Returns 84 | ------- 85 | pd.DataFrame 86 | """ 87 | urlmask = ESPN_API + "/{}/scoreboard?dates={}" 88 | filemask = "Schedule_{}_{}.json" 89 | 90 | df_list = [] 91 | # Get match days 92 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 93 | if int(skey[:2]) > int(str(datetime.now(tz=timezone.utc).year + 1)[-2:]): 94 | start_date = "".join(["19", skey[:2], "07", "01"]) 95 | else: 96 | start_date = "".join(["20", skey[:2], "07", "01"]) 97 | 98 | url = urlmask.format(lkey, start_date) 99 | filepath = self.data_dir / filemask.format(lkey, start_date) 100 | reader = self.get(url, filepath) 101 | data = json.load(reader) 102 | 103 | match_dates = [ 104 | datetime.strptime(d, "%Y-%m-%dT%H:%MZ").strftime("%Y%m%d") # noqa: DTZ007 105 | for d in data["leagues"][0]["calendar"] 106 | ] 107 | for date in match_dates: 108 | url = urlmask.format(lkey, date) 109 | filepath = self.data_dir / filemask.format(lkey, date) 110 | current_season = not self._is_complete(lkey, skey) 111 | reader = self.get(url, filepath, no_cache=current_season and not force_cache) 112 | 113 | data = json.load(reader) 114 | df_list.extend( 115 | [ 116 | { 117 | "league": lkey, 118 | "season": skey, 119 | "date": e["date"], 120 | "home_team": e["competitions"][0]["competitors"][0]["team"]["name"], 121 | "away_team": e["competitions"][0]["competitors"][1]["team"]["name"], 122 | "game_id": int(e["id"]), 123 | "league_id": lkey, 124 | } 125 | for e in data["events"] 126 | ] 127 | ) 128 | return ( 129 | pd.DataFrame(df_list) 130 | .pipe(self._translate_league) 131 | .replace({"home_team": TEAMNAME_REPLACEMENTS, "away_team": TEAMNAME_REPLACEMENTS}) 132 | .assign(date=lambda x: pd.to_datetime(x["date"])) 133 | .dropna(subset=["home_team", "away_team", "date"]) 134 | .assign(game=lambda df: df.apply(make_game_id, axis=1)) 135 | .set_index(["league", "season", "game"]) 136 | .sort_index() 137 | ) 138 | 139 | def read_matchsheet(self, match_id: Optional[Union[int, list[int]]] = None) -> pd.DataFrame: 140 | """Retrieve match sheets for the selected leagues and seasons. 141 | 142 | Parameters 143 | ---------- 144 | match_id : int or list of int, optional 145 | Retrieve the match sheet for a specific game. 146 | 147 | Raises 148 | ------ 149 | ValueError 150 | If no games with the given IDs were found for the selected seasons and leagues. 151 | 152 | Returns 153 | ------- 154 | pd.DataFrame. 155 | """ 156 | urlmask = ESPN_API + "/{}/summary?event={}" 157 | filemask = "Summary_{}.json" 158 | 159 | df_schedule = self.read_schedule().reset_index() 160 | if match_id is not None: 161 | iterator = df_schedule[ 162 | df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id) 163 | ] 164 | if len(iterator) == 0: 165 | raise ValueError( 166 | "No games with the given IDs found for the selected seasons and leagues." 167 | ) 168 | else: 169 | iterator = df_schedule 170 | 171 | df_list = [] 172 | for i, match in iterator.iterrows(): 173 | url = urlmask.format(match["league_id"], match["game_id"]) 174 | filepath = self.data_dir / filemask.format(match["game_id"]) 175 | reader = self.get(url, filepath) 176 | 177 | data = json.load(reader) 178 | for i in range(2): 179 | match_sheet = { 180 | "game": match["game"], 181 | "league": match["league"], 182 | "season": match["season"], 183 | "team": data["boxscore"]["form"][i]["team"]["displayName"], 184 | "is_home": (i == 0), 185 | "venue": ( 186 | data["gameInfo"]["venue"]["fullName"] 187 | if "venue" in data["gameInfo"] 188 | else None 189 | ), 190 | "attendance": data["gameInfo"].get("attendance"), 191 | "capacity": ( 192 | data["gameInfo"]["venue"].get("capacity") 193 | if "venue" in data["gameInfo"] 194 | else None 195 | ), 196 | "roster": data["rosters"][i].get("roster", None), 197 | } 198 | if "statistics" in data["boxscore"]["teams"][i]: 199 | for stat in data["boxscore"]["teams"][i]["statistics"]: 200 | match_sheet[stat["name"]] = stat["displayValue"] 201 | df_list.append(match_sheet) 202 | return ( 203 | pd.DataFrame(df_list) 204 | .replace({"team": TEAMNAME_REPLACEMENTS}) 205 | .pipe(standardize_colnames) 206 | .set_index(["league", "season", "game", "team"]) 207 | .sort_index() 208 | ) 209 | 210 | def read_lineup( # noqa: C901 211 | self, match_id: Optional[Union[int, list[int]]] = None 212 | ) -> pd.DataFrame: 213 | """Retrieve lineups for the selected leagues and seasons. 214 | 215 | Parameters 216 | ---------- 217 | match_id : int or list of int, optional 218 | Retrieve the lineup for a specific game. 219 | 220 | Raises 221 | ------ 222 | ValueError 223 | If no games with the given IDs were found for the selected seasons and leagues. 224 | 225 | Returns 226 | ------- 227 | pd.DataFrame. 228 | """ 229 | urlmask = ESPN_API + "/{}/summary?event={}" 230 | filemask = "Summary_{}.json" 231 | 232 | df_schedule = self.read_schedule().reset_index() 233 | if match_id is not None: 234 | iterator = df_schedule[ 235 | df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id) 236 | ] 237 | if len(iterator) == 0: 238 | raise ValueError( 239 | "No games with the given IDs found for the selected seasons and leagues." 240 | ) 241 | else: 242 | iterator = df_schedule 243 | 244 | df_list = [] 245 | for i, match in iterator.iterrows(): 246 | url = urlmask.format(match["league_id"], match["game_id"]) 247 | filepath = self.data_dir / filemask.format(match["game_id"]) 248 | reader = self.get(url, filepath) 249 | 250 | data = json.load(reader) 251 | for i in range(2): 252 | if "roster" not in data["rosters"][i]: 253 | logger.info( 254 | "No lineup info found for team %d in game with ID=%s", 255 | i + 1, 256 | match["game_id"], 257 | ) 258 | continue 259 | for p in data["rosters"][i]["roster"]: 260 | match_sheet = { 261 | "game": match["game"], 262 | "league": match["league"], 263 | "season": match["season"], 264 | "team": data["boxscore"]["form"][i]["team"]["displayName"], 265 | "is_home": (i == 0), 266 | "player": p["athlete"]["displayName"], 267 | "position": p["position"]["name"] if "position" in p else None, 268 | "formation_place": p.get("formationPlace", None), 269 | } 270 | subbed_in = ( 271 | p["subbedIn"] 272 | if isinstance(p["subbedIn"], bool) 273 | else p["subbedIn"]["didSub"] 274 | ) 275 | subbed_out = ( 276 | p["subbedOut"] 277 | if isinstance(p["subbedOut"], bool) 278 | else p["subbedOut"]["didSub"] 279 | ) 280 | subbed_events = [] 281 | if isinstance(p["subbedIn"], bool) and (subbed_in or subbed_out): 282 | subbed_events = ( 283 | [e for e in p["plays"] if e["substitution"]] 284 | if isinstance(p["subbedIn"], bool) 285 | else [p["subbedIn"], p["subbedOut"]] 286 | ) 287 | else: 288 | if subbed_in: 289 | subbed_events.append(p["subbedIn"]) 290 | if subbed_out: 291 | subbed_events.append(p["subbedOut"]) 292 | 293 | if p["starter"]: 294 | match_sheet["sub_in"] = "start" 295 | elif subbed_in: 296 | match_sheet["sub_in"] = sum( 297 | map( 298 | int, 299 | re.findall( 300 | r"(\d{1,3})", 301 | subbed_events[0]["clock"]["displayValue"], 302 | ), 303 | ) 304 | ) 305 | else: 306 | match_sheet["sub_in"] = None 307 | 308 | if (p["starter"] or subbed_in) and not subbed_out: 309 | match_sheet["sub_out"] = "end" 310 | elif subbed_out: 311 | j = 0 if not subbed_in else 1 312 | match_sheet["sub_out"] = sum( 313 | map( 314 | int, 315 | re.findall( 316 | r"(\d{1,3})", 317 | subbed_events[j]["clock"]["displayValue"], 318 | ), 319 | ) 320 | ) 321 | else: 322 | match_sheet["sub_out"] = None 323 | 324 | if "stats" in p: 325 | for stat in p["stats"]: 326 | match_sheet[stat["name"]] = stat["value"] 327 | 328 | df_list.append(match_sheet) 329 | 330 | if len(df_list) == 0: 331 | return pd.DataFrame() 332 | 333 | return ( 334 | pd.DataFrame(df_list) 335 | .replace({"team": TEAMNAME_REPLACEMENTS}) 336 | .pipe(standardize_colnames) 337 | .set_index(["league", "season", "game", "team", "player"]) 338 | .sort_index() 339 | ) 340 | -------------------------------------------------------------------------------- /soccerdata/fotmob.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://fotmob.com.""" 2 | 3 | import itertools 4 | import json 5 | from collections.abc import Iterable 6 | from pathlib import Path 7 | from typing import Callable, Optional, Union 8 | 9 | import pandas as pd 10 | import tls_requests 11 | 12 | from ._common import BaseRequestsReader, add_standardized_team_name, make_game_id 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger 14 | 15 | FOTMOB_DATADIR = DATA_DIR / "FotMob" 16 | FOTMOB_API = "https://www.fotmob.com/api/" 17 | 18 | 19 | class FotMob(BaseRequestsReader): 20 | """Provides pd.DataFrames from data available at http://www.fotmob.com. 21 | 22 | Data will be downloaded as necessary and cached locally in 23 | ``~/soccerdata/data/FotMob``. 24 | 25 | Parameters 26 | ---------- 27 | leagues : string or iterable, optional 28 | IDs of Leagues to include. 29 | seasons : string, int or list, optional 30 | Seasons to include. Supports multiple formats. 31 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 32 | proxy : 'tor' or or dict or list(dict) or callable, optional 33 | Use a proxy to hide your IP address. Valid options are: 34 | - "tor": Uses the Tor network. Tor should be running in 35 | the background on port 9050. 36 | - str: The address of the proxy server to use. 37 | - list(str): A list of proxies to choose from. A different proxy will 38 | be selected from this list after failed requests, allowing rotating 39 | proxies. 40 | - callable: A function that returns a valid proxy. This function will 41 | be called after failed requests, allowing rotating proxies. 42 | no_cache : bool 43 | If True, will not use cached data. 44 | no_store : bool 45 | If True, will not store downloaded data. 46 | data_dir : Path 47 | Path to directory where data will be cached. 48 | """ 49 | 50 | def __init__( 51 | self, 52 | leagues: Optional[Union[str, list[str]]] = None, 53 | seasons: Optional[Union[str, int, Iterable[Union[str, int]]]] = None, 54 | proxy: Optional[Union[str, list[str], Callable[[], str]]] = None, 55 | no_cache: bool = NOCACHE, 56 | no_store: bool = NOSTORE, 57 | data_dir: Path = FOTMOB_DATADIR, 58 | ): 59 | """Initialize the FotMob reader.""" 60 | super().__init__( 61 | leagues=leagues, 62 | proxy=proxy, 63 | no_cache=no_cache, 64 | no_store=no_store, 65 | data_dir=data_dir, 66 | ) 67 | self.seasons = seasons # type: ignore 68 | if not self.no_store: 69 | (self.data_dir / "leagues").mkdir(parents=True, exist_ok=True) 70 | (self.data_dir / "seasons").mkdir(parents=True, exist_ok=True) 71 | (self.data_dir / "matches").mkdir(parents=True, exist_ok=True) 72 | 73 | def _init_session(self) -> tls_requests.Client: 74 | session = super()._init_session() 75 | try: 76 | r = tls_requests.get("http://46.101.91.154:6006/") 77 | r.raise_for_status() 78 | except tls_requests.exceptions.HTTPError: 79 | raise ConnectionError("Unable to connect to the session cookie server.") 80 | result = r.json() 81 | session.headers.update(result) 82 | return session 83 | 84 | @property 85 | def leagues(self) -> list[str]: 86 | """Return a list of selected leagues.""" 87 | return list(self._leagues_dict.keys()) 88 | 89 | def read_leagues(self) -> pd.DataFrame: 90 | """Retrieve the selected leagues from the datasource. 91 | 92 | Returns 93 | ------- 94 | pd.DataFrame 95 | """ 96 | url = FOTMOB_API + "allLeagues" 97 | filepath = self.data_dir / "allLeagues.json" 98 | reader = self.get(url, filepath) 99 | data = json.load(reader) 100 | leagues = [] 101 | for k, v in data.items(): 102 | if k == "international": 103 | for int_league in v[0]["leagues"]: 104 | leagues.append( 105 | { 106 | "region": v[0]["ccode"], 107 | "league_id": int_league["id"], 108 | "league": int_league["name"], 109 | "url": "https://fotmob.com" + int_league["pageUrl"], 110 | } 111 | ) 112 | elif k not in ("favourite", "popular", "userSettings"): 113 | for country in v: 114 | for dom_league in country["leagues"]: 115 | leagues.append( 116 | { 117 | "region": country["ccode"], 118 | "league": dom_league["name"], 119 | "league_id": dom_league["id"], 120 | "url": "https://fotmob.com" + dom_league["pageUrl"], 121 | } 122 | ) 123 | df = ( 124 | pd.DataFrame(leagues) 125 | .assign(league=lambda x: x.region + "-" + x.league) 126 | .pipe(self._translate_league) 127 | .set_index("league") 128 | .loc[self._selected_leagues.keys()] 129 | .sort_index() 130 | ) 131 | return df[df.index.isin(self.leagues)] 132 | 133 | def read_seasons(self) -> pd.DataFrame: 134 | """Retrieve the selected seasons for the selected leagues. 135 | 136 | Returns 137 | ------- 138 | pd.DataFrame 139 | """ 140 | filemask = "leagues/{}.json" 141 | urlmask = FOTMOB_API + "leagues?id={}" 142 | df_leagues = self.read_leagues() 143 | seasons = [] 144 | for lkey, league in df_leagues.iterrows(): 145 | url = urlmask.format(league.league_id) 146 | filepath = self.data_dir / filemask.format(lkey) 147 | reader = self.get(url, filepath) 148 | data = json.load(reader) 149 | # extract season IDs 150 | avail_seasons = data["allAvailableSeasons"] 151 | for season in avail_seasons: 152 | seasons.append( 153 | { 154 | "league": lkey, 155 | "season": self._season_code.parse(season), 156 | "league_id": league.league_id, 157 | "season_id": season, 158 | "url": league.url + "?season=" + season, 159 | } 160 | ) 161 | # Change season id for 2122 season manually (gross) 162 | df = pd.DataFrame(seasons).set_index(["league", "season"]).sort_index() 163 | return df.loc[df.index.isin(list(itertools.product(self.leagues, self.seasons)))] 164 | 165 | def read_league_table(self, force_cache: bool = False) -> pd.DataFrame: # noqa: C901 166 | """Retrieve the league table for the selected leagues. 167 | 168 | Parameters 169 | ---------- 170 | force_cache : bool 171 | By default no cached data is used for the current season. 172 | If True, will force the use of cached data anyway. 173 | 174 | Returns 175 | ------- 176 | pd.DataFrame 177 | """ 178 | filemask = "seasons/{}_{}.html" 179 | urlmask = FOTMOB_API + "leagues?id={}&season={}" 180 | 181 | idx = ["league", "season"] 182 | cols = ["team", "MP", "W", "D", "L", "GF", "GA", "GD", "Pts"] 183 | 184 | # get league and season IDs 185 | seasons = self.read_seasons() 186 | # collect league tables 187 | mult_tables = [] 188 | for (lkey, skey), season in seasons.iterrows(): 189 | # read html page (league overview) 190 | filepath = self.data_dir / filemask.format(lkey, skey) 191 | url = urlmask.format(season.league_id, season.season_id) 192 | current_season = not self._is_complete(lkey, skey) 193 | reader = self.get(url, filepath, no_cache=current_season and not force_cache) 194 | season_data = json.load(reader) 195 | table_data = season_data["table"][0]["data"] 196 | if "tables" in table_data: 197 | if "stage" not in idx: 198 | idx.append("stage") 199 | groups_data = table_data["tables"] 200 | all_groups = [] 201 | for i in range(len(groups_data)): 202 | group_table = pd.json_normalize(groups_data[i]["table"]["all"]) 203 | group_table["stage"] = groups_data[i]["leagueName"] 204 | all_groups.append(group_table) 205 | df_table = pd.concat(all_groups, axis=0) 206 | else: 207 | df_table = pd.json_normalize(table_data["table"]["all"]) 208 | df_table[["GF", "GA"]] = df_table["scoresStr"].str.split("-", expand=True) 209 | df_table = df_table.rename( 210 | columns={ 211 | "name": "team", 212 | "played": "MP", 213 | "wins": "W", 214 | "draws": "D", 215 | "losses": "L", 216 | "goalConDiff": "GD", 217 | "pts": "Pts", 218 | } 219 | ) 220 | df_table["league"] = lkey 221 | df_table["season"] = skey 222 | 223 | # If league has a playoff, add final playoff standing as a column 224 | if "playoff" in season_data["tabs"]: 225 | if "playoff" not in cols: 226 | cols.append("playoff") 227 | df_table["playoff"] = None 228 | # Get cup game finalists (for leagues with playoffs) 229 | playoff_rounds = season_data["playoff"]["rounds"] 230 | for i in range(len(playoff_rounds)): 231 | stage_teams = [] 232 | for game in playoff_rounds[i]["matchups"]: 233 | if not bool(game): 234 | continue 235 | stage = game["stage"] 236 | stage_teams.append(game["homeTeamId"]) 237 | stage_teams.append(game["awayTeamId"]) 238 | df_table.loc[df_table["id"].isin(stage_teams), "playoff"] = stage 239 | if stage == "final": 240 | winner = game["winner"] 241 | df_table.loc[df_table["id"] == winner, "playoff"] = "cup_winner" 242 | mult_tables.append(df_table) 243 | return ( 244 | pd.concat(mult_tables, axis=0) 245 | .rename(columns={"Squad": "team"}) 246 | .replace({"team": TEAMNAME_REPLACEMENTS}) 247 | .set_index(idx) 248 | .sort_index()[cols] 249 | ) 250 | 251 | def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: 252 | """Retrieve the game schedule for the selected leagues and seasons. 253 | 254 | Parameters 255 | ---------- 256 | force_cache : bool 257 | By default no cached data is used for the current season. 258 | If True, will force the use of cached data anyway. 259 | 260 | Returns 261 | ------- 262 | pd.DataFrame 263 | """ 264 | filemask = "seasons/{}_{}.html" 265 | urlmask = FOTMOB_API + "leagues?id={}&season={}" 266 | 267 | cols = [ 268 | "round", 269 | "week", 270 | "date", 271 | "home_team", 272 | "away_team", 273 | "home_score", 274 | "away_score", 275 | "status", 276 | "game_id", 277 | "url", 278 | ] 279 | 280 | df_seasons = self.read_seasons() 281 | all_schedules = [] 282 | for (lkey, skey), season in df_seasons.iterrows(): 283 | filepath = self.data_dir / filemask.format(lkey, skey) 284 | url = urlmask.format(season.league_id, season.season_id) 285 | current_season = not self._is_complete(lkey, skey) 286 | reader = self.get(url, filepath, no_cache=current_season and not force_cache) 287 | season_data = json.load(reader) 288 | 289 | df = pd.json_normalize(season_data["matches"]["allMatches"]) 290 | df["league"] = lkey 291 | df["season"] = skey 292 | all_schedules.append(df) 293 | 294 | # Construct the output dataframe 295 | df = ( 296 | pd.concat(all_schedules) 297 | .rename( 298 | columns={ 299 | "roundName": "round", 300 | "round": "week", 301 | "home.name": "home_team", 302 | "away.name": "away_team", 303 | "status.reason.short": "status", 304 | "pageUrl": "url", 305 | "id": "game_id", 306 | } 307 | ) 308 | .replace( 309 | { 310 | "home_team": TEAMNAME_REPLACEMENTS, 311 | "away_team": TEAMNAME_REPLACEMENTS, 312 | } 313 | ) 314 | .assign(date=lambda x: pd.to_datetime(x["status.utcTime"], format="mixed")) 315 | ) 316 | df["game"] = df.apply(make_game_id, axis=1) 317 | df["url"] = "https://fotmob.com" + df["url"] 318 | df[["home_score", "away_score"]] = df["status.scoreStr"].str.split("-", expand=True) 319 | return df.set_index(["league", "season", "game"]).sort_index()[cols] 320 | 321 | def read_team_match_stats( 322 | self, 323 | stat_type: str = "Top stats", 324 | opponent_stats: bool = True, 325 | team: Optional[Union[str, list[str]]] = None, 326 | force_cache: bool = False, 327 | ) -> pd.DataFrame: 328 | """Retrieve the match stats for the selected leagues and seasons. 329 | 330 | The following stat types are available: 331 | * 'Top stats' 332 | * 'Shots' 333 | * 'Expected goals (xG)' 334 | * 'Passes' 335 | * 'Defence' 336 | * 'Duels' 337 | * 'Discipline' 338 | 339 | Parameters 340 | ---------- 341 | stat_type : str 342 | Type of stats to retrieve. 343 | opponent_stats: bool 344 | If True, will retrieve opponent stats. 345 | team: str or list of str, optional 346 | Team(s) to retrieve. If None, will retrieve all teams. 347 | force_cache : bool 348 | By default no cached data is used to scrape the list of available 349 | games for the current season. If True, will force the use of 350 | cached data anyway. 351 | 352 | Raises 353 | ------ 354 | TypeError 355 | If ``stat_type`` is not valid. 356 | ValueError 357 | If no games with the given IDs were found for the selected seasons and leagues. 358 | 359 | Returns 360 | ------- 361 | pd.DataFrame 362 | """ 363 | filemask = "matches/{}_{}_{}.html" 364 | urlmask = FOTMOB_API + "matchDetails?matchId={}" 365 | 366 | # Retrieve games for which a match report is available 367 | df_matches = self.read_schedule(force_cache) 368 | df_complete = df_matches.loc[df_matches["status"].isin(["FT", "AET", "Pen"])] 369 | 370 | if team is not None: 371 | # get alternative names of the specified team(s) 372 | teams_to_check = add_standardized_team_name(team) 373 | 374 | # select requested teams 375 | iterator = df_complete.loc[ 376 | ( 377 | df_complete.home_team.isin(teams_to_check) 378 | | df_complete.away_team.isin(teams_to_check) 379 | ) 380 | ] 381 | if len(iterator) == 0: 382 | raise ValueError("No data found for the given teams in the selected seasons.") 383 | else: 384 | iterator = df_complete 385 | teams_to_check = iterator.home_team.tolist() + iterator.away_team.tolist() 386 | 387 | stats = [] 388 | for i, game in iterator.reset_index().iterrows(): 389 | lkey, skey, gkey = game["league"], game["season"], game["game"] 390 | # Get data for specific game 391 | url = urlmask.format(game.game_id) 392 | filepath = self.data_dir / filemask.format(lkey, skey, game.game_id) 393 | reader = self.get(url, filepath) 394 | logger.info( 395 | "[%s/%s] Retrieving game with id=%s", 396 | i + 1, 397 | len(iterator), 398 | game["game_id"], 399 | ) 400 | game_data = json.load(reader) 401 | 402 | # Get stats types 403 | all_stats = game_data["content"]["stats"]["Periods"]["All"]["stats"] 404 | try: 405 | selected_stats = next(stat for stat in all_stats if stat["title"] == stat_type) 406 | except StopIteration: 407 | raise ValueError(f"Invalid stat type: {stat_type}") 408 | 409 | df_raw_stats = pd.DataFrame(selected_stats["stats"]) 410 | game_teams = [game.home_team, game.away_team] 411 | for i, team in enumerate(game_teams): 412 | df_team_stats = df_raw_stats.copy() 413 | df_team_stats["stat"] = df_team_stats["stats"].apply(lambda x: x[i]) # noqa: B023 414 | df_team_stats["league"] = lkey 415 | df_team_stats["season"] = skey 416 | df_team_stats["game"] = gkey 417 | df_team_stats["team"] = team 418 | if not opponent_stats: 419 | df_team_stats = df_team_stats[df_team_stats.team.isin(teams_to_check)] 420 | df_team_stats.set_index(["league", "season", "game", "team"], inplace=True) 421 | df_team_stats = df_team_stats[df_team_stats["type"] != "title"] 422 | df_team_stats = df_team_stats.pivot(columns="title", values="stat").reset_index() 423 | df_team_stats.columns.name = None 424 | stats.append(df_team_stats) 425 | 426 | df = pd.concat(stats, axis=0) 427 | df = df.set_index(["league", "season", "game", "team"]).sort_index() 428 | # Split percentage values 429 | pct_cols = [col for col in df.columns if df[col].astype(str).str.contains("%").any()] 430 | for col in pct_cols: 431 | df[[col, col + " (%)"]] = df[col].str.split(expand=True) 432 | df[col + " (%)"] = df[col + " (%)"].str.extract(r"(\d+)").astype(float).div(100) 433 | return df 434 | -------------------------------------------------------------------------------- /soccerdata/match_history.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://www.football-data.co.uk/data.php.""" 2 | 3 | import itertools 4 | from pathlib import Path 5 | from typing import IO, Callable, Optional, Union 6 | 7 | import pandas as pd 8 | 9 | from ._common import BaseRequestsReader, make_game_id 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger 11 | 12 | MATCH_HISTORY_DATA_DIR = DATA_DIR / "MatchHistory" 13 | MATCH_HISTORY_API = "https://www.football-data.co.uk" 14 | 15 | 16 | def _parse_csv(raw_data: IO[bytes], lkey: str, skey: str) -> pd.DataFrame: 17 | logger.info("Parsing league=%s season=%s", lkey, skey) 18 | if int(skey) >= 2425: 19 | # Since 2024-25, the CSV files are encoded in UTF-8-SIG 20 | df_games = pd.read_csv( 21 | raw_data, 22 | encoding="UTF-8-SIG", 23 | on_bad_lines="warn", 24 | ) 25 | else: 26 | df_games = pd.read_csv( 27 | raw_data, 28 | encoding="latin-1", 29 | on_bad_lines="warn", 30 | ) 31 | return df_games 32 | 33 | 34 | class MatchHistory(BaseRequestsReader): 35 | """Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php. 36 | 37 | Data will be downloaded as necessary and cached locally in 38 | ``~/soccerdata/data/MatchHistory``. 39 | 40 | Parameters 41 | ---------- 42 | leagues : string or iterable 43 | IDs of leagues to include. 44 | seasons : string, int or list 45 | Seasons to include. Supports multiple formats. 46 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 47 | proxy : 'tor' or or dict or list(dict) or callable, optional 48 | Use a proxy to hide your IP address. Valid options are: 49 | - "tor": Uses the Tor network. Tor should be running in 50 | the background on port 9050. 51 | - str: The address of the proxy server to use. 52 | - list(str): A list of proxies to choose from. A different proxy will 53 | be selected from this list after failed requests, allowing rotating 54 | proxies. 55 | - callable: A function that returns a valid proxy. This function will 56 | be called after failed requests, allowing rotating proxies. 57 | no_cache : bool 58 | If True, will not use cached data. 59 | no_store : bool 60 | If True, will not store downloaded data. 61 | data_dir : Path, optional 62 | Path to directory where data will be cached. 63 | """ 64 | 65 | def __init__( 66 | self, 67 | leagues: Optional[Union[str, list[str]]] = None, 68 | seasons: Optional[Union[str, int, list]] = None, 69 | proxy: Optional[Union[str, list[str], Callable[[], str]]] = None, 70 | no_cache: bool = NOCACHE, 71 | no_store: bool = NOSTORE, 72 | data_dir: Path = MATCH_HISTORY_DATA_DIR, 73 | ): 74 | super().__init__( 75 | leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir 76 | ) 77 | self.seasons = seasons # type: ignore 78 | 79 | def read_games(self) -> pd.DataFrame: 80 | """Retrieve game history for the selected leagues and seasons. 81 | 82 | Column names are explained here: http://www.football-data.co.uk/notes.txt 83 | 84 | Returns 85 | ------- 86 | pd.DataFrame 87 | """ 88 | urlmask = MATCH_HISTORY_API + "/mmz4281/{}/{}.csv" 89 | filemask = "{}_{}.csv" 90 | col_rename = { 91 | "Div": "league", 92 | "Date": "date", 93 | "Time": "time", 94 | "HomeTeam": "home_team", 95 | "AwayTeam": "away_team", 96 | "Referee": "referee", 97 | } 98 | 99 | df_list = [] 100 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 101 | filepath = self.data_dir / filemask.format(lkey, skey) 102 | url = urlmask.format(skey, lkey) 103 | current_season = not self._is_complete(lkey, skey) 104 | 105 | reader = self.get(url, filepath, no_cache=current_season) 106 | df_games = _parse_csv(reader, lkey, skey).assign(season=skey) 107 | 108 | if "Time" not in df_games.columns: 109 | df_games["Time"] = "12:00" 110 | df_games["Time"] = df_games["Time"].fillna("12:00") 111 | df_list.append(df_games) 112 | 113 | df = ( 114 | pd.concat(df_list, sort=False) 115 | .rename(columns=col_rename) 116 | .assign( 117 | date=lambda x: pd.to_datetime( 118 | x["date"] + " " + x["time"], format="mixed", dayfirst=True 119 | ) 120 | ) 121 | .drop("time", axis=1) 122 | .pipe(self._translate_league) 123 | .replace( 124 | { 125 | "home_team": TEAMNAME_REPLACEMENTS, 126 | "away_team": TEAMNAME_REPLACEMENTS, 127 | } 128 | ) 129 | .dropna(subset=["home_team", "away_team"]) 130 | ) 131 | 132 | df = df.loc[:, ~df.columns.str.contains("^Unnamed")] 133 | df["game"] = df.apply(make_game_id, axis=1) 134 | df.set_index(["league", "season", "game"], inplace=True) 135 | df.sort_index(inplace=True) 136 | return df 137 | -------------------------------------------------------------------------------- /soccerdata/sofascore.py: -------------------------------------------------------------------------------- 1 | """Scraper for https://www.sofascore.com/.""" 2 | 3 | import itertools 4 | import json 5 | from collections.abc import Iterable 6 | from datetime import datetime, timezone 7 | from pathlib import Path 8 | from typing import Callable, Optional, Union 9 | 10 | import pandas as pd 11 | 12 | from ._common import BaseRequestsReader, make_game_id 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 14 | 15 | SOFASCORE_DATADIR = DATA_DIR / "Sofascore" 16 | SOFASCORE_API = "https://api.sofascore.com/api/v1/" 17 | 18 | 19 | class Sofascore(BaseRequestsReader): 20 | """Provides pd.DataFrames from data available at http://www.sofascore.com. 21 | 22 | Data will be downloaded as necessary and cached locally in 23 | ``~/soccerdata/data/Sofascore``. 24 | 25 | Parameters 26 | ---------- 27 | leagues : string or iterable, optional 28 | IDs of Leagues to include. 29 | seasons : string, int or list, optional 30 | Seasons to include. Supports multiple formats. 31 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 32 | proxy : 'tor' or or dict or list(dict) or callable, optional 33 | Use a proxy to hide your IP address. Valid options are: 34 | - "tor": Uses the Tor network. Tor should be running in 35 | the background on port 9050. 36 | - str: The address of the proxy server to use. 37 | - list(str): A list of proxies to choose from. A different proxy will 38 | be selected from this list after failed requests, allowing rotating 39 | proxies. 40 | - callable: A function that returns a valid proxy. This function will 41 | be called after failed requests, allowing rotating proxies. 42 | no_cache : bool 43 | If True, will not use cached data. 44 | no_store : bool 45 | If True, will not store downloaded data. 46 | data_dir : Path 47 | Path to directory where data will be cached. 48 | """ 49 | 50 | def __init__( 51 | self, 52 | leagues: Optional[Union[str, list[str]]] = None, 53 | seasons: Optional[Union[str, int, Iterable[Union[str, int]]]] = None, 54 | proxy: Optional[Union[str, list[str], Callable[[], str]]] = None, 55 | no_cache: bool = NOCACHE, 56 | no_store: bool = NOSTORE, 57 | data_dir: Path = SOFASCORE_DATADIR, 58 | ): 59 | """Initialize the Sofascore reader.""" 60 | super().__init__( 61 | leagues=leagues, 62 | proxy=proxy, 63 | no_cache=no_cache, 64 | no_store=no_store, 65 | data_dir=data_dir, 66 | ) 67 | self.seasons = seasons # type: ignore 68 | if not self.no_store: 69 | (self.data_dir / "leagues").mkdir(parents=True, exist_ok=True) 70 | (self.data_dir / "seasons").mkdir(parents=True, exist_ok=True) 71 | (self.data_dir / "matches").mkdir(parents=True, exist_ok=True) 72 | 73 | def read_leagues(self) -> pd.DataFrame: 74 | """Retrieve the selected leagues from the datasource. 75 | 76 | Returns 77 | ------- 78 | pd.DataFrame 79 | """ 80 | url = SOFASCORE_API + "config/unique-tournaments/EN/football" 81 | filepath = self.data_dir / "leagues.json" 82 | reader = self.get(url, filepath) 83 | data = json.load(reader) 84 | leagues = [] 85 | for k in data["uniqueTournaments"]: 86 | leagues.append( 87 | { 88 | "league_id": k["id"], 89 | "league": k["name"], 90 | } 91 | ) 92 | df = ( 93 | pd.DataFrame(leagues) 94 | .pipe(self._translate_league) 95 | .assign(region=lambda x: x["league"].str.split("-").str[0]) 96 | .set_index("league") 97 | .loc[self._selected_leagues.keys()] 98 | .sort_index() 99 | ) 100 | return df[df.index.isin(self.leagues)] 101 | 102 | def read_seasons(self) -> pd.DataFrame: 103 | """Retrieve the selected seasons for the selected leagues. 104 | 105 | Returns 106 | ------- 107 | pd.DataFrame 108 | """ 109 | filemask = "leagues/{}.json" 110 | seasons = [] 111 | df_leagues = self.read_leagues() 112 | for lkey, league in df_leagues.iterrows(): 113 | url = SOFASCORE_API + "unique-tournament/{}/seasons" 114 | filepath = self.data_dir / filemask.format(lkey) 115 | reader = self.get(url.format(league.league_id), filepath) 116 | data = json.load(reader)["seasons"] 117 | for season in data: 118 | seasons.append( 119 | { 120 | "league": lkey, 121 | "season": self._season_code.parse(season["year"]), 122 | "league_id": league.league_id, 123 | "season_id": season["id"], 124 | } 125 | ) 126 | df = pd.DataFrame(seasons).set_index(["league", "season"]).sort_index() 127 | 128 | return df.loc[df.index.isin(list(itertools.product(self.leagues, self.seasons)))] 129 | 130 | def read_league_table(self, force_cache: bool = False) -> pd.DataFrame: 131 | """Retrieve the league table for the selected leagues. 132 | 133 | Parameters 134 | ---------- 135 | force_cache : bool 136 | By default no cached data is used for the current season. 137 | If True, will force the use of cached data anyway. 138 | 139 | Returns 140 | ------- 141 | pd.DataFrame 142 | """ 143 | filemask = "seasons/{}_{}.html" 144 | urlmask = SOFASCORE_API + "unique-tournament/{}/season/{}/standings/total" 145 | 146 | idx = ["league", "season"] 147 | cols = ["team", "MP", "W", "D", "L", "GF", "GA", "GD", "Pts"] 148 | 149 | seasons = self.read_seasons() 150 | # collect league tables 151 | mult_tables = [] 152 | for (lkey, skey), season in seasons.iterrows(): 153 | filepath = self.data_dir / filemask.format(lkey, skey) 154 | url = urlmask.format(season.league_id, season.season_id) 155 | current_season = not self._is_complete(lkey, skey) 156 | reader = self.get(url, filepath, no_cache=current_season and not force_cache) 157 | season_data = json.load(reader) 158 | for row in season_data["standings"][0]["rows"]: 159 | mult_tables.append( 160 | { 161 | "league": lkey, 162 | "season": skey, 163 | "team": row["team"]["name"], 164 | "MP": row["matches"], 165 | "W": row["wins"], 166 | "D": row["draws"], 167 | "L": row["losses"], 168 | "GF": row["scoresFor"], 169 | "GA": row["scoresAgainst"], 170 | "GD": row["scoresFor"] - row["scoresAgainst"], 171 | "Pts": row["points"], 172 | } 173 | ) 174 | df = ( 175 | pd.DataFrame(mult_tables) 176 | .set_index(idx) 177 | .replace({"team": TEAMNAME_REPLACEMENTS}) 178 | .sort_index()[cols] 179 | ) 180 | return df 181 | 182 | def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: 183 | """Retrieve the game schedule for the selected leagues and seasons. 184 | 185 | Parameters 186 | ---------- 187 | force_cache : bool 188 | By default no cached data is used for the current season. 189 | If True, will force the use of cached data anyway. 190 | 191 | Returns 192 | ------- 193 | pd.DataFrame 194 | """ 195 | urlmask1 = SOFASCORE_API + "unique-tournament/{}/season/{}/rounds" 196 | urlmask2 = SOFASCORE_API + "unique-tournament/{}/season/{}/events/round/{}" 197 | filemask1 = "matches/rounds_{}_{}.json" 198 | filemask2 = "matches/round_matches_{}_{}_{}.json" 199 | 200 | cols = [ 201 | "round", 202 | "week", 203 | "date", 204 | "home_team", 205 | "away_team", 206 | "home_score", 207 | "away_score", 208 | "game_id", 209 | ] 210 | 211 | df_seasons = self.read_seasons() 212 | all_schedules = [] 213 | for (lkey, skey), season in df_seasons.iterrows(): 214 | filepath1 = self.data_dir / filemask1.format(lkey, skey) 215 | url1 = urlmask1.format(season["league_id"], season["season_id"]) 216 | current_season = not self._is_complete(lkey, skey) 217 | reader1 = self.get(url1, filepath1, no_cache=current_season and not force_cache) 218 | season_data = json.load(reader1) 219 | rounds = season_data["rounds"] 220 | 221 | for round in rounds: # noqa: A001 222 | filepath2 = self.data_dir / filemask2.format(lkey, skey, round["round"]) 223 | url2 = urlmask2.format(season["league_id"], season["season_id"], round["round"]) 224 | reader2 = self.get(url2, filepath2, no_cache=current_season and not force_cache) 225 | match_data = json.load(reader2) 226 | for _match in match_data["events"]: 227 | if _match["status"]["code"] == 100 or _match["status"]["code"] == 0: 228 | if _match["status"]["code"] == 100: 229 | home_score = int(_match["homeScore"]["current"]) 230 | away_score = int(_match["awayScore"]["current"]) 231 | else: 232 | home_score = float("nan") # type: ignore 233 | away_score = float("nan") # type: ignore 234 | 235 | all_schedules.append( 236 | { 237 | "league": lkey, 238 | "season": skey, 239 | "round": round["round"], 240 | "week": _match["roundInfo"]["round"], 241 | "date": datetime.fromtimestamp( 242 | _match["startTimestamp"], tz=timezone.utc 243 | ), 244 | "home_team": _match["homeTeam"]["name"], 245 | "away_team": _match["awayTeam"]["name"], 246 | "home_score": home_score, 247 | "away_score": away_score, 248 | "game_id": _match["id"], 249 | } 250 | ) 251 | 252 | df = pd.DataFrame(all_schedules).replace( 253 | { 254 | "home_team": TEAMNAME_REPLACEMENTS, 255 | "away_team": TEAMNAME_REPLACEMENTS, 256 | } 257 | ) 258 | df["game"] = df.apply(make_game_id, axis=1) 259 | return df.set_index(["league", "season", "game"]).sort_index()[cols] 260 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for the soccerdata package.""" 2 | -------------------------------------------------------------------------------- /tests/appdata/config/league_dict.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /tests/appdata/config/teamname_replacements.json: -------------------------------------------------------------------------------- 1 | { 2 | "Manchester City": ["Man City"], 3 | "Olympique Marseille": ["Marseille"], 4 | "Valencia CF": ["Valencia"], 5 | "FC Bayern Munich": ["FC Bayern München"] 6 | } 7 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Pytest fixtures for soccerdata package.""" 2 | 3 | import pytest 4 | 5 | import soccerdata as sd 6 | 7 | 8 | @pytest.fixture() 9 | def espn_seriea() -> sd.ESPN: 10 | """Return a correctly initialized instance of ESPN filtered by league: Serie A.""" 11 | return sd.ESPN("ITA-Serie A", "20-21") 12 | 13 | 14 | @pytest.fixture() 15 | def sofifa_bundesliga() -> sd.SoFIFA: 16 | """Return a correctly initialized instance of SoFIFA filtered by league: Bundesliga.""" 17 | return sd.SoFIFA("GER-Bundesliga", versions=[230012]) 18 | 19 | 20 | @pytest.fixture() 21 | def fbref_ligue1() -> sd.FBref: 22 | """Return a correctly initialized instance of FBref filtered by league: Ligue 1.""" 23 | return sd.FBref("FRA-Ligue 1", "20-21") 24 | 25 | 26 | @pytest.fixture() 27 | def fotmob_laliga(): 28 | """Return a correctly initialized instance of Fotmob filtered by league: La Liga.""" 29 | return sd.FotMob("ESP-La Liga", "20-21") 30 | 31 | 32 | @pytest.fixture() 33 | def elo() -> sd.ClubElo: 34 | """Return a correctly initialized ClubElo instance.""" 35 | return sd.ClubElo() 36 | 37 | 38 | @pytest.fixture() 39 | def match_epl_5y() -> sd.MatchHistory: 40 | """Return a MatchHistory instance for the last 5 years of the EPL.""" 41 | return sd.MatchHistory("ENG-Premier League", list(range(2019, 2025))) 42 | 43 | 44 | @pytest.fixture() 45 | def whoscored() -> sd.WhoScored: 46 | """Return a correctly initialized instance of WhoScored.""" 47 | return sd.WhoScored("ENG-Premier League", "20-21", headless=False) 48 | 49 | 50 | @pytest.fixture() 51 | def understat_epl_1516() -> sd.Understat: 52 | """Return a correctly initialized instance of Understat filtered by league: Premier League.""" 53 | return sd.Understat("ENG-Premier League", "15-16") 54 | 55 | 56 | @pytest.fixture() 57 | def understat_epl_9091() -> sd.Understat: 58 | """Return a correctly initialized instance of Understat filtered by league: Premier League.""" 59 | return sd.Understat("ENG-Premier League", "90-91") 60 | 61 | 62 | @pytest.fixture() 63 | def sofascore_epl_1516() -> sd.Sofascore: 64 | """Return a correctly initialized instance of Sofascore filtered by league: Premier League.""" 65 | return sd.Sofascore("ENG-Premier League", "15-16") 66 | -------------------------------------------------------------------------------- /tests/test_ClubElo.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.ClubElo.""" 2 | 3 | import time 4 | from datetime import datetime, timedelta, timezone 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | import pytest 9 | 10 | from soccerdata import ClubElo 11 | 12 | 13 | def test_read_by_date(elo: ClubElo) -> None: 14 | """It should return a dataframe with the ELO ratings for all clubs at the specified date.""" 15 | assert isinstance(elo.read_by_date(), pd.DataFrame) 16 | assert isinstance(elo.read_by_date("2017-04-01"), pd.DataFrame) 17 | assert isinstance(elo.read_by_date(datetime(2017, 4, 1, tzinfo=timezone.utc)), pd.DataFrame) 18 | 19 | 20 | def test_read_by_date_bad_params(elo: ClubElo) -> None: 21 | """It should raise an error if the parameters are invalid.""" 22 | with pytest.raises(ValueError, match="time data '2017' does not match format '%Y-%m-%d'"): 23 | elo.read_by_date("2017") 24 | with pytest.raises( 25 | TypeError, match="'date' must be a datetime object or string like 'YYYY-MM-DD'" 26 | ): 27 | elo.read_by_date(1 / 4) # type: ignore 28 | 29 | 30 | def test_read_team_history(elo: ClubElo) -> None: 31 | """It should return a dataframe with the ELO history for the specified club.""" 32 | assert isinstance(elo.read_team_history("Feyenoord"), pd.DataFrame) 33 | assert isinstance(elo.read_team_history("Feyenoord", 2), pd.DataFrame) 34 | assert isinstance(elo.read_team_history("Feyenoord", timedelta(days=2)), pd.DataFrame) 35 | 36 | 37 | def test_read_team_history_max_age(elo: ClubElo) -> None: 38 | """It should not use cached data if it is older than max_age.""" 39 | max_age = timedelta(milliseconds=1) 40 | assert isinstance(elo.read_team_history("Feyenoord", max_age), pd.DataFrame) 41 | update_time = ( 42 | (Path(__file__).parent / "appdata" / "data" / "ClubElo" / "Feyenoord.csv").stat().st_mtime 43 | ) 44 | current_time = time.time() 45 | assert current_time - update_time < 5 46 | 47 | 48 | def test_read_team_history_replacement(elo: ClubElo) -> None: 49 | """It should use the replacement names from teamname_replacements.json.""" 50 | assert isinstance(elo.read_team_history("Manchester City"), pd.DataFrame) 51 | 52 | 53 | def test_read_team_history_bad_team(elo: ClubElo) -> None: 54 | """It should raise an error if the team is not found.""" 55 | with pytest.raises(ValueError, match="No data found for team FC Knudde"): 56 | elo.read_team_history("FC Knudde") 57 | 58 | 59 | def test_read_team_history_bad_params(elo: ClubElo) -> None: 60 | """It should raise an error if the parameters are invalid.""" 61 | with pytest.raises(TypeError, match="'max_age' must be of type int or datetime.timedelta"): 62 | elo.read_team_history("Feyenoord", max_age=datetime.now(tz=timezone.utc)) # type: ignore 63 | -------------------------------------------------------------------------------- /tests/test_ESPN.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.ESPN.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from soccerdata.espn import ESPN 7 | 8 | 9 | def test_read_schedule(espn_seriea: ESPN) -> None: 10 | """It should return a dataframe with the schedule of the season.""" 11 | assert isinstance(espn_seriea.read_schedule(), pd.DataFrame) 12 | 13 | 14 | def test_read_matchsheet(espn_seriea: ESPN) -> None: 15 | """It should return a dataframe with the matchsheet data.""" 16 | assert isinstance(espn_seriea.read_matchsheet(match_id=554204), pd.DataFrame) 17 | 18 | 19 | def test_read_matchsheet_bad_id(espn_seriea: ESPN) -> None: 20 | """It should raise a ValueError if the selected game is not in the specified season.""" 21 | with pytest.raises( 22 | ValueError, 23 | match="No games with the given IDs found for the selected seasons and leagues.", 24 | ): 25 | assert isinstance(espn_seriea.read_matchsheet(match_id=123), pd.DataFrame) 26 | 27 | 28 | def test_read_lineups(espn_seriea: ESPN) -> None: 29 | """It should return a dataframe with the lineups.""" 30 | assert isinstance(espn_seriea.read_lineup(match_id=554204), pd.DataFrame) 31 | 32 | 33 | def test_id_not_in_season(espn_seriea: ESPN) -> None: 34 | """It should raise a ValueError if the selected game is not in the specified season.""" 35 | with pytest.raises( 36 | ValueError, 37 | match="No games with the given IDs found for the selected seasons and leagues.", 38 | ): 39 | assert isinstance(espn_seriea.read_lineup(match_id=123), pd.DataFrame) 40 | -------------------------------------------------------------------------------- /tests/test_FBref.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.FBref.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import soccerdata as sd 7 | from soccerdata.fbref import FBref, _concat 8 | 9 | 10 | def test_available_leagues() -> None: 11 | assert sd.FBref.available_leagues() == [ 12 | "Big 5 European Leagues Combined", 13 | "ENG-Premier League", 14 | "ESP-La Liga", 15 | "FRA-Ligue 1", 16 | "GER-Bundesliga", 17 | "INT-European Championship", 18 | "INT-Women's World Cup", 19 | "INT-World Cup", 20 | "ITA-Serie A", 21 | ] 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "stat_type", 26 | [ 27 | "standard", 28 | "keeper", 29 | "keeper_adv", 30 | "shooting", 31 | "passing", 32 | "passing_types", 33 | "goal_shot_creation", 34 | "defense", 35 | "possession", 36 | "playing_time", 37 | "misc", 38 | ], 39 | ) 40 | def test_read_team_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: 41 | assert isinstance(fbref_ligue1.read_team_season_stats(stat_type), pd.DataFrame) 42 | 43 | 44 | @pytest.mark.parametrize( 45 | "stat_type", 46 | [ 47 | "schedule", 48 | "shooting", 49 | "keeper", 50 | "passing", 51 | "passing_types", 52 | "goal_shot_creation", 53 | "defense", 54 | "possession", 55 | "misc", 56 | ], 57 | ) 58 | def test_read_team_match_stats(fbref_ligue1: FBref, stat_type: str) -> None: 59 | assert isinstance(fbref_ligue1.read_team_match_stats(stat_type), pd.DataFrame) 60 | 61 | 62 | def test_read_team_match_stats_alt_names(fbref_ligue1: FBref) -> None: 63 | # Test with FBref team name 64 | assert isinstance( 65 | fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Olympique Marseille"), 66 | pd.DataFrame, 67 | ) 68 | # Test with standardized team name 69 | assert isinstance( 70 | fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Marseille"), 71 | pd.DataFrame, 72 | ) 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "stat_type", 77 | [ 78 | "standard", 79 | "shooting", 80 | "passing", 81 | "passing_types", 82 | "goal_shot_creation", 83 | "defense", 84 | "possession", 85 | "playing_time", 86 | "misc", 87 | "keeper", 88 | "keeper_adv", 89 | ], 90 | ) 91 | def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: 92 | assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame) 93 | 94 | 95 | def test_read_schedule(fbref_ligue1: FBref) -> None: 96 | assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame) 97 | 98 | 99 | @pytest.mark.parametrize( 100 | "stat_type", 101 | [ 102 | "summary", 103 | "keepers", 104 | "passing", 105 | "passing_types", 106 | "defense", 107 | "possession", 108 | "misc", 109 | ], 110 | ) 111 | def test_read_player_match_stats(fbref_ligue1: FBref, stat_type: str) -> None: 112 | assert isinstance( 113 | fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame 114 | ) 115 | 116 | 117 | def test_read_events(fbref_ligue1: FBref) -> None: 118 | assert isinstance(fbref_ligue1.read_events(match_id="796787da"), pd.DataFrame) 119 | 120 | 121 | def test_read_events_yellow_for_manager() -> None: 122 | """When a yellow card given to the manager, there is no tag.""" 123 | fbref_laliga = sd.FBref("ESP-La Liga", "23-24") 124 | events = fbref_laliga.read_events(match_id="e8867e6b") 125 | yellow_cards = events[events["event_type"] == "yellow_card"] 126 | assert "Pepe Bordalás" in yellow_cards["player1"].tolist() 127 | 128 | 129 | def test_missing_events() -> None: 130 | fbref = sd.FBref("FRA-Ligue 1", "19-20") 131 | events = fbref.read_events(match_id="1d845950") 132 | assert len(events) == 0 133 | 134 | 135 | def test_read_shot_events(fbref_ligue1: FBref) -> None: 136 | assert isinstance(fbref_ligue1.read_shot_events(match_id="796787da"), pd.DataFrame) 137 | 138 | 139 | def test_read_lineup(fbref_ligue1: FBref) -> None: 140 | assert isinstance(fbref_ligue1.read_lineup(match_id="796787da"), pd.DataFrame) 141 | 142 | 143 | def test_concat() -> None: 144 | df1 = pd.DataFrame( 145 | columns=pd.MultiIndex.from_tuples( 146 | [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")] 147 | ) 148 | ) 149 | df2 = pd.DataFrame( 150 | columns=pd.MultiIndex.from_tuples( 151 | [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Assists")] 152 | ) 153 | ) 154 | df3 = pd.DataFrame( 155 | columns=pd.MultiIndex.from_tuples( 156 | [("Unnamed: a", "player"), ("Goals", "Unnamed: b"), ("Performance", "Assists")] 157 | ) 158 | ) 159 | res = _concat([df1, df2, df3], key=["player"]) 160 | assert res.columns.equals( 161 | pd.MultiIndex.from_tuples( 162 | [("player", ""), ("Performance", "Goals"), ("Performance", "Assists")] 163 | ) 164 | ) 165 | res = _concat([df3, df1, df2], key=["player"]) 166 | assert res.columns.equals( 167 | pd.MultiIndex.from_tuples( 168 | [("player", ""), ("Performance", "Goals"), ("Performance", "Assists")] 169 | ) 170 | ) 171 | 172 | 173 | def test_concat_with_forfeited_game() -> None: 174 | fbref_seriea = sd.FBref(["ITA-Serie A"], 2021) 175 | df_1 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "34e95e35"]) 176 | df_2 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "a3e10e13"]) 177 | assert isinstance(df_1, pd.DataFrame) 178 | assert isinstance(df_2, pd.DataFrame) 179 | # Regardless of the order in which the matches are read, the result should be the same. 180 | assert df_1.columns.equals(df_2.columns) 181 | 182 | 183 | def test_combine_big5() -> None: 184 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 185 | assert len(fbref_bigfive.read_leagues(split_up_big5=False)) == 1 186 | assert len(fbref_bigfive.read_seasons(split_up_big5=False)) == 1 187 | assert len(fbref_bigfive.read_leagues(split_up_big5=True)) == 5 188 | assert len(fbref_bigfive.read_seasons(split_up_big5=True)) == 5 189 | # by default, split_up_big5 should be False 190 | assert len(fbref_bigfive.read_leagues()) == 1 191 | assert len(fbref_bigfive.read_seasons()) == 1 192 | 193 | 194 | @pytest.mark.parametrize( 195 | "stat_type", 196 | [ 197 | "standard", 198 | "keeper", 199 | # "keeper_adv", disabled because of inconsistent data on FBref 200 | "shooting", 201 | "passing", 202 | "passing_types", 203 | "goal_shot_creation", 204 | "defense", 205 | "possession", 206 | "playing_time", 207 | "misc", 208 | ], 209 | ) 210 | def test_combine_big5_team_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: 211 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 212 | ligue1 = fbref_ligue1.read_team_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index() 213 | bigfive = fbref_bigfive.read_team_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index() 214 | cols = _concat([ligue1, bigfive], key=["season"]).columns 215 | ligue1.columns = cols 216 | bigfive.columns = cols 217 | pd.testing.assert_frame_equal( 218 | ligue1, 219 | bigfive, 220 | ) 221 | 222 | 223 | @pytest.mark.parametrize( 224 | "stat_type", 225 | [ 226 | "standard", 227 | "shooting", 228 | "passing", 229 | "passing_types", 230 | "goal_shot_creation", 231 | "defense", 232 | "possession", 233 | "playing_time", 234 | "misc", 235 | "keeper", 236 | "keeper_adv", 237 | ], 238 | ) 239 | def test_combine_big5_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: 240 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 241 | ligue1 = fbref_ligue1.read_player_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index() 242 | bigfive = fbref_bigfive.read_player_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index() 243 | cols = _concat([ligue1, bigfive], key=["season"]).columns 244 | ligue1.columns = cols 245 | bigfive.columns = cols 246 | pd.testing.assert_frame_equal( 247 | ligue1, 248 | bigfive, 249 | ) 250 | -------------------------------------------------------------------------------- /tests/test_FotMob.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.FotMob.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | # import soccerdata as sd 7 | from soccerdata.fotmob import FotMob 8 | 9 | # Unittests ------------------------------------------------------------------- 10 | 11 | 12 | @pytest.mark.fails_gha() 13 | def test_read_league_table(fotmob_laliga: FotMob) -> None: 14 | assert isinstance(fotmob_laliga.read_league_table(), pd.DataFrame) 15 | 16 | 17 | @pytest.mark.fails_gha() 18 | def test_read_schedule(fotmob_laliga: FotMob) -> None: 19 | assert isinstance(fotmob_laliga.read_schedule(), pd.DataFrame) 20 | 21 | 22 | @pytest.mark.fails_gha() 23 | @pytest.mark.parametrize( 24 | "stat_type", 25 | ["Top stats", "Shots", "Expected goals (xG)", "Passes", "Defence", "Duels", "Discipline"], 26 | ) 27 | def test_read_team_match_stats(fotmob_laliga: FotMob, stat_type: str) -> None: 28 | assert isinstance( 29 | fotmob_laliga.read_team_match_stats(stat_type, team="Valencia"), pd.DataFrame 30 | ) 31 | 32 | 33 | @pytest.mark.fails_gha() 34 | def test_read_team_match_stats_alt_names(fotmob_laliga: FotMob) -> None: 35 | # Test with Fotmob team name 36 | assert isinstance( 37 | fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia"), pd.DataFrame 38 | ) 39 | # Test with standardized team name 40 | assert isinstance( 41 | fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia CF"), 42 | pd.DataFrame, 43 | ) 44 | -------------------------------------------------------------------------------- /tests/test_Integration.py: -------------------------------------------------------------------------------- 1 | """Integration tests for soccerdata package.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import soccerdata as foo 7 | 8 | # TODO: integration tests 9 | # Names of common leagues equal for all classes 10 | # Number of clubs equal for all common leagues over classes 11 | # Clubnames equal for all common leagues over classes 12 | # Number of games equal for all common leagues/seasons over classes 13 | # Scores per game equal for all common leagues over classes 14 | 15 | 16 | @pytest.mark.e2e() 17 | def test_mh_vs_elo(): 18 | """We should be able to retrieve the Elo history for all teams in these leagues.""" 19 | league_sel = [ 20 | "ENG-Premier League", 21 | "ESP-La Liga", 22 | "FRA-Ligue 1", 23 | "GER-Bundesliga", 24 | "ITA-Serie A", 25 | ] 26 | 27 | mh = foo.MatchHistory(leagues=league_sel, seasons="1819") 28 | mh_games = mh.read_games() 29 | 30 | elo = foo.ClubElo() 31 | elo_hist = pd.concat([elo.read_team_history(team) for team in set(mh_games["home_team"])]) 32 | 33 | assert set(mh_games["home_team"]) - set(elo_hist["team"]) == set() 34 | -------------------------------------------------------------------------------- /tests/test_MatchHistory.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.MatchHistory.""" 2 | 3 | import pandas as pd 4 | 5 | from soccerdata.match_history import MatchHistory 6 | 7 | 8 | def test_read_games(match_epl_5y: MatchHistory) -> None: 9 | """It should return a DataFrame with all games from the selected leagues and seasons.""" 10 | df = match_epl_5y.read_games() 11 | assert isinstance(df, pd.DataFrame) 12 | assert len(df.index.get_level_values("season").unique()) == 5 13 | assert len(df) > 0 14 | assert not any("" in c for c in df.columns) 15 | -------------------------------------------------------------------------------- /tests/test_SoFIFA.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.SoFIFA.""" 2 | 3 | import pandas as pd 4 | 5 | from soccerdata.sofifa import SoFIFA 6 | 7 | 8 | def test_read_players(sofifa_bundesliga: SoFIFA) -> None: 9 | """It should use the replacement names from teamname_replacements.json.""" 10 | assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern München"), pd.DataFrame) 11 | 12 | 13 | def test_read_players_replacement(sofifa_bundesliga: SoFIFA) -> None: 14 | """It should use the replacement names from teamname_replacements.json.""" 15 | assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern Munich"), pd.DataFrame) 16 | 17 | 18 | def test_read_team_ratings(sofifa_bundesliga: SoFIFA) -> None: 19 | """It should return a dataframe with the team ratings.""" 20 | assert isinstance(sofifa_bundesliga.read_team_ratings(), pd.DataFrame) 21 | 22 | 23 | def test_read_player_ratings(sofifa_bundesliga: SoFIFA) -> None: 24 | """It should return a dataframe with the player ratings.""" 25 | assert isinstance(sofifa_bundesliga.read_player_ratings(player=189596), pd.DataFrame) 26 | -------------------------------------------------------------------------------- /tests/test_Sofascore.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.Sofascore.""" 2 | 3 | import pandas as pd 4 | 5 | from soccerdata.sofascore import Sofascore 6 | 7 | 8 | def test_read_leagues(sofascore_epl_1516: Sofascore) -> None: 9 | leagues = sofascore_epl_1516.read_leagues() 10 | assert isinstance(leagues, pd.DataFrame) 11 | assert len(leagues) == 1 12 | 13 | 14 | def test_read_seasons(sofascore_epl_1516: Sofascore) -> None: 15 | seasons = sofascore_epl_1516.read_seasons() 16 | assert isinstance(seasons, pd.DataFrame) 17 | assert len(seasons) == 1 18 | 19 | 20 | def test_read_seasons_empty() -> None: 21 | sofascore_instance = Sofascore("ENG-Premier League", "90-91") 22 | seasons = sofascore_instance.read_seasons() 23 | assert isinstance(seasons, pd.DataFrame) 24 | assert len(seasons) == 0 25 | 26 | 27 | def test_read_schedule(sofascore_epl_1516: Sofascore) -> None: 28 | schedule = sofascore_epl_1516.read_schedule() 29 | assert isinstance(schedule, pd.DataFrame) 30 | assert len(schedule) == 380 31 | 32 | 33 | def test_read_league_table(sofascore_epl_1516: Sofascore) -> None: 34 | league_table = sofascore_epl_1516.read_league_table() 35 | assert isinstance(league_table, pd.DataFrame) 36 | assert len(league_table) == 20 37 | -------------------------------------------------------------------------------- /tests/test_Understat.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.Understat.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from soccerdata.understat import Understat 7 | 8 | 9 | def test_read_leagues(understat_epl_1516: Understat) -> None: 10 | leagues = understat_epl_1516.read_leagues() 11 | assert isinstance(leagues, pd.DataFrame) 12 | assert len(leagues) == 1 13 | 14 | 15 | def test_read_seasons(understat_epl_1516: Understat) -> None: 16 | seasons = understat_epl_1516.read_seasons() 17 | assert isinstance(seasons, pd.DataFrame) 18 | assert len(seasons) == 1 19 | 20 | 21 | def test_read_seasons_empty(understat_epl_9091: Understat) -> None: 22 | seasons = understat_epl_9091.read_seasons() 23 | assert isinstance(seasons, pd.DataFrame) 24 | assert len(seasons) == 0 25 | 26 | 27 | def test_read_schedule(understat_epl_1516: Understat) -> None: 28 | schedule = understat_epl_1516.read_schedule() 29 | assert isinstance(schedule, pd.DataFrame) 30 | assert len(schedule) == 380 31 | 32 | 33 | def test_read_team_match_stats(understat_epl_1516: Understat) -> None: 34 | team_match_stats = understat_epl_1516.read_team_match_stats() 35 | assert isinstance(team_match_stats, pd.DataFrame) 36 | assert len(team_match_stats) == 380 37 | 38 | 39 | def test_read_player_season_stats(understat_epl_1516: Understat) -> None: 40 | player_season_stats = understat_epl_1516.read_player_season_stats() 41 | assert isinstance(player_season_stats, pd.DataFrame) 42 | assert len(player_season_stats) == 550 43 | 44 | 45 | def test_read_player_match_stats(understat_epl_1516: Understat) -> None: 46 | player_match_stats = understat_epl_1516.read_player_match_stats() 47 | assert isinstance(player_match_stats, pd.DataFrame) 48 | 49 | 50 | def test_read_player_match_stats_new_columns(understat_epl_1516: Understat) -> None: 51 | player_match_stats = understat_epl_1516.read_player_match_stats() 52 | assert "assists" in player_match_stats.columns 53 | assert "key_passes" in player_match_stats.columns 54 | assert "yellow_cards" in player_match_stats.columns 55 | assert "red_cards" in player_match_stats.columns 56 | 57 | 58 | def test_read_shots(understat_epl_1516: Understat) -> None: 59 | shots_all = understat_epl_1516.read_shot_events() 60 | assert isinstance(shots_all, pd.DataFrame) 61 | assert len(shots_all) == 9_819 62 | shots_utd_bou = understat_epl_1516.read_shot_events(460) 63 | assert isinstance(shots_utd_bou, pd.DataFrame) 64 | assert len(shots_utd_bou) == 20 65 | with pytest.raises( 66 | ValueError, match="No matches found with the given IDs in the selected seasons." 67 | ): 68 | understat_epl_1516.read_shot_events(42) 69 | -------------------------------------------------------------------------------- /tests/test_Whoscored.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.WhoScored.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | # Unittests ------------------------------------------------------------------- 7 | 8 | 9 | @pytest.mark.fails_gha() 10 | def test_whoscored_missing_players(whoscored): 11 | assert isinstance(whoscored.read_missing_players(1485184), pd.DataFrame) 12 | 13 | 14 | @pytest.mark.fails_gha() 15 | def test_whoscored_events(whoscored): 16 | assert isinstance(whoscored.read_events(1485184), pd.DataFrame) 17 | -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | """Unittests for soccerdata._common.""" 2 | 3 | import json 4 | from datetime import datetime, timezone 5 | 6 | import pandas as pd 7 | import pytest 8 | import time_machine 9 | 10 | import soccerdata 11 | from soccerdata._common import ( 12 | BaseRequestsReader, 13 | SeasonCode, 14 | add_alt_team_names, 15 | add_standardized_team_name, 16 | make_game_id, 17 | standardize_colnames, 18 | ) 19 | 20 | # _download_and_save 21 | 22 | 23 | def test_download_and_save_not_cached(tmp_path): 24 | reader = BaseRequestsReader() 25 | url = "http://api.clubelo.com/Barcelona" 26 | filepath = tmp_path / "Barcelona.csv" 27 | data = reader._download_and_save(url, filepath) 28 | assert isinstance(pd.read_csv(data), pd.DataFrame) 29 | 30 | 31 | def test_download_and_save_cached(tmp_path): 32 | reader = BaseRequestsReader() 33 | url = "http://api.clubelo.com/Barcelona" 34 | filepath = tmp_path / "Barcelona.csv" 35 | data = reader._download_and_save(url, filepath) 36 | data = reader._download_and_save(url, filepath) 37 | assert isinstance(pd.read_csv(data), pd.DataFrame) 38 | 39 | 40 | def test_download_and_save_no_cache(tmp_path): 41 | reader = BaseRequestsReader(no_cache=True) 42 | url = "http://api.clubelo.com/Barcelona" 43 | filepath = tmp_path / "Barcelona.csv" 44 | filepath.write_text("bogus") 45 | data = reader._download_and_save(url, filepath) 46 | assert len(pd.read_csv(data)) > 1 47 | 48 | 49 | def test_download_and_save_no_store_no_filepath(): 50 | reader = BaseRequestsReader(no_store=True) 51 | url = "http://api.clubelo.com/Barcelona" 52 | data = reader._download_and_save(url, filepath=None) 53 | assert isinstance(pd.read_csv(data), pd.DataFrame) 54 | 55 | 56 | def test_download_and_save_no_cache_filepath(tmp_path): 57 | reader = BaseRequestsReader(no_store=True) 58 | url = "http://api.clubelo.com/Barcelona" 59 | filepath = tmp_path / "Barcelona.csv" 60 | data = reader._download_and_save(url, filepath) 61 | assert isinstance(pd.read_csv(data), pd.DataFrame) 62 | assert not filepath.exists() 63 | 64 | 65 | def test_download_and_save_variable_no_store_no_filepath(): 66 | reader = BaseRequestsReader(no_store=True) 67 | url = "https://understat.com/" 68 | data = reader._download_and_save(url, filepath=None, var="statData") 69 | stats = json.load(data) 70 | assert isinstance(stats, dict) 71 | assert "statData" in stats 72 | 73 | 74 | # def test_download_and_save_requests_tor(tmp_path): 75 | # url = "https://check.torproject.org/api/ip" 76 | # reader = BaseRequestsReader(proxy=None) 77 | # ip_without_proxy = reader.get(url, tmp_path / "myip.txt") 78 | # ip_without_proxy = json.load(ip_without_proxy) 79 | # proxy_reader = BaseRequestsReader(proxy="tor") 80 | # ip_with_proxy = proxy_reader.get(url, tmp_path / "myproxyip.txt") 81 | # ip_with_proxy = json.load(ip_with_proxy) 82 | # assert ip_without_proxy["IP"] != ip_with_proxy["IP"] 83 | # assert ip_with_proxy["IsTor"] 84 | # 85 | # 86 | # def test_download_and_save_selenium_tor(tmp_path): 87 | # url = "https://check.torproject.org/api/ip" 88 | # reader = BaseSeleniumReader(proxy=None).get(url, tmp_path / "myip.txt") 89 | # ip_without_proxy = html.parse(reader).xpath("//pre")[0].text 90 | # ip_without_proxy = json.loads(ip_without_proxy) 91 | # proxy_reader = BaseSeleniumReader(proxy="tor").get(url, tmp_path / "myproxyip.txt") 92 | # ip_with_proxy = html.parse(proxy_reader).xpath("//pre")[0].text 93 | # ip_with_proxy = json.loads(ip_with_proxy) 94 | # assert ip_without_proxy["IP"] != ip_with_proxy["IP"] 95 | # assert ip_with_proxy["IsTor"] 96 | # 97 | 98 | # make_game_id 99 | 100 | 101 | def test_make_game_id(): 102 | s = pd.Series( 103 | { 104 | "date": datetime(1993, 7, 30, tzinfo=timezone.utc), 105 | "home_team": "Barcelona", 106 | "away_team": "Real Madrid", 107 | } 108 | ) 109 | game_id = make_game_id(s) 110 | assert game_id == "1993-07-30 Barcelona-Real Madrid" 111 | 112 | 113 | # add_alt_team_names 114 | 115 | 116 | def test_add_alt_team_names(): 117 | # "Valencia" is replaced by "Valencia CF" 118 | assert add_alt_team_names("Valencia CF") == {"Valencia", "Valencia CF"} 119 | # "Real Madrid" is not replaced 120 | assert add_alt_team_names("Real Madrid") == {"Real Madrid"} 121 | 122 | 123 | def test_add_standardize_team_name(): 124 | # "Valencia" is replaced by "Valencia CF" 125 | assert add_standardized_team_name("Valencia") == {"Valencia", "Valencia CF"} 126 | # "Real Madrid" is not replaced 127 | assert add_standardized_team_name("Real Madrid") == {"Real Madrid"} 128 | 129 | 130 | # standardize_colnames 131 | 132 | 133 | def test_standardize_colnames(): 134 | df = pd.DataFrame( 135 | columns=[ 136 | "First Test", 137 | "SecondTest", 138 | "thirdTest", 139 | "Fourthtest", 140 | "Fifth-test", 141 | "TestSix", 142 | ] 143 | ) 144 | df = standardize_colnames( 145 | df, cols=["First Test", "SecondTest", "thirdTest", "Fourthtest", "Fifth-test"] 146 | ) 147 | assert df.columns.tolist() == [ 148 | "first_test", 149 | "second_test", 150 | "third_test", 151 | "fourthtest", 152 | "fifth_test", 153 | "TestSix", 154 | ] 155 | 156 | 157 | # is_complete 158 | 159 | 160 | def test_is_complete(): 161 | reader = BaseRequestsReader(no_store=True) 162 | with time_machine.travel(datetime(2020, 12, 25, 1, 24, tzinfo=timezone.utc)): 163 | assert reader._is_complete("ENG-Premier League", "1920") 164 | assert not reader._is_complete("ENG-Premier League", "2021") 165 | with time_machine.travel(datetime(2021, 2, 25, 1, 24, tzinfo=timezone.utc)): 166 | assert reader._is_complete("ENG-Premier League", "1920") 167 | assert not reader._is_complete("ENG-Premier League", "2021") 168 | with time_machine.travel(datetime(2021, 7, 1, 1, 24, tzinfo=timezone.utc)): 169 | assert reader._is_complete("ENG-Premier League", "1920") 170 | assert reader._is_complete("ENG-Premier League", "2021") 171 | assert not reader._is_complete("ENG-Premier League", "2122") 172 | 173 | 174 | def test_is_complete_default_value(mocker): 175 | mocker.patch.object(soccerdata._common, "LEAGUE_DICT", {"FAKE-Dummy League": {}}) 176 | reader = BaseRequestsReader(no_store=True) 177 | with time_machine.travel(datetime(2020, 12, 25, 1, 24, tzinfo=timezone.utc)): 178 | assert reader._is_complete("FAKE-Dummy League", "1920") 179 | 180 | 181 | def test_is_complete_undefined_league(mocker): # noqa: ARG001 182 | reader = BaseRequestsReader(no_store=True) 183 | with pytest.raises( 184 | ValueError, 185 | match="Invalid league 'FAKE-Dummy League'", 186 | ): 187 | reader._is_complete("FAKE-Dummy League", "1920") 188 | 189 | 190 | # Season codes 191 | def test_season_pattern1a(): 192 | assert SeasonCode.MULTI_YEAR.parse("9495") == "9495" 193 | assert SeasonCode.SINGLE_YEAR.parse("9495") == "1994" 194 | 195 | 196 | def test_season_pattern1a_warn(): 197 | with pytest.warns(UserWarning) as record: 198 | assert SeasonCode.MULTI_YEAR.parse("2021") == "2021" 199 | 200 | # check that only one warning was raised 201 | assert len(record) == 1 202 | # check that the message matches 203 | msg = 'Season id "2021" is ambiguous: interpreting as "20-21"' 204 | assert record[0].message.args[0] == msg # type: ignore 205 | 206 | 207 | def test_season_pattern1b(): 208 | my_season = check_post = "1998" 209 | assert SeasonCode.MULTI_YEAR.parse(my_season) == "9899" 210 | assert SeasonCode.SINGLE_YEAR.parse(my_season) == "1998" 211 | assert my_season == check_post 212 | 213 | 214 | def test_season_pattern1c(): 215 | assert SeasonCode.MULTI_YEAR.parse("1999") == "9900" 216 | assert SeasonCode.SINGLE_YEAR.parse("1999") == "1999" 217 | 218 | 219 | def test_season_pattern2(): 220 | assert SeasonCode.MULTI_YEAR.parse("11") == "1112" 221 | assert SeasonCode.SINGLE_YEAR.parse("11") == "2011" 222 | assert SeasonCode.MULTI_YEAR.parse("99") == "9900" 223 | assert SeasonCode.SINGLE_YEAR.parse("99") == "1999" 224 | 225 | 226 | def test_season_pattern3(): 227 | assert SeasonCode.MULTI_YEAR.parse("2011-2012") == "1112" 228 | assert SeasonCode.SINGLE_YEAR.parse("2011-2012") == "2011" 229 | assert SeasonCode.MULTI_YEAR.parse("1999-2000") == "9900" 230 | assert SeasonCode.SINGLE_YEAR.parse("1999-2000") == "1999" 231 | 232 | 233 | def test_season_pattern4(): 234 | assert SeasonCode.MULTI_YEAR.parse("2011-12") == "1112" 235 | assert SeasonCode.SINGLE_YEAR.parse("2011-12") == "2011" 236 | assert SeasonCode.MULTI_YEAR.parse("1999-00") == "9900" 237 | assert SeasonCode.SINGLE_YEAR.parse("1999-00") == "1999" 238 | 239 | 240 | def test_season_pattern5(): 241 | assert SeasonCode.MULTI_YEAR.parse("13-14") == "1314" 242 | assert SeasonCode.SINGLE_YEAR.parse("13-14") == "2013" 243 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | """Unittests for soccerdata._config.""" 2 | 3 | import json 4 | import logging 5 | from importlib import reload 6 | 7 | from soccerdata import _config as conf 8 | 9 | 10 | def test_env_soccerdata_dir(monkeypatch, tmp_path): 11 | monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path)) 12 | reload(conf) 13 | assert tmp_path == conf.BASE_DIR 14 | 15 | 16 | def test_env_nocache(monkeypatch): 17 | monkeypatch.setenv("SOCCERDATA_NOCACHE", "t") 18 | reload(conf) 19 | assert conf.NOCACHE is True 20 | 21 | monkeypatch.setenv("SOCCERDATA_NOCACHE", "true") 22 | reload(conf) 23 | assert conf.NOCACHE is True 24 | 25 | monkeypatch.setenv("SOCCERDATA_NOCACHE", "f") 26 | reload(conf) 27 | assert conf.NOCACHE is False 28 | 29 | 30 | def test_env_nostore(monkeypatch): 31 | monkeypatch.setenv("SOCCERDATA_NOSTORE", "t") 32 | reload(conf) 33 | assert conf.NOSTORE is True 34 | 35 | monkeypatch.setenv("SOCCERDATA_NOSTORE", "true") 36 | reload(conf) 37 | assert conf.NOSTORE is True 38 | 39 | monkeypatch.setenv("SOCCERDATA_NOSTORE", "f") 40 | reload(conf) 41 | assert conf.NOSTORE is False 42 | 43 | 44 | def test_env_loglevel(monkeypatch): 45 | monkeypatch.setenv("SOCCERDATA_LOGLEVEL", "DEBUG") 46 | reload(conf) 47 | assert conf.logger.level == logging.DEBUG 48 | 49 | 50 | def test_read_teamnname_replacements(monkeypatch, tmp_path): 51 | monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path)) 52 | # no teamname_replacements.json 53 | reload(conf) 54 | assert {} == conf.TEAMNAME_REPLACEMENTS 55 | fp = tmp_path / "config" / "teamname_replacements.json" 56 | with fp.open("w", encoding="utf8") as outfile: 57 | json.dump({"Celta de Vigo": ["Celta Vigo", "Celta"]}, outfile) 58 | # correctly parse teamname_replacements.json 59 | reload(conf) 60 | assert { 61 | "Celta Vigo": "Celta de Vigo", 62 | "Celta": "Celta de Vigo", 63 | } == conf.TEAMNAME_REPLACEMENTS 64 | 65 | 66 | def test_read_league_dict(monkeypatch, tmp_path): 67 | monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path)) 68 | # no league_dict.json 69 | reload(conf) 70 | nb_default = len(conf.LEAGUE_DICT) 71 | fp = tmp_path / "config" / "league_dict.json" 72 | with fp.open("w", encoding="utf8") as outfile: 73 | json.dump({"ABC-Fake": {"WhoScored": "Fake"}}, outfile) 74 | # correctly parse league_dict.json 75 | reload(conf) 76 | assert len(conf.LEAGUE_DICT) == nb_default + 1 77 | assert conf.LEAGUE_DICT["ABC-Fake"] == {"WhoScored": "Fake"} 78 | --------------------------------------------------------------------------------