├── .bumpversion.cfg
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── labels.yml
    ├── release-drafter.yml
    ├── renovate.json
    └── workflows
    │   ├── ci.yml
    │   ├── constraints.txt
    │   ├── labeler.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CONTRIBUTING.rst
├── LICENSE.rst
├── Makefile
├── README.rst
├── docs
    ├── _static
    │   ├── artwork.ai
    │   ├── default.css
    │   ├── favicon.ico
    │   ├── logo.png
    │   └── logo2.png
    ├── conf.py
    ├── contributing.rst
    ├── datasources
    │   ├── ClubElo.ipynb
    │   ├── ESPN.ipynb
    │   ├── FBref.ipynb
    │   ├── FotMob.ipynb
    │   ├── MatchHistory.ipynb
    │   ├── SoFIFA.ipynb
    │   ├── Sofascore.ipynb
    │   ├── Understat.ipynb
    │   ├── WhoScored.ipynb
    │   └── index.rst
    ├── examples
    │   ├── ClubElo - Evolution of current top teams.ipynb
    │   ├── MatchHistory - Home advantage.ipynb
    │   └── index.rst
    ├── faq.rst
    ├── howto
    │   ├── custom-leagues.rst
    │   ├── index.rst
    │   └── proxy.rst
    ├── index.rst
    ├── intro.rst
    ├── license.rst
    ├── output.csv
    ├── reference
    │   ├── base.rst
    │   ├── clubelo.rst
    │   ├── espn.rst
    │   ├── fbref.rst
    │   ├── fotmob.rst
    │   ├── index.rst
    │   ├── matchhistory.rst
    │   ├── sofascore.rst
    │   ├── sofifa.rst
    │   ├── understat.rst
    │   ├── utils.rst
    │   └── whoscored.rst
    ├── requirements.txt
    └── topics
    │   └── index.rst
├── noxfile.py
├── poetry.lock
├── pyproject.toml
├── soccerdata
    ├── __init__.py
    ├── _common.py
    ├── _config.py
    ├── clubelo.py
    ├── espn.py
    ├── fbref.py
    ├── fotmob.py
    ├── match_history.py
    ├── sofascore.py
    ├── sofifa.py
    ├── understat.py
    └── whoscored.py
└── tests
    ├── __init__.py
    ├── appdata
        └── config
        │   ├── league_dict.json
        │   └── teamname_replacements.json
    ├── conftest.py
    ├── test_ClubElo.py
    ├── test_ESPN.py
    ├── test_FBref.py
    ├── test_FotMob.py
    ├── test_Integration.py
    ├── test_MatchHistory.py
    ├── test_SoFIFA.py
    ├── test_Sofascore.py
    ├── test_Understat.py
    ├── test_Whoscored.py
    ├── test_common.py
    └── test_config.py


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.8.7
 3 | commit = True
 4 | tag = False
 5 | 
 6 | [bumpversion:file:pyproject.toml]
 7 | search = version = "{current_version}"
 8 | replace = version = "{new_version}"
 9 | 
10 | [bumpversion:file:docs/conf.py]
11 | search = release = "{current_version}"
12 | replace = release = "{new_version}"
13 | 
14 | [bumpversion:file:soccerdata/__init__.py]
15 | search = __version__ = "{current_version}"
16 | replace = __version__ = "{new_version}"
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is and the expected behavior.
12 | 
13 | **Affected scrapers**
14 | This affects the following scrapers:
15 | - [ ] ClubElo
16 | - [ ] ESPN
17 | - [ ] FBref
18 | - [ ] FiveThirtyEight
19 | - [ ] FotMob
20 | - [ ] Match History
21 | - [ ] SoFIFA
22 | - [ ] Understat
23 | - [ ] WhoScored
24 | 
25 | **Code example**
26 | A minimal code example that fails. Use `no_cache=True` to make sure an invalid cached file does not cause the bug and make sure you have the latest version of soccerdata installed.
27 | 
28 | ```python
29 | import soccerdata as sd
30 | fbref = sd.FBref(leagues="ENG-Premier League", seasons="24/25", no_cache=True)
31 | fbref.read_schedule()
32 | ```
33 | 
34 | **Error message**
35 | 
36 | ```
37 | <paste the error message here>
38 | ```
39 | 
40 | **Additional context**
41 | Add any other context about the problem here.
42 | 
43 | **Contributor Action Plan**
44 | 
45 | - [ ] I can fix this issue and will submit a pull request.
46 | - [ ] I’m unsure how to fix this, but I'm willing to work on it with guidance.
47 | - [ ] I’m not able to fix this issue.
48 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Labels names are important as they are used by Release Drafter to decide
 3 | # regarding where to record them in changelog or if to skip them.
 4 | #
 5 | # The repository labels will be automatically configured using this file and
 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler.
 7 | - name: breaking
 8 |   description: Breaking Changes
 9 |   color: bfd4f2
10 | - name: bug
11 |   description: Something isn't working
12 |   color: d73a4a
13 | - name: build
14 |   description: Build System and Dependencies
15 |   color: bfdadc
16 | - name: ci
17 |   description: Continuous Integration
18 |   color: 4a97d6
19 | - name: dependencies
20 |   description: Pull requests that update a dependency file
21 |   color: 0366d6
22 | - name: documentation
23 |   description: Improvements or additions to documentation
24 |   color: 0075ca
25 | - name: duplicate
26 |   description: This issue or pull request already exists
27 |   color: cfd3d7
28 | - name: enhancement
29 |   description: New feature or request
30 |   color: a2eeef
31 | - name: github_actions
32 |   description: Pull requests that update Github_actions code
33 |   color: "000000"
34 | - name: good first issue
35 |   description: Good for newcomers
36 |   color: 7057ff
37 | - name: help wanted
38 |   description: Extra attention is needed
39 |   color: 008672
40 | - name: invalid
41 |   description: This doesn't seem right
42 |   color: e4e669
43 | - name: performance
44 |   description: Performance
45 |   color: "016175"
46 | - name: question
47 |   description: Further information is requested
48 |   color: d876e3
49 | - name: refactoring
50 |   description: Refactoring
51 |   color: ef67c4
52 | - name: removal
53 |   description: Removals and Deprecations
54 |   color: 9ae7ea
55 | - name: testing
56 |   description: Testing
57 |   color: b1fc6f
58 | - name: wontfix
59 |   description: This will not be worked on
60 |   color: ffffff
61 | - name: common
62 |   description: Issue or pull request related to all scrapers
63 |   color: F1C40F
64 | - name: ClubElo
65 |   description: Issue or pull request related to the ClubElo scraper
66 |   color: "273746"
67 | - name: ESPN
68 |   description: Issue or pull request related to the ESPN scraper
69 |   color: "943126"
70 | - name: FBref
71 |   description: Issue or pull request related to the FBref scraper
72 |   color: 145A32
73 | - name: FiveThirtyEight
74 |   description: Issue or pull request related to the FiveThirtyEight scraper
75 |   color: E67E22
76 | - name: FotMob
77 |   description: Issue or pull request related to the FotMob scraper
78 |   color: 228B22
79 | - name: MatchHistory
80 |   description: Issue or pull request related to the MatchHistory scraper
81 |   color: 1A5276
82 | - name: Sofascore
83 |   description: Issue or pull request related to the Sofascore scraper
84 |   color: 3740F5
85 | - name: SoFIFA
86 |   description: Issue or pull request related to the SoFIFA scraper
87 |   color: 138D75
88 | - name: WhoScored
89 |   description: Issue or pull request related to the WhoScored scraper
90 |   color: 76448A
91 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | categories:
 2 |   - title: ":boom: Breaking Changes"
 3 |     label: "breaking"
 4 |   - title: ":rocket: Features"
 5 |     label: "enhancement"
 6 |   - title: ":fire: Removals and Deprecations"
 7 |     label: "removal"
 8 |   - title: ":beetle: Fixes"
 9 |     label: "bug"
10 |   - title: ":racehorse: Performance"
11 |     label: "performance"
12 |   - title: ":rotating_light: Testing"
13 |     label: "testing"
14 |   - title: ":construction_worker: Continuous Integration"
15 |     label: "ci"
16 |   - title: ":books: Documentation"
17 |     label: "documentation"
18 |   - title: ":hammer: Refactoring"
19 |     label: "refactoring"
20 |   - title: ":lipstick: Style"
21 |     label: "style"
22 |   - title: ":package: Dependencies"
23 |     labels:
24 |       - "dependencies"
25 |       - "build"
26 | change-template: "* $TITLE (#$NUMBER) @$AUTHOR"
27 | template: |
28 |   ## Changes
29 | 
30 |   $CHANGES
31 | 


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["config:base", ":automergePatch"],
3 |   "stabilityDays": 7,
4 |   "addLabels": ["dependencies"],
5 |   "pip_requirements": {
6 |     "fileMatch": ["constraints.txt"]
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | on:
  3 |   - push
  4 |   - pull_request
  5 | jobs:
  6 |   tests:
  7 |     name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }}
  8 |     runs-on: ${{ matrix.os }}
  9 |     strategy:
 10 |       fail-fast: false
 11 |       matrix:
 12 |         include:
 13 |           - {python: "3.11", os: "ubuntu-latest", session: "pre-commit"}
 14 |           - {python: "3.11", os: "ubuntu-latest", session: "mypy"}
 15 |           - {python: "3.11", os: "ubuntu-latest", session: "tests"}
 16 |           - {python: "3.10", os: "ubuntu-latest", session: "tests"}
 17 |           - {python: "3.9", os: "ubuntu-latest", session: "tests"}
 18 |           - {python: "3.11", os: "windows-latest", session: "tests"}
 19 |           - {python: "3.11", os: "macos-latest", session: "tests"}
 20 |           - {python: "3.11", os: "ubuntu-latest", session: "docs-build"}
 21 |     env:
 22 |       NOXSESSION: ${{ matrix.session }}
 23 |       FORCE_COLOR: "1"
 24 |       PRE_COMMIT_COLOR: "always"
 25 |     steps:
 26 |       - name: Check out the repository
 27 |         uses: actions/checkout@v4.2.2
 28 |       - name: Restore data cache
 29 |         if: matrix.session == 'tests'
 30 |         id: cache-data
 31 |         uses: actions/cache@v4
 32 |         with:
 33 |           path: tests/appdata/data
 34 |           key: cache-data-${{ runner.os }}-${{ matrix.python }}
 35 |       - name: Set up Python ${{ matrix.python }}
 36 |         uses: actions/setup-python@v5.6.0
 37 |         with:
 38 |           python-version: ${{ matrix.python }}
 39 |       - name: Upgrade pip
 40 |         run: |
 41 |           pip install --constraint=.github/workflows/constraints.txt pip
 42 |           pip --version
 43 |       - name: Upgrade pip in virtual environments
 44 |         shell: python
 45 |         run: |
 46 |           import os
 47 |           import pip
 48 | 
 49 |           with open(os.environ["GITHUB_ENV"], mode="a") as io:
 50 |               print(f"VIRTUALENV_PIP={pip.__version__}", file=io)
 51 |       - name: Install Poetry
 52 |         run: |
 53 |           pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt poetry
 54 |           poetry --version
 55 |       - name: Install Nox
 56 |         run: |
 57 |           pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox
 58 |           pipx inject --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox nox-poetry
 59 |           nox --version
 60 |       - name: Compute pre-commit cache key
 61 |         if: matrix.session == 'pre-commit'
 62 |         id: pre-commit-cache
 63 |         shell: python
 64 |         run: |
 65 |           import hashlib
 66 |           import sys
 67 | 
 68 |           python = "py{}.{}".format(*sys.version_info[:2])
 69 |           payload = sys.version.encode() + sys.executable.encode()
 70 |           digest = hashlib.sha256(payload).hexdigest()
 71 |           result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8])
 72 | 
 73 |           print("::set-output name=result::{}".format(result))
 74 |       - name: Restore pre-commit cache
 75 |         uses: actions/cache@v4.2.3
 76 |         if: matrix.session == 'pre-commit'
 77 |         with:
 78 |           path: ~/.cache/pre-commit
 79 |           key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }}
 80 |           restore-keys: |
 81 |             ${{ steps.pre-commit-cache.outputs.result }}-
 82 |       - name: Install pandoc
 83 |         if: matrix.session == 'docs-build'
 84 |         run: sudo apt-get install -y pandoc
 85 |       - name: Run Nox
 86 |         run: |
 87 |           nox --force-color --python=${{ matrix.python }}
 88 |       - name: Upload coverage data
 89 |         if: always() && matrix.session == 'tests'
 90 |         uses: actions/upload-artifact@v4
 91 |         with:
 92 |           name: coverage-data-${{ matrix.os }}-${{ matrix.python }}
 93 |           path: ".coverage.*"
 94 |           include-hidden-files: true
 95 |       - name: Upload documentation
 96 |         if: matrix.session == 'docs-build'
 97 |         uses: actions/upload-artifact@v4
 98 |         with:
 99 |           name: docs
100 |           path: docs/_build
101 |   coverage:
102 |     runs-on: ubuntu-latest
103 |     needs: tests
104 |     steps:
105 |       - name: Check out the repository
106 |         uses: actions/checkout@v4.2.2
107 |       - name: Set up Python
108 |         uses: actions/setup-python@v5.6.0
109 |         with:
110 |           python-version: "3.11"
111 |       - name: Upgrade pip
112 |         run: |
113 |           pip install --constraint=.github/workflows/constraints.txt pip
114 |           pip --version
115 |       - name: Install Poetry
116 |         run: |
117 |           pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt poetry
118 |           poetry --version
119 |       - name: Install Nox
120 |         run: |
121 |           pipx install --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox
122 |           pipx inject --pip-args=--constraint=${{ github.workspace }}/.github/workflows/constraints.txt nox nox-poetry
123 |           nox --version
124 |       - name: Download coverage data
125 |         uses: actions/download-artifact@v4
126 |         with:
127 |           pattern: coverage-data-*
128 |           merge-multiple: true
129 |       - name: Combine coverage data and display human readable report
130 |         run: |
131 |           nox --force-color --session=coverage
132 |       - name: Create coverage report
133 |         run: |
134 |           nox --force-color --session=coverage -- xml
135 |       - name: Upload coverage report
136 |         uses: codecov/codecov-action@v5.4.2
137 | 


--------------------------------------------------------------------------------
/.github/workflows/constraints.txt:
--------------------------------------------------------------------------------
1 | pip==25.1.1
2 | nox==2025.5.1
3 | nox-poetry==1.1.0
4 | poetry==1.8.5
5 | virtualenv==20.31.2
6 | 


--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
 1 | name: Labeler
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | 
 9 | jobs:
10 |   labeler:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Check out the repository
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Run Labeler
17 |         uses: crazy-max/ghaction-github-labeler@v5.3.0
18 |         with:
19 |           skip-delete: true
20 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | 
 9 | jobs:
10 |   release:
11 |     name: Release
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Check out the repository
15 |         uses: actions/checkout@v4.2.2
16 |         with:
17 |           fetch-depth: 2
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5.6.0
21 |         with:
22 |           python-version: "3.8"
23 | 
24 |       - name: Upgrade pip
25 |         run: |
26 |           pip install --constraint=.github/workflows/constraints.txt pip
27 |           pip --version
28 | 
29 |       - name: Install Poetry
30 |         run: |
31 |           pip install --constraint=.github/workflows/constraints.txt poetry
32 |           poetry --version
33 | 
34 |       - name: Check if there is a parent commit
35 |         id: check-parent-commit
36 |         run: |
37 |           echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)"
38 | 
39 |       - name: Detect and tag new version
40 |         id: check-version
41 |         if: steps.check-parent-commit.outputs.sha
42 |         uses: salsify/action-detect-and-tag-new-version@v2.0.3
43 |         with:
44 |           version-command: |
45 |             bash -o pipefail -c "poetry version | awk '{ print \$2 }'"
46 | 
47 |       - name: Bump version for developmental release
48 |         if: "! steps.check-version.outputs.tag"
49 |         run: |
50 |           poetry version patch &&
51 |           version=$(poetry version | awk '{ print $2 }') &&
52 |           poetry version $version.dev.$(date +%s)
53 | 
54 |       - name: Build package
55 |         run: |
56 |           poetry build --ansi
57 | 
58 |       - name: Publish package on PyPI
59 |         if: steps.check-version.outputs.tag
60 |         uses: pypa/gh-action-pypi-publish@v1.12.4
61 |         with:
62 |           user: __token__
63 |           password: ${{ secrets.PYPI_TOKEN }}
64 | 
65 |       - name: Publish package on TestPyPI
66 |         if: "! steps.check-version.outputs.tag"
67 |         uses: pypa/gh-action-pypi-publish@v1.12.4
68 |         with:
69 |           user: __token__
70 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
71 |           repository_url: https://test.pypi.org/legacy/
72 | 
73 |       - name: Publish the release notes
74 |         uses: release-drafter/release-drafter@v6.1.0
75 |         with:
76 |           publish: ${{ steps.check-version.outputs.tag != '' }}
77 |           tag: ${{ steps.check-version.outputs.tag }}
78 |         env:
79 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | logs
 3 | config
 4 | notebooks/data
 5 | notebooks_priv
 6 | 
 7 | *.py[cod]
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | 
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | 
35 | # Translations
36 | *.mo
37 | 
38 | # Data
39 | .ipynb_checkpoints
40 | 
41 | # Sphinx documentation
42 | docs/_build/
43 | docs/modules/generated/
44 | 
45 | # Hidden files
46 | .*
47 | 
48 | # ...except these
49 | !.gitignore
50 | !.travis.yml
51 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: check-added-large-files
 5 |         name: Check for added large files
 6 |         entry: check-added-large-files
 7 |         language: system
 8 |       - id: check-toml
 9 |         name: Check Toml
10 |         entry: check-toml
11 |         language: system
12 |         types: [toml]
13 |       - id: check-yaml
14 |         name: Check Yaml
15 |         entry: check-yaml
16 |         language: system
17 |         types: [yaml]
18 |       - id: darglint
19 |         name: darglint
20 |         entry: darglint
21 |         language: system
22 |         types: [python]
23 |         stages: [manual]
24 |       - id: end-of-file-fixer
25 |         name: Fix End of Files
26 |         entry: end-of-file-fixer
27 |         language: system
28 |         types: [text]
29 |         stages: [commit, push, manual]
30 |       - id: pyupgrade
31 |         name: pyupgrade
32 |         description: Automatically upgrade syntax for newer versions.
33 |         entry: pyupgrade
34 |         language: system
35 |         types: [python]
36 |         args: [--py39-plus]
37 |       - id: trailing-whitespace
38 |         name: Trim Trailing Whitespace
39 |         entry: trailing-whitespace-fixer
40 |         language: system
41 |         types: [text]
42 |         stages: [commit, push, manual]
43 |   - repo: https://github.com/astral-sh/ruff-pre-commit
44 |     # Ruff version.
45 |     rev: v0.4.9
46 |     hooks:
47 |       # Run the linter.
48 |       - id: ruff
49 |         args: [--fix]
50 |       # Run the formatter.
51 |       - id: ruff-format
52 |   - repo: https://github.com/pre-commit/mirrors-prettier
53 |     rev: v3.1.0
54 |     hooks:
55 |       - id: prettier
56 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Configuration for the documentation build process.
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF
19 | formats: all
20 | 
21 | # Optionally set the version of Python and requirements required to build your docs
22 | python:
23 |   install:
24 |     - requirements: docs/requirements.txt
25 |     - method: pip
26 |       path: .
27 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Contributor Guide
  3 | =================
  4 | 
  5 | This document lays out guidelines and advice for contributing to this project.
  6 | If you're thinking of contributing, please start by reading this document and
  7 | getting a feel for how contributing to this project works. If you have any
  8 | questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer.
  9 | 
 10 | .. _Pieter Robberechts: https://people.cs.kuleuven.be/~pieter.robberechts/
 11 | 
 12 | The guide is split into sections based on the type of contribution you're
 13 | thinking of making.
 14 | 
 15 | 
 16 | .. _bug-reports:
 17 | 
 18 | Bug Reports
 19 | -----------
 20 | 
 21 | Bug reports are hugely important! Before you raise one, though, please check
 22 | through the `GitHub issues`_, **both open and closed**, to confirm that the bug
 23 | hasn't been reported before.
 24 | 
 25 | When filing an issue, make sure to answer these questions:
 26 | 
 27 | - Which Python version are you using?
 28 | - Which version of soccerdata are you using?
 29 | - What did you do?
 30 | - What did you expect to see?
 31 | - What did you see instead?
 32 | 
 33 | The best way to get your bug fixed is to provide a test case,
 34 | and/or steps to reproduce the issue.
 35 | 
 36 | .. _GitHub issues: https://github.com/probberechts/soccerdata/issues
 37 | 
 38 | 
 39 | Feature Requests
 40 | ----------------
 41 | 
 42 | If you believe there is a feature missing, feel free to raise a feature
 43 | request on the `Issue Tracker`_.
 44 | 
 45 | .. _Issue tracker: https://github.com/probberechts/soccerdata/issues
 46 | 
 47 | 
 48 | Documentation Contributions
 49 | ---------------------------
 50 | 
 51 | Documentation improvements are always welcome! The documentation files live in
 52 | the ``docs/`` directory of the codebase. They're written in
 53 | `reStructuredText`_, and use `Sphinx`_ to generate the full suite of
 54 | documentation.
 55 | 
 56 | You do not have to setup a development environment to make small changes to
 57 | the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes.
 58 | 
 59 | When contributing documentation, please do your best to follow the style of the
 60 | documentation files. This means a soft-limit of 79 characters wide in your text
 61 | files and a semi-formal, yet friendly and approachable, prose style.
 62 | 
 63 | When presenting Python code, use single-quoted strings (``'hello'`` instead of
 64 | ``"hello"``).
 65 | 
 66 | .. _reStructuredText: http://docutils.sourceforge.net/rst.html
 67 | .. _Sphinx: http://sphinx-doc.org/index.html
 68 | .. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
 69 | 
 70 | 
 71 | Code Contributions
 72 | ------------------
 73 | 
 74 | If you intend to contribute code, do not feel the need to sit on your
 75 | contribution until it is perfectly polished and complete. It helps everyone
 76 | involved for you to seek feedback as early as you possibly can. Submitting an
 77 | early, unfinished version of your contribution for feedback can save you from
 78 | putting a lot of work into a contribution that is not suitable for the
 79 | project.
 80 | 
 81 | Setting up your development environment
 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 83 | 
 84 | You need Python 3.7.1+ and the following tools:
 85 | 
 86 | - Poetry_
 87 | - Nox_
 88 | - nox-poetry_
 89 | 
 90 | Install the package with development requirements:
 91 | 
 92 | .. code:: console
 93 | 
 94 |    $ poetry install
 95 |    $ poetry self add poetry-plugin-export
 96 | 
 97 | You can now run an interactive Python session.
 98 | 
 99 | .. code:: console
100 | 
101 |    $ poetry run python
102 | 
103 | .. _Poetry: https://python-poetry.org/
104 | .. _Nox: https://nox.thea.codes/
105 | .. _nox-poetry: https://nox-poetry.readthedocs.io/
106 | 
107 | Steps for submitting Code
108 | ~~~~~~~~~~~~~~~~~~~~~~~~~
109 | 
110 | When contributing code, you'll want to follow this checklist:
111 | 
112 | 1. Fork the repository on GitHub.
113 | 2. Run the tests to confirm they all pass on your system. If they don't, you'll
114 |    need to investigate why they fail. If you're unable to diagnose this
115 |    yourself, raise it as a bug report.
116 | 3. Write tests that demonstrate your bug or feature. Ensure that they fail.
117 | 4. Make your change.
118 | 5. Run the entire test suite again, confirming that all tests pass *including
119 |    the ones you just added*.
120 | 6. Make sure your code follows the code style discussed below.
121 | 7. Send a GitHub Pull Request to the main repository's ``master`` branch.
122 |    GitHub Pull Requests are the expected method of code collaboration on this
123 |    project.
124 | 
125 | Testing the project
126 | ~~~~~~~~~~~~~~~~~~~
127 | 
128 | Run the full test suite:
129 | 
130 | .. code:: console
131 | 
132 |    $ nox
133 | 
134 | List the available Nox sessions:
135 | 
136 | .. code:: console
137 | 
138 |    $ nox --list-sessions
139 | 
140 | You can also run a specific Nox session.
141 | For example, invoke the unit test suite like this:
142 | 
143 | .. code:: console
144 | 
145 |    $ nox --session=tests
146 | 
147 | Unit tests are located in the ``tests`` directory,
148 | and are written using the pytest_ testing framework.
149 | 
150 | .. _pytest: https://pytest.readthedocs.io/
151 | 
152 | Code style
153 | ~~~~~~~~~~~
154 | 
155 | The soccerdata codebase uses the `PEP 8`_ code style. In addition, we have
156 | a few guidelines:
157 | 
158 | - Line-length can exceed 79 characters, to 100, when convenient.
159 | - Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient.
160 | - Always use double-quoted strings (e.g. ``"#soccer"``), unless a double-quote occurs within the string.
161 | 
162 | To ensure all code conforms to this format. You can format the code using the
163 | pre-commit hooks.
164 | 
165 | .. code:: console
166 | 
167 |    $ nox --session=pre-commit
168 | 
169 | Docstrings are to follow the `numpydoc guidelines`_.
170 | 
171 | .. _PEP 8: https://pep8.org/
172 | .. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html
173 | 
174 | Submitting changes
175 | ~~~~~~~~~~~~~~~~~~
176 | 
177 | Open a `pull request`_ to submit changes to this project.
178 | 
179 | Your pull request needs to meet the following guidelines for acceptance:
180 | 
181 | - The Nox test suite must pass without errors and warnings.
182 | - Include unit tests.
183 | - If your changes add functionality, update the documentation accordingly.
184 | 
185 | Feel free to submit early, though. We can always iterate on this.
186 | 
187 | To run linting and code formatting checks before committing your change, you
188 | can install pre-commit as a Git hook by running the following command:
189 | 
190 | .. code:: console
191 | 
192 |    $ nox --session=pre-commit -- install
193 | 
194 | It is recommended to open an issue before starting work on anything.
195 | 
196 | .. _pull request: https://github.com/probberechts/soccerdata/pulls
197 | .. github-only
198 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Apache License
 2 | ==============
 3 | 
 4 | Copyright (c) 2021 Pieter Robberechts
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |     http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 
18 | This file incorporates code of the `footballdata`_ software package covered
19 | by the following copyright and permission notice:
20 | 
21 |   Copyright (c) 2017 skagr
22 | 
23 |   Permission is hereby granted, free of charge, to any person obtaining a copy
24 |   of this software and associated documentation files (the "Software"), to deal
25 |   in the Software without restriction, including without limitation the rights
26 |   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27 |   copies of the Software, and to permit persons to whom the Software is
28 |   furnished to do so, subject to the following conditions:
29 | 
30 |   The above copyright notice and this permission notice shall be included in all
31 |   copies or substantial portions of the Software.
32 | 
33 | .. _footballdata: https://github.com/skagr/footballdata
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: init test lint pretty
 2 | 
 3 | BIN = .venv/bin/
 4 | CODE = soccerdata
 5 | PY = 3.9
 6 | 
 7 | init:
 8 | 	python3 -m venv .venv
 9 | 	poetry install
10 | 
11 | test:
12 | 	nox -rs tests-$(PY) -- $(args)
13 | 
14 | mypy:
15 | 	nox -rs mypy-$(PY) -- $(args)
16 | 
17 | lint:
18 | 	nox -rs pre-commit -- $(args)
19 | 
20 | precommit_install:
21 | 	nox -rs pre-commit -- install
22 | 
23 | bump_major:
24 | 	$(BIN)bumpversion major
25 | 
26 | bump_minor:
27 | 	$(BIN)bumpversion minor
28 | 
29 | bump_patch:
30 | 	$(BIN)bumpversion patch
31 | 
32 | clean:
33 | 	find . -type f -name "*.py[co]" -delete
34 | 	find . -type d -name "__pycache__" -delete
35 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://raw.githubusercontent.com/probberechts/soccerdata/master/docs/_static/logo2.png
 2 |    :align: center
 3 |    :alt: SoccerData
 4 |    :width: 600px
 5 | 
 6 | .. badges-begin
 7 | 
 8 | |Downloads| |PyPI| |Python Version| |License| |Read the Docs| |Tests| |Codecov| |pre-commit| |Black|
 9 | 
10 | .. |Downloads| image:: https://static.pepy.tech/badge/soccerdata/month
11 |    :target: https://pepy.tech/project/soccerdata
12 |    :alt: Downloads Per Month
13 | .. |PyPI| image:: https://img.shields.io/pypi/v/soccerdata.svg
14 |    :target: https://pypi.org/project/soccerdata/
15 |    :alt: PyPI
16 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/soccerdata
17 |    :target: https://pypi.org/project/soccerdata
18 |    :alt: Python Version
19 | .. |License| image:: https://img.shields.io/pypi/l/soccerdata.svg
20 |    :target: https://opensource.org/licenses/Apache-2.0
21 |    :alt: License
22 | .. |Read the Docs| image:: https://img.shields.io/readthedocs/soccerdata/latest.svg?label=Read%20the%20Docs
23 |    :target: https://soccerdata.readthedocs.io/
24 |    :alt: Read the documentation at https://soccerdata.readthedocs.io/
25 | .. |Tests| image:: https://github.com/probberechts/soccerdata/workflows/CI/badge.svg
26 |    :target: https://github.com/probberechts/soccerdata/actions?workflow=CI
27 |    :alt: Tests
28 | .. |Codecov| image:: https://codecov.io/gh/probberechts/soccerdata/branch/master/graph/badge.svg
29 |    :target: https://app.codecov.io/gh/probberechts/soccerdata
30 |    :alt: Codecov
31 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white
32 |    :target: https://github.com/pre-commit/pre-commit
33 |    :alt: pre-commit
34 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
35 |    :target: https://github.com/psf/black
36 |    :alt: Black
37 | 
38 | .. badges-end
39 | 
40 | SoccerData is a collection of scrapers to gather soccer data from popular
41 | websites, including `Club Elo`_, `ESPN`_, `FBref`_,
42 | `Football-Data.co.uk`_, `FotMob`_, `Sofascore`_, `SoFIFA`_, `Understat`_ and `WhoScored`_.
43 | You get Pandas DataFrames with sensible, matching column names and identifiers
44 | across datasets. Data is downloaded when needed and cached locally.
45 | 
46 | .. code:: python
47 | 
48 |    import soccerdata as sd
49 | 
50 |    # Create a scraper class instance for the 2020/21 Premier League
51 |    fbref = sd.FBref('ENG-Premier League', '2021')
52 | 
53 |    # Fetch data
54 |    games = fbref.read_schedule()
55 |    team_season_stats = fbref.read_team_season_stats(stat_type="passing")
56 |    player_season_stats = fbref.read_player_season_stats(stat_type="standard")
57 | 
58 | To learn how to install, configure and use SoccerData, see the
59 | `Quickstart guide <https://soccerdata.readthedocs.io/en/latest/intro.html>`__. For documentation on each of the
60 | supported data sources, see the `example notebooks <https://soccerdata.readthedocs.io/en/latest/datasources/>`__
61 | and `API reference <https://soccerdata.readthedocs.io/en/latest/reference/>`__.
62 | 
63 | .. _Club Elo: https://www.clubelo.com/
64 | .. _ESPN: https://www.espn.com/soccer/
65 | .. _FBref: https://www.fbref.com/en/
66 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/
67 | .. _Football-Data.co.uk: https://www.football-data.co.uk/
68 | .. _FotMob: https://fotmob.com/
69 | .. _Sofascore: https://www.sofascore.com/
70 | .. _SoFIFA: https://sofifa.com/
71 | .. _Understat: https://understat.com/
72 | .. _WhoScored: https://www.whoscored.com/
73 | 
74 | **Usage Notice:** Please use this web scraping tool responsibly and in compliance with the terms of service of the
75 | websites you intend to scrape. The software is provided as-is, without any warranty or guarantees of any kind. The
76 | developers disclaim any responsibility for misuse, legal consequences, or damages resulting from its use. It is
77 | your responsibility to use the software in accordance with the laws and regulations of your jurisdiction.
78 | 
79 | **Contribution and Issues:** As SoccerData relies on web scraping, any changes to the
80 | scraped websites will break the package. Hence, do not expect that all code
81 | will work all the time. If you spot any bugs, then please `fork it and start
82 | a pull request <https://github.com/probberechts/soccerdata/blob/master/CONTRIBUTING.rst>`__.
83 | 


--------------------------------------------------------------------------------
/docs/_static/artwork.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/artwork.ai


--------------------------------------------------------------------------------
/docs/_static/default.css:
--------------------------------------------------------------------------------
 1 | .toctree-l1 a:active,
 2 | .toctree-l1 a:hover {
 3 |   background-color: #676767;
 4 | }
 5 | 
 6 | .sidebar-logo {
 7 |   max-width: 100%;
 8 | }
 9 | 
10 | .sidebar-drawer {
11 |   width: calc(50% - 25em);
12 |   min-width: 22em;
13 | }
14 | 
15 | .sidebar-drawer .sidebar-container {
16 |   width: 23em;
17 | }
18 | 
19 | li.toctree-l2 {
20 |   font-size: 80%;
21 | }
22 | 
23 | @media (max-width: 67em) {
24 |   .sidebar-drawer {
25 |     width: 22em;
26 |     left: -22em;
27 |   }
28 |   .sidebar-drawer .sidebar-container {
29 |     width: 22em;
30 |   }
31 |   li.toctree-l2 {
32 |     font-size: 75%;
33 |   }
34 | }
35 | 
36 | /* autosummary table text */
37 | article .align-center,
38 | article .align-default {
39 |   text-align: left;
40 | }
41 | 
42 | dt {
43 |   font-weight: bold !important;
44 | }
45 | 


--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/_static/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/probberechts/soccerdata/519e2316b441903c759dfcb217421b4410739ba7/docs/_static/logo2.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | """Sphinx configuration."""
  2 | 
  3 | # -- Project information -----------------------------------------------------
  4 | 
  5 | project = "soccerdata"
  6 | author = "Pieter Robberechts"
  7 | copyright = f"2021, {author}"  # noqa: A001
  8 | 
  9 | # The full version, including alpha/beta/rc tags
 10 | release = "1.8.7"
 11 | 
 12 | # -- General configuration ------------------------------------------------
 13 | 
 14 | extensions = [
 15 |     "sphinx.ext.autodoc",
 16 |     "sphinx.ext.napoleon",
 17 |     "nbsphinx",
 18 |     # 'sphinx_gallery.load_style',
 19 | ]
 20 | exclude_patterns = ["_build", "**.ipynb_checkpoints"]
 21 | autodoc_typehints = "description"
 22 | autodoc_member_order = "bysource"
 23 | 
 24 | # -- Options for HTML output -------------------------------------------------
 25 | 
 26 | html_theme = "furo"
 27 | html_logo = "_static/logo2.png"
 28 | html_favicon = "_static/favicon.ico"
 29 | html_theme_options = {
 30 |     "sidebar_hide_name": True,
 31 |     "light_css_variables": {
 32 |         "color-brand-primary": "#2F3C7E",
 33 |         "color-brand-content": "#2F3C7E",
 34 |         "color-sidebar-background": "#fdf3f4",
 35 |         # "color-api-name": "#7bb5b2",
 36 |         # "color-api-pre-name": "#7bb5b2",
 37 |     },
 38 |     "dark_css_variables": {
 39 |         "color-brand-primary": "#7C4DFF",
 40 |         "color-brand-content": "#7C4DFF",
 41 |     },
 42 | }
 43 | 
 44 | html_static_path = ["_static"]
 45 | html_css_files = ["default.css"]
 46 | 
 47 | # -- Options for nbsphinx ---------------------------------------------------
 48 | 
 49 | nbsphinx_thumbnails = {
 50 |     "examples/datasources/ClubElo": "_static/ClubElo-logo.png",
 51 |     "examples/datasources/ESPN": "_static/ESPN-logo.png",
 52 |     "examples/datasources/WhoScored": "_static/WhoScored-logo.png",
 53 |     "examples/datasources/FBref": "_static/FBref-logo.png",
 54 |     "examples/datasources/FiveThirtyEight": "_static/FiveThirtyEight-logo.png",
 55 |     "examples/datasources/MatchHistory": "_static/FootballData-logo.jpg",
 56 |     "examples/datasources/SoFIFA": "_static/SoFIFA-logo.png",
 57 |     "examples/datasources/Understat": "_static/Understat-logo.png",
 58 | }
 59 | 
 60 | # This is processed by Jinja2 and inserted before each notebook
 61 | nbsphinx_prolog = r"""
 62 | {% set docname = 'doc/' + env.doc2path(env.docname, base=None) %}
 63 | 
 64 | .. raw:: html
 65 | 
 66 |     <div class="admonition note">
 67 |       This page was generated from
 68 |       <a class="reference external" href="https://github.com/probberechts/soccerdata/blob/{{ env.config.release|e }}/{{ docname|e }}">{{ docname|e }}</a>.<br />
 69 |       You can <a href="{{ env.docname.split('/')|last|e + '.ipynb' }}" class="reference download internal" download>download the notebook</a>,
 70 |       <script>
 71 |         if (document.location.host) {
 72 |           let nbviewer_link = document.createElement('a');
 73 |           nbviewer_link.setAttribute('href',
 74 |             'https://nbviewer.org/url' +
 75 |             (window.location.protocol == 'https:' ? 's/' : '/') +
 76 |             window.location.host +
 77 |             window.location.pathname.slice(0, -4) +
 78 |             'ipynb');
 79 |           nbviewer_link.innerHTML = 'or view it on <em>nbviewer</em>';
 80 |           nbviewer_link.classList.add('reference');
 81 |           nbviewer_link.classList.add('external');
 82 |           document.currentScript.replaceWith(nbviewer_link, '.');
 83 |         }
 84 |       </script>
 85 |     </div>
 86 | 
 87 | .. raw:: latex
 88 | 
 89 |     \nbsphinxstartnotebook{\scriptsize\noindent\strut
 90 |     \textcolor{gray}{The following section was generated from
 91 |     \sphinxcode{\sphinxupquote{\strut {{ docname | escape_latex }}}} \dotfill}}
 92 | """  # noqa
 93 | 
 94 | # This is processed by Jinja2 and inserted after each notebook
 95 | nbsphinx_epilog = r"""
 96 | {% set docname = 'doc/' + env.doc2path(env.docname, base=None) %}
 97 | .. raw:: latex
 98 | 
 99 |     \nbsphinxstopnotebook{\scriptsize\noindent\strut
100 |     \textcolor{gray}{\dotfill\ \sphinxcode{\sphinxupquote{\strut
101 |     {{ docname | escape_latex }}}} ends here.}}
102 | """
103 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. _contributing:
2 | .. include:: ../CONTRIBUTING.rst
3 | 


--------------------------------------------------------------------------------
/docs/datasources/ClubElo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "e621e3ae",
  7 |    "metadata": {
  8 |     "nbsphinx": "hidden"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "env: SOCCERDATA_LOGLEVEL=ERROR\n",
 16 |       "env: SOCCERDATA_NOCACHE=True\n",
 17 |       "env: SOCCERDATA_NOSTORE=True\n"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "%env SOCCERDATA_LOGLEVEL=ERROR\n",
 23 |     "%env SOCCERDATA_NOCACHE=True\n",
 24 |     "%env SOCCERDATA_NOSTORE=True"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "id": "2454afe6",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import soccerdata as sd"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "b5784f2d",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# ClubElo"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "id": "8dab5be9",
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Provides pd.DataFrames from CSV API at http://api.clubelo.com.\n",
 56 |       "\n",
 57 |       "    Data will be downloaded as necessary and cached locally in\n",
 58 |       "    ``~/soccerdata/data/ClubElo``.\n",
 59 |       "\n",
 60 |       "    Since the source does not provide league names, this class will not filter\n",
 61 |       "    by league. League names will be inserted from the other sources where\n",
 62 |       "    available. Leagues that are only covered by clubelo.com will have NaN\n",
 63 |       "    values.\n",
 64 |       "\n",
 65 |       "    Parameters\n",
 66 |       "    ----------\n",
 67 |       "    proxy : 'tor' or or dict or list(dict) or callable, optional\n",
 68 |       "        Use a proxy to hide your IP address. Valid options are:\n",
 69 |       "            - \"tor\": Uses the Tor network. Tor should be running in\n",
 70 |       "              the background on port 9050.\n",
 71 |       "            - dict: A dictionary with the proxy to use. The dict should be\n",
 72 |       "              a mapping of supported protocols to proxy addresses. For example::\n",
 73 |       "\n",
 74 |       "                  {\n",
 75 |       "                      'http': 'http://10.10.1.10:3128',\n",
 76 |       "                      'https': 'http://10.10.1.10:1080',\n",
 77 |       "                  }\n",
 78 |       "\n",
 79 |       "            - list(dict): A list of proxies to choose from. A different proxy will\n",
 80 |       "              be selected from this list after failed requests, allowing rotating\n",
 81 |       "              proxies.\n",
 82 |       "            - callable: A function that returns a valid proxy. This function will\n",
 83 |       "              be called after failed requests, allowing rotating proxies.\n",
 84 |       "    no_cache : bool\n",
 85 |       "        If True, will not use cached data.\n",
 86 |       "    no_store : bool\n",
 87 |       "        If True, will not store downloaded data.\n",
 88 |       "    data_dir : Path\n",
 89 |       "        Path to directory where data will be cached.\n",
 90 |       "    \n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "elo = sd.ClubElo()\n",
 96 |     "print(elo.__doc__)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "3a4c2916",
102 |    "metadata": {},
103 |    "source": [
104 |     "## ELO scores for all teams at specified date"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "745be31a",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/html": [
116 |        "<div>\n",
117 |        "<style scoped>\n",
118 |        "    .dataframe tbody tr th:only-of-type {\n",
119 |        "        vertical-align: middle;\n",
120 |        "    }\n",
121 |        "\n",
122 |        "    .dataframe tbody tr th {\n",
123 |        "        vertical-align: top;\n",
124 |        "    }\n",
125 |        "\n",
126 |        "    .dataframe thead th {\n",
127 |        "        text-align: right;\n",
128 |        "    }\n",
129 |        "</style>\n",
130 |        "<table border=\"1\" class=\"dataframe\">\n",
131 |        "  <thead>\n",
132 |        "    <tr style=\"text-align: right;\">\n",
133 |        "      <th></th>\n",
134 |        "      <th>rank</th>\n",
135 |        "      <th>country</th>\n",
136 |        "      <th>level</th>\n",
137 |        "      <th>elo</th>\n",
138 |        "      <th>from</th>\n",
139 |        "      <th>to</th>\n",
140 |        "      <th>league</th>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>team</th>\n",
144 |        "      <th></th>\n",
145 |        "      <th></th>\n",
146 |        "      <th></th>\n",
147 |        "      <th></th>\n",
148 |        "      <th></th>\n",
149 |        "      <th></th>\n",
150 |        "      <th></th>\n",
151 |        "    </tr>\n",
152 |        "  </thead>\n",
153 |        "  <tbody>\n",
154 |        "    <tr>\n",
155 |        "      <th>Liverpool</th>\n",
156 |        "      <td>1.0</td>\n",
157 |        "      <td>ENG</td>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>2047.083862</td>\n",
160 |        "      <td>2022-04-20</td>\n",
161 |        "      <td>2022-04-24</td>\n",
162 |        "      <td>ENG-Premier League</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>Man City</th>\n",
166 |        "      <td>2.0</td>\n",
167 |        "      <td>ENG</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>2037.059937</td>\n",
170 |        "      <td>2022-04-21</td>\n",
171 |        "      <td>2022-04-23</td>\n",
172 |        "      <td>ENG-Premier League</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>Bayern</th>\n",
176 |        "      <td>3.0</td>\n",
177 |        "      <td>GER</td>\n",
178 |        "      <td>1</td>\n",
179 |        "      <td>1984.775391</td>\n",
180 |        "      <td>2022-04-18</td>\n",
181 |        "      <td>2022-04-23</td>\n",
182 |        "      <td>GER-Bundesliga</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>Real Madrid</th>\n",
186 |        "      <td>4.0</td>\n",
187 |        "      <td>ESP</td>\n",
188 |        "      <td>1</td>\n",
189 |        "      <td>1969.584351</td>\n",
190 |        "      <td>2022-04-21</td>\n",
191 |        "      <td>2022-04-26</td>\n",
192 |        "      <td>ESP-La Liga</td>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "      <th>Chelsea</th>\n",
196 |        "      <td>5.0</td>\n",
197 |        "      <td>ENG</td>\n",
198 |        "      <td>1</td>\n",
199 |        "      <td>1921.101440</td>\n",
200 |        "      <td>2022-04-21</td>\n",
201 |        "      <td>2022-04-24</td>\n",
202 |        "      <td>ENG-Premier League</td>\n",
203 |        "    </tr>\n",
204 |        "  </tbody>\n",
205 |        "</table>\n",
206 |        "</div>"
207 |       ],
208 |       "text/plain": [
209 |        "             rank country  level          elo       from         to  \\\n",
210 |        "team                                                                  \n",
211 |        "Liverpool     1.0     ENG      1  2047.083862 2022-04-20 2022-04-24   \n",
212 |        "Man City      2.0     ENG      1  2037.059937 2022-04-21 2022-04-23   \n",
213 |        "Bayern        3.0     GER      1  1984.775391 2022-04-18 2022-04-23   \n",
214 |        "Real Madrid   4.0     ESP      1  1969.584351 2022-04-21 2022-04-26   \n",
215 |        "Chelsea       5.0     ENG      1  1921.101440 2022-04-21 2022-04-24   \n",
216 |        "\n",
217 |        "                         league  \n",
218 |        "team                             \n",
219 |        "Liverpool    ENG-Premier League  \n",
220 |        "Man City     ENG-Premier League  \n",
221 |        "Bayern           GER-Bundesliga  \n",
222 |        "Real Madrid         ESP-La Liga  \n",
223 |        "Chelsea      ENG-Premier League  "
224 |       ]
225 |      },
226 |      "execution_count": 4,
227 |      "metadata": {},
228 |      "output_type": "execute_result"
229 |     }
230 |    ],
231 |    "source": [
232 |     "current_elo = elo.read_by_date()\n",
233 |     "current_elo.head()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "246ca661",
239 |    "metadata": {},
240 |    "source": [
241 |     "## Full ELO history for one club"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 5,
247 |    "id": "1c87e14a",
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/html": [
253 |        "<div>\n",
254 |        "<style scoped>\n",
255 |        "    .dataframe tbody tr th:only-of-type {\n",
256 |        "        vertical-align: middle;\n",
257 |        "    }\n",
258 |        "\n",
259 |        "    .dataframe tbody tr th {\n",
260 |        "        vertical-align: top;\n",
261 |        "    }\n",
262 |        "\n",
263 |        "    .dataframe thead th {\n",
264 |        "        text-align: right;\n",
265 |        "    }\n",
266 |        "</style>\n",
267 |        "<table border=\"1\" class=\"dataframe\">\n",
268 |        "  <thead>\n",
269 |        "    <tr style=\"text-align: right;\">\n",
270 |        "      <th></th>\n",
271 |        "      <th>rank</th>\n",
272 |        "      <th>team</th>\n",
273 |        "      <th>country</th>\n",
274 |        "      <th>level</th>\n",
275 |        "      <th>elo</th>\n",
276 |        "      <th>to</th>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <th>from</th>\n",
280 |        "      <th></th>\n",
281 |        "      <th></th>\n",
282 |        "      <th></th>\n",
283 |        "      <th></th>\n",
284 |        "      <th></th>\n",
285 |        "      <th></th>\n",
286 |        "    </tr>\n",
287 |        "  </thead>\n",
288 |        "  <tbody>\n",
289 |        "    <tr>\n",
290 |        "      <th>1939-10-22</th>\n",
291 |        "      <td>NaN</td>\n",
292 |        "      <td>Barcelona</td>\n",
293 |        "      <td>ESP</td>\n",
294 |        "      <td>1</td>\n",
295 |        "      <td>1636.704590</td>\n",
296 |        "      <td>1939-12-03</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>1939-12-04</th>\n",
300 |        "      <td>NaN</td>\n",
301 |        "      <td>Barcelona</td>\n",
302 |        "      <td>ESP</td>\n",
303 |        "      <td>1</td>\n",
304 |        "      <td>1626.102173</td>\n",
305 |        "      <td>1939-12-10</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>1939-12-11</th>\n",
309 |        "      <td>NaN</td>\n",
310 |        "      <td>Barcelona</td>\n",
311 |        "      <td>ESP</td>\n",
312 |        "      <td>1</td>\n",
313 |        "      <td>1636.728271</td>\n",
314 |        "      <td>1939-12-17</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>1939-12-18</th>\n",
318 |        "      <td>NaN</td>\n",
319 |        "      <td>Barcelona</td>\n",
320 |        "      <td>ESP</td>\n",
321 |        "      <td>1</td>\n",
322 |        "      <td>1646.951660</td>\n",
323 |        "      <td>1939-12-24</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>1939-12-25</th>\n",
327 |        "      <td>NaN</td>\n",
328 |        "      <td>Barcelona</td>\n",
329 |        "      <td>ESP</td>\n",
330 |        "      <td>1</td>\n",
331 |        "      <td>1637.424316</td>\n",
332 |        "      <td>1939-12-31</td>\n",
333 |        "    </tr>\n",
334 |        "  </tbody>\n",
335 |        "</table>\n",
336 |        "</div>"
337 |       ],
338 |       "text/plain": [
339 |        "            rank       team country  level          elo         to\n",
340 |        "from                                                              \n",
341 |        "1939-10-22   NaN  Barcelona     ESP      1  1636.704590 1939-12-03\n",
342 |        "1939-12-04   NaN  Barcelona     ESP      1  1626.102173 1939-12-10\n",
343 |        "1939-12-11   NaN  Barcelona     ESP      1  1636.728271 1939-12-17\n",
344 |        "1939-12-18   NaN  Barcelona     ESP      1  1646.951660 1939-12-24\n",
345 |        "1939-12-25   NaN  Barcelona     ESP      1  1637.424316 1939-12-31"
346 |       ]
347 |      },
348 |      "execution_count": 5,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "barca_elo = elo.read_team_history(\"Barcelona\")\n",
355 |     "barca_elo.head()"
356 |    ]
357 |   }
358 |  ],
359 |  "metadata": {
360 |   "kernelspec": {
361 |    "display_name": "soccerdata",
362 |    "language": "python",
363 |    "name": "soccerdata"
364 |   },
365 |   "language_info": {
366 |    "codemirror_mode": {
367 |     "name": "ipython",
368 |     "version": 3
369 |    },
370 |    "file_extension": ".py",
371 |    "mimetype": "text/x-python",
372 |    "name": "python",
373 |    "nbconvert_exporter": "python",
374 |    "pygments_lexer": "ipython3",
375 |    "version": "3.9.6"
376 |   },
377 |   "toc": {
378 |    "base_numbering": 1,
379 |    "nav_menu": {},
380 |    "number_sections": true,
381 |    "sideBar": true,
382 |    "skip_h1_title": false,
383 |    "title_cell": "Table of Contents",
384 |    "title_sidebar": "Contents",
385 |    "toc_cell": false,
386 |    "toc_position": {},
387 |    "toc_section_display": true,
388 |    "toc_window_display": true
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 5
393 | }
394 | 


--------------------------------------------------------------------------------
/docs/datasources/Sofascore.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "pd.set_option('display.max_columns', None)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stdout",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "env: SOCCERDATA_LOGLEVEL=ERROR\n",
 23 |       "env: SOCCERDATA_NOCACHE=True\n",
 24 |       "env: SOCCERDATA_NOSTORE=True\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%env SOCCERDATA_LOGLEVEL=ERROR\n",
 30 |     "%env SOCCERDATA_NOCACHE=True\n",
 31 |     "%env SOCCERDATA_NOSTORE=True"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import soccerdata as sd"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "# Sofascore"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Provides pd.DataFrames from data available at http://www.sofascore.com.\n",
 60 |       "\n",
 61 |       "    Data will be downloaded as necessary and cached locally in\n",
 62 |       "    ``~/soccerdata/data/Sofascore``.\n",
 63 |       "\n",
 64 |       "    Parameters\n",
 65 |       "    ----------\n",
 66 |       "    leagues : string or iterable, optional\n",
 67 |       "        IDs of Leagues to include.\n",
 68 |       "    seasons : string, int or list, optional\n",
 69 |       "        Seasons to include. Supports multiple formats.\n",
 70 |       "        Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]\n",
 71 |       "    proxy : 'tor' or dict or list(dict) or callable, optional\n",
 72 |       "        Use a proxy to hide your IP address. Valid options are:\n",
 73 |       "            - 'tor': Uses the Tor network. Tor should be running in\n",
 74 |       "              the background on port 9050.\n",
 75 |       "            - dict: A dictionary with the proxy to use. The dict should be\n",
 76 |       "              a mapping of supported protocols to proxy addresses. For example::\n",
 77 |       "\n",
 78 |       "                  {\n",
 79 |       "                      'http': 'http://10.10.1.10:3128',\n",
 80 |       "                      'https': 'http://10.10.1.10:1080',\n",
 81 |       "                  }\n",
 82 |       "\n",
 83 |       "            - list(dict): A list of proxies to choose from. A different proxy will\n",
 84 |       "              be selected from this list after failed requests, allowing rotating\n",
 85 |       "              proxies.\n",
 86 |       "            - callable: A function that returns a valid proxy. This function will\n",
 87 |       "              be called after failed requests, allowing rotating proxies.\n",
 88 |       "    no_cache : bool\n",
 89 |       "        If True, will not use cached data.\n",
 90 |       "    no_store : bool\n",
 91 |       "        If True, will not store downloaded data.\n",
 92 |       "    data_dir : Path\n",
 93 |       "        Path to directory where data will be cached.\n",
 94 |       "    \n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "sofascore = sd.Sofascore(leagues='ESP-La Liga', seasons='2022/2023')\n",
100 |     "print(sofascore.__doc__)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Read league table"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/html": [
118 |        "<div>\n",
119 |        "<style scoped>\n",
120 |        "    .dataframe tbody tr th:only-of-type {\n",
121 |        "        vertical-align: middle;\n",
122 |        "    }\n",
123 |        "\n",
124 |        "    .dataframe tbody tr th {\n",
125 |        "        vertical-align: top;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe thead th {\n",
129 |        "        text-align: right;\n",
130 |        "    }\n",
131 |        "</style>\n",
132 |        "<table border=\"1\" class=\"dataframe\">\n",
133 |        "  <thead>\n",
134 |        "    <tr style=\"text-align: right;\">\n",
135 |        "      <th></th>\n",
136 |        "      <th></th>\n",
137 |        "      <th>team</th>\n",
138 |        "      <th>MP</th>\n",
139 |        "      <th>W</th>\n",
140 |        "      <th>D</th>\n",
141 |        "      <th>L</th>\n",
142 |        "      <th>GF</th>\n",
143 |        "      <th>GA</th>\n",
144 |        "      <th>GD</th>\n",
145 |        "      <th>Pts</th>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>league</th>\n",
149 |        "      <th>season</th>\n",
150 |        "      <th></th>\n",
151 |        "      <th></th>\n",
152 |        "      <th></th>\n",
153 |        "      <th></th>\n",
154 |        "      <th></th>\n",
155 |        "      <th></th>\n",
156 |        "      <th></th>\n",
157 |        "      <th></th>\n",
158 |        "      <th></th>\n",
159 |        "    </tr>\n",
160 |        "  </thead>\n",
161 |        "  <tbody>\n",
162 |        "    <tr>\n",
163 |        "      <th rowspan=\"5\" valign=\"top\">ESP-La Liga</th>\n",
164 |        "      <th>2223</th>\n",
165 |        "      <td>Barcelona</td>\n",
166 |        "      <td>38</td>\n",
167 |        "      <td>28</td>\n",
168 |        "      <td>4</td>\n",
169 |        "      <td>6</td>\n",
170 |        "      <td>70</td>\n",
171 |        "      <td>20</td>\n",
172 |        "      <td>50</td>\n",
173 |        "      <td>88</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>2223</th>\n",
177 |        "      <td>Real Madrid</td>\n",
178 |        "      <td>38</td>\n",
179 |        "      <td>24</td>\n",
180 |        "      <td>6</td>\n",
181 |        "      <td>8</td>\n",
182 |        "      <td>75</td>\n",
183 |        "      <td>36</td>\n",
184 |        "      <td>39</td>\n",
185 |        "      <td>78</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>2223</th>\n",
189 |        "      <td>Atlético Madrid</td>\n",
190 |        "      <td>38</td>\n",
191 |        "      <td>23</td>\n",
192 |        "      <td>8</td>\n",
193 |        "      <td>7</td>\n",
194 |        "      <td>70</td>\n",
195 |        "      <td>33</td>\n",
196 |        "      <td>37</td>\n",
197 |        "      <td>77</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>2223</th>\n",
201 |        "      <td>Real Sociedad</td>\n",
202 |        "      <td>38</td>\n",
203 |        "      <td>21</td>\n",
204 |        "      <td>8</td>\n",
205 |        "      <td>9</td>\n",
206 |        "      <td>51</td>\n",
207 |        "      <td>35</td>\n",
208 |        "      <td>16</td>\n",
209 |        "      <td>71</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>2223</th>\n",
213 |        "      <td>Villarreal</td>\n",
214 |        "      <td>38</td>\n",
215 |        "      <td>19</td>\n",
216 |        "      <td>7</td>\n",
217 |        "      <td>12</td>\n",
218 |        "      <td>59</td>\n",
219 |        "      <td>40</td>\n",
220 |        "      <td>19</td>\n",
221 |        "      <td>64</td>\n",
222 |        "    </tr>\n",
223 |        "  </tbody>\n",
224 |        "</table>\n",
225 |        "</div>"
226 |       ],
227 |       "text/plain": [
228 |        "                               team  MP   W  D   L  GF  GA  GD  Pts\n",
229 |        "league      season                                                 \n",
230 |        "ESP-La Liga 2223          Barcelona  38  28  4   6  70  20  50   88\n",
231 |        "            2223        Real Madrid  38  24  6   8  75  36  39   78\n",
232 |        "            2223    Atlético Madrid  38  23  8   7  70  33  37   77\n",
233 |        "            2223      Real Sociedad  38  21  8   9  51  35  16   71\n",
234 |        "            2223         Villarreal  38  19  7  12  59  40  19   64"
235 |       ]
236 |      },
237 |      "execution_count": 5,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "league_table = sofascore.read_league_table()\n",
244 |     "league_table.head()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "## Read schedule"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 8,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/html": [
262 |        "<div>\n",
263 |        "<style scoped>\n",
264 |        "    .dataframe tbody tr th:only-of-type {\n",
265 |        "        vertical-align: middle;\n",
266 |        "    }\n",
267 |        "\n",
268 |        "    .dataframe tbody tr th {\n",
269 |        "        vertical-align: top;\n",
270 |        "    }\n",
271 |        "\n",
272 |        "    .dataframe thead th {\n",
273 |        "        text-align: right;\n",
274 |        "    }\n",
275 |        "</style>\n",
276 |        "<table border=\"1\" class=\"dataframe\">\n",
277 |        "  <thead>\n",
278 |        "    <tr style=\"text-align: right;\">\n",
279 |        "      <th></th>\n",
280 |        "      <th></th>\n",
281 |        "      <th></th>\n",
282 |        "      <th>round</th>\n",
283 |        "      <th>week</th>\n",
284 |        "      <th>date</th>\n",
285 |        "      <th>home_team</th>\n",
286 |        "      <th>away_team</th>\n",
287 |        "      <th>home_score</th>\n",
288 |        "      <th>away_score</th>\n",
289 |        "      <th>game_id</th>\n",
290 |        "    </tr>\n",
291 |        "    <tr>\n",
292 |        "      <th>league</th>\n",
293 |        "      <th>season</th>\n",
294 |        "      <th>game</th>\n",
295 |        "      <th></th>\n",
296 |        "      <th></th>\n",
297 |        "      <th></th>\n",
298 |        "      <th></th>\n",
299 |        "      <th></th>\n",
300 |        "      <th></th>\n",
301 |        "      <th></th>\n",
302 |        "      <th></th>\n",
303 |        "    </tr>\n",
304 |        "  </thead>\n",
305 |        "  <tbody>\n",
306 |        "    <tr>\n",
307 |        "      <th rowspan=\"5\" valign=\"top\">ESP-La Liga</th>\n",
308 |        "      <th rowspan=\"5\" valign=\"top\">2223</th>\n",
309 |        "      <th>2022-08-12 Osasuna-Sevilla</th>\n",
310 |        "      <td>None</td>\n",
311 |        "      <td>1</td>\n",
312 |        "      <td>2022-08-12 15:00:00</td>\n",
313 |        "      <td>Osasuna</td>\n",
314 |        "      <td>Sevilla</td>\n",
315 |        "      <td>2</td>\n",
316 |        "      <td>1</td>\n",
317 |        "      <td>10408559</td>\n",
318 |        "    </tr>\n",
319 |        "    <tr>\n",
320 |        "      <th>2022-08-13 Barcelona-Rayo Vallecano</th>\n",
321 |        "      <td>None</td>\n",
322 |        "      <td>1</td>\n",
323 |        "      <td>2022-08-13 15:00:00</td>\n",
324 |        "      <td>Barcelona</td>\n",
325 |        "      <td>Rayo Vallecano</td>\n",
326 |        "      <td>0</td>\n",
327 |        "      <td>0</td>\n",
328 |        "      <td>10408557</td>\n",
329 |        "    </tr>\n",
330 |        "    <tr>\n",
331 |        "      <th>2022-08-13 Celta Vigo-Espanyol</th>\n",
332 |        "      <td>None</td>\n",
333 |        "      <td>1</td>\n",
334 |        "      <td>2022-08-13 11:00:00</td>\n",
335 |        "      <td>Celta Vigo</td>\n",
336 |        "      <td>Espanyol</td>\n",
337 |        "      <td>2</td>\n",
338 |        "      <td>2</td>\n",
339 |        "      <td>10408645</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>2022-08-13 Real Valladolid-Villarreal</th>\n",
343 |        "      <td>None</td>\n",
344 |        "      <td>1</td>\n",
345 |        "      <td>2022-08-13 13:00:00</td>\n",
346 |        "      <td>Real Valladolid</td>\n",
347 |        "      <td>Villarreal</td>\n",
348 |        "      <td>0</td>\n",
349 |        "      <td>3</td>\n",
350 |        "      <td>10408563</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>2022-08-14 Almería-Real Madrid</th>\n",
354 |        "      <td>None</td>\n",
355 |        "      <td>1</td>\n",
356 |        "      <td>2022-08-14 16:00:00</td>\n",
357 |        "      <td>Almería</td>\n",
358 |        "      <td>Real Madrid</td>\n",
359 |        "      <td>1</td>\n",
360 |        "      <td>2</td>\n",
361 |        "      <td>10408712</td>\n",
362 |        "    </tr>\n",
363 |        "  </tbody>\n",
364 |        "</table>\n",
365 |        "</div>"
366 |       ],
367 |       "text/plain": [
368 |        "                                                         round  week  \\\n",
369 |        "league      season game                                                \n",
370 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla             None     1   \n",
371 |        "                   2022-08-13 Barcelona-Rayo Vallecano    None     1   \n",
372 |        "                   2022-08-13 Celta Vigo-Espanyol         None     1   \n",
373 |        "                   2022-08-13 Real Valladolid-Villarreal  None     1   \n",
374 |        "                   2022-08-14 Almería-Real Madrid         None     1   \n",
375 |        "\n",
376 |        "                                                                        date  \\\n",
377 |        "league      season game                                                        \n",
378 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla            2022-08-12 15:00:00   \n",
379 |        "                   2022-08-13 Barcelona-Rayo Vallecano   2022-08-13 15:00:00   \n",
380 |        "                   2022-08-13 Celta Vigo-Espanyol        2022-08-13 11:00:00   \n",
381 |        "                   2022-08-13 Real Valladolid-Villarreal 2022-08-13 13:00:00   \n",
382 |        "                   2022-08-14 Almería-Real Madrid        2022-08-14 16:00:00   \n",
383 |        "\n",
384 |        "                                                                home_team  \\\n",
385 |        "league      season game                                                     \n",
386 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla                     Osasuna   \n",
387 |        "                   2022-08-13 Barcelona-Rayo Vallecano          Barcelona   \n",
388 |        "                   2022-08-13 Celta Vigo-Espanyol              Celta Vigo   \n",
389 |        "                   2022-08-13 Real Valladolid-Villarreal  Real Valladolid   \n",
390 |        "                   2022-08-14 Almería-Real Madrid                 Almería   \n",
391 |        "\n",
392 |        "                                                               away_team  \\\n",
393 |        "league      season game                                                    \n",
394 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla                    Sevilla   \n",
395 |        "                   2022-08-13 Barcelona-Rayo Vallecano    Rayo Vallecano   \n",
396 |        "                   2022-08-13 Celta Vigo-Espanyol               Espanyol   \n",
397 |        "                   2022-08-13 Real Valladolid-Villarreal      Villarreal   \n",
398 |        "                   2022-08-14 Almería-Real Madrid            Real Madrid   \n",
399 |        "\n",
400 |        "                                                          home_score  \\\n",
401 |        "league      season game                                                \n",
402 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla                      2   \n",
403 |        "                   2022-08-13 Barcelona-Rayo Vallecano             0   \n",
404 |        "                   2022-08-13 Celta Vigo-Espanyol                  2   \n",
405 |        "                   2022-08-13 Real Valladolid-Villarreal           0   \n",
406 |        "                   2022-08-14 Almería-Real Madrid                  1   \n",
407 |        "\n",
408 |        "                                                          away_score   game_id  \n",
409 |        "league      season game                                                         \n",
410 |        "ESP-La Liga 2223   2022-08-12 Osasuna-Sevilla                      1  10408559  \n",
411 |        "                   2022-08-13 Barcelona-Rayo Vallecano             0  10408557  \n",
412 |        "                   2022-08-13 Celta Vigo-Espanyol                  2  10408645  \n",
413 |        "                   2022-08-13 Real Valladolid-Villarreal           3  10408563  \n",
414 |        "                   2022-08-14 Almería-Real Madrid                  2  10408712  "
415 |       ]
416 |      },
417 |      "execution_count": 8,
418 |      "metadata": {},
419 |      "output_type": "execute_result"
420 |     }
421 |    ],
422 |    "source": [
423 |     "schedule = sofascore.read_schedule()\n",
424 |     "schedule.head()"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": []
433 |   }
434 |  ],
435 |  "metadata": {
436 |   "kernelspec": {
437 |    "display_name": "srcftbl",
438 |    "language": "python",
439 |    "name": "python3"
440 |   },
441 |   "language_info": {
442 |    "codemirror_mode": {
443 |     "name": "ipython",
444 |     "version": 3
445 |    },
446 |    "file_extension": ".py",
447 |    "mimetype": "text/x-python",
448 |    "name": "python",
449 |    "nbconvert_exporter": "python",
450 |    "pygments_lexer": "ipython3",
451 |    "version": "3.12.1"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 2
456 | }
457 | 


--------------------------------------------------------------------------------
/docs/datasources/index.rst:
--------------------------------------------------------------------------------
  1 | .. soccerdata package index documentation toctree
  2 | .. _datasources:
  3 | 
  4 | .. currentmodule:: soccerdata
  5 | .. highlight:: python
  6 | 
  7 | ========================
  8 | Overview of Data Sources
  9 | ========================
 10 | 
 11 | Currently, the following data sources are supported.
 12 | 
 13 | .. rst-class:: datasources
 14 | 
 15 | -----
 16 | 
 17 | ClubElo
 18 |     `URL <http://clubelo.com>`__ |
 19 |     :doc:`Example usage <ClubElo>` |
 20 |     :doc:`API reference </reference/clubelo>`
 21 | 
 22 |     .. code::
 23 | 
 24 |       from soccerdata import ClubElo
 25 | 
 26 |     Team's relative strengths as Elo ratings, for most European leagues. Recalculated after every round, includes history.
 27 | 
 28 | -----
 29 | 
 30 | ESPN
 31 |     `URL <https://www.espn.com/soccer/>`__ |
 32 |     :doc:`Example usage <ESPN>` |
 33 |     :doc:`API reference </reference/espn>`
 34 | 
 35 |     .. code::
 36 | 
 37 |       from soccerdata import ESPN
 38 | 
 39 |     Historical results, statistics and lineups.
 40 | 
 41 | -----
 42 | 
 43 | FBref
 44 |     `URL <https://fbref.com/en/>`__ |
 45 |     :doc:`Example usage <FBref>` |
 46 |     :doc:`API reference </reference/fbref>`
 47 | 
 48 |     .. code::
 49 | 
 50 |       from soccerdata import FBref
 51 | 
 52 |     Historical results, lineups, and detailed aggregated statistics for teams and individual players based on Opta data.
 53 | 
 54 | -----
 55 | 
 56 | FotMob
 57 |     `URL <https://www.fotmob.com/>`__ |
 58 |     :doc:`Example usage <FotMob>` |
 59 |     :doc:`API reference </reference/fotmob>`
 60 | 
 61 |     .. code::
 62 | 
 63 |       from soccerdata import FotMob
 64 | 
 65 |     Historical results, lineups, and detailed aggregated statistics for teams and individual players based on Opta data.
 66 | 
 67 | -----
 68 | 
 69 | Football-Data.co.uk
 70 |     `URL <https://www.football-data.co.uk/data.php>`__ |
 71 |     :doc:`Example usage <MatchHistory>` |
 72 |     :doc:`API reference </reference/matchhistory>`
 73 | 
 74 |     .. code::
 75 | 
 76 |       from soccerdata import MatchHistory
 77 | 
 78 |     Historical results, betting odds and match statistics. Level of detail depends on league.
 79 | 
 80 | -----
 81 | 
 82 | Sofascore
 83 |     `URL <https://sofascore.com/>`__ |
 84 |     :doc:`Example usage <Sofascore>` |
 85 |     :doc:`API reference </reference/sofascore>`
 86 | 
 87 |     .. code::
 88 | 
 89 |       from soccerdata import Sofascore
 90 | 
 91 |     Results, schedules, lineups, and detailed statistics for teams and individual players.
 92 | 
 93 | -----
 94 | 
 95 | SoFIFA
 96 |     `URL <https://sofifa.com/>`__ |
 97 |     :doc:`Example usage <SoFIFA>` |
 98 |     :doc:`API reference </reference/sofifa>`
 99 | 
100 |     .. code::
101 | 
102 |       from soccerdata import SoFIFA
103 | 
104 |     Detailed scores on all player's abilities from EA Sports FC.
105 | 
106 | -----
107 | 
108 | Understat
109 |     `URL <https://understat.com>`__ |
110 |     :doc:`Example usage <Understat>` |
111 |     :doc:`API reference </reference/understat>`
112 | 
113 |     .. code::
114 | 
115 |       from soccerdata import Understat
116 | 
117 |     Advanced statistics such as xG, xGBuildup and xGChain, and shot events with associated xG values for the top European leagues.
118 | 
119 | -----
120 | 
121 | WhoScored
122 |     `URL <https://www.whoscored.com/>`__ |
123 |     :doc:`Example usage <WhoScored>` |
124 |     :doc:`API reference </reference/whoscored>`
125 | 
126 |     .. code::
127 | 
128 |       from soccerdata import WhoScored
129 | 
130 |     Historical results, match preview data and detailed Opta event stream data for major leagues.
131 | 
132 | .. toctree::
133 |    :hidden:
134 | 
135 |    ClubElo
136 |    ESPN
137 |    FBref
138 |    FiveThirtyEight
139 |    FotMob
140 |    MatchHistory
141 |    SoFIFA
142 |    Understat
143 |    WhoScored
144 | 


--------------------------------------------------------------------------------
/docs/examples/index.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | Data Analysis Examples
 3 | ======================
 4 | 
 5 | Below is a gallery of examples of data analysis using the ``soccerdata`` package.
 6 | If you have an example you would like to share, please submit a pull request to the
 7 | SoccerData GitHub repository.
 8 | 
 9 | .. nbgallery::
10 |     :glob:
11 |     :hidden:
12 | 
13 |     *
14 | 


--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
 1 | .. _faq:
 2 | 
 3 | FAQ
 4 | =====
 5 | 
 6 | **Is web scraping legal?**
 7 | 
 8 | Even though web scraping is ubiquitous, its legal status remains unclear. That
 9 | is because whether web scraping is legal will depend on many aspects.
10 | It is always best to consult with a lawyer or legal expert to ensure that your
11 | web scraping activities are legal and comply with all applicable laws and
12 | regulations.
13 | 
14 | .. Currently, web scraping is not per se prohibited in the European Union but the
15 | .. use of data mining tools is legally risky.
16 | ..
17 | .. The sui generis database right protects the content of a database. What does
18 | .. it mean for web scrapers? That you can scrape such data (and, therefore, copy
19 | .. and collect contents of the protected database – which falls under the
20 | .. definition of “extraction” under the analyzed Directive) as long as (a) you
21 | .. don’t scrape a ‘substantial part, evaluated qualitatively and/or
22 | .. quantitatively, of the contents of that database’ and you don’t re-use it
23 | .. (meaning basically selling or publishing it); or (b) scraping falls under TDM
24 | .. exception described below; or (c) you’ve received an appropriate licence.
25 | ..
26 | .. However, the TDM exception is limited: the database owners are granted the
27 | .. possibility to restrict the reproduction and extraction of the databases and
28 | .. their content. That restriction must be made in a manner that will allow bots
29 | .. and crawlers etc. to see that restriction (therefore, on a website there
30 | .. should be installed for example a special program communicating visiting
31 | .. scraping programs that scraping is prohibited). Any such restriction should,
32 | .. in any case, permit scraping made for scientific research purposes (see art.
33 | .. 3 (1) and 7(1) of the DSM Directive).
34 | ..
35 | .. But there are more traps on your way. One of them is the possibility of
36 | .. breaching the website’s Terms of Use if they prohibit web scraping.
37 | .. As the situation is highly uncertain, it is advisable to be careful and, if
38 | .. possible, rather avoid breaching terms of use made available in any form.
39 | 
40 | .. To minimize concerns, scraping should be discreet, respect websites’ terms of
41 | .. service, check whether sites are using the robots.txt protocol to communicate
42 | .. that scraping is prohibited, avoid personal data scraping and, if it is
43 | .. necessary, make sure no GDPR violations are made and avoid scraping private or
44 | .. classified information. If possible, it would be advisable to get a licence
45 | .. for scraping.
46 | 
47 | 
48 | **Something doesn’t work**
49 | 
50 | 1. Have you updated to the newest version of soccerdata?
51 | 2. Clear the cache or run your script without caching enabled.
52 | 3. Does the log produce any warnings that sound like they might be related?
53 |    Maybe the data you are looking for is not available or can not be processed
54 |    correctly.
55 | 4. Open an issue on GitHub.
56 | 


--------------------------------------------------------------------------------
/docs/howto/custom-leagues.rst:
--------------------------------------------------------------------------------
  1 | ===========================
  2 | How to add custom leagues
  3 | ===========================
  4 | 
  5 | SoccerData has built-in support to scrape data from the top-5 European leagues
  6 | and the major international tournaments. The leagues available for each source
  7 | can be listed with the :meth:`~soccerdata.FBref.available_leagues` class method.
  8 | 
  9 | .. code:: python
 10 | 
 11 |   import soccerdata as sd
 12 |   sd.FBref.available_leagues()
 13 |   >>> ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A']
 14 | 
 15 | This documentation explains how to add custom leagues.
 16 | 
 17 | 
 18 | .. warning::
 19 | 
 20 |   Note that you might encounter errors when trying to scrape data for the
 21 |   leagues you added yourself. This is because the data provided for these
 22 |   leagues might have a different structure. If you encounter such an error,
 23 |   please do not open an issue on GitHub, but try to fix it yourself.
 24 | 
 25 | 
 26 | 
 27 | Adding a new league
 28 | -------------------
 29 | 
 30 | Additional leagues can configured in ``SOCCERDATA_DIR/config/league_dict.json``.
 31 | This file should contain a mapping between a generic name for the league and
 32 | the identifier used internally by each data source (see below) that you want
 33 | to support. For example, for the Dutch Eredivisie this would be:
 34 | 
 35 | .. code-block:: json
 36 | 
 37 |   {
 38 |     "NED-Eredivisie": {
 39 |       "ClubElo": "NED_1",
 40 |       "MatchHistory": "N1",
 41 |       "SoFIFA": "[Netherlands] Eredivisie",
 42 |       "FBref": "Eredivisie",
 43 |       "ESPN": "ned.1",
 44 |       "FiveThirtyEight": "eredivisie",
 45 |       "WhoScored": "Netherlands - Eredivisie",
 46 |       "Sofascore": "Eredivisie",
 47 |       "season_start": "Aug",
 48 |       "season_end": "May"
 49 |     }
 50 |   }
 51 | 
 52 | The ``season_end`` and ``season_start`` fields are optional. This should be
 53 | the month in which the last game and first game of a season are played,
 54 | respectively. If they are not provided, June is used as the last month of the
 55 | season and July as the first one.
 56 | 
 57 | Now, restart your Python session and check whether it is added to available
 58 | leagues by running the command below.
 59 | 
 60 | .. code:: python
 61 | 
 62 |   >>> import soccerdata as sd
 63 |   >>> sd.FBref.available_leagues()
 64 |   [..., 'NED-Eredivisie', ...]
 65 | 
 66 | 
 67 | 
 68 | Internal identifiers
 69 | --------------------
 70 | 
 71 | Below are instructions on how to find the internal identifiers for each data
 72 | source.
 73 | 
 74 | **ClubElo**
 75 |   The internal identifier has the format ``{country_code}_{level}``. The get
 76 |   the country code, go to https://clubelo.com/, click on the league you want
 77 |   to add and take the three-letter code in the URL. For example, the URL for
 78 |   the Dutch Eredivisie is http://clubelo.com/NED which means that the country
 79 |   identifier is ``NED``. The level is the number of the league, starting with
 80 |   1 for the top league. The internal identifier for the Dutch Eredivisie is
 81 |   therefore ``NED_1``.
 82 | 
 83 | **MatchHistory**
 84 |   The internal identifier has the format ``{country_code}{level}``. Download
 85 |   the CSV file corresponding corresponding to the league you would like to add
 86 |   from https://www.football-data.co.uk/data.php and take the value in the
 87 |   ``Div`` column.
 88 | 
 89 | **SoFIFA**
 90 |   The internal identifier has the format ``[{region}] {league name}``. Go to
 91 |   https://sofifa.com/api/league to get the list of available leagues. The
 92 |   ``{region}`` corresponds to the ``nationName`` field in the JSON response. The
 93 |   ``{league name}`` corresponds to the ``value`` field.
 94 | 
 95 | **FBref**
 96 |   Go to https://fbref.com/en/comps/ and take the value in the ``Competition
 97 |   Name`` column.
 98 | 
 99 | **ESPN**
100 |   The internal identifier has the format ``{country_code}.{level}``. Go to
101 |   https://www.espn.com/soccer/competitions, click on the league you want
102 |   to add and take the value in the URL after ``/league/_/name/``.
103 | 
104 | **FiveThirtyEight**
105 |   Go to https://projects.fivethirtyeight.com/soccer-predictions/, select the
106 |   relevant league and take the value in the URL after
107 |   ``/soccer-predictions/``.
108 | 
109 | **WhoScored**
110 |   Go to https://www.whoscored.com and use the JavaScript console to get the
111 |   value of the ``allRegions`` variable. The internal identifier has the format
112 |   ``{region name} - {league name}``.
113 | 
114 | **FotMob**
115 |   The internal identifier is identical in style to the general format:
116 |   ``[{region}]-{league name}``. Go to https://www.fotmob.com/api/allLeagues
117 |   to get the list of available leagues. The ``{region}`` corresponds to the
118 |   ``ccode`` field in the JSON response. The ``{league name}`` corresponds to
119 |   the ``name`` field.
120 | 
121 | **Sofascore**
122 |   Go to https://api.sofascore.com/api/v1/config/unique-tournaments/EN/football
123 |   to get the list of major leagues and tournaments. Access ``uniqueTournaments``
124 |   in the JSON response, and the ``{league name}`` corresponds to the ``name``
125 |   field.
126 | 
127 | Troubleshooting
128 | ---------------
129 | 
130 | If you add a new league and it doesn't show up in the list of available leagues,
131 | there are a few things you can do to debug the problem.
132 | 
133 | 1. Make sure to reload the soccerdata module after you modify the
134 |    ``league_dict.json`` file. The most straightforward way to do this is to
135 |    restart your notebook or Python interpreter.
136 | 
137 | 2. Check whether your ``league_dict.json`` file is at the correct location. If
138 |    so, you should see this appear in the log messages when importing the
139 |    soccerdata library.
140 | 
141 |    .. code:: python
142 | 
143 |      >>> import soccerdata as sd
144 |      [11/25/22 11:49:12] INFO     Custom team name replacements loaded from <path>/teamname_replacements.json.                                                                                                _config.py:83
145 |                          INFO     Custom league dict loaded from <path>/league_dict.json.                                                                                                                    _config.py:153
146 | 
147 | 
148 | 3. Check whether the content of your ``league_dict.json`` file is valid JSON.
149 |    You can check the file's syntax using Python's built-in ``json.tool``
150 |    module.
151 | 
152 |    .. code:: sh
153 | 
154 |       $ cat config/league_dict.json | python -m json.tool
155 |       Expecting ',' delimiter: line 1 column 10 (char 9)
156 | 


--------------------------------------------------------------------------------
/docs/howto/index.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | How-to Guides
 3 | ===============
 4 | 
 5 | Here you'll find short answers to "How do I...?" types of questions. These
 6 | how-to guides don't cover topics in depth -- you'll find that material in the
 7 | :doc:`/reference/index`. However, these guides will help you quickly
 8 | accomplish common tasks.
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 | 
13 |    custom-leagues
14 |    proxy
15 | 


--------------------------------------------------------------------------------
/docs/howto/proxy.rst:
--------------------------------------------------------------------------------
 1 | How to use a proxy server
 2 | -------------------------
 3 | 
 4 | You can setup a SOCKS5 proxy with Tor.
 5 | Checkout the `installation guide`_ on the Tor website for installation
 6 | instructions. After installing Tor, make sure to start it up before scraping.
 7 | This can easily be done by running the ``tor`` command from your terminal (in
 8 | a separate window), Tor will start up and run on “localhost:9050” by default.
 9 | Once Tor is running, you can enable the extension by setting ``proxy='tor'``.
10 | 
11 | .. code:: python
12 | 
13 |    ws = sd.WhoScored(proxy='tor')
14 | 
15 | The code snippet above assumes you have a Tor proxy running on
16 | "localhost:9050". Many distributions indeed default to having a SOCKS proxy
17 | listening on port 9050, but some may not. In particular, the Tor Browser
18 | Bundle defaults to listening on port 9150. You can specify a custom host and
19 | port as
20 | 
21 | .. code:: python
22 | 
23 |    ws = sd.WhoScored(proxy={
24 |         "http": "socks5://127.0.0.1:9150",
25 |         "https": "socks5://127.0.0.1:9150",
26 |     })
27 | 
28 | 
29 | .. _installation guide: https://community.torproject.org/onion-services/setup/install/
30 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | =============================
 2 | Welcome to SoccerData's docs!
 3 | =============================
 4 | 
 5 | Release v\ |release|. (``pip install soccerdata``)
 6 | 
 7 | 
 8 | .. image:: https://pepy.tech/badge/soccerdata/month
 9 |     :target: https://pepy.tech/project/soccerdata
10 |     :alt: SoccerData Downloads Per Month Badge
11 | 
12 | .. image:: https://img.shields.io/pypi/l/soccerdata.svg
13 |     :target: https://pypi.org/project/soccerdata/
14 |     :alt: License Badge
15 | 
16 | .. image:: https://img.shields.io/pypi/pyversions/soccerdata.svg
17 |     :target: https://pypi.org/project/soccerdata/
18 |     :alt: Python Version Support Badge
19 | 
20 | 
21 | **SoccerData** is a collection of scrapers to gather soccer data from popular
22 | websites, including `Club Elo`_, `ESPN`_, `FBref`_, `FiveThirtyEight`_,
23 | `Football-Data.co.uk`_, `FotMob`_, `Sofascore`_, `SoFIFA`_, `Understat`_  and `WhoScored`_.
24 | 
25 | .. code:: python
26 | 
27 |    import soccerdata as sd
28 | 
29 |    # Create a scraper class instance for the 2020/21 Premier League
30 |    fbref = sd.FBref('ENG-Premier League', '2021')
31 | 
32 |    # Fetch data
33 |    games = fbref.read_schedule()
34 |    team_season_stats = fbref.read_team_season_stats(stat_type="passing")
35 |    player_season_stats = fbref.read_player_season_stats(stat_type="standard")
36 | 
37 | 
38 | -------------------
39 | 
40 | **Main features**
41 | 
42 | - Access current and historical soccer fixtures, forecasts, detailed match
43 |   stats, event stream data and more.
44 | - All data is provided in the form of Pandas DataFrames with sensible,
45 |   matching column names and identifiers across datasets to make working with
46 |   the data and combining data from multiple sources easy.
47 | - Data is only downloaded when needed and cached locally to speed up your
48 |   analyis scripts.
49 | - Integrates with the `socceraction`_ package to allow analysis of event stream
50 |   data.
51 | 
52 | Do you like it? :doc:`Let's dive in! <intro>`
53 | 
54 | .. toctree::
55 |    :hidden:
56 |    :maxdepth: 1
57 | 
58 |    intro
59 |    datasources/index
60 |    howto/index
61 |    examples/index
62 |    reference/index
63 |    faq
64 |    contributing
65 |    License <license>
66 |    Changelog <https://github.com/probberechts/soccerdata/releases>
67 | 
68 | .. _socceraction: https://socceraction.readthedocs.io/en/latest/documentation/data/opta.html#whoscored
69 | .. _Club Elo: https://www.clubelo.com/
70 | .. _ESPN: https://www.espn.com/soccer/
71 | .. _FBref: https://www.fbref.com/en/
72 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/
73 | .. _Football-Data.co.uk: https://www.football-data.co.uk/
74 | .. _FotMob: https://fotmob.com/
75 | .. _Sofascore: https://www.sofascore.com/
76 | .. _SoFIFA: https://sofifa.com/
77 | .. _Understat: https://understat.com/
78 | .. _WhoScored: https://www.whoscored.com/
79 | 


--------------------------------------------------------------------------------
/docs/intro.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | ===============
  4 | Getting Started
  5 | ===============
  6 | 
  7 | New to `soccerdata`? Well, you came to the right place: this tutorial will walk
  8 | you through installing, configuring, and using the library. By the end of this
  9 | tutorial, you will be able to scrape data from the top-5 European leagues and
 10 | use it to create your own data-driven analyses.
 11 | 
 12 | 
 13 | Installation
 14 | ------------
 15 | 
 16 | SoccerData can be easily installed via `pip <https://pip.readthedocs.org/>`__:
 17 | 
 18 | .. code:: bash
 19 | 
 20 |   python3 -m pip install soccerdata
 21 | 
 22 | 
 23 | Scraping data
 24 | -------------
 25 | 
 26 | Each of the :ref:`supported data sources <datasources>` has its corresponding
 27 | class for fetching data with a uniform API. For example, the
 28 | :class:`~soccerdata.FBref` class is used to fetch data from `fbref.com
 29 | <https://www.fbref.com/>`__.
 30 | 
 31 | .. code:: python
 32 | 
 33 |    import soccerdata as sd
 34 | 
 35 |    # Create scraper class instance
 36 |    fbref = sd.FBref()
 37 | 
 38 | 
 39 | Once you have a scraper class instance, you can use it to fetch data. See the
 40 | the :ref:`examples <datasources>` and :ref:`API reference <api>` for the full
 41 | list of options available for each scraper. For example, to fetch aggregated
 42 | shooting stats for all teams:
 43 | 
 44 | .. code:: python
 45 | 
 46 |    # Create dataframes
 47 |    season_stats = fbref.read_team_season_stats(stat_type='shooting')
 48 | 
 49 | 
 50 | The data is always returned as a convenient Pandas DataFrame.
 51 | 
 52 | .. csv-table::
 53 |    :file: output.csv
 54 |    :header-rows: 1
 55 | 
 56 | By default, the data for all available leagues and the five most recent
 57 | seasons will be retrieved. However, in most cases, you would want to limit the
 58 | data to specific leagues and / or seasons. This can be done by passing a list
 59 | of leagues and seasons to the constructor of the scraper class. For example:
 60 | 
 61 | .. code:: python
 62 | 
 63 |    # Create scraper class instance filtering on specific leagues and seasons
 64 |    fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['1718', '1819'])
 65 |    # Retrieve data for the specified leagues and seasons
 66 |    season_stats = fbref.read_team_season_stats(stat_type='shooting')
 67 | 
 68 | 
 69 | Note that only a limited number of leagues are supported out-of-the-box. The
 70 | leagues available for each source can be listed with the
 71 | :meth:`~soccerdata.FBref.available_leagues` class method.
 72 | 
 73 | .. code:: python
 74 | 
 75 |    sd.FBref.available_leagues()
 76 |    >>> ['Big 5 European Leagues Combined', 'ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A', 'INT-World Cup', "INT-Women's World Cup"]
 77 | 
 78 | 
 79 | You can :doc:`add more leagues <howto/custom-leagues>` but there are no
 80 | guarantees that they will be scraped correctly.
 81 | 
 82 | 
 83 | Data caching
 84 | ------------
 85 | 
 86 | Data caching is used to speed up the runtime and to prevent exceeding the rate
 87 | limit of web servers. By default, all downloaded data is cached to
 88 | ``~/soccerdata`` on Linux and Mac OS, and to ``C:\Users\yourusername\soccerdata``
 89 | on Windows. A custom location can be set if desired. You can configure this
 90 | using environment variables (see below) or on the level of an individual
 91 | scraper by setting the ``data_dir`` parameter when creating the scraper class
 92 | instance:
 93 | 
 94 | .. code:: python
 95 | 
 96 |    # Create scraper class instance with custom caching directory
 97 |    fbref = sd.FBref(data_dir="/tmp/FBref")
 98 | 
 99 | 
100 | This directory can be deleted at any time to reclaim disk space.
101 | However, this also means you will have to redownload the same data again if
102 | you need it, which will lead to reduced performance.
103 | 
104 | SoccerData has no knowledge of when the data on the server changes, so it is
105 | up to the user to decide when to refresh the cache. This can be done by
106 | deleting the cache directory or by setting the ``no_cache`` option to ``True``
107 | when creating the scraper class instance:
108 | 
109 | .. code:: python
110 | 
111 |    # Create scraper class instance which always re-downloads the latest data
112 |    fbref = sd.FBref(no_cache=True)
113 | 
114 | 
115 | Some methods will assume the cache is always out-of-date (for example, when
116 | scraping the fixture of the current season). Typically, these methods will
117 | have a ``force_cache`` option that can be set to ``True`` to force the cached
118 | data to be used. For example:
119 | 
120 | .. code:: python
121 | 
122 |    fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['2324'])
123 |    fbref.read_schedule(force_cache=True)
124 | 
125 | 
126 | Caching can also be disabled entirely by setting the ``no_store`` option to
127 | ``True`` when creating the scraper class instance. However, it should almost
128 | always be left enabled.
129 | 
130 | .. code:: python
131 | 
132 |    # Create scraper class instance with caching disabled
133 |    fbref = sd.FBref(no_store=True)
134 | 
135 | 
136 | Global configuration
137 | ---------------------
138 | 
139 | Several settings can be configured globally using the following environment
140 | variables:
141 | 
142 | ``SOCCERDATA_DIR``
143 |     The directory where the downloaded data is cached and where logs are
144 |     stored. By default, all data is stored to ``~/soccerdata`` on Linux / Mac
145 |     OS and ``C:\Users\yourusername\soccerdata`` on Windows.
146 | ``SOCCERDATA_NOCACHE``
147 |     If set to "true", no cached data is returned. Note that no-cache does not
148 |     mean "don't cache". All downloaded data is still cached and overwrites
149 |     existing caches. If the sense of "don't cache" that you want is actually
150 |     "don't store", then ``SOCCERDATA_NOSTORE`` is the option to use. By
151 |     default, data is retrieved from the cache.
152 | ``SOCCERDATA_NOSTORE``
153 |     If set to "true", no data is stored. By default, data is cached.
154 | ``SOCCERDATA_MAXAGE``
155 |     The maximum age of cached data in seconds. If the cached data is older
156 |     than this, it will be re-downloaded. By default, this is set to infinity.
157 | ``SOCCERDATA_LOGLEVEL``
158 |     The level of logging to use. By default, this is set to "INFO".
159 | 
160 | Example:
161 | 
162 | .. code-block:: bash
163 | 
164 |   # bash
165 |   export SOCCERDATA_DIR = "~/soccerdata"
166 |   export SOCCERDATA_NOCACHE = "False"
167 |   export SOCCERDATA_NOSTORE = "False"
168 |   export SOCCERDATA_LOGLEVEL = "INFO"
169 | 
170 | 
171 | Uniform team names
172 | ------------------
173 | 
174 | Each data source uses a different set of team names, which makes it difficult
175 | to combine data from multiple sources. To mitigate this, SoccerData allows
176 | translating the team names to uniform names. This is done by providing
177 | a ``SOCCERDATA_DIR/config/teamname_replacements.json`` file. This file should contain a
178 | mapping between a generic name for each team and the team name used by each
179 | data source that you want to support. The example below will map "Tottenham
180 | Hotspur", "Tottenham Hotspur FC" and "Spurs" to "Tottenham" in all scraped
181 | data.
182 | 
183 | .. code-block:: json
184 | 
185 |   {
186 |     "Tottenham": ["Tottenham Hotspur", "Tottenham Hotspur FC", "Spurs"],
187 |   }
188 | 
189 | Additional setup for scraping WhoScored data
190 | --------------------------------------------
191 | 
192 | WhoScored implements strong protection against scraping using Incapsula. To
193 | circumvent this, this scraper uses Selenium with the ChromeDriver extension to
194 | emulate a real user. Before using this scraper, you will have to `install
195 | Chrome`_. A Selenium driver matching your Chrome version will be downloaded
196 | automatically when you run the scraper.
197 | 
198 | Next steps
199 | ----------
200 | Look at you! You’re now basically an expert at SoccerData! ✨
201 | 
202 | From this point you can:
203 | 
204 | - Look at the example notebooks for each :ref:`Data source <datasources>`.
205 | - Take a deep dive into the :ref:`API <api>`.
206 | - Give us feedback or contribute, see :ref:`Contributing <contributing>`.
207 | 
208 | Have fun! 🎉
209 | 
210 | 
211 | .. _install Chrome: https://www.google.com/chrome/
212 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../LICENSE.rst
2 | 


--------------------------------------------------------------------------------
/docs/output.csv:
--------------------------------------------------------------------------------
1 | league,season,team,#Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
2 | ENG-Premier League,2021,Arsenal,29,38.0,53,455,141,31.0,11.97,3.71,0.1,0.33,16.9,23,6,6,53.5,49.0,0.11,-0.5,-2.0
3 | ,,Aston Villa,24,38.0,52,518,179,34.6,13.63,4.71,0.09,0.26,16.5,15,5,6,52.9,48.5,0.1,-0.9,-1.5
4 | ,,Brighton,27,38.0,39,476,129,27.1,12.53,3.39,0.07,0.26,16.6,14,6,9,51.6,44.8,0.1,-12.6,-11.8
5 | ,,Burnley,25,38.0,32,383,125,32.6,10.08,3.29,0.08,0.23,16.6,15,3,3,39.9,37.6,0.1,-7.9,-8.6
6 | ,,Chelsea,27,38.0,56,553,194,35.1,14.55,5.11,0.09,0.25,16.3,16,8,10,64.0,56.4,0.1,-8.0,-8.4
7 | 


--------------------------------------------------------------------------------
/docs/reference/base.rst:
--------------------------------------------------------------------------------
 1 | .. _api-base:
 2 | 
 3 | Base Readers
 4 | ============
 5 | 
 6 | The logic for downloading data from the web is implemented in the base classes
 7 | that are documented here. The base classes are not intended to be used directly
 8 | but rather to be subclassed by the specific readers which implement the logic
 9 | to parse the data.
10 | 
11 | The :class:`BaseRequestsReader` is a wrapper around the `requests` library
12 | and is used by scrapers that do not require JavaScript to be executed. The
13 | :class:`BaseSeleniumReader` is a wrapper around the `selenium` library and is
14 | used by scrapers that require JavaScript to be executed.
15 | 
16 | .. autoclass:: soccerdata._common.BaseRequestsReader
17 |    :inherited-members:
18 |    :members:
19 | 
20 | .. autoclass:: soccerdata._common.BaseSeleniumReader
21 |    :inherited-members:
22 |    :members:
23 | 


--------------------------------------------------------------------------------
/docs/reference/clubelo.rst:
--------------------------------------------------------------------------------
1 | .. _api-clubelo:
2 | 
3 | Club Elo
4 | ========
5 | 
6 | .. autoclass:: soccerdata.ClubElo
7 |    :inherited-members: available_leagues
8 |    :members: read_by_date, read_team_history
9 | 


--------------------------------------------------------------------------------
/docs/reference/espn.rst:
--------------------------------------------------------------------------------
1 | .. _api-espn:
2 | 
3 | ESPN
4 | =====
5 | 
6 | .. autoclass:: soccerdata.ESPN
7 |    :inherited-members:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/reference/fbref.rst:
--------------------------------------------------------------------------------
 1 | .. _api-fbref:
 2 | 
 3 | FBref
 4 | =====
 5 | 
 6 | .. autoclass:: soccerdata.FBref
 7 |    :members: available_leagues, read_leagues, read_seasons,
 8 |     read_team_season_stats, read_team_match_stats,
 9 |     read_player_season_stats, read_player_match_stats,
10 |     read_schedule, read_lineup, read_events, read_shot_events
11 | 


--------------------------------------------------------------------------------
/docs/reference/fotmob.rst:
--------------------------------------------------------------------------------
1 | .. _api-fotmob:
2 | 
3 | FotMob
4 | ======
5 | 
6 | .. autoclass:: soccerdata.FotMob
7 |    :members: available_leagues, read_leagues, read_seasons,
8 |      read_league_table, read_schedule, read_team_match_stats,
9 | 


--------------------------------------------------------------------------------
/docs/reference/index.rst:
--------------------------------------------------------------------------------
 1 | .. soccerdata package index documentation toctree
 2 | .. _api:
 3 | 
 4 | .. currentmodule:: soccerdata
 5 | 
 6 | API Reference
 7 | =============
 8 | 
 9 | This part of the documentation covers all the interfaces of the implemented
10 | data scrapers.
11 | 
12 | .. toctree::
13 | 
14 |    clubelo
15 |    espn
16 |    fbref
17 |    fivethirtyeight
18 |    fotmob
19 |    matchhistory
20 |    sofascore
21 |    sofifa
22 |    understat
23 |    whoscored
24 | 
25 | If you would like to extend the functionality of soccerdata, you might also be
26 | interested in the following modules:
27 | 
28 | .. toctree::
29 | 
30 |    base
31 |    utils
32 | 


--------------------------------------------------------------------------------
/docs/reference/matchhistory.rst:
--------------------------------------------------------------------------------
1 | .. _api-matchhistory:
2 | 
3 | MatchHistory
4 | =============
5 | 
6 | .. autoclass:: soccerdata.MatchHistory
7 |    :inherited-members:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/reference/sofascore.rst:
--------------------------------------------------------------------------------
1 | .. _api-sofascore:
2 | 
3 | Sofascore
4 | ======
5 | 
6 | .. autoclass:: soccerdata.Sofascore
7 |    :members: read_leagues, read_seasons,
8 |      read_league_table, read_schedule,
9 | 


--------------------------------------------------------------------------------
/docs/reference/sofifa.rst:
--------------------------------------------------------------------------------
1 | .. _api-sofifa:
2 | 
3 | SoFIFA
4 | ========
5 | 
6 | .. autoclass:: soccerdata.SoFIFA
7 |    :members: read_leagues, read_versions, read_teams, read_players,
8 |     read_team_ratings, read_player_ratings, available_leagues
9 | 


--------------------------------------------------------------------------------
/docs/reference/understat.rst:
--------------------------------------------------------------------------------
 1 | .. _api-understat:
 2 | 
 3 | Understat
 4 | =========
 5 | 
 6 | .. autoclass:: soccerdata.Understat
 7 |    :inherited-members: available_leagues
 8 |    :members: read_leagues, read_seasons, read_schedule,
 9 |     read_team_match_stats, read_player_season_stats,
10 |     read_player_match_stats, read_shot_events
11 | 


--------------------------------------------------------------------------------
/docs/reference/utils.rst:
--------------------------------------------------------------------------------
 1 | .. _api-utils:
 2 | 
 3 | Utilities
 4 | ============
 5 | 
 6 | .. automethod:: soccerdata._common.season_code
 7 | .. automethod:: soccerdata._common.make_game_id
 8 | .. automethod:: soccerdata._common.standardize_colnames
 9 | .. automethod:: soccerdata._common.get_proxy
10 | .. automethod:: soccerdata._common.check_proxy
11 | 


--------------------------------------------------------------------------------
/docs/reference/whoscored.rst:
--------------------------------------------------------------------------------
1 | .. _api-whoscored:
2 | 
3 | WhoScored
4 | =========
5 | 
6 | .. autoclass:: soccerdata.WhoScored
7 |    :members: available_leagues, read_schedule, read_missing_players, read_events
8 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | furo==2024.8.6
2 | sphinx==7.4.7
3 | nbsphinx==0.9.7
4 | 


--------------------------------------------------------------------------------
/docs/topics/index.rst:
--------------------------------------------------------------------------------
1 | ===============
2 | Advanced Usage
3 | ===============
4 | 
5 | Introductions to all the key parts of SoccerData you'll need to know:
6 | 
7 | .. toctree::
8 |    :maxdepth: 2
9 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
  1 | """Nox sessions."""
  2 | 
  3 | import os
  4 | import shlex
  5 | import shutil
  6 | import sys
  7 | from pathlib import Path
  8 | from textwrap import dedent
  9 | 
 10 | import nox
 11 | 
 12 | try:
 13 |     from nox_poetry import Session, session
 14 | except ImportError:
 15 |     message = f"""\
 16 |     Nox failed to import the 'nox-poetry' package.
 17 | 
 18 |     Please install it using the following command:
 19 | 
 20 |     {sys.executable} -m pip install nox-poetry"""
 21 |     raise SystemExit(dedent(message)) from None
 22 | 
 23 | 
 24 | package = "soccerdata"
 25 | python_versions = ["3.11", "3.10", "3.9", "3.8"]
 26 | nox.needs_version = ">= 2021.6.6"
 27 | nox.options.sessions = (
 28 |     "pre-commit",
 29 |     "mypy",
 30 |     "tests",
 31 |     "docs-build",
 32 | )
 33 | 
 34 | 
 35 | def activate_virtualenv_in_precommit_hooks(session: Session) -> None:
 36 |     """Activate virtualenv in hooks installed by pre-commit.
 37 | 
 38 |     This function patches git hooks installed by pre-commit to activate the
 39 |     session's virtual environment. This allows pre-commit to locate hooks in
 40 |     that environment when invoked from git.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     session : Session
 45 |         The Session object.
 46 |     """
 47 |     assert session.bin is not None
 48 | 
 49 |     # Only patch hooks containing a reference to this session's bindir. Support
 50 |     # quoting rules for Python and bash, but strip the outermost quotes so we
 51 |     # can detect paths within the bindir, like <bindir>/python.
 52 |     bindirs = [
 53 |         bindir[1:-1] if bindir[0] in "'\"" else bindir
 54 |         for bindir in (repr(session.bin), shlex.quote(session.bin))
 55 |     ]
 56 | 
 57 |     virtualenv = session.env.get("VIRTUAL_ENV")
 58 |     if virtualenv is None:
 59 |         return
 60 | 
 61 |     headers = {
 62 |         # pre-commit < 2.16.0
 63 |         "python": f"""\
 64 |             import os
 65 |             os.environ["VIRTUAL_ENV"] = {virtualenv!r}
 66 |             os.environ["PATH"] = os.pathsep.join((
 67 |                 {session.bin!r},
 68 |                 os.environ.get("PATH", ""),
 69 |             ))
 70 |             """,
 71 |         # pre-commit >= 2.16.0
 72 |         "bash": f"""\
 73 |             VIRTUAL_ENV={shlex.quote(virtualenv)}
 74 |             PATH={shlex.quote(session.bin)}"{os.pathsep}$PATH"
 75 |             """,
 76 |     }
 77 | 
 78 |     hookdir = Path(".git") / "hooks"
 79 |     if not hookdir.is_dir():
 80 |         return
 81 | 
 82 |     for hook in hookdir.iterdir():
 83 |         if hook.name.endswith(".sample") or not hook.is_file():
 84 |             continue
 85 | 
 86 |         if not hook.read_bytes().startswith(b"#!"):
 87 |             continue
 88 | 
 89 |         text = hook.read_text()
 90 | 
 91 |         if not any(
 92 |             Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text
 93 |             for bindir in bindirs
 94 |         ):
 95 |             continue
 96 | 
 97 |         lines = text.splitlines()
 98 | 
 99 |         for executable, header in headers.items():
100 |             if executable in lines[0].lower():
101 |                 lines.insert(1, dedent(header))
102 |                 hook.write_text("\n".join(lines))
103 |                 break
104 | 
105 | 
106 | @session(name="pre-commit", python=python_versions[0])
107 | def precommit(session: Session) -> None:
108 |     """Lint using pre-commit."""
109 |     args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"]
110 |     session.install(
111 |         "bandit",
112 |         "darglint",
113 |         "ruff",
114 |         "pep8-naming",
115 |         "pre-commit",
116 |         "pre-commit-hooks",
117 |         "pyupgrade",
118 |     )
119 |     session.run("pre-commit", *args)
120 |     if args and args[0] == "install":
121 |         activate_virtualenv_in_precommit_hooks(session)
122 | 
123 | 
124 | @session(python=python_versions)
125 | def mypy(session: Session) -> None:
126 |     """Type-check using mypy."""
127 |     args = session.posargs or ["soccerdata", "tests", "docs/conf.py"]
128 |     session.install(".")
129 |     session.install("mypy", "pytest")
130 |     session.run("mypy", "--install-types", "--non-interactive", *args)
131 |     if not session.posargs:
132 |         session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py")
133 | 
134 | 
135 | @session(python=python_versions)
136 | def tests(session: Session) -> None:
137 |     """Run the test suite."""
138 |     args = session.posargs or ["-m", "not e2e and not fails_gha"]
139 |     session.install(".")
140 |     session.install("coverage[toml]", "pytest", "pytest-mock", "time-machine", "pygments")
141 |     try:
142 |         session.run(
143 |             "coverage",
144 |             "run",
145 |             "--parallel",
146 |             "-m",
147 |             "pytest",
148 |             *args,
149 |             env={
150 |                 "SOCCERDATA_DIR": str(Path(__file__).parent / "tests" / "appdata"),
151 |                 "MAXAGE": "604800",
152 |             },
153 |         )
154 |     finally:
155 |         if session.interactive:
156 |             session.notify("coverage", posargs=[])
157 | 
158 | 
159 | @session(python=python_versions[0])
160 | def coverage(session: Session) -> None:
161 |     """Produce the coverage report."""
162 |     args = session.posargs or ["report"]
163 | 
164 |     session.install("coverage[toml]")
165 | 
166 |     if not session.posargs and any(Path().glob(".coverage.*")):
167 |         session.run("coverage", "combine")
168 | 
169 |     session.run("coverage", *args)
170 | 
171 | 
172 | @session(name="docs-build", python=python_versions[0])
173 | def docs_build(session: Session) -> None:
174 |     """Build the documentation."""
175 |     args = session.posargs or ["docs", "docs/_build"]
176 |     if not session.posargs and "FORCE_COLOR" in os.environ:
177 |         args.insert(0, "--color")
178 | 
179 |     session.install(".")
180 |     session.install("sphinx", "sphinx-click", "furo", "nbsphinx", "ipython")
181 | 
182 |     build_dir = Path("docs", "_build")
183 |     if build_dir.exists():
184 |         shutil.rmtree(build_dir)
185 | 
186 |     session.run("sphinx-build", *args, env={"SOCCERDATA_DIR": str(Path.home() / "soccerdata")})
187 | 
188 | 
189 | @session(python=python_versions[0])
190 | def docs(session: Session) -> None:
191 |     """Build and serve the documentation with live reloading on file changes."""
192 |     args = session.posargs or ["--host=0.0.0.0", "docs", "docs/_build"]
193 |     session.install(".")
194 |     session.install("sphinx", "sphinx-autobuild", "furo", "nbsphinx", "ipython")
195 | 
196 |     build_dir = Path("docs", "_build")
197 |     if build_dir.exists():
198 |         shutil.rmtree(build_dir)
199 | 
200 |     session.run(
201 |         "sphinx-autobuild",
202 |         *args,
203 |         env={"SOCCERDATA_DIR": str(Path.home() / "soccerdata")},
204 |     )
205 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "soccerdata"
  3 | version = "1.8.7"
  4 | description = "A collection of wrappers over soccer data from various websites / APIs."
  5 | authors = ["Pieter Robberechts <pieter.robberechts@kuleuven.be>"]
  6 | license = "Apache-2.0"
  7 | readme = 'README.rst'
  8 | homepage = "https://github.com/probberechts/soccerdata"
  9 | repository = "https://github.com/probberechts/soccerdata"
 10 | documentation = "https://soccerdata.readthedocs.io"
 11 | keywords = ["soccer", "football", "soccer data", "web scraping", "soccer analytics"]
 12 | classifiers = [
 13 |     "Programming Language :: Python :: 3",
 14 |     "License :: OSI Approved :: MIT License",
 15 |     "Operating System :: OS Independent"
 16 | ]
 17 | 
 18 | [tool.poetry.urls]
 19 | Changelog = "https://github.com/probberechts/soccerdata/releases"
 20 | 
 21 | [tool.poetry.dependencies]
 22 | python = ">=3.9,<3.13"
 23 | html5lib = "^1.1"
 24 | pandas = "^2.0.0, !=2.1.0"
 25 | lxml = "^4.9.3"
 26 | socceraction = {version="^1.5.3", optional=true}
 27 | wrapper-tls-requests = "^1.1.4"
 28 | tqdm = "^4.67.1"
 29 | rich = "^14.0.0"
 30 | seleniumbase = "^4.38.2"
 31 | unidecode = "^1.4.0"
 32 | urllib3 = "<2"
 33 | 
 34 | [tool.poetry.extras]
 35 | socceraction = ["socceraction"]
 36 | 
 37 | [tool.poetry.group.test.dependencies]
 38 | pytest = "^8.0.0"
 39 | pytest-deadfixtures = "^2.2.1"
 40 | pytest-mock = "^3.10.0"
 41 | coverage = {version = "^7.0", extras = ["toml"]}
 42 | time-machine = "^2.8.2"
 43 | 
 44 | [tool.poetry.group.dev.dependencies]
 45 | Pygments = "^2.13.0"
 46 | bumpversion = "^0.6.0"
 47 | darglint = "^1.8.1"
 48 | ruff = "*"
 49 | mypy = "*"
 50 | pep8-naming = "^0.15.0"
 51 | pre-commit = "^4.0.0"
 52 | pre-commit-hooks = "^5.0.0"
 53 | 
 54 | [tool.poetry.group.docs.dependencies]
 55 | Sphinx = "^7.0.0"
 56 | furo = "^2024.0.0"
 57 | nbsphinx = "^0.9.0"
 58 | sphinx-autobuild = "^2024.0.0"
 59 | 
 60 | [tool.coverage.paths]
 61 | source = ["soccerdata", "*/site-packages"]
 62 | tests = ["tests", "*/tests"]
 63 | 
 64 | [tool.coverage.run]
 65 | branch = true
 66 | source = ["soccerdata", "tests"]
 67 | 
 68 | [tool.coverage.report]
 69 | show_missing = true
 70 | ignore_errors = true
 71 | 
 72 | [tool.mypy]
 73 | ignore_missing_imports = true
 74 | disallow_untyped_defs = true
 75 | disallow_incomplete_defs = true
 76 | no_implicit_optional = true
 77 | check_untyped_defs = true
 78 | show_error_codes = true
 79 | warn_unused_ignores = true
 80 | 
 81 | [[tool.mypy.overrides]]
 82 | module = ["tests.*"]
 83 | disallow_untyped_defs = false
 84 | 
 85 | [tool.ruff]
 86 | src = ['soccerdata', 'tests']
 87 | line-length = 99
 88 | target-version = 'py39'
 89 | 
 90 | [tool.ruff.lint]
 91 | select = [
 92 |   'A',
 93 |   'ARG',
 94 |   'B',
 95 |   'B9',
 96 |   'C',
 97 |   'C4',
 98 |   'D',
 99 |   'DTZ',
100 |   'E',
101 |   'F',
102 |   'I',
103 |   'N',
104 |   'PIE',
105 |   'PT',
106 |   'PTH',
107 |   'Q',
108 |   'RET',
109 |   'RUF',
110 |   'SIM',
111 |   'SLF',
112 |   'T10',
113 |   'TCH',
114 |   'UP',
115 |   'W',
116 | ]
117 | ignore = ['B904']
118 | 
119 | [tool.ruff.lint.per-file-ignores]
120 | "__init__.py" = ['F401']
121 | "tests/*" = [
122 |     'S',
123 |     'D212',
124 |     'D415',
125 |     'D205',
126 |     'D103',
127 |     'D104',
128 |     'N999',
129 |     'SLF001',
130 | ]
131 | 
132 | [tool.ruff.lint.mccabe]
133 | max-complexity = 10
134 | 
135 | [tool.ruff.lint.pydocstyle]
136 | convention = 'numpy'
137 | 
138 | [tool.ruff.lint.isort]
139 | known-first-party = ["soccerdata", "tests"]
140 | 
141 | [build-system]
142 | requires = ["poetry-core>=1.0.0"]
143 | build-backend = "poetry.core.masonry.api"
144 | 


--------------------------------------------------------------------------------
/soccerdata/__init__.py:
--------------------------------------------------------------------------------
 1 | """A collection of tools to read and process soccer data from various sources."""
 2 | 
 3 | __version__ = "1.8.7"
 4 | 
 5 | __all__ = [
 6 |     "ClubElo",
 7 |     "ESPN",
 8 |     "FBref",
 9 |     "FotMob",
10 |     "MatchHistory",
11 |     "Sofascore",
12 |     "SoFIFA",
13 |     "Understat",
14 |     "WhoScored",
15 | ]
16 | 
17 | from .clubelo import ClubElo
18 | from .espn import ESPN
19 | from .fbref import FBref
20 | from .fotmob import FotMob
21 | from .match_history import MatchHistory
22 | from .sofascore import Sofascore
23 | from .sofifa import SoFIFA
24 | from .understat import Understat
25 | from .whoscored import WhoScored
26 | 


--------------------------------------------------------------------------------
/soccerdata/_config.py:
--------------------------------------------------------------------------------
  1 | """Configurations."""
  2 | 
  3 | import json
  4 | import logging
  5 | import logging.config
  6 | import os
  7 | import sys
  8 | from pathlib import Path
  9 | 
 10 | from rich.logging import RichHandler
 11 | 
 12 | # Configuration
 13 | NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", "False").lower() in ("true", "1", "t")
 14 | NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", "False").lower() in ("true", "1", "t")
 15 | MAXAGE = None
 16 | if os.environ.get("SOCCERDATA_MAXAGE") is not None:
 17 |     MAXAGE = int(os.environ.get("SOCCERDATA_MAXAGE", 0))
 18 | LOGLEVEL = os.environ.get("SOCCERDATA_LOGLEVEL", "INFO").upper()
 19 | 
 20 | # Directories
 21 | BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata"))
 22 | LOGS_DIR = Path(BASE_DIR, "logs")
 23 | DATA_DIR = Path(BASE_DIR, "data")
 24 | CONFIG_DIR = Path(BASE_DIR, "config")
 25 | 
 26 | # Create dirs
 27 | LOGS_DIR.mkdir(parents=True, exist_ok=True)
 28 | DATA_DIR.mkdir(parents=True, exist_ok=True)
 29 | CONFIG_DIR.mkdir(parents=True, exist_ok=True)
 30 | 
 31 | # Logger
 32 | logging_config = {
 33 |     "version": 1,
 34 |     "disable_existing_loggers": False,
 35 |     "formatters": {
 36 |         "minimal": {"format": "%(message)s"},
 37 |         "detailed": {
 38 |             "format": "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"  # noqa: E501
 39 |         },
 40 |     },
 41 |     "handlers": {
 42 |         "console": {
 43 |             "class": "logging.StreamHandler",
 44 |             "stream": sys.stdout,
 45 |             "formatter": "minimal",
 46 |             "level": logging.DEBUG,
 47 |         },
 48 |         "info": {
 49 |             "class": "logging.handlers.RotatingFileHandler",
 50 |             "filename": Path(LOGS_DIR, "info.log"),
 51 |             "maxBytes": 10485760,  # 1 MB
 52 |             "backupCount": 10,
 53 |             "formatter": "detailed",
 54 |             "level": logging.INFO,
 55 |         },
 56 |         "error": {
 57 |             "class": "logging.handlers.RotatingFileHandler",
 58 |             "filename": Path(LOGS_DIR, "error.log"),
 59 |             "maxBytes": 10485760,  # 1 MB
 60 |             "backupCount": 10,
 61 |             "formatter": "detailed",
 62 |             "level": logging.ERROR,
 63 |         },
 64 |     },
 65 |     "loggers": {
 66 |         "root": {
 67 |             "handlers": ["console", "info", "error"],
 68 |             "level": LOGLEVEL,
 69 |             "propagate": True,
 70 |         },
 71 |     },
 72 | }
 73 | logging.config.dictConfig(logging_config)
 74 | logging.captureWarnings(True)
 75 | logger = logging.getLogger("root")
 76 | logger.handlers[0] = RichHandler(markup=True)
 77 | 
 78 | # Team name replacements
 79 | TEAMNAME_REPLACEMENTS = {}
 80 | _f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json"
 81 | if _f_custom_teamnname_replacements.is_file():
 82 |     with _f_custom_teamnname_replacements.open(encoding="utf8") as json_file:
 83 |         for team, to_replace_list in json.load(json_file).items():
 84 |             for to_replace in to_replace_list:
 85 |                 TEAMNAME_REPLACEMENTS[to_replace] = team
 86 |     logger.info(
 87 |         "Custom team name replacements loaded from %s.",
 88 |         _f_custom_teamnname_replacements,
 89 |     )
 90 | else:
 91 |     logger.info(
 92 |         "No custom team name replacements found. You can configure these in %s.",
 93 |         _f_custom_teamnname_replacements,
 94 |     )
 95 | 
 96 | 
 97 | # League dict
 98 | LEAGUE_DICT = {
 99 |     "ENG-Premier League": {
100 |         "ClubElo": "ENG_1",
101 |         "MatchHistory": "E0",
102 |         "FiveThirtyEight": "premier-league",
103 |         "FBref": "Premier League",
104 |         "FotMob": "ENG-Premier League",
105 |         "ESPN": "eng.1",
106 |         "Sofascore": "Premier League",
107 |         "SoFIFA": "[England] Premier League",
108 |         "Understat": "EPL",
109 |         "WhoScored": "England - Premier League",
110 |         "season_start": "Aug",
111 |         "season_end": "May",
112 |     },
113 |     "ESP-La Liga": {
114 |         "ClubElo": "ESP_1",
115 |         "MatchHistory": "SP1",
116 |         "FiveThirtyEight": "la-liga",
117 |         "FBref": "La Liga",
118 |         "FotMob": "ESP-LaLiga",
119 |         "ESPN": "esp.1",
120 |         "Sofascore": "LaLiga",
121 |         "SoFIFA": "[Spain] La Liga",
122 |         "Understat": "La liga",
123 |         "WhoScored": "Spain - LaLiga",
124 |         "season_start": "Aug",
125 |         "season_end": "May",
126 |     },
127 |     "ITA-Serie A": {
128 |         "ClubElo": "ITA_1",
129 |         "MatchHistory": "I1",
130 |         "FiveThirtyEight": "serie-a",
131 |         "FBref": "Serie A",
132 |         "FotMob": "ITA-Serie A",
133 |         "ESPN": "ita.1",
134 |         "Sofascore": "Serie A",
135 |         "SoFIFA": "[Italy] Serie A",
136 |         "Understat": "Serie A",
137 |         "WhoScored": "Italy - Serie A",
138 |         "season_start": "Aug",
139 |         "season_end": "May",
140 |     },
141 |     "GER-Bundesliga": {
142 |         "ClubElo": "GER_1",
143 |         "MatchHistory": "D1",
144 |         "FiveThirtyEight": "bundesliga",
145 |         "FBref": "Fußball-Bundesliga",
146 |         "FotMob": "GER-Bundesliga",
147 |         "ESPN": "ger.1",
148 |         "Sofascore": "Bundesliga",
149 |         "SoFIFA": "[Germany] Bundesliga",
150 |         "Understat": "Bundesliga",
151 |         "WhoScored": "Germany - Bundesliga",
152 |         "season_start": "Aug",
153 |         "season_end": "May",
154 |     },
155 |     "FRA-Ligue 1": {
156 |         "ClubElo": "FRA_1",
157 |         "MatchHistory": "F1",
158 |         "FiveThirtyEight": "ligue-1",
159 |         "FBref": "Ligue 1",
160 |         "FotMob": "FRA-Ligue 1",
161 |         "ESPN": "fra.1",
162 |         "Sofascore": "Ligue 1",
163 |         "SoFIFA": "[France] Ligue 1",
164 |         "Understat": "Ligue 1",
165 |         "WhoScored": "France - Ligue 1",
166 |         "season_start": "Aug",
167 |         "season_end": "May",
168 |     },
169 |     "INT-World Cup": {
170 |         "FBref": "FIFA World Cup",
171 |         "FotMob": "INT-World Cup",
172 |         "WhoScored": "International - FIFA World Cup",
173 |         "season_code": "single-year",
174 |     },
175 |     "INT-European Championship": {
176 |         "FBref": "UEFA European Football Championship",
177 |         "FotMob": "INT-EURO",
178 |         "Sofascore": "EURO",
179 |         "WhoScored": "International - European Championship",
180 |         "season_start": "Jun",
181 |         "season_end": "Jul",
182 |         "season_code": "single-year",
183 |     },
184 |     "INT-Women's World Cup": {
185 |         "FBref": "FIFA Women's World Cup",
186 |         "FotMob": "INT-Women's World Cup",
187 |         "WhoScored": "International - FIFA Women's World Cup",
188 |         "season_code": "single-year",
189 |     },
190 | }
191 | _f_custom_league_dict = CONFIG_DIR / "league_dict.json"
192 | if _f_custom_league_dict.is_file():
193 |     with _f_custom_league_dict.open(encoding="utf8") as json_file:
194 |         LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)}
195 |     logger.info("Custom league dict loaded from %s.", _f_custom_league_dict)
196 | else:
197 |     logger.info(
198 |         "No custom league dict found. You can configure additional leagues in %s.",
199 |         _f_custom_league_dict,
200 |     )
201 | 


--------------------------------------------------------------------------------
/soccerdata/clubelo.py:
--------------------------------------------------------------------------------
  1 | """Scraper for api.clubelo.com."""
  2 | 
  3 | import re
  4 | from datetime import datetime, timedelta, timezone
  5 | from pathlib import Path
  6 | from typing import IO, Callable, Optional, Union
  7 | 
  8 | import pandas as pd
  9 | from unidecode import unidecode
 10 | 
 11 | from ._common import BaseRequestsReader, add_alt_team_names, standardize_colnames
 12 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 13 | 
 14 | CLUB_ELO_DATADIR = DATA_DIR / "ClubElo"
 15 | CLUB_ELO_API = "http://api.clubelo.com"
 16 | 
 17 | 
 18 | def _parse_csv(data: IO[bytes]) -> pd.DataFrame:
 19 |     return pd.read_csv(data, parse_dates=["From", "To"], date_format="%Y-%m-%d")
 20 | 
 21 | 
 22 | class ClubElo(BaseRequestsReader):
 23 |     """Provides pd.DataFrames from CSV API at http://api.clubelo.com.
 24 | 
 25 |     Data will be downloaded as necessary and cached locally in
 26 |     ``~/soccerdata/data/ClubElo``.
 27 | 
 28 |     Since the source does not provide league names, this class will not filter
 29 |     by league. League names will be inserted from the other sources where
 30 |     available. Leagues that are only covered by clubelo.com will have NaN
 31 |     values.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 36 |         Use a proxy to hide your IP address. Valid options are:
 37 |             - "tor": Uses the Tor network. Tor should be running in
 38 |               the background on port 9050.
 39 |             - str: The address of the proxy server to use.
 40 |             - list(str): A list of proxies to choose from. A different proxy will
 41 |               be selected from this list after failed requests, allowing rotating
 42 |               proxies.
 43 |             - callable: A function that returns a valid proxy. This function will
 44 |               be called after failed requests, allowing rotating proxies.
 45 |     no_cache : bool
 46 |         If True, will not use cached data.
 47 |     no_store : bool
 48 |         If True, will not store downloaded data.
 49 |     data_dir : Path
 50 |         Path to directory where data will be cached.
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         proxy: Optional[Union[str, list[str], Callable[[], str]]] = None,
 56 |         no_cache: bool = NOCACHE,
 57 |         no_store: bool = NOSTORE,
 58 |         data_dir: Path = CLUB_ELO_DATADIR,
 59 |     ):
 60 |         """Initialize a new ClubElo reader."""
 61 |         super().__init__(proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir)
 62 | 
 63 |     def read_by_date(self, date: Optional[Union[str, datetime]] = None) -> pd.DataFrame:
 64 |         """Retrieve ELO scores for all teams at specified date.
 65 | 
 66 |         Elo scores are available as early as 1939. Values before 1960 should
 67 |         be considered provisional.
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         date : datetime object or string like 'YYYY-MM-DD'
 72 |             Date for which to retrieve ELO scores. If no date is specified,
 73 |             get today's scores.
 74 | 
 75 |         Raises
 76 |         ------
 77 |         TypeError
 78 |             If date is not a date string or datetime object.
 79 |         ValueError
 80 |             If data is an invalid date string.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         pd.DataFrame
 85 |         """
 86 |         if not date:
 87 |             date = datetime.now(tz=timezone.utc)
 88 |         elif isinstance(date, str):
 89 |             date = datetime.strptime(date, "%Y-%m-%d").astimezone(timezone.utc)
 90 | 
 91 |         if not isinstance(date, datetime):
 92 |             raise TypeError("'date' must be a datetime object or string like 'YYYY-MM-DD'")
 93 | 
 94 |         datestring = date.strftime("%Y-%m-%d")
 95 |         filepath = self.data_dir / f"{datestring}.csv"
 96 |         url = f"{CLUB_ELO_API}/{datestring}"
 97 | 
 98 |         data = self.get(url, filepath)
 99 | 
100 |         return (
101 |             _parse_csv(data)
102 |             .pipe(standardize_colnames)
103 |             .rename(columns={"club": "team"})
104 |             .replace({"team": TEAMNAME_REPLACEMENTS})
105 |             .replace("None", float("nan"))
106 |             .assign(rank=lambda x: x["rank"].astype("float"))
107 |             .assign(league=lambda x: x["country"] + "_" + x["level"].astype(str))
108 |             .pipe(self._translate_league)
109 |             .reset_index(drop=True)
110 |             .set_index("team")
111 |         )
112 | 
113 |     def read_team_history(
114 |         self, team: str, max_age: Union[int, timedelta] = 1
115 |     ) -> Optional[pd.DataFrame]:
116 |         """Retrieve full ELO history for one club.
117 | 
118 |         For the exact spelling of a club's name, check the result of
119 |         :func:`~soccerdata.ClubElo.read_by_date` or `clubelo.com
120 |         <http://clubelo.com/Ranking>`__. You can also use alternative team
121 |         names specified in `teamname_replacements.json`. Values before 1960
122 |         should be considered provisional.
123 | 
124 |         Parameters
125 |         ----------
126 |         team : str
127 |             The club's name.
128 |         max_age : int for age in days, or timedelta object
129 |             The max. age of locally cached file before re-download.
130 | 
131 |         Raises
132 |         ------
133 |         TypeError
134 |             If max_age is not an integer or timedelta object.
135 |         ValueError
136 |             If no ratings for the given team are available.
137 | 
138 |         Returns
139 |         -------
140 |         pd.DataFrame
141 |         """
142 |         teams_to_check = add_alt_team_names(team)
143 |         teams_to_check = {re.sub(r"[\s']", "", unidecode(team)) for team in teams_to_check}
144 | 
145 |         for _team in teams_to_check:
146 |             filepath = self.data_dir / f"{_team}.csv"
147 |             url = f"{CLUB_ELO_API}/{_team}"
148 |             data = self.get(url, filepath, max_age)
149 | 
150 |             df = (
151 |                 _parse_csv(data)
152 |                 .pipe(standardize_colnames)
153 |                 .rename(columns={"club": "team"})
154 |                 .replace("None", float("nan"))
155 |                 .assign(rank=lambda x: x["rank"].astype("float"))
156 |                 .set_index("from")
157 |                 .sort_index()
158 |             )
159 | 
160 |             if len(df) > 0:
161 |                 # clubelo.com returns a CSV with just a header for nonexistent club
162 |                 df.replace({"team": TEAMNAME_REPLACEMENTS}, inplace=True)
163 |                 return df
164 | 
165 |         raise ValueError(f"No data found for team {team}")
166 | 


--------------------------------------------------------------------------------
/soccerdata/espn.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://site.api.espn.com/apis/site/v2/sports/soccer."""
  2 | 
  3 | import itertools
  4 | import json
  5 | import re
  6 | from datetime import datetime, timezone
  7 | from pathlib import Path
  8 | from typing import Callable, Optional, Union
  9 | 
 10 | import pandas as pd
 11 | 
 12 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames
 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
 14 | 
 15 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/summary?event=513466
 16 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/scoreboard?dates=20180901
 17 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/news
 18 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/teams
 19 | 
 20 | ESPN_DATADIR = DATA_DIR / "ESPN"
 21 | ESPN_API = "http://site.api.espn.com/apis/site/v2/sports/soccer"
 22 | 
 23 | 
 24 | class ESPN(BaseRequestsReader):
 25 |     """Provides pd.DataFrames from JSON api available at http://site.api.espn.com.
 26 | 
 27 |     Data will be downloaded as necessary and cached locally in
 28 |     ``~/soccerdata/data/ESPN``.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     leagues : string or iterable, optional
 33 |         IDs of leagues to include.
 34 |     seasons : string, int or list, optional
 35 |         Seasons to include. Supports multiple formats.
 36 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 37 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 38 |         Use a proxy to hide your IP address. Valid options are:
 39 |             - "tor": Uses the Tor network. Tor should be running in
 40 |               the background on port 9050.
 41 |             - str: The address of the proxy server to use.
 42 |             - list(str): A list of proxies to choose from. A different proxy will
 43 |               be selected from this list after failed requests, allowing rotating
 44 |               proxies.
 45 |             - callable: A function that returns a valid proxy. This function will
 46 |               be called after failed requests, allowing rotating proxies.
 47 |     no_cache : bool
 48 |         If True, will not use cached data.
 49 |     no_store : bool
 50 |         If True, will not store downloaded data.
 51 |     data_dir : Path
 52 |         Path to directory where data will be cached.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         leagues: Optional[Union[str, list[str]]] = None,
 58 |         seasons: Optional[Union[str, int, list]] = None,
 59 |         proxy: Optional[Union[str, list[str], Callable[[], str]]] = None,
 60 |         no_cache: bool = NOCACHE,
 61 |         no_store: bool = NOSTORE,
 62 |         data_dir: Path = ESPN_DATADIR,
 63 |     ):
 64 |         """Initialize a new ESPN reader."""
 65 |         super().__init__(
 66 |             leagues=leagues,
 67 |             proxy=proxy,
 68 |             no_cache=no_cache,
 69 |             no_store=no_store,
 70 |             data_dir=data_dir,
 71 |         )
 72 |         self.seasons = seasons  # type: ignore
 73 | 
 74 |     def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
 75 |         """Retrieve the game schedule for the selected leagues and seasons.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         force_cache : bool
 80 |              By default no cached data is used for the current season.
 81 |              If True, will force the use of cached data anyway.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         pd.DataFrame
 86 |         """
 87 |         urlmask = ESPN_API + "/{}/scoreboard?dates={}"
 88 |         filemask = "Schedule_{}_{}.json"
 89 | 
 90 |         df_list = []
 91 |         # Get match days
 92 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
 93 |             if int(skey[:2]) > int(str(datetime.now(tz=timezone.utc).year + 1)[-2:]):
 94 |                 start_date = "".join(["19", skey[:2], "07", "01"])
 95 |             else:
 96 |                 start_date = "".join(["20", skey[:2], "07", "01"])
 97 | 
 98 |             url = urlmask.format(lkey, start_date)
 99 |             filepath = self.data_dir / filemask.format(lkey, start_date)
100 |             reader = self.get(url, filepath)
101 |             data = json.load(reader)
102 | 
103 |             match_dates = [
104 |                 datetime.strptime(d, "%Y-%m-%dT%H:%MZ").strftime("%Y%m%d")  # noqa: DTZ007
105 |                 for d in data["leagues"][0]["calendar"]
106 |             ]
107 |             for date in match_dates:
108 |                 url = urlmask.format(lkey, date)
109 |                 filepath = self.data_dir / filemask.format(lkey, date)
110 |                 current_season = not self._is_complete(lkey, skey)
111 |                 reader = self.get(url, filepath, no_cache=current_season and not force_cache)
112 | 
113 |                 data = json.load(reader)
114 |                 df_list.extend(
115 |                     [
116 |                         {
117 |                             "league": lkey,
118 |                             "season": skey,
119 |                             "date": e["date"],
120 |                             "home_team": e["competitions"][0]["competitors"][0]["team"]["name"],
121 |                             "away_team": e["competitions"][0]["competitors"][1]["team"]["name"],
122 |                             "game_id": int(e["id"]),
123 |                             "league_id": lkey,
124 |                         }
125 |                         for e in data["events"]
126 |                     ]
127 |                 )
128 |         return (
129 |             pd.DataFrame(df_list)
130 |             .pipe(self._translate_league)
131 |             .replace({"home_team": TEAMNAME_REPLACEMENTS, "away_team": TEAMNAME_REPLACEMENTS})
132 |             .assign(date=lambda x: pd.to_datetime(x["date"]))
133 |             .dropna(subset=["home_team", "away_team", "date"])
134 |             .assign(game=lambda df: df.apply(make_game_id, axis=1))
135 |             .set_index(["league", "season", "game"])
136 |             .sort_index()
137 |         )
138 | 
139 |     def read_matchsheet(self, match_id: Optional[Union[int, list[int]]] = None) -> pd.DataFrame:
140 |         """Retrieve match sheets for the selected leagues and seasons.
141 | 
142 |         Parameters
143 |         ----------
144 |         match_id : int or list of int, optional
145 |             Retrieve the match sheet for a specific game.
146 | 
147 |         Raises
148 |         ------
149 |         ValueError
150 |             If no games with the given IDs were found for the selected seasons and leagues.
151 | 
152 |         Returns
153 |         -------
154 |         pd.DataFrame.
155 |         """
156 |         urlmask = ESPN_API + "/{}/summary?event={}"
157 |         filemask = "Summary_{}.json"
158 | 
159 |         df_schedule = self.read_schedule().reset_index()
160 |         if match_id is not None:
161 |             iterator = df_schedule[
162 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id)
163 |             ]
164 |             if len(iterator) == 0:
165 |                 raise ValueError(
166 |                     "No games with the given IDs found for the selected seasons and leagues."
167 |                 )
168 |         else:
169 |             iterator = df_schedule
170 | 
171 |         df_list = []
172 |         for i, match in iterator.iterrows():
173 |             url = urlmask.format(match["league_id"], match["game_id"])
174 |             filepath = self.data_dir / filemask.format(match["game_id"])
175 |             reader = self.get(url, filepath)
176 | 
177 |             data = json.load(reader)
178 |             for i in range(2):
179 |                 match_sheet = {
180 |                     "game": match["game"],
181 |                     "league": match["league"],
182 |                     "season": match["season"],
183 |                     "team": data["boxscore"]["form"][i]["team"]["displayName"],
184 |                     "is_home": (i == 0),
185 |                     "venue": (
186 |                         data["gameInfo"]["venue"]["fullName"]
187 |                         if "venue" in data["gameInfo"]
188 |                         else None
189 |                     ),
190 |                     "attendance": data["gameInfo"].get("attendance"),
191 |                     "capacity": (
192 |                         data["gameInfo"]["venue"].get("capacity")
193 |                         if "venue" in data["gameInfo"]
194 |                         else None
195 |                     ),
196 |                     "roster": data["rosters"][i].get("roster", None),
197 |                 }
198 |                 if "statistics" in data["boxscore"]["teams"][i]:
199 |                     for stat in data["boxscore"]["teams"][i]["statistics"]:
200 |                         match_sheet[stat["name"]] = stat["displayValue"]
201 |                 df_list.append(match_sheet)
202 |         return (
203 |             pd.DataFrame(df_list)
204 |             .replace({"team": TEAMNAME_REPLACEMENTS})
205 |             .pipe(standardize_colnames)
206 |             .set_index(["league", "season", "game", "team"])
207 |             .sort_index()
208 |         )
209 | 
210 |     def read_lineup(  # noqa: C901
211 |         self, match_id: Optional[Union[int, list[int]]] = None
212 |     ) -> pd.DataFrame:
213 |         """Retrieve lineups for the selected leagues and seasons.
214 | 
215 |         Parameters
216 |         ----------
217 |         match_id : int or list of int, optional
218 |             Retrieve the lineup for a specific game.
219 | 
220 |         Raises
221 |         ------
222 |         ValueError
223 |             If no games with the given IDs were found for the selected seasons and leagues.
224 | 
225 |         Returns
226 |         -------
227 |         pd.DataFrame.
228 |         """
229 |         urlmask = ESPN_API + "/{}/summary?event={}"
230 |         filemask = "Summary_{}.json"
231 | 
232 |         df_schedule = self.read_schedule().reset_index()
233 |         if match_id is not None:
234 |             iterator = df_schedule[
235 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id)
236 |             ]
237 |             if len(iterator) == 0:
238 |                 raise ValueError(
239 |                     "No games with the given IDs found for the selected seasons and leagues."
240 |                 )
241 |         else:
242 |             iterator = df_schedule
243 | 
244 |         df_list = []
245 |         for i, match in iterator.iterrows():
246 |             url = urlmask.format(match["league_id"], match["game_id"])
247 |             filepath = self.data_dir / filemask.format(match["game_id"])
248 |             reader = self.get(url, filepath)
249 | 
250 |             data = json.load(reader)
251 |             for i in range(2):
252 |                 if "roster" not in data["rosters"][i]:
253 |                     logger.info(
254 |                         "No lineup info found for team %d in game with ID=%s",
255 |                         i + 1,
256 |                         match["game_id"],
257 |                     )
258 |                     continue
259 |                 for p in data["rosters"][i]["roster"]:
260 |                     match_sheet = {
261 |                         "game": match["game"],
262 |                         "league": match["league"],
263 |                         "season": match["season"],
264 |                         "team": data["boxscore"]["form"][i]["team"]["displayName"],
265 |                         "is_home": (i == 0),
266 |                         "player": p["athlete"]["displayName"],
267 |                         "position": p["position"]["name"] if "position" in p else None,
268 |                         "formation_place": p.get("formationPlace", None),
269 |                     }
270 |                     subbed_in = (
271 |                         p["subbedIn"]
272 |                         if isinstance(p["subbedIn"], bool)
273 |                         else p["subbedIn"]["didSub"]
274 |                     )
275 |                     subbed_out = (
276 |                         p["subbedOut"]
277 |                         if isinstance(p["subbedOut"], bool)
278 |                         else p["subbedOut"]["didSub"]
279 |                     )
280 |                     subbed_events = []
281 |                     if isinstance(p["subbedIn"], bool) and (subbed_in or subbed_out):
282 |                         subbed_events = (
283 |                             [e for e in p["plays"] if e["substitution"]]
284 |                             if isinstance(p["subbedIn"], bool)
285 |                             else [p["subbedIn"], p["subbedOut"]]
286 |                         )
287 |                     else:
288 |                         if subbed_in:
289 |                             subbed_events.append(p["subbedIn"])
290 |                         if subbed_out:
291 |                             subbed_events.append(p["subbedOut"])
292 | 
293 |                     if p["starter"]:
294 |                         match_sheet["sub_in"] = "start"
295 |                     elif subbed_in:
296 |                         match_sheet["sub_in"] = sum(
297 |                             map(
298 |                                 int,
299 |                                 re.findall(
300 |                                     r"(\d{1,3})",
301 |                                     subbed_events[0]["clock"]["displayValue"],
302 |                                 ),
303 |                             )
304 |                         )
305 |                     else:
306 |                         match_sheet["sub_in"] = None
307 | 
308 |                     if (p["starter"] or subbed_in) and not subbed_out:
309 |                         match_sheet["sub_out"] = "end"
310 |                     elif subbed_out:
311 |                         j = 0 if not subbed_in else 1
312 |                         match_sheet["sub_out"] = sum(
313 |                             map(
314 |                                 int,
315 |                                 re.findall(
316 |                                     r"(\d{1,3})",
317 |                                     subbed_events[j]["clock"]["displayValue"],
318 |                                 ),
319 |                             )
320 |                         )
321 |                     else:
322 |                         match_sheet["sub_out"] = None
323 | 
324 |                     if "stats" in p:
325 |                         for stat in p["stats"]:
326 |                             match_sheet[stat["name"]] = stat["value"]
327 | 
328 |                     df_list.append(match_sheet)
329 | 
330 |         if len(df_list) == 0:
331 |             return pd.DataFrame()
332 | 
333 |         return (
334 |             pd.DataFrame(df_list)
335 |             .replace({"team": TEAMNAME_REPLACEMENTS})
336 |             .pipe(standardize_colnames)
337 |             .set_index(["league", "season", "game", "team", "player"])
338 |             .sort_index()
339 |         )
340 | 


--------------------------------------------------------------------------------
/soccerdata/fotmob.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://fotmob.com."""
  2 | 
  3 | import itertools
  4 | import json
  5 | from collections.abc import Iterable
  6 | from pathlib import Path
  7 | from typing import Callable, Optional, Union
  8 | 
  9 | import pandas as pd
 10 | import tls_requests
 11 | 
 12 | from ._common import BaseRequestsReader, add_standardized_team_name, make_game_id
 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
 14 | 
 15 | FOTMOB_DATADIR = DATA_DIR / "FotMob"
 16 | FOTMOB_API = "https://www.fotmob.com/api/"
 17 | 
 18 | 
 19 | class FotMob(BaseRequestsReader):
 20 |     """Provides pd.DataFrames from data available at http://www.fotmob.com.
 21 | 
 22 |     Data will be downloaded as necessary and cached locally in
 23 |     ``~/soccerdata/data/FotMob``.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     leagues : string or iterable, optional
 28 |         IDs of Leagues to include.
 29 |     seasons : string, int or list, optional
 30 |         Seasons to include. Supports multiple formats.
 31 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 32 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 33 |         Use a proxy to hide your IP address. Valid options are:
 34 |             - "tor": Uses the Tor network. Tor should be running in
 35 |               the background on port 9050.
 36 |             - str: The address of the proxy server to use.
 37 |             - list(str): A list of proxies to choose from. A different proxy will
 38 |               be selected from this list after failed requests, allowing rotating
 39 |               proxies.
 40 |             - callable: A function that returns a valid proxy. This function will
 41 |               be called after failed requests, allowing rotating proxies.
 42 |     no_cache : bool
 43 |         If True, will not use cached data.
 44 |     no_store : bool
 45 |         If True, will not store downloaded data.
 46 |     data_dir : Path
 47 |         Path to directory where data will be cached.
 48 |     """
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         leagues: Optional[Union[str, list[str]]] = None,
 53 |         seasons: Optional[Union[str, int, Iterable[Union[str, int]]]] = None,
 54 |         proxy: Optional[Union[str, list[str], Callable[[], str]]] = None,
 55 |         no_cache: bool = NOCACHE,
 56 |         no_store: bool = NOSTORE,
 57 |         data_dir: Path = FOTMOB_DATADIR,
 58 |     ):
 59 |         """Initialize the FotMob reader."""
 60 |         super().__init__(
 61 |             leagues=leagues,
 62 |             proxy=proxy,
 63 |             no_cache=no_cache,
 64 |             no_store=no_store,
 65 |             data_dir=data_dir,
 66 |         )
 67 |         self.seasons = seasons  # type: ignore
 68 |         if not self.no_store:
 69 |             (self.data_dir / "leagues").mkdir(parents=True, exist_ok=True)
 70 |             (self.data_dir / "seasons").mkdir(parents=True, exist_ok=True)
 71 |             (self.data_dir / "matches").mkdir(parents=True, exist_ok=True)
 72 | 
 73 |     def _init_session(self) -> tls_requests.Client:
 74 |         session = super()._init_session()
 75 |         try:
 76 |             r = tls_requests.get("http://46.101.91.154:6006/")
 77 |             r.raise_for_status()
 78 |         except tls_requests.exceptions.HTTPError:
 79 |             raise ConnectionError("Unable to connect to the session cookie server.")
 80 |         result = r.json()
 81 |         session.headers.update(result)
 82 |         return session
 83 | 
 84 |     @property
 85 |     def leagues(self) -> list[str]:
 86 |         """Return a list of selected leagues."""
 87 |         return list(self._leagues_dict.keys())
 88 | 
 89 |     def read_leagues(self) -> pd.DataFrame:
 90 |         """Retrieve the selected leagues from the datasource.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         pd.DataFrame
 95 |         """
 96 |         url = FOTMOB_API + "allLeagues"
 97 |         filepath = self.data_dir / "allLeagues.json"
 98 |         reader = self.get(url, filepath)
 99 |         data = json.load(reader)
100 |         leagues = []
101 |         for k, v in data.items():
102 |             if k == "international":
103 |                 for int_league in v[0]["leagues"]:
104 |                     leagues.append(
105 |                         {
106 |                             "region": v[0]["ccode"],
107 |                             "league_id": int_league["id"],
108 |                             "league": int_league["name"],
109 |                             "url": "https://fotmob.com" + int_league["pageUrl"],
110 |                         }
111 |                     )
112 |             elif k not in ("favourite", "popular", "userSettings"):
113 |                 for country in v:
114 |                     for dom_league in country["leagues"]:
115 |                         leagues.append(
116 |                             {
117 |                                 "region": country["ccode"],
118 |                                 "league": dom_league["name"],
119 |                                 "league_id": dom_league["id"],
120 |                                 "url": "https://fotmob.com" + dom_league["pageUrl"],
121 |                             }
122 |                         )
123 |         df = (
124 |             pd.DataFrame(leagues)
125 |             .assign(league=lambda x: x.region + "-" + x.league)
126 |             .pipe(self._translate_league)
127 |             .set_index("league")
128 |             .loc[self._selected_leagues.keys()]
129 |             .sort_index()
130 |         )
131 |         return df[df.index.isin(self.leagues)]
132 | 
133 |     def read_seasons(self) -> pd.DataFrame:
134 |         """Retrieve the selected seasons for the selected leagues.
135 | 
136 |         Returns
137 |         -------
138 |         pd.DataFrame
139 |         """
140 |         filemask = "leagues/{}.json"
141 |         urlmask = FOTMOB_API + "leagues?id={}"
142 |         df_leagues = self.read_leagues()
143 |         seasons = []
144 |         for lkey, league in df_leagues.iterrows():
145 |             url = urlmask.format(league.league_id)
146 |             filepath = self.data_dir / filemask.format(lkey)
147 |             reader = self.get(url, filepath)
148 |             data = json.load(reader)
149 |             # extract season IDs
150 |             avail_seasons = data["allAvailableSeasons"]
151 |             for season in avail_seasons:
152 |                 seasons.append(
153 |                     {
154 |                         "league": lkey,
155 |                         "season": self._season_code.parse(season),
156 |                         "league_id": league.league_id,
157 |                         "season_id": season,
158 |                         "url": league.url + "?season=" + season,
159 |                     }
160 |                 )
161 |             # Change season id for 2122 season manually (gross)
162 |         df = pd.DataFrame(seasons).set_index(["league", "season"]).sort_index()
163 |         return df.loc[df.index.isin(list(itertools.product(self.leagues, self.seasons)))]
164 | 
165 |     def read_league_table(self, force_cache: bool = False) -> pd.DataFrame:  # noqa: C901
166 |         """Retrieve the league table for the selected leagues.
167 | 
168 |         Parameters
169 |         ----------
170 |         force_cache : bool
171 |              By default no cached data is used for the current season.
172 |              If True, will force the use of cached data anyway.
173 | 
174 |         Returns
175 |         -------
176 |         pd.DataFrame
177 |         """
178 |         filemask = "seasons/{}_{}.html"
179 |         urlmask = FOTMOB_API + "leagues?id={}&season={}"
180 | 
181 |         idx = ["league", "season"]
182 |         cols = ["team", "MP", "W", "D", "L", "GF", "GA", "GD", "Pts"]
183 | 
184 |         # get league and season IDs
185 |         seasons = self.read_seasons()
186 |         # collect league tables
187 |         mult_tables = []
188 |         for (lkey, skey), season in seasons.iterrows():
189 |             # read html page (league overview)
190 |             filepath = self.data_dir / filemask.format(lkey, skey)
191 |             url = urlmask.format(season.league_id, season.season_id)
192 |             current_season = not self._is_complete(lkey, skey)
193 |             reader = self.get(url, filepath, no_cache=current_season and not force_cache)
194 |             season_data = json.load(reader)
195 |             table_data = season_data["table"][0]["data"]
196 |             if "tables" in table_data:
197 |                 if "stage" not in idx:
198 |                     idx.append("stage")
199 |                 groups_data = table_data["tables"]
200 |                 all_groups = []
201 |                 for i in range(len(groups_data)):
202 |                     group_table = pd.json_normalize(groups_data[i]["table"]["all"])
203 |                     group_table["stage"] = groups_data[i]["leagueName"]
204 |                     all_groups.append(group_table)
205 |                 df_table = pd.concat(all_groups, axis=0)
206 |             else:
207 |                 df_table = pd.json_normalize(table_data["table"]["all"])
208 |             df_table[["GF", "GA"]] = df_table["scoresStr"].str.split("-", expand=True)
209 |             df_table = df_table.rename(
210 |                 columns={
211 |                     "name": "team",
212 |                     "played": "MP",
213 |                     "wins": "W",
214 |                     "draws": "D",
215 |                     "losses": "L",
216 |                     "goalConDiff": "GD",
217 |                     "pts": "Pts",
218 |                 }
219 |             )
220 |             df_table["league"] = lkey
221 |             df_table["season"] = skey
222 | 
223 |             # If league has a playoff, add final playoff standing as a column
224 |             if "playoff" in season_data["tabs"]:
225 |                 if "playoff" not in cols:
226 |                     cols.append("playoff")
227 |                 df_table["playoff"] = None
228 |                 # Get cup game finalists (for leagues with playoffs)
229 |                 playoff_rounds = season_data["playoff"]["rounds"]
230 |                 for i in range(len(playoff_rounds)):
231 |                     stage_teams = []
232 |                     for game in playoff_rounds[i]["matchups"]:
233 |                         if not bool(game):
234 |                             continue
235 |                         stage = game["stage"]
236 |                         stage_teams.append(game["homeTeamId"])
237 |                         stage_teams.append(game["awayTeamId"])
238 |                         df_table.loc[df_table["id"].isin(stage_teams), "playoff"] = stage
239 |                         if stage == "final":
240 |                             winner = game["winner"]
241 |                             df_table.loc[df_table["id"] == winner, "playoff"] = "cup_winner"
242 |             mult_tables.append(df_table)
243 |         return (
244 |             pd.concat(mult_tables, axis=0)
245 |             .rename(columns={"Squad": "team"})
246 |             .replace({"team": TEAMNAME_REPLACEMENTS})
247 |             .set_index(idx)
248 |             .sort_index()[cols]
249 |         )
250 | 
251 |     def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
252 |         """Retrieve the game schedule for the selected leagues and seasons.
253 | 
254 |         Parameters
255 |         ----------
256 |         force_cache : bool
257 |              By default no cached data is used for the current season.
258 |              If True, will force the use of cached data anyway.
259 | 
260 |         Returns
261 |         -------
262 |         pd.DataFrame
263 |         """
264 |         filemask = "seasons/{}_{}.html"
265 |         urlmask = FOTMOB_API + "leagues?id={}&season={}"
266 | 
267 |         cols = [
268 |             "round",
269 |             "week",
270 |             "date",
271 |             "home_team",
272 |             "away_team",
273 |             "home_score",
274 |             "away_score",
275 |             "status",
276 |             "game_id",
277 |             "url",
278 |         ]
279 | 
280 |         df_seasons = self.read_seasons()
281 |         all_schedules = []
282 |         for (lkey, skey), season in df_seasons.iterrows():
283 |             filepath = self.data_dir / filemask.format(lkey, skey)
284 |             url = urlmask.format(season.league_id, season.season_id)
285 |             current_season = not self._is_complete(lkey, skey)
286 |             reader = self.get(url, filepath, no_cache=current_season and not force_cache)
287 |             season_data = json.load(reader)
288 | 
289 |             df = pd.json_normalize(season_data["matches"]["allMatches"])
290 |             df["league"] = lkey
291 |             df["season"] = skey
292 |             all_schedules.append(df)
293 | 
294 |         # Construct the output dataframe
295 |         df = (
296 |             pd.concat(all_schedules)
297 |             .rename(
298 |                 columns={
299 |                     "roundName": "round",
300 |                     "round": "week",
301 |                     "home.name": "home_team",
302 |                     "away.name": "away_team",
303 |                     "status.reason.short": "status",
304 |                     "pageUrl": "url",
305 |                     "id": "game_id",
306 |                 }
307 |             )
308 |             .replace(
309 |                 {
310 |                     "home_team": TEAMNAME_REPLACEMENTS,
311 |                     "away_team": TEAMNAME_REPLACEMENTS,
312 |                 }
313 |             )
314 |             .assign(date=lambda x: pd.to_datetime(x["status.utcTime"], format="mixed"))
315 |         )
316 |         df["game"] = df.apply(make_game_id, axis=1)
317 |         df["url"] = "https://fotmob.com" + df["url"]
318 |         df[["home_score", "away_score"]] = df["status.scoreStr"].str.split("-", expand=True)
319 |         return df.set_index(["league", "season", "game"]).sort_index()[cols]
320 | 
321 |     def read_team_match_stats(
322 |         self,
323 |         stat_type: str = "Top stats",
324 |         opponent_stats: bool = True,
325 |         team: Optional[Union[str, list[str]]] = None,
326 |         force_cache: bool = False,
327 |     ) -> pd.DataFrame:
328 |         """Retrieve the match stats for the selected leagues and seasons.
329 | 
330 |         The following stat types are available:
331 |             * 'Top stats'
332 |             * 'Shots'
333 |             * 'Expected goals (xG)'
334 |             * 'Passes'
335 |             * 'Defence'
336 |             * 'Duels'
337 |             * 'Discipline'
338 | 
339 |         Parameters
340 |         ----------
341 |         stat_type : str
342 |             Type of stats to retrieve.
343 |         opponent_stats: bool
344 |             If True, will retrieve opponent stats.
345 |         team: str or list of str, optional
346 |             Team(s) to retrieve. If None, will retrieve all teams.
347 |         force_cache : bool
348 |             By default no cached data is used to scrape the list of available
349 |             games for the current season. If True, will force the use of
350 |             cached data anyway.
351 | 
352 |         Raises
353 |         ------
354 |         TypeError
355 |             If ``stat_type`` is not valid.
356 |         ValueError
357 |             If no games with the given IDs were found for the selected seasons and leagues.
358 | 
359 |         Returns
360 |         -------
361 |         pd.DataFrame
362 |         """
363 |         filemask = "matches/{}_{}_{}.html"
364 |         urlmask = FOTMOB_API + "matchDetails?matchId={}"
365 | 
366 |         # Retrieve games for which a match report is available
367 |         df_matches = self.read_schedule(force_cache)
368 |         df_complete = df_matches.loc[df_matches["status"].isin(["FT", "AET", "Pen"])]
369 | 
370 |         if team is not None:
371 |             # get alternative names of the specified team(s)
372 |             teams_to_check = add_standardized_team_name(team)
373 | 
374 |             # select requested teams
375 |             iterator = df_complete.loc[
376 |                 (
377 |                     df_complete.home_team.isin(teams_to_check)
378 |                     | df_complete.away_team.isin(teams_to_check)
379 |                 )
380 |             ]
381 |             if len(iterator) == 0:
382 |                 raise ValueError("No data found for the given teams in the selected seasons.")
383 |         else:
384 |             iterator = df_complete
385 |             teams_to_check = iterator.home_team.tolist() + iterator.away_team.tolist()
386 | 
387 |         stats = []
388 |         for i, game in iterator.reset_index().iterrows():
389 |             lkey, skey, gkey = game["league"], game["season"], game["game"]
390 |             # Get data for specific game
391 |             url = urlmask.format(game.game_id)
392 |             filepath = self.data_dir / filemask.format(lkey, skey, game.game_id)
393 |             reader = self.get(url, filepath)
394 |             logger.info(
395 |                 "[%s/%s] Retrieving game with id=%s",
396 |                 i + 1,
397 |                 len(iterator),
398 |                 game["game_id"],
399 |             )
400 |             game_data = json.load(reader)
401 | 
402 |             # Get stats types
403 |             all_stats = game_data["content"]["stats"]["Periods"]["All"]["stats"]
404 |             try:
405 |                 selected_stats = next(stat for stat in all_stats if stat["title"] == stat_type)
406 |             except StopIteration:
407 |                 raise ValueError(f"Invalid stat type: {stat_type}")
408 | 
409 |             df_raw_stats = pd.DataFrame(selected_stats["stats"])
410 |             game_teams = [game.home_team, game.away_team]
411 |             for i, team in enumerate(game_teams):
412 |                 df_team_stats = df_raw_stats.copy()
413 |                 df_team_stats["stat"] = df_team_stats["stats"].apply(lambda x: x[i])  # noqa: B023
414 |                 df_team_stats["league"] = lkey
415 |                 df_team_stats["season"] = skey
416 |                 df_team_stats["game"] = gkey
417 |                 df_team_stats["team"] = team
418 |                 if not opponent_stats:
419 |                     df_team_stats = df_team_stats[df_team_stats.team.isin(teams_to_check)]
420 |                 df_team_stats.set_index(["league", "season", "game", "team"], inplace=True)
421 |                 df_team_stats = df_team_stats[df_team_stats["type"] != "title"]
422 |                 df_team_stats = df_team_stats.pivot(columns="title", values="stat").reset_index()
423 |                 df_team_stats.columns.name = None
424 |                 stats.append(df_team_stats)
425 | 
426 |         df = pd.concat(stats, axis=0)
427 |         df = df.set_index(["league", "season", "game", "team"]).sort_index()
428 |         # Split percentage values
429 |         pct_cols = [col for col in df.columns if df[col].astype(str).str.contains("%").any()]
430 |         for col in pct_cols:
431 |             df[[col, col + " (%)"]] = df[col].str.split(expand=True)
432 |             df[col + " (%)"] = df[col + " (%)"].str.extract(r"(\d+)").astype(float).div(100)
433 |         return df
434 | 


--------------------------------------------------------------------------------
/soccerdata/match_history.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://www.football-data.co.uk/data.php."""
  2 | 
  3 | import itertools
  4 | from pathlib import Path
  5 | from typing import IO, Callable, Optional, Union
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from ._common import BaseRequestsReader, make_game_id
 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
 11 | 
 12 | MATCH_HISTORY_DATA_DIR = DATA_DIR / "MatchHistory"
 13 | MATCH_HISTORY_API = "https://www.football-data.co.uk"
 14 | 
 15 | 
 16 | def _parse_csv(raw_data: IO[bytes], lkey: str, skey: str) -> pd.DataFrame:
 17 |     logger.info("Parsing league=%s season=%s", lkey, skey)
 18 |     if int(skey) >= 2425:
 19 |         # Since 2024-25, the CSV files are encoded in UTF-8-SIG
 20 |         df_games = pd.read_csv(
 21 |             raw_data,
 22 |             encoding="UTF-8-SIG",
 23 |             on_bad_lines="warn",
 24 |         )
 25 |     else:
 26 |         df_games = pd.read_csv(
 27 |             raw_data,
 28 |             encoding="latin-1",
 29 |             on_bad_lines="warn",
 30 |         )
 31 |     return df_games
 32 | 
 33 | 
 34 | class MatchHistory(BaseRequestsReader):
 35 |     """Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php.
 36 | 
 37 |     Data will be downloaded as necessary and cached locally in
 38 |     ``~/soccerdata/data/MatchHistory``.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     leagues : string or iterable
 43 |         IDs of leagues to include.
 44 |     seasons : string, int or list
 45 |         Seasons to include. Supports multiple formats.
 46 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 47 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 48 |         Use a proxy to hide your IP address. Valid options are:
 49 |             - "tor": Uses the Tor network. Tor should be running in
 50 |               the background on port 9050.
 51 |             - str: The address of the proxy server to use.
 52 |             - list(str): A list of proxies to choose from. A different proxy will
 53 |               be selected from this list after failed requests, allowing rotating
 54 |               proxies.
 55 |             - callable: A function that returns a valid proxy. This function will
 56 |               be called after failed requests, allowing rotating proxies.
 57 |     no_cache : bool
 58 |         If True, will not use cached data.
 59 |     no_store : bool
 60 |         If True, will not store downloaded data.
 61 |     data_dir : Path, optional
 62 |         Path to directory where data will be cached.
 63 |     """
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         leagues: Optional[Union[str, list[str]]] = None,
 68 |         seasons: Optional[Union[str, int, list]] = None,
 69 |         proxy: Optional[Union[str, list[str], Callable[[], str]]] = None,
 70 |         no_cache: bool = NOCACHE,
 71 |         no_store: bool = NOSTORE,
 72 |         data_dir: Path = MATCH_HISTORY_DATA_DIR,
 73 |     ):
 74 |         super().__init__(
 75 |             leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir
 76 |         )
 77 |         self.seasons = seasons  # type: ignore
 78 | 
 79 |     def read_games(self) -> pd.DataFrame:
 80 |         """Retrieve game history for the selected leagues and seasons.
 81 | 
 82 |         Column names are explained here: http://www.football-data.co.uk/notes.txt
 83 | 
 84 |         Returns
 85 |         -------
 86 |         pd.DataFrame
 87 |         """
 88 |         urlmask = MATCH_HISTORY_API + "/mmz4281/{}/{}.csv"
 89 |         filemask = "{}_{}.csv"
 90 |         col_rename = {
 91 |             "Div": "league",
 92 |             "Date": "date",
 93 |             "Time": "time",
 94 |             "HomeTeam": "home_team",
 95 |             "AwayTeam": "away_team",
 96 |             "Referee": "referee",
 97 |         }
 98 | 
 99 |         df_list = []
100 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
101 |             filepath = self.data_dir / filemask.format(lkey, skey)
102 |             url = urlmask.format(skey, lkey)
103 |             current_season = not self._is_complete(lkey, skey)
104 | 
105 |             reader = self.get(url, filepath, no_cache=current_season)
106 |             df_games = _parse_csv(reader, lkey, skey).assign(season=skey)
107 | 
108 |             if "Time" not in df_games.columns:
109 |                 df_games["Time"] = "12:00"
110 |             df_games["Time"] = df_games["Time"].fillna("12:00")
111 |             df_list.append(df_games)
112 | 
113 |         df = (
114 |             pd.concat(df_list, sort=False)
115 |             .rename(columns=col_rename)
116 |             .assign(
117 |                 date=lambda x: pd.to_datetime(
118 |                     x["date"] + " " + x["time"], format="mixed", dayfirst=True
119 |                 )
120 |             )
121 |             .drop("time", axis=1)
122 |             .pipe(self._translate_league)
123 |             .replace(
124 |                 {
125 |                     "home_team": TEAMNAME_REPLACEMENTS,
126 |                     "away_team": TEAMNAME_REPLACEMENTS,
127 |                 }
128 |             )
129 |             .dropna(subset=["home_team", "away_team"])
130 |         )
131 | 
132 |         df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
133 |         df["game"] = df.apply(make_game_id, axis=1)
134 |         df.set_index(["league", "season", "game"], inplace=True)
135 |         df.sort_index(inplace=True)
136 |         return df
137 | 


--------------------------------------------------------------------------------
/soccerdata/sofascore.py:
--------------------------------------------------------------------------------
  1 | """Scraper for https://www.sofascore.com/."""
  2 | 
  3 | import itertools
  4 | import json
  5 | from collections.abc import Iterable
  6 | from datetime import datetime, timezone
  7 | from pathlib import Path
  8 | from typing import Callable, Optional, Union
  9 | 
 10 | import pandas as pd
 11 | 
 12 | from ._common import BaseRequestsReader, make_game_id
 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 14 | 
 15 | SOFASCORE_DATADIR = DATA_DIR / "Sofascore"
 16 | SOFASCORE_API = "https://api.sofascore.com/api/v1/"
 17 | 
 18 | 
 19 | class Sofascore(BaseRequestsReader):
 20 |     """Provides pd.DataFrames from data available at http://www.sofascore.com.
 21 | 
 22 |     Data will be downloaded as necessary and cached locally in
 23 |     ``~/soccerdata/data/Sofascore``.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     leagues : string or iterable, optional
 28 |         IDs of Leagues to include.
 29 |     seasons : string, int or list, optional
 30 |         Seasons to include. Supports multiple formats.
 31 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 32 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 33 |         Use a proxy to hide your IP address. Valid options are:
 34 |             - "tor": Uses the Tor network. Tor should be running in
 35 |               the background on port 9050.
 36 |             - str: The address of the proxy server to use.
 37 |             - list(str): A list of proxies to choose from. A different proxy will
 38 |               be selected from this list after failed requests, allowing rotating
 39 |               proxies.
 40 |             - callable: A function that returns a valid proxy. This function will
 41 |               be called after failed requests, allowing rotating proxies.
 42 |     no_cache : bool
 43 |         If True, will not use cached data.
 44 |     no_store : bool
 45 |         If True, will not store downloaded data.
 46 |     data_dir : Path
 47 |         Path to directory where data will be cached.
 48 |     """
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         leagues: Optional[Union[str, list[str]]] = None,
 53 |         seasons: Optional[Union[str, int, Iterable[Union[str, int]]]] = None,
 54 |         proxy: Optional[Union[str, list[str], Callable[[], str]]] = None,
 55 |         no_cache: bool = NOCACHE,
 56 |         no_store: bool = NOSTORE,
 57 |         data_dir: Path = SOFASCORE_DATADIR,
 58 |     ):
 59 |         """Initialize the Sofascore reader."""
 60 |         super().__init__(
 61 |             leagues=leagues,
 62 |             proxy=proxy,
 63 |             no_cache=no_cache,
 64 |             no_store=no_store,
 65 |             data_dir=data_dir,
 66 |         )
 67 |         self.seasons = seasons  # type: ignore
 68 |         if not self.no_store:
 69 |             (self.data_dir / "leagues").mkdir(parents=True, exist_ok=True)
 70 |             (self.data_dir / "seasons").mkdir(parents=True, exist_ok=True)
 71 |             (self.data_dir / "matches").mkdir(parents=True, exist_ok=True)
 72 | 
 73 |     def read_leagues(self) -> pd.DataFrame:
 74 |         """Retrieve the selected leagues from the datasource.
 75 | 
 76 |         Returns
 77 |         -------
 78 |         pd.DataFrame
 79 |         """
 80 |         url = SOFASCORE_API + "config/unique-tournaments/EN/football"
 81 |         filepath = self.data_dir / "leagues.json"
 82 |         reader = self.get(url, filepath)
 83 |         data = json.load(reader)
 84 |         leagues = []
 85 |         for k in data["uniqueTournaments"]:
 86 |             leagues.append(
 87 |                 {
 88 |                     "league_id": k["id"],
 89 |                     "league": k["name"],
 90 |                 }
 91 |             )
 92 |         df = (
 93 |             pd.DataFrame(leagues)
 94 |             .pipe(self._translate_league)
 95 |             .assign(region=lambda x: x["league"].str.split("-").str[0])
 96 |             .set_index("league")
 97 |             .loc[self._selected_leagues.keys()]
 98 |             .sort_index()
 99 |         )
100 |         return df[df.index.isin(self.leagues)]
101 | 
102 |     def read_seasons(self) -> pd.DataFrame:
103 |         """Retrieve the selected seasons for the selected leagues.
104 | 
105 |         Returns
106 |         -------
107 |         pd.DataFrame
108 |         """
109 |         filemask = "leagues/{}.json"
110 |         seasons = []
111 |         df_leagues = self.read_leagues()
112 |         for lkey, league in df_leagues.iterrows():
113 |             url = SOFASCORE_API + "unique-tournament/{}/seasons"
114 |             filepath = self.data_dir / filemask.format(lkey)
115 |             reader = self.get(url.format(league.league_id), filepath)
116 |             data = json.load(reader)["seasons"]
117 |             for season in data:
118 |                 seasons.append(
119 |                     {
120 |                         "league": lkey,
121 |                         "season": self._season_code.parse(season["year"]),
122 |                         "league_id": league.league_id,
123 |                         "season_id": season["id"],
124 |                     }
125 |                 )
126 |         df = pd.DataFrame(seasons).set_index(["league", "season"]).sort_index()
127 | 
128 |         return df.loc[df.index.isin(list(itertools.product(self.leagues, self.seasons)))]
129 | 
130 |     def read_league_table(self, force_cache: bool = False) -> pd.DataFrame:
131 |         """Retrieve the league table for the selected leagues.
132 | 
133 |         Parameters
134 |         ----------
135 |         force_cache : bool
136 |              By default no cached data is used for the current season.
137 |              If True, will force the use of cached data anyway.
138 | 
139 |         Returns
140 |         -------
141 |         pd.DataFrame
142 |         """
143 |         filemask = "seasons/{}_{}.html"
144 |         urlmask = SOFASCORE_API + "unique-tournament/{}/season/{}/standings/total"
145 | 
146 |         idx = ["league", "season"]
147 |         cols = ["team", "MP", "W", "D", "L", "GF", "GA", "GD", "Pts"]
148 | 
149 |         seasons = self.read_seasons()
150 |         # collect league tables
151 |         mult_tables = []
152 |         for (lkey, skey), season in seasons.iterrows():
153 |             filepath = self.data_dir / filemask.format(lkey, skey)
154 |             url = urlmask.format(season.league_id, season.season_id)
155 |             current_season = not self._is_complete(lkey, skey)
156 |             reader = self.get(url, filepath, no_cache=current_season and not force_cache)
157 |             season_data = json.load(reader)
158 |             for row in season_data["standings"][0]["rows"]:
159 |                 mult_tables.append(
160 |                     {
161 |                         "league": lkey,
162 |                         "season": skey,
163 |                         "team": row["team"]["name"],
164 |                         "MP": row["matches"],
165 |                         "W": row["wins"],
166 |                         "D": row["draws"],
167 |                         "L": row["losses"],
168 |                         "GF": row["scoresFor"],
169 |                         "GA": row["scoresAgainst"],
170 |                         "GD": row["scoresFor"] - row["scoresAgainst"],
171 |                         "Pts": row["points"],
172 |                     }
173 |                 )
174 |             df = (
175 |                 pd.DataFrame(mult_tables)
176 |                 .set_index(idx)
177 |                 .replace({"team": TEAMNAME_REPLACEMENTS})
178 |                 .sort_index()[cols]
179 |             )
180 |         return df
181 | 
182 |     def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
183 |         """Retrieve the game schedule for the selected leagues and seasons.
184 | 
185 |         Parameters
186 |         ----------
187 |         force_cache : bool
188 |              By default no cached data is used for the current season.
189 |              If True, will force the use of cached data anyway.
190 | 
191 |         Returns
192 |         -------
193 |         pd.DataFrame
194 |         """
195 |         urlmask1 = SOFASCORE_API + "unique-tournament/{}/season/{}/rounds"
196 |         urlmask2 = SOFASCORE_API + "unique-tournament/{}/season/{}/events/round/{}"
197 |         filemask1 = "matches/rounds_{}_{}.json"
198 |         filemask2 = "matches/round_matches_{}_{}_{}.json"
199 | 
200 |         cols = [
201 |             "round",
202 |             "week",
203 |             "date",
204 |             "home_team",
205 |             "away_team",
206 |             "home_score",
207 |             "away_score",
208 |             "game_id",
209 |         ]
210 | 
211 |         df_seasons = self.read_seasons()
212 |         all_schedules = []
213 |         for (lkey, skey), season in df_seasons.iterrows():
214 |             filepath1 = self.data_dir / filemask1.format(lkey, skey)
215 |             url1 = urlmask1.format(season["league_id"], season["season_id"])
216 |             current_season = not self._is_complete(lkey, skey)
217 |             reader1 = self.get(url1, filepath1, no_cache=current_season and not force_cache)
218 |             season_data = json.load(reader1)
219 |             rounds = season_data["rounds"]
220 | 
221 |             for round in rounds:  # noqa: A001
222 |                 filepath2 = self.data_dir / filemask2.format(lkey, skey, round["round"])
223 |                 url2 = urlmask2.format(season["league_id"], season["season_id"], round["round"])
224 |                 reader2 = self.get(url2, filepath2, no_cache=current_season and not force_cache)
225 |                 match_data = json.load(reader2)
226 |                 for _match in match_data["events"]:
227 |                     if _match["status"]["code"] == 100 or _match["status"]["code"] == 0:
228 |                         if _match["status"]["code"] == 100:
229 |                             home_score = int(_match["homeScore"]["current"])
230 |                             away_score = int(_match["awayScore"]["current"])
231 |                         else:
232 |                             home_score = float("nan")  # type: ignore
233 |                             away_score = float("nan")  # type: ignore
234 | 
235 |                         all_schedules.append(
236 |                             {
237 |                                 "league": lkey,
238 |                                 "season": skey,
239 |                                 "round": round["round"],
240 |                                 "week": _match["roundInfo"]["round"],
241 |                                 "date": datetime.fromtimestamp(
242 |                                     _match["startTimestamp"], tz=timezone.utc
243 |                                 ),
244 |                                 "home_team": _match["homeTeam"]["name"],
245 |                                 "away_team": _match["awayTeam"]["name"],
246 |                                 "home_score": home_score,
247 |                                 "away_score": away_score,
248 |                                 "game_id": _match["id"],
249 |                             }
250 |                         )
251 | 
252 |         df = pd.DataFrame(all_schedules).replace(
253 |             {
254 |                 "home_team": TEAMNAME_REPLACEMENTS,
255 |                 "away_team": TEAMNAME_REPLACEMENTS,
256 |             }
257 |         )
258 |         df["game"] = df.apply(make_game_id, axis=1)
259 |         return df.set_index(["league", "season", "game"]).sort_index()[cols]
260 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test suite for the soccerdata package."""
2 | 


--------------------------------------------------------------------------------
/tests/appdata/config/league_dict.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/tests/appdata/config/teamname_replacements.json:
--------------------------------------------------------------------------------
1 | {
2 |   "Manchester City": ["Man City"],
3 |   "Olympique Marseille": ["Marseille"],
4 |   "Valencia CF": ["Valencia"],
5 |   "FC Bayern Munich": ["FC Bayern München"]
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Pytest fixtures for soccerdata package."""
 2 | 
 3 | import pytest
 4 | 
 5 | import soccerdata as sd
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def espn_seriea() -> sd.ESPN:
10 |     """Return a correctly initialized instance of ESPN filtered by league: Serie A."""
11 |     return sd.ESPN("ITA-Serie A", "20-21")
12 | 
13 | 
14 | @pytest.fixture()
15 | def sofifa_bundesliga() -> sd.SoFIFA:
16 |     """Return a correctly initialized instance of SoFIFA filtered by league: Bundesliga."""
17 |     return sd.SoFIFA("GER-Bundesliga", versions=[230012])
18 | 
19 | 
20 | @pytest.fixture()
21 | def fbref_ligue1() -> sd.FBref:
22 |     """Return a correctly initialized instance of FBref filtered by league: Ligue 1."""
23 |     return sd.FBref("FRA-Ligue 1", "20-21")
24 | 
25 | 
26 | @pytest.fixture()
27 | def fotmob_laliga():
28 |     """Return a correctly initialized instance of Fotmob filtered by league: La Liga."""
29 |     return sd.FotMob("ESP-La Liga", "20-21")
30 | 
31 | 
32 | @pytest.fixture()
33 | def elo() -> sd.ClubElo:
34 |     """Return a correctly initialized ClubElo instance."""
35 |     return sd.ClubElo()
36 | 
37 | 
38 | @pytest.fixture()
39 | def match_epl_5y() -> sd.MatchHistory:
40 |     """Return a MatchHistory instance for the last 5 years of the EPL."""
41 |     return sd.MatchHistory("ENG-Premier League", list(range(2019, 2025)))
42 | 
43 | 
44 | @pytest.fixture()
45 | def whoscored() -> sd.WhoScored:
46 |     """Return a correctly initialized instance of WhoScored."""
47 |     return sd.WhoScored("ENG-Premier League", "20-21", headless=False)
48 | 
49 | 
50 | @pytest.fixture()
51 | def understat_epl_1516() -> sd.Understat:
52 |     """Return a correctly initialized instance of Understat filtered by league: Premier League."""
53 |     return sd.Understat("ENG-Premier League", "15-16")
54 | 
55 | 
56 | @pytest.fixture()
57 | def understat_epl_9091() -> sd.Understat:
58 |     """Return a correctly initialized instance of Understat filtered by league: Premier League."""
59 |     return sd.Understat("ENG-Premier League", "90-91")
60 | 
61 | 
62 | @pytest.fixture()
63 | def sofascore_epl_1516() -> sd.Sofascore:
64 |     """Return a correctly initialized instance of Sofascore filtered by league: Premier League."""
65 |     return sd.Sofascore("ENG-Premier League", "15-16")
66 | 


--------------------------------------------------------------------------------
/tests/test_ClubElo.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.ClubElo."""
 2 | 
 3 | import time
 4 | from datetime import datetime, timedelta, timezone
 5 | from pathlib import Path
 6 | 
 7 | import pandas as pd
 8 | import pytest
 9 | 
10 | from soccerdata import ClubElo
11 | 
12 | 
13 | def test_read_by_date(elo: ClubElo) -> None:
14 |     """It should return a dataframe with the ELO ratings for all clubs at the specified date."""
15 |     assert isinstance(elo.read_by_date(), pd.DataFrame)
16 |     assert isinstance(elo.read_by_date("2017-04-01"), pd.DataFrame)
17 |     assert isinstance(elo.read_by_date(datetime(2017, 4, 1, tzinfo=timezone.utc)), pd.DataFrame)
18 | 
19 | 
20 | def test_read_by_date_bad_params(elo: ClubElo) -> None:
21 |     """It should raise an error if the parameters are invalid."""
22 |     with pytest.raises(ValueError, match="time data '2017' does not match format '%Y-%m-%d'"):
23 |         elo.read_by_date("2017")
24 |     with pytest.raises(
25 |         TypeError, match="'date' must be a datetime object or string like 'YYYY-MM-DD'"
26 |     ):
27 |         elo.read_by_date(1 / 4)  # type: ignore
28 | 
29 | 
30 | def test_read_team_history(elo: ClubElo) -> None:
31 |     """It should return a dataframe with the ELO history for the specified club."""
32 |     assert isinstance(elo.read_team_history("Feyenoord"), pd.DataFrame)
33 |     assert isinstance(elo.read_team_history("Feyenoord", 2), pd.DataFrame)
34 |     assert isinstance(elo.read_team_history("Feyenoord", timedelta(days=2)), pd.DataFrame)
35 | 
36 | 
37 | def test_read_team_history_max_age(elo: ClubElo) -> None:
38 |     """It should not use cached data if it is older than max_age."""
39 |     max_age = timedelta(milliseconds=1)
40 |     assert isinstance(elo.read_team_history("Feyenoord", max_age), pd.DataFrame)
41 |     update_time = (
42 |         (Path(__file__).parent / "appdata" / "data" / "ClubElo" / "Feyenoord.csv").stat().st_mtime
43 |     )
44 |     current_time = time.time()
45 |     assert current_time - update_time < 5
46 | 
47 | 
48 | def test_read_team_history_replacement(elo: ClubElo) -> None:
49 |     """It should use the replacement names from teamname_replacements.json."""
50 |     assert isinstance(elo.read_team_history("Manchester City"), pd.DataFrame)
51 | 
52 | 
53 | def test_read_team_history_bad_team(elo: ClubElo) -> None:
54 |     """It should raise an error if the team is not found."""
55 |     with pytest.raises(ValueError, match="No data found for team FC Knudde"):
56 |         elo.read_team_history("FC Knudde")
57 | 
58 | 
59 | def test_read_team_history_bad_params(elo: ClubElo) -> None:
60 |     """It should raise an error if the parameters are invalid."""
61 |     with pytest.raises(TypeError, match="'max_age' must be of type int or datetime.timedelta"):
62 |         elo.read_team_history("Feyenoord", max_age=datetime.now(tz=timezone.utc))  # type: ignore
63 | 


--------------------------------------------------------------------------------
/tests/test_ESPN.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.ESPN."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from soccerdata.espn import ESPN
 7 | 
 8 | 
 9 | def test_read_schedule(espn_seriea: ESPN) -> None:
10 |     """It should return a dataframe with the schedule of the season."""
11 |     assert isinstance(espn_seriea.read_schedule(), pd.DataFrame)
12 | 
13 | 
14 | def test_read_matchsheet(espn_seriea: ESPN) -> None:
15 |     """It should return a dataframe with the matchsheet data."""
16 |     assert isinstance(espn_seriea.read_matchsheet(match_id=554204), pd.DataFrame)
17 | 
18 | 
19 | def test_read_matchsheet_bad_id(espn_seriea: ESPN) -> None:
20 |     """It should raise a ValueError if the selected game is not in the specified season."""
21 |     with pytest.raises(
22 |         ValueError,
23 |         match="No games with the given IDs found for the selected seasons and leagues.",
24 |     ):
25 |         assert isinstance(espn_seriea.read_matchsheet(match_id=123), pd.DataFrame)
26 | 
27 | 
28 | def test_read_lineups(espn_seriea: ESPN) -> None:
29 |     """It should return a dataframe with the lineups."""
30 |     assert isinstance(espn_seriea.read_lineup(match_id=554204), pd.DataFrame)
31 | 
32 | 
33 | def test_id_not_in_season(espn_seriea: ESPN) -> None:
34 |     """It should raise a ValueError if the selected game is not in the specified season."""
35 |     with pytest.raises(
36 |         ValueError,
37 |         match="No games with the given IDs found for the selected seasons and leagues.",
38 |     ):
39 |         assert isinstance(espn_seriea.read_lineup(match_id=123), pd.DataFrame)
40 | 


--------------------------------------------------------------------------------
/tests/test_FBref.py:
--------------------------------------------------------------------------------
  1 | """Unittests for class soccerdata.FBref."""
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | import soccerdata as sd
  7 | from soccerdata.fbref import FBref, _concat
  8 | 
  9 | 
 10 | def test_available_leagues() -> None:
 11 |     assert sd.FBref.available_leagues() == [
 12 |         "Big 5 European Leagues Combined",
 13 |         "ENG-Premier League",
 14 |         "ESP-La Liga",
 15 |         "FRA-Ligue 1",
 16 |         "GER-Bundesliga",
 17 |         "INT-European Championship",
 18 |         "INT-Women's World Cup",
 19 |         "INT-World Cup",
 20 |         "ITA-Serie A",
 21 |     ]
 22 | 
 23 | 
 24 | @pytest.mark.parametrize(
 25 |     "stat_type",
 26 |     [
 27 |         "standard",
 28 |         "keeper",
 29 |         "keeper_adv",
 30 |         "shooting",
 31 |         "passing",
 32 |         "passing_types",
 33 |         "goal_shot_creation",
 34 |         "defense",
 35 |         "possession",
 36 |         "playing_time",
 37 |         "misc",
 38 |     ],
 39 | )
 40 | def test_read_team_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
 41 |     assert isinstance(fbref_ligue1.read_team_season_stats(stat_type), pd.DataFrame)
 42 | 
 43 | 
 44 | @pytest.mark.parametrize(
 45 |     "stat_type",
 46 |     [
 47 |         "schedule",
 48 |         "shooting",
 49 |         "keeper",
 50 |         "passing",
 51 |         "passing_types",
 52 |         "goal_shot_creation",
 53 |         "defense",
 54 |         "possession",
 55 |         "misc",
 56 |     ],
 57 | )
 58 | def test_read_team_match_stats(fbref_ligue1: FBref, stat_type: str) -> None:
 59 |     assert isinstance(fbref_ligue1.read_team_match_stats(stat_type), pd.DataFrame)
 60 | 
 61 | 
 62 | def test_read_team_match_stats_alt_names(fbref_ligue1: FBref) -> None:
 63 |     # Test with FBref team name
 64 |     assert isinstance(
 65 |         fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Olympique Marseille"),
 66 |         pd.DataFrame,
 67 |     )
 68 |     # Test with standardized team name
 69 |     assert isinstance(
 70 |         fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Marseille"),
 71 |         pd.DataFrame,
 72 |     )
 73 | 
 74 | 
 75 | @pytest.mark.parametrize(
 76 |     "stat_type",
 77 |     [
 78 |         "standard",
 79 |         "shooting",
 80 |         "passing",
 81 |         "passing_types",
 82 |         "goal_shot_creation",
 83 |         "defense",
 84 |         "possession",
 85 |         "playing_time",
 86 |         "misc",
 87 |         "keeper",
 88 |         "keeper_adv",
 89 |     ],
 90 | )
 91 | def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
 92 |     assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame)
 93 | 
 94 | 
 95 | def test_read_schedule(fbref_ligue1: FBref) -> None:
 96 |     assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame)
 97 | 
 98 | 
 99 | @pytest.mark.parametrize(
100 |     "stat_type",
101 |     [
102 |         "summary",
103 |         "keepers",
104 |         "passing",
105 |         "passing_types",
106 |         "defense",
107 |         "possession",
108 |         "misc",
109 |     ],
110 | )
111 | def test_read_player_match_stats(fbref_ligue1: FBref, stat_type: str) -> None:
112 |     assert isinstance(
113 |         fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame
114 |     )
115 | 
116 | 
117 | def test_read_events(fbref_ligue1: FBref) -> None:
118 |     assert isinstance(fbref_ligue1.read_events(match_id="796787da"), pd.DataFrame)
119 | 
120 | 
121 | def test_read_events_yellow_for_manager() -> None:
122 |     """When a yellow card given to the manager, there is no <a> tag."""
123 |     fbref_laliga = sd.FBref("ESP-La Liga", "23-24")
124 |     events = fbref_laliga.read_events(match_id="e8867e6b")
125 |     yellow_cards = events[events["event_type"] == "yellow_card"]
126 |     assert "Pepe Bordalás" in yellow_cards["player1"].tolist()
127 | 
128 | 
129 | def test_missing_events() -> None:
130 |     fbref = sd.FBref("FRA-Ligue 1", "19-20")
131 |     events = fbref.read_events(match_id="1d845950")
132 |     assert len(events) == 0
133 | 
134 | 
135 | def test_read_shot_events(fbref_ligue1: FBref) -> None:
136 |     assert isinstance(fbref_ligue1.read_shot_events(match_id="796787da"), pd.DataFrame)
137 | 
138 | 
139 | def test_read_lineup(fbref_ligue1: FBref) -> None:
140 |     assert isinstance(fbref_ligue1.read_lineup(match_id="796787da"), pd.DataFrame)
141 | 
142 | 
143 | def test_concat() -> None:
144 |     df1 = pd.DataFrame(
145 |         columns=pd.MultiIndex.from_tuples(
146 |             [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")]
147 |         )
148 |     )
149 |     df2 = pd.DataFrame(
150 |         columns=pd.MultiIndex.from_tuples(
151 |             [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Assists")]
152 |         )
153 |     )
154 |     df3 = pd.DataFrame(
155 |         columns=pd.MultiIndex.from_tuples(
156 |             [("Unnamed: a", "player"), ("Goals", "Unnamed: b"), ("Performance", "Assists")]
157 |         )
158 |     )
159 |     res = _concat([df1, df2, df3], key=["player"])
160 |     assert res.columns.equals(
161 |         pd.MultiIndex.from_tuples(
162 |             [("player", ""), ("Performance", "Goals"), ("Performance", "Assists")]
163 |         )
164 |     )
165 |     res = _concat([df3, df1, df2], key=["player"])
166 |     assert res.columns.equals(
167 |         pd.MultiIndex.from_tuples(
168 |             [("player", ""), ("Performance", "Goals"), ("Performance", "Assists")]
169 |         )
170 |     )
171 | 
172 | 
173 | def test_concat_with_forfeited_game() -> None:
174 |     fbref_seriea = sd.FBref(["ITA-Serie A"], 2021)
175 |     df_1 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "34e95e35"])
176 |     df_2 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "a3e10e13"])
177 |     assert isinstance(df_1, pd.DataFrame)
178 |     assert isinstance(df_2, pd.DataFrame)
179 |     # Regardless of the order in which the matches are read, the result should be the same.
180 |     assert df_1.columns.equals(df_2.columns)
181 | 
182 | 
183 | def test_combine_big5() -> None:
184 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
185 |     assert len(fbref_bigfive.read_leagues(split_up_big5=False)) == 1
186 |     assert len(fbref_bigfive.read_seasons(split_up_big5=False)) == 1
187 |     assert len(fbref_bigfive.read_leagues(split_up_big5=True)) == 5
188 |     assert len(fbref_bigfive.read_seasons(split_up_big5=True)) == 5
189 |     # by default, split_up_big5 should be False
190 |     assert len(fbref_bigfive.read_leagues()) == 1
191 |     assert len(fbref_bigfive.read_seasons()) == 1
192 | 
193 | 
194 | @pytest.mark.parametrize(
195 |     "stat_type",
196 |     [
197 |         "standard",
198 |         "keeper",
199 |         # "keeper_adv",  disabled because of inconsistent data on FBref
200 |         "shooting",
201 |         "passing",
202 |         "passing_types",
203 |         "goal_shot_creation",
204 |         "defense",
205 |         "possession",
206 |         "playing_time",
207 |         "misc",
208 |     ],
209 | )
210 | def test_combine_big5_team_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
211 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
212 |     ligue1 = fbref_ligue1.read_team_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index()
213 |     bigfive = fbref_bigfive.read_team_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index()
214 |     cols = _concat([ligue1, bigfive], key=["season"]).columns
215 |     ligue1.columns = cols
216 |     bigfive.columns = cols
217 |     pd.testing.assert_frame_equal(
218 |         ligue1,
219 |         bigfive,
220 |     )
221 | 
222 | 
223 | @pytest.mark.parametrize(
224 |     "stat_type",
225 |     [
226 |         "standard",
227 |         "shooting",
228 |         "passing",
229 |         "passing_types",
230 |         "goal_shot_creation",
231 |         "defense",
232 |         "possession",
233 |         "playing_time",
234 |         "misc",
235 |         "keeper",
236 |         "keeper_adv",
237 |     ],
238 | )
239 | def test_combine_big5_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
240 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
241 |     ligue1 = fbref_ligue1.read_player_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index()
242 |     bigfive = fbref_bigfive.read_player_season_stats(stat_type).loc["FRA-Ligue 1"].reset_index()
243 |     cols = _concat([ligue1, bigfive], key=["season"]).columns
244 |     ligue1.columns = cols
245 |     bigfive.columns = cols
246 |     pd.testing.assert_frame_equal(
247 |         ligue1,
248 |         bigfive,
249 |     )
250 | 


--------------------------------------------------------------------------------
/tests/test_FotMob.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.FotMob."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | # import soccerdata as sd
 7 | from soccerdata.fotmob import FotMob
 8 | 
 9 | # Unittests -------------------------------------------------------------------
10 | 
11 | 
12 | @pytest.mark.fails_gha()
13 | def test_read_league_table(fotmob_laliga: FotMob) -> None:
14 |     assert isinstance(fotmob_laliga.read_league_table(), pd.DataFrame)
15 | 
16 | 
17 | @pytest.mark.fails_gha()
18 | def test_read_schedule(fotmob_laliga: FotMob) -> None:
19 |     assert isinstance(fotmob_laliga.read_schedule(), pd.DataFrame)
20 | 
21 | 
22 | @pytest.mark.fails_gha()
23 | @pytest.mark.parametrize(
24 |     "stat_type",
25 |     ["Top stats", "Shots", "Expected goals (xG)", "Passes", "Defence", "Duels", "Discipline"],
26 | )
27 | def test_read_team_match_stats(fotmob_laliga: FotMob, stat_type: str) -> None:
28 |     assert isinstance(
29 |         fotmob_laliga.read_team_match_stats(stat_type, team="Valencia"), pd.DataFrame
30 |     )
31 | 
32 | 
33 | @pytest.mark.fails_gha()
34 | def test_read_team_match_stats_alt_names(fotmob_laliga: FotMob) -> None:
35 |     # Test with Fotmob team name
36 |     assert isinstance(
37 |         fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia"), pd.DataFrame
38 |     )
39 |     # Test with standardized team name
40 |     assert isinstance(
41 |         fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia CF"),
42 |         pd.DataFrame,
43 |     )
44 | 


--------------------------------------------------------------------------------
/tests/test_Integration.py:
--------------------------------------------------------------------------------
 1 | """Integration tests for soccerdata package."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import soccerdata as foo
 7 | 
 8 | # TODO: integration tests
 9 | # Names of common leagues equal for all classes
10 | # Number of clubs equal for all common leagues over classes
11 | # Clubnames equal for all common leagues over classes
12 | # Number of games equal for all common leagues/seasons over classes
13 | # Scores per game equal for all common leagues over classes
14 | 
15 | 
16 | @pytest.mark.e2e()
17 | def test_mh_vs_elo():
18 |     """We should be able to retrieve the Elo history for all teams in these leagues."""
19 |     league_sel = [
20 |         "ENG-Premier League",
21 |         "ESP-La Liga",
22 |         "FRA-Ligue 1",
23 |         "GER-Bundesliga",
24 |         "ITA-Serie A",
25 |     ]
26 | 
27 |     mh = foo.MatchHistory(leagues=league_sel, seasons="1819")
28 |     mh_games = mh.read_games()
29 | 
30 |     elo = foo.ClubElo()
31 |     elo_hist = pd.concat([elo.read_team_history(team) for team in set(mh_games["home_team"])])
32 | 
33 |     assert set(mh_games["home_team"]) - set(elo_hist["team"]) == set()
34 | 


--------------------------------------------------------------------------------
/tests/test_MatchHistory.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.MatchHistory."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from soccerdata.match_history import MatchHistory
 6 | 
 7 | 
 8 | def test_read_games(match_epl_5y: MatchHistory) -> None:
 9 |     """It should return a DataFrame with all games from the selected leagues and seasons."""
10 |     df = match_epl_5y.read_games()
11 |     assert isinstance(df, pd.DataFrame)
12 |     assert len(df.index.get_level_values("season").unique()) == 5
13 |     assert len(df) > 0
14 |     assert not any("ï»¿" in c for c in df.columns)
15 | 


--------------------------------------------------------------------------------
/tests/test_SoFIFA.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.SoFIFA."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from soccerdata.sofifa import SoFIFA
 6 | 
 7 | 
 8 | def test_read_players(sofifa_bundesliga: SoFIFA) -> None:
 9 |     """It should use the replacement names from teamname_replacements.json."""
10 |     assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern München"), pd.DataFrame)
11 | 
12 | 
13 | def test_read_players_replacement(sofifa_bundesliga: SoFIFA) -> None:
14 |     """It should use the replacement names from teamname_replacements.json."""
15 |     assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern Munich"), pd.DataFrame)
16 | 
17 | 
18 | def test_read_team_ratings(sofifa_bundesliga: SoFIFA) -> None:
19 |     """It should return a dataframe with the team ratings."""
20 |     assert isinstance(sofifa_bundesliga.read_team_ratings(), pd.DataFrame)
21 | 
22 | 
23 | def test_read_player_ratings(sofifa_bundesliga: SoFIFA) -> None:
24 |     """It should return a dataframe with the player ratings."""
25 |     assert isinstance(sofifa_bundesliga.read_player_ratings(player=189596), pd.DataFrame)
26 | 


--------------------------------------------------------------------------------
/tests/test_Sofascore.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.Sofascore."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from soccerdata.sofascore import Sofascore
 6 | 
 7 | 
 8 | def test_read_leagues(sofascore_epl_1516: Sofascore) -> None:
 9 |     leagues = sofascore_epl_1516.read_leagues()
10 |     assert isinstance(leagues, pd.DataFrame)
11 |     assert len(leagues) == 1
12 | 
13 | 
14 | def test_read_seasons(sofascore_epl_1516: Sofascore) -> None:
15 |     seasons = sofascore_epl_1516.read_seasons()
16 |     assert isinstance(seasons, pd.DataFrame)
17 |     assert len(seasons) == 1
18 | 
19 | 
20 | def test_read_seasons_empty() -> None:
21 |     sofascore_instance = Sofascore("ENG-Premier League", "90-91")
22 |     seasons = sofascore_instance.read_seasons()
23 |     assert isinstance(seasons, pd.DataFrame)
24 |     assert len(seasons) == 0
25 | 
26 | 
27 | def test_read_schedule(sofascore_epl_1516: Sofascore) -> None:
28 |     schedule = sofascore_epl_1516.read_schedule()
29 |     assert isinstance(schedule, pd.DataFrame)
30 |     assert len(schedule) == 380
31 | 
32 | 
33 | def test_read_league_table(sofascore_epl_1516: Sofascore) -> None:
34 |     league_table = sofascore_epl_1516.read_league_table()
35 |     assert isinstance(league_table, pd.DataFrame)
36 |     assert len(league_table) == 20
37 | 


--------------------------------------------------------------------------------
/tests/test_Understat.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.Understat."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from soccerdata.understat import Understat
 7 | 
 8 | 
 9 | def test_read_leagues(understat_epl_1516: Understat) -> None:
10 |     leagues = understat_epl_1516.read_leagues()
11 |     assert isinstance(leagues, pd.DataFrame)
12 |     assert len(leagues) == 1
13 | 
14 | 
15 | def test_read_seasons(understat_epl_1516: Understat) -> None:
16 |     seasons = understat_epl_1516.read_seasons()
17 |     assert isinstance(seasons, pd.DataFrame)
18 |     assert len(seasons) == 1
19 | 
20 | 
21 | def test_read_seasons_empty(understat_epl_9091: Understat) -> None:
22 |     seasons = understat_epl_9091.read_seasons()
23 |     assert isinstance(seasons, pd.DataFrame)
24 |     assert len(seasons) == 0
25 | 
26 | 
27 | def test_read_schedule(understat_epl_1516: Understat) -> None:
28 |     schedule = understat_epl_1516.read_schedule()
29 |     assert isinstance(schedule, pd.DataFrame)
30 |     assert len(schedule) == 380
31 | 
32 | 
33 | def test_read_team_match_stats(understat_epl_1516: Understat) -> None:
34 |     team_match_stats = understat_epl_1516.read_team_match_stats()
35 |     assert isinstance(team_match_stats, pd.DataFrame)
36 |     assert len(team_match_stats) == 380
37 | 
38 | 
39 | def test_read_player_season_stats(understat_epl_1516: Understat) -> None:
40 |     player_season_stats = understat_epl_1516.read_player_season_stats()
41 |     assert isinstance(player_season_stats, pd.DataFrame)
42 |     assert len(player_season_stats) == 550
43 | 
44 | 
45 | def test_read_player_match_stats(understat_epl_1516: Understat) -> None:
46 |     player_match_stats = understat_epl_1516.read_player_match_stats()
47 |     assert isinstance(player_match_stats, pd.DataFrame)
48 | 
49 | 
50 | def test_read_player_match_stats_new_columns(understat_epl_1516: Understat) -> None:
51 |     player_match_stats = understat_epl_1516.read_player_match_stats()
52 |     assert "assists" in player_match_stats.columns
53 |     assert "key_passes" in player_match_stats.columns
54 |     assert "yellow_cards" in player_match_stats.columns
55 |     assert "red_cards" in player_match_stats.columns
56 | 
57 | 
58 | def test_read_shots(understat_epl_1516: Understat) -> None:
59 |     shots_all = understat_epl_1516.read_shot_events()
60 |     assert isinstance(shots_all, pd.DataFrame)
61 |     assert len(shots_all) == 9_819
62 |     shots_utd_bou = understat_epl_1516.read_shot_events(460)
63 |     assert isinstance(shots_utd_bou, pd.DataFrame)
64 |     assert len(shots_utd_bou) == 20
65 |     with pytest.raises(
66 |         ValueError, match="No matches found with the given IDs in the selected seasons."
67 |     ):
68 |         understat_epl_1516.read_shot_events(42)
69 | 


--------------------------------------------------------------------------------
/tests/test_Whoscored.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.WhoScored."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | # Unittests -------------------------------------------------------------------
 7 | 
 8 | 
 9 | @pytest.mark.fails_gha()
10 | def test_whoscored_missing_players(whoscored):
11 |     assert isinstance(whoscored.read_missing_players(1485184), pd.DataFrame)
12 | 
13 | 
14 | @pytest.mark.fails_gha()
15 | def test_whoscored_events(whoscored):
16 |     assert isinstance(whoscored.read_events(1485184), pd.DataFrame)
17 | 


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
  1 | """Unittests for soccerdata._common."""
  2 | 
  3 | import json
  4 | from datetime import datetime, timezone
  5 | 
  6 | import pandas as pd
  7 | import pytest
  8 | import time_machine
  9 | 
 10 | import soccerdata
 11 | from soccerdata._common import (
 12 |     BaseRequestsReader,
 13 |     SeasonCode,
 14 |     add_alt_team_names,
 15 |     add_standardized_team_name,
 16 |     make_game_id,
 17 |     standardize_colnames,
 18 | )
 19 | 
 20 | # _download_and_save
 21 | 
 22 | 
 23 | def test_download_and_save_not_cached(tmp_path):
 24 |     reader = BaseRequestsReader()
 25 |     url = "http://api.clubelo.com/Barcelona"
 26 |     filepath = tmp_path / "Barcelona.csv"
 27 |     data = reader._download_and_save(url, filepath)
 28 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 29 | 
 30 | 
 31 | def test_download_and_save_cached(tmp_path):
 32 |     reader = BaseRequestsReader()
 33 |     url = "http://api.clubelo.com/Barcelona"
 34 |     filepath = tmp_path / "Barcelona.csv"
 35 |     data = reader._download_and_save(url, filepath)
 36 |     data = reader._download_and_save(url, filepath)
 37 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 38 | 
 39 | 
 40 | def test_download_and_save_no_cache(tmp_path):
 41 |     reader = BaseRequestsReader(no_cache=True)
 42 |     url = "http://api.clubelo.com/Barcelona"
 43 |     filepath = tmp_path / "Barcelona.csv"
 44 |     filepath.write_text("bogus")
 45 |     data = reader._download_and_save(url, filepath)
 46 |     assert len(pd.read_csv(data)) > 1
 47 | 
 48 | 
 49 | def test_download_and_save_no_store_no_filepath():
 50 |     reader = BaseRequestsReader(no_store=True)
 51 |     url = "http://api.clubelo.com/Barcelona"
 52 |     data = reader._download_and_save(url, filepath=None)
 53 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 54 | 
 55 | 
 56 | def test_download_and_save_no_cache_filepath(tmp_path):
 57 |     reader = BaseRequestsReader(no_store=True)
 58 |     url = "http://api.clubelo.com/Barcelona"
 59 |     filepath = tmp_path / "Barcelona.csv"
 60 |     data = reader._download_and_save(url, filepath)
 61 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 62 |     assert not filepath.exists()
 63 | 
 64 | 
 65 | def test_download_and_save_variable_no_store_no_filepath():
 66 |     reader = BaseRequestsReader(no_store=True)
 67 |     url = "https://understat.com/"
 68 |     data = reader._download_and_save(url, filepath=None, var="statData")
 69 |     stats = json.load(data)
 70 |     assert isinstance(stats, dict)
 71 |     assert "statData" in stats
 72 | 
 73 | 
 74 | # def test_download_and_save_requests_tor(tmp_path):
 75 | #     url = "https://check.torproject.org/api/ip"
 76 | #     reader = BaseRequestsReader(proxy=None)
 77 | #     ip_without_proxy = reader.get(url, tmp_path / "myip.txt")
 78 | #     ip_without_proxy = json.load(ip_without_proxy)
 79 | #     proxy_reader = BaseRequestsReader(proxy="tor")
 80 | #     ip_with_proxy = proxy_reader.get(url, tmp_path / "myproxyip.txt")
 81 | #     ip_with_proxy = json.load(ip_with_proxy)
 82 | #     assert ip_without_proxy["IP"] != ip_with_proxy["IP"]
 83 | #     assert ip_with_proxy["IsTor"]
 84 | #
 85 | #
 86 | # def test_download_and_save_selenium_tor(tmp_path):
 87 | #     url = "https://check.torproject.org/api/ip"
 88 | #     reader = BaseSeleniumReader(proxy=None).get(url, tmp_path / "myip.txt")
 89 | #     ip_without_proxy = html.parse(reader).xpath("//pre")[0].text
 90 | #     ip_without_proxy = json.loads(ip_without_proxy)
 91 | #     proxy_reader = BaseSeleniumReader(proxy="tor").get(url, tmp_path / "myproxyip.txt")
 92 | #     ip_with_proxy = html.parse(proxy_reader).xpath("//pre")[0].text
 93 | #     ip_with_proxy = json.loads(ip_with_proxy)
 94 | #     assert ip_without_proxy["IP"] != ip_with_proxy["IP"]
 95 | #     assert ip_with_proxy["IsTor"]
 96 | #
 97 | 
 98 | # make_game_id
 99 | 
100 | 
101 | def test_make_game_id():
102 |     s = pd.Series(
103 |         {
104 |             "date": datetime(1993, 7, 30, tzinfo=timezone.utc),
105 |             "home_team": "Barcelona",
106 |             "away_team": "Real Madrid",
107 |         }
108 |     )
109 |     game_id = make_game_id(s)
110 |     assert game_id == "1993-07-30 Barcelona-Real Madrid"
111 | 
112 | 
113 | # add_alt_team_names
114 | 
115 | 
116 | def test_add_alt_team_names():
117 |     # "Valencia" is replaced by "Valencia CF"
118 |     assert add_alt_team_names("Valencia CF") == {"Valencia", "Valencia CF"}
119 |     # "Real Madrid" is not replaced
120 |     assert add_alt_team_names("Real Madrid") == {"Real Madrid"}
121 | 
122 | 
123 | def test_add_standardize_team_name():
124 |     # "Valencia" is replaced by "Valencia CF"
125 |     assert add_standardized_team_name("Valencia") == {"Valencia", "Valencia CF"}
126 |     # "Real Madrid" is not replaced
127 |     assert add_standardized_team_name("Real Madrid") == {"Real Madrid"}
128 | 
129 | 
130 | # standardize_colnames
131 | 
132 | 
133 | def test_standardize_colnames():
134 |     df = pd.DataFrame(
135 |         columns=[
136 |             "First Test",
137 |             "SecondTest",
138 |             "thirdTest",
139 |             "Fourthtest",
140 |             "Fifth-test",
141 |             "TestSix",
142 |         ]
143 |     )
144 |     df = standardize_colnames(
145 |         df, cols=["First Test", "SecondTest", "thirdTest", "Fourthtest", "Fifth-test"]
146 |     )
147 |     assert df.columns.tolist() == [
148 |         "first_test",
149 |         "second_test",
150 |         "third_test",
151 |         "fourthtest",
152 |         "fifth_test",
153 |         "TestSix",
154 |     ]
155 | 
156 | 
157 | # is_complete
158 | 
159 | 
160 | def test_is_complete():
161 |     reader = BaseRequestsReader(no_store=True)
162 |     with time_machine.travel(datetime(2020, 12, 25, 1, 24, tzinfo=timezone.utc)):
163 |         assert reader._is_complete("ENG-Premier League", "1920")
164 |         assert not reader._is_complete("ENG-Premier League", "2021")
165 |     with time_machine.travel(datetime(2021, 2, 25, 1, 24, tzinfo=timezone.utc)):
166 |         assert reader._is_complete("ENG-Premier League", "1920")
167 |         assert not reader._is_complete("ENG-Premier League", "2021")
168 |     with time_machine.travel(datetime(2021, 7, 1, 1, 24, tzinfo=timezone.utc)):
169 |         assert reader._is_complete("ENG-Premier League", "1920")
170 |         assert reader._is_complete("ENG-Premier League", "2021")
171 |         assert not reader._is_complete("ENG-Premier League", "2122")
172 | 
173 | 
174 | def test_is_complete_default_value(mocker):
175 |     mocker.patch.object(soccerdata._common, "LEAGUE_DICT", {"FAKE-Dummy League": {}})
176 |     reader = BaseRequestsReader(no_store=True)
177 |     with time_machine.travel(datetime(2020, 12, 25, 1, 24, tzinfo=timezone.utc)):
178 |         assert reader._is_complete("FAKE-Dummy League", "1920")
179 | 
180 | 
181 | def test_is_complete_undefined_league(mocker):  # noqa: ARG001
182 |     reader = BaseRequestsReader(no_store=True)
183 |     with pytest.raises(
184 |         ValueError,
185 |         match="Invalid league 'FAKE-Dummy League'",
186 |     ):
187 |         reader._is_complete("FAKE-Dummy League", "1920")
188 | 
189 | 
190 | # Season codes
191 | def test_season_pattern1a():
192 |     assert SeasonCode.MULTI_YEAR.parse("9495") == "9495"
193 |     assert SeasonCode.SINGLE_YEAR.parse("9495") == "1994"
194 | 
195 | 
196 | def test_season_pattern1a_warn():
197 |     with pytest.warns(UserWarning) as record:
198 |         assert SeasonCode.MULTI_YEAR.parse("2021") == "2021"
199 | 
200 |     # check that only one warning was raised
201 |     assert len(record) == 1
202 |     # check that the message matches
203 |     msg = 'Season id "2021" is ambiguous: interpreting as "20-21"'
204 |     assert record[0].message.args[0] == msg  # type: ignore
205 | 
206 | 
207 | def test_season_pattern1b():
208 |     my_season = check_post = "1998"
209 |     assert SeasonCode.MULTI_YEAR.parse(my_season) == "9899"
210 |     assert SeasonCode.SINGLE_YEAR.parse(my_season) == "1998"
211 |     assert my_season == check_post
212 | 
213 | 
214 | def test_season_pattern1c():
215 |     assert SeasonCode.MULTI_YEAR.parse("1999") == "9900"
216 |     assert SeasonCode.SINGLE_YEAR.parse("1999") == "1999"
217 | 
218 | 
219 | def test_season_pattern2():
220 |     assert SeasonCode.MULTI_YEAR.parse("11") == "1112"
221 |     assert SeasonCode.SINGLE_YEAR.parse("11") == "2011"
222 |     assert SeasonCode.MULTI_YEAR.parse("99") == "9900"
223 |     assert SeasonCode.SINGLE_YEAR.parse("99") == "1999"
224 | 
225 | 
226 | def test_season_pattern3():
227 |     assert SeasonCode.MULTI_YEAR.parse("2011-2012") == "1112"
228 |     assert SeasonCode.SINGLE_YEAR.parse("2011-2012") == "2011"
229 |     assert SeasonCode.MULTI_YEAR.parse("1999-2000") == "9900"
230 |     assert SeasonCode.SINGLE_YEAR.parse("1999-2000") == "1999"
231 | 
232 | 
233 | def test_season_pattern4():
234 |     assert SeasonCode.MULTI_YEAR.parse("2011-12") == "1112"
235 |     assert SeasonCode.SINGLE_YEAR.parse("2011-12") == "2011"
236 |     assert SeasonCode.MULTI_YEAR.parse("1999-00") == "9900"
237 |     assert SeasonCode.SINGLE_YEAR.parse("1999-00") == "1999"
238 | 
239 | 
240 | def test_season_pattern5():
241 |     assert SeasonCode.MULTI_YEAR.parse("13-14") == "1314"
242 |     assert SeasonCode.SINGLE_YEAR.parse("13-14") == "2013"
243 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | """Unittests for soccerdata._config."""
 2 | 
 3 | import json
 4 | import logging
 5 | from importlib import reload
 6 | 
 7 | from soccerdata import _config as conf
 8 | 
 9 | 
10 | def test_env_soccerdata_dir(monkeypatch, tmp_path):
11 |     monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path))
12 |     reload(conf)
13 |     assert tmp_path == conf.BASE_DIR
14 | 
15 | 
16 | def test_env_nocache(monkeypatch):
17 |     monkeypatch.setenv("SOCCERDATA_NOCACHE", "t")
18 |     reload(conf)
19 |     assert conf.NOCACHE is True
20 | 
21 |     monkeypatch.setenv("SOCCERDATA_NOCACHE", "true")
22 |     reload(conf)
23 |     assert conf.NOCACHE is True
24 | 
25 |     monkeypatch.setenv("SOCCERDATA_NOCACHE", "f")
26 |     reload(conf)
27 |     assert conf.NOCACHE is False
28 | 
29 | 
30 | def test_env_nostore(monkeypatch):
31 |     monkeypatch.setenv("SOCCERDATA_NOSTORE", "t")
32 |     reload(conf)
33 |     assert conf.NOSTORE is True
34 | 
35 |     monkeypatch.setenv("SOCCERDATA_NOSTORE", "true")
36 |     reload(conf)
37 |     assert conf.NOSTORE is True
38 | 
39 |     monkeypatch.setenv("SOCCERDATA_NOSTORE", "f")
40 |     reload(conf)
41 |     assert conf.NOSTORE is False
42 | 
43 | 
44 | def test_env_loglevel(monkeypatch):
45 |     monkeypatch.setenv("SOCCERDATA_LOGLEVEL", "DEBUG")
46 |     reload(conf)
47 |     assert conf.logger.level == logging.DEBUG
48 | 
49 | 
50 | def test_read_teamnname_replacements(monkeypatch, tmp_path):
51 |     monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path))
52 |     # no teamname_replacements.json
53 |     reload(conf)
54 |     assert {} == conf.TEAMNAME_REPLACEMENTS
55 |     fp = tmp_path / "config" / "teamname_replacements.json"
56 |     with fp.open("w", encoding="utf8") as outfile:
57 |         json.dump({"Celta de Vigo": ["Celta Vigo", "Celta"]}, outfile)
58 |     # correctly parse teamname_replacements.json
59 |     reload(conf)
60 |     assert {
61 |         "Celta Vigo": "Celta de Vigo",
62 |         "Celta": "Celta de Vigo",
63 |     } == conf.TEAMNAME_REPLACEMENTS
64 | 
65 | 
66 | def test_read_league_dict(monkeypatch, tmp_path):
67 |     monkeypatch.setenv("SOCCERDATA_DIR", str(tmp_path))
68 |     # no league_dict.json
69 |     reload(conf)
70 |     nb_default = len(conf.LEAGUE_DICT)
71 |     fp = tmp_path / "config" / "league_dict.json"
72 |     with fp.open("w", encoding="utf8") as outfile:
73 |         json.dump({"ABC-Fake": {"WhoScored": "Fake"}}, outfile)
74 |     # correctly parse league_dict.json
75 |     reload(conf)
76 |     assert len(conf.LEAGUE_DICT) == nb_default + 1
77 |     assert conf.LEAGUE_DICT["ABC-Fake"] == {"WhoScored": "Fake"}
78 | 


--------------------------------------------------------------------------------