├── docs
    ├── license.rst
    ├── contributing.rst
    ├── requirements.txt
    ├── _static
    │   ├── logo.png
    │   ├── logo2.png
    │   ├── favicon.ico
    │   └── default.css
    ├── reference
    │   ├── espn.rst
    │   ├── sofifa.rst
    │   ├── matchhistory.rst
    │   ├── clubelo.rst
    │   ├── whoscored.rst
    │   ├── fivethirtyeight.rst
    │   ├── fbref.rst
    │   └── index.rst
    ├── output.csv
    ├── conf.py
    ├── datasources
    │   ├── index.rst
    │   ├── ClubElo.ipynb
    │   └── SoFIFA.ipynb
    ├── index.rst
    └── usage.rst
├── tests
    ├── __init__.py
    ├── test_SoFIFA.py
    ├── test_MatchHistory.py
    ├── test_Whoscored.py
    ├── test_ESPN.py
    ├── test_Integration.py
    ├── conftest.py
    ├── test_ClubElo.py
    ├── test_config.py
    ├── test_FiveThirtyEight.py
    ├── test_FBref.py
    └── test_common.py
├── .github
    ├── renovate.json
    └── workflows
    │   ├── constraints.txt
    │   ├── release.yml
    │   └── ci.yml
├── .bumpversion.cfg
├── soccerdata
    ├── __init__.py
    ├── match_history.py
    ├── _config.py
    ├── clubelo.py
    ├── fivethirtyeight.py
    ├── sofifa.py
    ├── espn.py
    ├── _common.py
    └── fbref.py
├── .readthedocs.yml
├── Makefile
├── .gitignore
├── setup.cfg
├── LICENSE.rst
├── .pre-commit-config.yaml
├── pyproject.toml
├── README.rst
├── noxfile.py
└── CONTRIBUTING.rst


/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../LICENSE.rst
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test suite for the soccerdata package."""
2 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. _contributing:
2 | .. include:: ../CONTRIBUTING.rst
3 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | furo==2022.9.29
2 | sphinx==4.5.0
3 | nbsphinx==0.8.9
4 | 


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/_static/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/logo2.png


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["config:base", ":automergePatch"],
3 |   "stabilityDays": 7
4 | }
5 | 


--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/favicon.ico


--------------------------------------------------------------------------------
/.github/workflows/constraints.txt:
--------------------------------------------------------------------------------
1 | pip==21.3.1
2 | nox==2021.10.1
3 | nox-poetry==0.9.0
4 | poetry==1.1.12
5 | virtualenv==20.10.0
6 | 


--------------------------------------------------------------------------------
/docs/reference/espn.rst:
--------------------------------------------------------------------------------
1 | .. _api-espn:
2 | 
3 | ESPN
4 | =====
5 | 
6 | .. autoclass:: soccerdata.ESPN
7 |    :inherited-members:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/reference/sofifa.rst:
--------------------------------------------------------------------------------
1 | .. _api-sofifa:
2 | 
3 | SoFIFA
4 | ========
5 | 
6 | .. autoclass:: soccerdata.SoFIFA
7 |    :members: available_leagues, read_ratings
8 | 


--------------------------------------------------------------------------------
/docs/reference/matchhistory.rst:
--------------------------------------------------------------------------------
1 | .. _api-matchhistory:
2 | 
3 | MatchHistory
4 | =============
5 | 
6 | .. autoclass:: soccerdata.MatchHistory
7 |    :inherited-members:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/reference/clubelo.rst:
--------------------------------------------------------------------------------
1 | .. _api-clubelo:
2 | 
3 | Club Elo
4 | ========
5 | 
6 | .. autoclass:: soccerdata.ClubElo
7 |    :inherited-members: available_leagues
8 |    :members: read_by_date, read_team_history
9 | 


--------------------------------------------------------------------------------
/docs/reference/whoscored.rst:
--------------------------------------------------------------------------------
1 | .. _api-whoscored:
2 | 
3 | WhoScored
4 | =========
5 | 
6 | .. autoclass:: soccerdata.WhoScored
7 |    :members: available_leagues, read_schedule, read_missing_players, read_events
8 | 


--------------------------------------------------------------------------------
/docs/reference/fivethirtyeight.rst:
--------------------------------------------------------------------------------
1 | .. _api-fivethirtyeight:
2 | 
3 | FiveThirtyEight
4 | ===============
5 | 
6 | .. autoclass:: soccerdata.FiveThirtyEight
7 |     :members: available_leagues, read_games, read_forecasts, read_clinches
8 | 


--------------------------------------------------------------------------------
/docs/reference/fbref.rst:
--------------------------------------------------------------------------------
1 | .. _api-fbref:
2 | 
3 | FBref
4 | =====
5 | 
6 | .. autoclass:: soccerdata.FBref
7 |    :members: available_leagues, read_team_season_stats, read_player_season_stats,
8 |     read_schedule, read_player_match_stats, read_lineup, read_shot_events
9 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.1.0
 3 | commit = True
 4 | tag = False
 5 | 
 6 | [bumpversion:file:pyproject.toml]
 7 | search = version = "{current_version}"
 8 | replace = version = "{new_version}"
 9 | 
10 | [bumpversion:file:soccerdata/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 


--------------------------------------------------------------------------------
/tests/test_SoFIFA.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.SoFIFA."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | # Unittests -------------------------------------------------------------------
 7 | 
 8 | 
 9 | @pytest.mark.fails_gha
10 | def test_sofifa_ratings(sofifa_bundesliga):
11 |     assert isinstance(sofifa_bundesliga.read_ratings(), pd.DataFrame)
12 | 


--------------------------------------------------------------------------------
/tests/test_MatchHistory.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.MatchHistory."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | # Unittests -------------------------------------------------------------------
 7 | # Reader
 8 | def test_epl_2y(match_epl_2y):
 9 |     df = match_epl_2y.read_games()
10 |     assert isinstance(df, pd.DataFrame)
11 |     assert len(df.index.get_level_values("season").unique()) == 2
12 | 


--------------------------------------------------------------------------------
/tests/test_Whoscored.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.WhoScored."""
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | # Unittests -------------------------------------------------------------------
 6 | 
 7 | 
 8 | @pytest.mark.fails_gha
 9 | def test_whoscored_missing_players(whoscored):
10 |     assert isinstance(whoscored.read_missing_players(1485184), pd.DataFrame)
11 | 
12 | 
13 | @pytest.mark.fails_gha
14 | def test_whoscored_events(whoscored):
15 |     assert isinstance(whoscored.read_events(1485184), pd.DataFrame)
16 | 


--------------------------------------------------------------------------------
/soccerdata/__init__.py:
--------------------------------------------------------------------------------
 1 | """A collection of tools to read and process soccer data from various sources."""
 2 | 
 3 | __version__ = '1.1.0'
 4 | 
 5 | __all__ = [
 6 |     'FiveThirtyEight',
 7 |     'ClubElo',
 8 |     'MatchHistory',
 9 |     'FBref',
10 |     'ESPN',
11 |     'WhoScored',
12 |     'SoFIFA',
13 | ]
14 | 
15 | from .clubelo import ClubElo
16 | from .espn import ESPN
17 | from .fbref import FBref
18 | from .fivethirtyeight import FiveThirtyEight
19 | from .match_history import MatchHistory
20 | from .sofifa import SoFIFA
21 | from .whoscored import WhoScored
22 | 


--------------------------------------------------------------------------------
/docs/output.csv:
--------------------------------------------------------------------------------
1 | league,season,team,#Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
2 | ENG-Premier League,2021,Arsenal,29,38.0,53,455,141,31.0,11.97,3.71,0.1,0.33,16.9,23,6,6,53.5,49.0,0.11,-0.5,-2.0
3 | ,,Aston Villa,24,38.0,52,518,179,34.6,13.63,4.71,0.09,0.26,16.5,15,5,6,52.9,48.5,0.1,-0.9,-1.5
4 | ,,Brighton,27,38.0,39,476,129,27.1,12.53,3.39,0.07,0.26,16.6,14,6,9,51.6,44.8,0.1,-12.6,-11.8
5 | ,,Burnley,25,38.0,32,383,125,32.6,10.08,3.29,0.08,0.23,16.6,15,3,3,39.9,37.6,0.1,-7.9,-8.6
6 | ,,Chelsea,27,38.0,56,553,194,35.1,14.55,5.11,0.09,0.25,16.3,16,8,10,64.0,56.4,0.1,-8.0,-8.4
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats: all
14 | 
15 | # Optionally set the version of Python and requirements required to build your docs
16 | python:
17 |   version: 3.7
18 |   install:
19 |     - requirements: docs/requirements.txt
20 |     - method: pip
21 |       path: .
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: init test lint pretty
 2 | 
 3 | BIN = .venv/bin/
 4 | CODE = soccerdata
 5 | PY = 3.9
 6 | 
 7 | init:
 8 | 	python3 -m venv .venv
 9 | 	poetry install
10 | 
11 | test:
12 | 	nox -rs tests-$(PY) -- $(args)
13 | 
14 | mypy:
15 | 	nox -rs mypy-$(PY) -- $(args)
16 | 
17 | lint:
18 | 	nox -rs pre-commit -- $(args)
19 | 
20 | precommit_install:
21 | 	nox -rs pre-commit -- install
22 | 
23 | bump_major:
24 | 	$(BIN)bumpversion major
25 | 
26 | bump_minor:
27 | 	$(BIN)bumpversion minor
28 | 
29 | bump_patch:
30 | 	$(BIN)bumpversion patch
31 | 
32 | clean:
33 | 	find . -type f -name "*.py[co]" -delete
34 | 	find . -type d -name "__pycache__" -delete
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | logs
 3 | config
 4 | notebooks/data
 5 | notebooks_priv
 6 | 
 7 | *.py[cod]
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | 
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | 
35 | # Translations
36 | *.mo
37 | 
38 | # Data
39 | .ipynb_checkpoints
40 | 
41 | # Sphinx documentation
42 | docs/_build/
43 | docs/modules/generated/
44 | 
45 | # Hidden files
46 | .*
47 | 
48 | # ...except these
49 | !.gitignore
50 | !.travis.yml
51 | 


--------------------------------------------------------------------------------
/tests/test_ESPN.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.ESPN."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | # Unittests -------------------------------------------------------------------
 7 | 
 8 | 
 9 | def test_espn_schedule(espn_seriea):
10 |     assert isinstance(espn_seriea.read_schedule(), pd.DataFrame)
11 | 
12 | 
13 | def test_espn_matchsheet(espn_seriea):
14 |     assert isinstance(espn_seriea.read_matchsheet(554204), pd.DataFrame)
15 | 
16 | 
17 | def test_espn_lineups(espn_seriea):
18 |     assert isinstance(espn_seriea.read_lineup(554204), pd.DataFrame)
19 | 
20 | 
21 | def test_espn_id_not_in_season(espn_seriea):
22 |     with pytest.raises(ValueError):
23 |         assert isinstance(espn_seriea.read_lineup(123), pd.DataFrame)
24 | 


--------------------------------------------------------------------------------
/docs/_static/default.css:
--------------------------------------------------------------------------------
 1 | .toctree-l1 a:active,
 2 | .toctree-l1 a:hover {
 3 |   background-color: #676767;
 4 | }
 5 | 
 6 | .sidebar-logo {
 7 |   max-width: 100%;
 8 | }
 9 | 
10 | .sidebar-drawer {
11 |   width: calc(50% - 25em);
12 |   min-width: 22em;
13 | }
14 | 
15 | .sidebar-drawer .sidebar-container {
16 |   width: 23em;
17 | }
18 | 
19 | li.toctree-l2 {
20 |   font-size: 80%;
21 | }
22 | 
23 | @media (max-width: 67em) {
24 |   .sidebar-drawer {
25 |     width: 22em;
26 |     left: -22em;
27 |   }
28 |   .sidebar-drawer .sidebar-container {
29 |     width: 22em;
30 |   }
31 |   li.toctree-l2 {
32 |     font-size: 75%;
33 |   }
34 | }
35 | 
36 | /* autosummary table text */
37 | article .align-center,
38 | article .align-default {
39 |   text-align: left;
40 | }
41 | 


--------------------------------------------------------------------------------
/docs/reference/index.rst:
--------------------------------------------------------------------------------
 1 | .. soccerdata package index documentation toctree
 2 | .. _api:
 3 | 
 4 | .. currentmodule:: soccerdata
 5 | 
 6 | API
 7 | ===
 8 | 
 9 | .. list-table::
10 |    :widths: 30 70
11 | 
12 |    * - :ref:`Club Elo <api-clubelo>`
13 |      - ClubElo reader.
14 |    * - :ref:`ESPN <api-espn>`
15 |      - ESPN reader.
16 |    * - :ref:`FBref <api-fbref>`
17 |      - FBref reader.
18 |    * - :ref:`FiveThirtyEight <api-fivethirtyeight>`
19 |      - FiveThirtyEight reader.
20 |    * - :ref:`MatchHistory <api-matchhistory>`
21 |      - Football-data.co.uk reader.
22 |    * - :ref:`SoFIFA <api-sofifa>`
23 |      - SoFIFA reader.
24 |    * - :ref:`WhoScored <api-whoscored>`
25 |      - WhoScored reader.
26 | 
27 | .. toctree::
28 |    :hidden:
29 | 
30 |    clubelo
31 |    espn
32 |    fbref
33 |    fivethirtyeight
34 |    matchhistory
35 |    sofifa
36 |    whoscored
37 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | """Sphinx configuration."""
 2 | project = "soccerdata"
 3 | author = "Pieter Robberechts"
 4 | copyright = f"2021, {author}"
 5 | extensions = [
 6 |     "sphinx.ext.autodoc",
 7 |     "sphinx.ext.napoleon",
 8 |     "nbsphinx",
 9 | ]
10 | exclude_patterns = ["_build", "**.ipynb_checkpoints"]
11 | autodoc_typehints = "description"
12 | autodoc_member_order = "bysource"
13 | 
14 | # -- Options for HTML output -------------------------------------------------
15 | 
16 | html_theme = "furo"
17 | html_logo = "_static/logo2.png"
18 | html_favicon = "_static/favicon.ico"
19 | html_theme_options = {
20 |     "sidebar_hide_name": True,
21 |     "light_css_variables": {
22 |         "color-brand-primary": "#2F3C7E",
23 |         "color-brand-content": "#2F3C7E",
24 |         "color-sidebar-background": "#fdf3f4",
25 |         # "color-api-name": "#7bb5b2",
26 |         # "color-api-pre-name": "#7bb5b2",
27 |     },
28 |     "dark_css_variables": {
29 |         "color-brand-primary": "#7C4DFF",
30 |         "color-brand-content": "#7C4DFF",
31 |     },
32 | }
33 | 
34 | html_static_path = ["_static"]
35 | html_css_files = ["default.css"]
36 | 


--------------------------------------------------------------------------------
/tests/test_Integration.py:
--------------------------------------------------------------------------------
 1 | """Integration tests for soccerdata package."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import soccerdata as foo
 7 | 
 8 | # TODO: integration tests
 9 | # Names of common leagues equal for all classes
10 | # Number of clubs equal for all common leagues over classes
11 | # Clubnames equal for all common leagues over classes
12 | # Number of games equal for all common leagues/seasons over classes
13 | # Scores per game equal for all common leagues over classes
14 | 
15 | 
16 | @pytest.mark.e2e
17 | def test_five38_vs_elo():
18 |     """We should be able to retrieve the Elo history for all teams in these leagues."""
19 |     league_sel = [
20 |         'ENG-Premier League',
21 |         'ESP-La Liga',
22 |         'FRA-Ligue 1',
23 |         'GER-Bundesliga',
24 |         'ITA-Serie A',
25 |     ]
26 | 
27 |     five38 = foo.FiveThirtyEight(leagues=league_sel, seasons='1819')
28 |     five38_games = five38.read_games()
29 | 
30 |     elo = foo.ClubElo()
31 |     elo_hist = pd.concat([elo.read_team_history(team) for team in set(five38_games['home_team'])])
32 | 
33 |     assert set(five38_games['home_team']) - set(elo_hist['team']) == set()
34 | 


--------------------------------------------------------------------------------
/docs/datasources/index.rst:
--------------------------------------------------------------------------------
 1 | .. soccerdata package index documentation toctree
 2 | .. _datasources:
 3 | 
 4 | .. currentmodule:: soccerdata
 5 | 
 6 | Data Sources
 7 | ============
 8 | 
 9 | Currently the following data sources are supported:
10 | 
11 | .. list-table::
12 |    :widths: 30 70
13 | 
14 |    * - `Club Elo <ClubElo.html>`_
15 |      - Team’s relative strengths as Elo ratings, for most European leagues. Recalculated after every round, includes history.
16 |    * - `ESPN <ESPN.html>`_
17 |      - Historical results, statistics and lineups.
18 |    * - `FBref <FBref.html>`_
19 |      - Historical results, lineups, and detailed aggregated statistics for teams and individual players based on StatsBomb data.
20 |    * - `FiveThirtyEight <FiveThirtyEight.html>`_
21 |      - Team’s relative strengths as SPI ratings, predictions and results for the top European and American leagues.
22 |    * - `MatchHistory <MatchHistory.html>`_
23 |      - Historical results, betting odds and match statistics. Level of detail depends on league.
24 |    * - `SoFIFA <SoFIFA.html>`_
25 |      - Detailed scores on all player's abilities from EA Sports FIFA.
26 |    * - `WhoScored <WhoScored.html>`_
27 |      - Historical results, match preview data and detailed Opta event stream data for major leagues.
28 | 
29 | .. toctree::
30 |    :hidden:
31 | 
32 |    ClubElo
33 |    ESPN
34 |    FBref
35 |    FiveThirtyEight
36 |    MatchHistory
37 |    SoFIFA
38 |    WhoScored
39 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | enable-extensions = G
 3 | exclude = .git, .venv
 4 | ignore =
 5 |     A003 ; 'id' is a python builtin, consider renaming the class attribute
 6 |     W503 ; line break before binary operator
 7 |     RST303 ; Unknown directive type "XXX".
 8 |     RST304 ; Unknown interpreted text role "XXX".
 9 |     DAR402 ; The docstring describes an exception not explicitly raised.
10 | per-file-ignores =
11 |     tests/*:D103
12 | max-complexity = 10
13 | max-line-length = 100
14 | show-source = true
15 | application-import-names = soccerdata
16 | docstring-convention = numpy
17 | strictness = short
18 | docstring_style = numpy
19 | 
20 | 
21 | [pylint]
22 | good-names=i,j,k,e,x,_,pk,id
23 | max-args=5
24 | max-attributes=10
25 | max-bool-expr=5
26 | max-module-lines=200
27 | max-nested-blocks=2
28 | max-public-methods=5
29 | max-returns=5
30 | max-statements=20
31 | output-format = colorized
32 | 
33 | disable=
34 |     C0103, ; Constant name "api" doesn't conform to UPPER_CASE naming style (invalid-name)
35 |     C0111, ; Missing module docstring (missing-docstring)
36 |     C0330, ; Wrong hanging indentation before block (add 4 spaces)
37 |     E0213, ; Method should have "self" as first argument (no-self-argument) - N805 for flake8
38 |     R0201, ; Method could be a function (no-self-use)
39 |     R0901, ; Too many ancestors (m/n) (too-many-ancestors)
40 |     R0903, ; Too few public methods (m/n) (too-few-public-methods)
41 | 
42 | ignored-classes=
43 |     contextlib.closing,
44 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Apache License
 2 | ==============
 3 | 
 4 | Copyright (c) 2021 Pieter Robberechts
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |     http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 
18 | This file incorporates code of the `footballdata`_ software package covered
19 | by the following copyright and permission notice:
20 | 
21 |   Copyright (c) 2017 skagr
22 | 
23 |   Permission is hereby granted, free of charge, to any person obtaining a copy
24 |   of this software and associated documentation files (the "Software"), to deal
25 |   in the Software without restriction, including without limitation the rights
26 |   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27 |   copies of the Software, and to permit persons to whom the Software is
28 |   furnished to do so, subject to the following conditions:
29 | 
30 |   The above copyright notice and this permission notice shall be included in all
31 |   copies or substantial portions of the Software.
32 | 
33 | .. _footballdata: https://github.com/skagr/footballdata
34 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Pytest fixtures for soccerdata package."""
 2 | 
 3 | import pytest
 4 | 
 5 | import soccerdata as foo
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def five38():
10 |     """Return a correctly initialized instance of FiveThirtyEight."""
11 |     return foo.FiveThirtyEight(seasons="20-21")
12 | 
13 | 
14 | @pytest.fixture
15 | def five38_laliga():
16 |     """Return a correctly initialized instance of FiveThirtyEight filtered by league: La Liga."""
17 |     return foo.FiveThirtyEight("ESP-La Liga", "20-21")
18 | 
19 | 
20 | @pytest.fixture
21 | def espn_seriea():
22 |     """Return a correctly initialized instance of ESPN filtered by league: Serie A."""
23 |     return foo.ESPN("ITA-Serie A", "20-21")
24 | 
25 | 
26 | @pytest.fixture
27 | def sofifa_bundesliga():
28 |     """Return a correctly initialized instance of SoFIFA filtered by league: Bundesliga."""
29 |     return foo.SoFIFA("GER-Bundesliga", "20-21")
30 | 
31 | 
32 | @pytest.fixture
33 | def fbref_ligue1():
34 |     """Return a correctly initialized instance of FBref filtered by league: Ligue 1."""
35 |     return foo.FBref("FRA-Ligue 1", "20-21")
36 | 
37 | 
38 | @pytest.fixture
39 | def elo():
40 |     """Return a correctly initialized ClubElo instance."""
41 |     return foo.ClubElo()
42 | 
43 | 
44 | @pytest.fixture
45 | def match_epl_2y():
46 |     """Return a MatchHistory instance for the last 2 years of the EPL."""
47 |     return foo.MatchHistory("ENG-Premier League", list(range(2018, 2020)))
48 | 
49 | 
50 | @pytest.fixture
51 | def whoscored():
52 |     """Return a correctly initialized instance of WhoScored."""
53 |     return foo.WhoScored("ENG-Premier League", "20-21")
54 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: black
 5 |         name: black
 6 |         entry: black
 7 |         language: system
 8 |         types: [python]
 9 |         require_serial: true
10 |       - id: check-added-large-files
11 |         name: Check for added large files
12 |         entry: check-added-large-files
13 |         language: system
14 |       - id: check-toml
15 |         name: Check Toml
16 |         entry: check-toml
17 |         language: system
18 |         types: [toml]
19 |       - id: check-yaml
20 |         name: Check Yaml
21 |         entry: check-yaml
22 |         language: system
23 |         types: [yaml]
24 |       - id: end-of-file-fixer
25 |         name: Fix End of Files
26 |         entry: end-of-file-fixer
27 |         language: system
28 |         types: [text]
29 |         stages: [commit, push, manual]
30 |       - id: flake8
31 |         name: flake8
32 |         entry: flake8
33 |         language: system
34 |         types: [python]
35 |         require_serial: true
36 |       - id: pyupgrade
37 |         name: pyupgrade
38 |         description: Automatically upgrade syntax for newer versions.
39 |         entry: pyupgrade
40 |         language: system
41 |         types: [python]
42 |         args: [--py37-plus]
43 |       - id: isort
44 |         name: Reorder python imports
45 |         entry: isort
46 |         language: system
47 |         types: [python]
48 |       - id: trailing-whitespace
49 |         name: Trim Trailing Whitespace
50 |         entry: trailing-whitespace-fixer
51 |         language: system
52 |         types: [text]
53 |         stages: [commit, push, manual]
54 |   - repo: https://github.com/pre-commit/mirrors-prettier
55 |     rev: v2.4.1
56 |     hooks:
57 |       - id: prettier
58 | 


--------------------------------------------------------------------------------
/tests/test_ClubElo.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.ClubElo."""
 2 | import json
 3 | from datetime import datetime, timedelta
 4 | from importlib import reload
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | from soccerdata import _config as conf
10 | from soccerdata import clubelo as foo
11 | 
12 | # Unittests -------------------------------------------------------------------
13 | # Happy flow
14 | 
15 | 
16 | def test_by_date(elo):
17 |     assert isinstance(elo.read_by_date(), pd.DataFrame)
18 |     assert isinstance(elo.read_by_date('2017-04-01'), pd.DataFrame)
19 |     assert isinstance(elo.read_by_date(datetime(2017, 4, 1)), pd.DataFrame)
20 | 
21 | 
22 | def test_club_hist_age(elo):
23 |     assert isinstance(elo.read_team_history('Feyenoord'), pd.DataFrame)
24 |     assert isinstance(elo.read_team_history('Feyenoord', 2), pd.DataFrame)
25 |     max_age = timedelta(milliseconds=1)
26 |     assert isinstance(elo.read_team_history('Feyenoord', max_age), pd.DataFrame)
27 | 
28 | 
29 | def test_club_hist_replacement(monkeypatch, tmp_path):
30 |     monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path))
31 |     # no teamname_replacements.json
32 |     reload(conf)
33 |     assert conf.TEAMNAME_REPLACEMENTS == {}
34 |     fp = tmp_path / "config" / "teamname_replacements.json"
35 |     with open(fp, 'w', encoding='utf8') as outfile:
36 |         json.dump({"Manchester City": ["Man City"]}, outfile)
37 |     # correctly parse teamname_replacements.json
38 |     reload(conf)
39 |     reload(foo)
40 |     elo = foo.ClubElo()
41 |     assert isinstance(elo.read_team_history('Manchester City'), pd.DataFrame)
42 | 
43 | 
44 | # Bad calls
45 | 
46 | 
47 | def test_by_date_bad_params(elo):
48 |     with pytest.raises(ValueError):
49 |         elo.read_by_date('2017')
50 |     with pytest.raises(AttributeError):
51 |         elo.read_by_date(1 / 4)
52 | 
53 | 
54 | def test_club_hist_bad_params(elo):
55 |     with pytest.raises(TypeError):
56 |         elo.read_team_history()  # missing argument
57 |     with pytest.raises(ValueError):
58 |         elo.read_team_history('FC Knudde')  # no data for team
59 |     with pytest.raises(TypeError):
60 |         elo.read_team_history('Feyenoord', datetime.now())  # invalid max_age type
61 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | 
 9 | jobs:
10 |   release:
11 |     name: Release
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Check out the repository
15 |         uses: actions/checkout@v3.1.0
16 |         with:
17 |           fetch-depth: 2
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v4.3.0
21 |         with:
22 |           python-version: "3.8"
23 | 
24 |       - name: Upgrade pip
25 |         run: |
26 |           pip install --constraint=.github/workflows/constraints.txt pip
27 |           pip --version
28 | 
29 |       - name: Install Poetry
30 |         run: |
31 |           pip install --constraint=.github/workflows/constraints.txt poetry
32 |           poetry --version
33 | 
34 |       - name: Check if there is a parent commit
35 |         id: check-parent-commit
36 |         run: |
37 |           echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)"
38 | 
39 |       - name: Detect and tag new version
40 |         id: check-version
41 |         if: steps.check-parent-commit.outputs.sha
42 |         uses: salsify/action-detect-and-tag-new-version@v2.0.1
43 |         with:
44 |           version-command: |
45 |             bash -o pipefail -c "poetry version | awk '{ print \$2 }'"
46 | 
47 |       - name: Bump version for developmental release
48 |         if: "! steps.check-version.outputs.tag"
49 |         run: |
50 |           poetry version patch &&
51 |           version=$(poetry version | awk '{ print $2 }') &&
52 |           poetry version $version.dev.$(date +%s)
53 | 
54 |       - name: Build package
55 |         run: |
56 |           poetry build --ansi
57 | 
58 |       - name: Publish package on PyPI
59 |         if: steps.check-version.outputs.tag
60 |         uses: pypa/gh-action-pypi-publish@v1.5.1
61 |         with:
62 |           user: __token__
63 |           password: ${{ secrets.PYPI_TOKEN }}
64 | 
65 |       - name: Publish package on TestPyPI
66 |         if: "! steps.check-version.outputs.tag"
67 |         uses: pypa/gh-action-pypi-publish@v1.5.1
68 |         with:
69 |           user: __token__
70 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
71 |           repository_url: https://test.pypi.org/legacy/
72 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | """Unittests for soccerdata._config."""
 2 | import json
 3 | import logging
 4 | from importlib import reload
 5 | 
 6 | from soccerdata import _config as conf
 7 | 
 8 | 
 9 | def test_env_soccerdata_dir(monkeypatch, tmp_path):
10 |     monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path))
11 |     reload(conf)
12 |     assert conf.BASE_DIR == tmp_path
13 | 
14 | 
15 | def test_env_nocache(monkeypatch):
16 |     monkeypatch.setenv('SOCCERDATA_NOCACHE', 't')
17 |     reload(conf)
18 |     assert conf.NOCACHE is True
19 | 
20 |     monkeypatch.setenv('SOCCERDATA_NOCACHE', 'true')
21 |     reload(conf)
22 |     assert conf.NOCACHE is True
23 | 
24 |     monkeypatch.setenv('SOCCERDATA_NOCACHE', 'f')
25 |     reload(conf)
26 |     assert conf.NOCACHE is False
27 | 
28 | 
29 | def test_env_nostore(monkeypatch):
30 |     monkeypatch.setenv('SOCCERDATA_NOSTORE', 't')
31 |     reload(conf)
32 |     assert conf.NOSTORE is True
33 | 
34 |     monkeypatch.setenv('SOCCERDATA_NOSTORE', 'true')
35 |     reload(conf)
36 |     assert conf.NOSTORE is True
37 | 
38 |     monkeypatch.setenv('SOCCERDATA_NOSTORE', 'f')
39 |     reload(conf)
40 |     assert conf.NOSTORE is False
41 | 
42 | 
43 | def test_env_loglevel(monkeypatch):
44 |     monkeypatch.setenv('SOCCERDATA_LOGLEVEL', 'DEBUG')
45 |     reload(conf)
46 |     assert conf.logger.level == logging.DEBUG
47 | 
48 | 
49 | def test_read_teamnname_replacements(monkeypatch, tmp_path):
50 |     monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path))
51 |     # no teamname_replacements.json
52 |     reload(conf)
53 |     assert conf.TEAMNAME_REPLACEMENTS == {}
54 |     fp = tmp_path / "config" / "teamname_replacements.json"
55 |     with open(fp, 'w', encoding='utf8') as outfile:
56 |         json.dump({"Celta de Vigo": ["Celta Vigo", "Celta"]}, outfile)
57 |     # correctly parse teamname_replacements.json
58 |     reload(conf)
59 |     assert conf.TEAMNAME_REPLACEMENTS == {
60 |         "Celta Vigo": "Celta de Vigo",
61 |         "Celta": "Celta de Vigo",
62 |     }
63 | 
64 | 
65 | def test_read_league_dict(monkeypatch, tmp_path):
66 |     monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path))
67 |     # no league_dict.json
68 |     reload(conf)
69 |     nb_default = len(conf.LEAGUE_DICT)
70 |     fp = tmp_path / "config" / "league_dict.json"
71 |     with open(fp, 'w', encoding='utf8') as outfile:
72 |         json.dump({"ABC-Fake": {"WhoScored": "Fake"}}, outfile)
73 |     # correctly parse league_dict.json
74 |     reload(conf)
75 |     assert len(conf.LEAGUE_DICT) == nb_default + 1
76 |     assert conf.LEAGUE_DICT['ABC-Fake'] == {'WhoScored': 'Fake'}
77 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to SoccerData
 2 | ======================
 3 | 
 4 | 
 5 | SoccerData is a collection of wrappers over soccer data from `Club Elo`_,
 6 | `ESPN`_, `FBref`_, `FiveThirtyEight`_, `Football-Data.co.uk`_, `SoFIFA`_ and
 7 | `WhoScored`_. You get Pandas DataFrames with sensible, matching column names
 8 | and identifiers across datasets. Data is downloaded when needed and cached
 9 | locally.
10 | 
11 | .. code:: python
12 | 
13 |    import soccerdata as sd
14 | 
15 |    # Create scraper class instance for the Premier League
16 |    five38 = sd.FiveThirtyEight('ENG-Premier League', '1819')
17 | 
18 |    # Fetch dataframes
19 |    games = five38.read_games()
20 | 
21 | To learn how to install, configure and use SoccerData, see the
22 | :ref:`Quickstart guide <quickstart>`. For documentation on each of the
23 | supported data sources, see the :ref:`API reference <api>`.
24 | 
25 | Other useful projects
26 | ----------------------
27 | 
28 | SoccerData is not the only tool of its kind. If SoccerData doesn’t quite fit
29 | your needs or you want to obtain data from other sources, we recommend looking
30 | at these tools:
31 | 
32 | - `worldfootballR`_: an R package with scrapers for FBref, Transfermarkt and Understat.
33 | - `Tyrone Mings`_: a Python package to scrape data from TransferMarkt
34 | - `understat`_:a Python package to scrape data from Understat
35 | - `understatr`_: an R package to scrape data from Understat
36 | - `ScraperFC`_: a Python package to scrape data from FBRef, Understat, FiveThirtyEight and WhoScored
37 | - `Scrape-FBref-data`_: Python package to scrape StatsBomb data via FBref
38 | 
39 | 
40 | .. toctree::
41 |    :hidden:
42 |    :maxdepth: 1
43 | 
44 |    usage
45 |    datasources/index
46 |    reference/index
47 |    contributing
48 |    License <license>
49 |    Changelog <https://github.com/probberechts/soccerdata/releases>
50 | 
51 | .. _socceraction: https://socceraction.readthedocs.io/en/latest/modules/generated/socceraction.data.opta.OptaLoader.html#socceraction.data.opta.OptaLoader
52 | .. _Club Elo: https://www.clubelo.com/
53 | .. _ESPN: https://www.espn.com/soccer/
54 | .. _FBref: https://www.fbref.com/en/
55 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/
56 | .. _Football-Data.co.uk: https://www.football-data.co.uk/
57 | .. _SoFIFA: https://sofifa.com/
58 | .. _WhoScored: https://www.whoscored.com/
59 | .. _worldfootballR: https://jaseziv.github.io/worldfootballR/index.html
60 | .. _Tyrone Mings: https://github.com/FCrSTATS/tyrone_mings
61 | .. _understat: https://github.com/amosbastian/understat
62 | .. _understatr: https://github.com/ewenme/understatr
63 | .. _ScraperFC: https://github.com/oseymour/ScraperFC
64 | .. _Scrape-FBref-data: https://github.com/parth1902/Scrape-FBref-data
65 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "soccerdata"
 3 | version = "1.1.0"
 4 | description = "A collection of wrappers over soccer data from various websites / APIs."
 5 | authors = ["Pieter Robberechts <pieter.robberechts@kuleuven.be>"]
 6 | license = "Apache-2.0"
 7 | readme = 'README.rst'
 8 | homepage = "https://github.com/probberechts/soccerdata"
 9 | repository = "https://github.com/probberechts/soccerdata"
10 | keywords = ["soccer", "football", "soccer data", "web scraping", "soccer analytics"]
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: MIT License",
14 |     "Operating System :: OS Independent"
15 | ]
16 | 
17 | [tool.poetry.dependencies]
18 | python = ">=3.7,<4.0.0"
19 | pandas = "^1.0"
20 | requests = "^2.23"
21 | unicode = "^2.7"
22 | lxml = "^4.6"
23 | selenium = "^4.0.0"
24 | Unidecode = "^1.2.0"
25 | rich = "^12.0.0"
26 | pretty-errors = "^1.2.25"
27 | PySocks = "^1.7.1"
28 | html5lib = "^1.1"
29 | undetected-chromedriver = "^3.1.3"
30 | 
31 | [tool.poetry.dev-dependencies]
32 | pytest = "^7.0.0"
33 | mypy = "^0.982"
34 | pylint = "^2.6.0"
35 | pytest-deadfixtures = "^2.2.1"
36 | unify = "^0.5"
37 | black = "^21.12b0"
38 | Sphinx = "^4.3.2"
39 | sphinx-autobuild = "^2021.3.14"
40 | furo = "^2022.0.0"
41 | coverage = {version = "^6.2", extras = ["toml"]}
42 | pre-commit = "^2.16.0"
43 | flake8 = "^4.0.1"
44 | flake8-bugbear = "^22.0.0"
45 | flake8-docstrings = "^1.6.0"
46 | flake8-rst-docstrings = "^0.2.5"
47 | pep8-naming = "^0.13.0"
48 | darglint = "^1.8.1"
49 | pre-commit-hooks = "^4.1.0"
50 | Pygments = "^2.10.0"
51 | time-machine = "^2.5.0"
52 | pytest-mock = "^3.6.1"
53 | bumpversion = "^0.6.0"
54 | nbsphinx = "^0.8.8"
55 | 
56 | [tool.isort]
57 | profile = "black"
58 | src_paths = ["soccerdata", "tests"]
59 | balanced_wrapping = true
60 | default_section = "THIRDPARTY"
61 | include_trailing_comma = true
62 | known_first_party = ["soccerdata", "tests"]
63 | line_length = 79
64 | multi_line_output = 3
65 | 
66 | [tool.black]
67 | line-length = 99
68 | target-version = ['py38']
69 | skip-string-normalization = 1
70 | include = '\.pyi?$'
71 | 
72 | [tool.coverage.paths]
73 | source = ["soccerdata", "*/site-packages"]
74 | 
75 | [tool.coverage.run]
76 | branch = true
77 | source = ["soccerdata"]
78 | 
79 | [tool.coverage.report]
80 | show_missing = true
81 | ignore_errors = true
82 | 
83 | [tool.mypy]
84 | ignore_missing_imports = true
85 | disallow_untyped_defs = true
86 | disallow_incomplete_defs = true
87 | no_implicit_optional = true
88 | check_untyped_defs = true
89 | show_error_codes = true
90 | warn_unused_ignores = true
91 | 
92 | [[tool.mypy.overrides]]
93 | module = ["tests.*"]
94 | disallow_untyped_defs = false
95 | 
96 | [build-system]
97 | requires = ["poetry>=0.12"]
98 | build-backend = "poetry.masonry.api"
99 | 


--------------------------------------------------------------------------------
/tests/test_FiveThirtyEight.py:
--------------------------------------------------------------------------------
 1 | """Unittests for class soccerdata.FiveThirtyEight."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import soccerdata as foo
 7 | 
 8 | # Unittests -------------------------------------------------------------------
 9 | # Happy flow
10 | 
11 | 
12 | def test_five38_league_ids(five38_laliga):
13 |     assert isinstance(five38_laliga._selected_leagues, dict)
14 | 
15 | 
16 | def test_five38_leagues(five38_laliga):
17 |     assert isinstance(five38_laliga.read_leagues(), pd.DataFrame)
18 | 
19 | 
20 | def test_five38_games(five38_laliga):
21 |     assert isinstance(five38_laliga.read_games(), pd.DataFrame)
22 | 
23 | 
24 | def test_five38_forecasts(five38_laliga):
25 |     assert isinstance(five38_laliga.read_forecasts(), pd.DataFrame)
26 | 
27 | 
28 | def test_five38_clinches(five38_laliga):
29 |     assert isinstance(five38_laliga.read_clinches(), pd.DataFrame)
30 | 
31 | 
32 | def test_five38_league_ids_ll(five38_laliga):
33 |     assert isinstance(five38_laliga._selected_leagues, dict)
34 | 
35 | 
36 | def test_five38_leagues_ll(five38_laliga):
37 |     assert isinstance(five38_laliga.read_leagues(), pd.DataFrame)
38 | 
39 | 
40 | def test_five38_games_ll(five38_laliga):
41 |     assert isinstance(five38_laliga.read_games(), pd.DataFrame)
42 | 
43 | 
44 | def test_five38_forecasts_ll(five38_laliga):
45 |     assert isinstance(five38_laliga.read_forecasts(), pd.DataFrame)
46 | 
47 | 
48 | def test_five38_clinches_ll(five38_laliga):
49 |     assert isinstance(five38_laliga.read_clinches(), pd.DataFrame)
50 | 
51 | 
52 | def test_five38_laliga(five38_laliga):
53 |     df = five38_laliga.read_leagues()
54 |     assert len(df) == 1
55 |     assert df.loc['ESP-La Liga', 'long_name'] == 'La Liga'
56 | 
57 | 
58 | def test_league_counts(five38):
59 |     assert len(five38._selected_leagues) == len(five38.read_leagues())
60 |     assert len(five38._selected_leagues) == len(
61 |         five38.read_games().reset_index()['league'].unique()
62 |     )
63 |     assert len(five38._selected_leagues) == len(
64 |         five38.read_forecasts().reset_index()['league'].unique()
65 |     )
66 | 
67 | 
68 | def test_league_matches_games(five38):
69 |     assert set(five38.read_games().reset_index().league) == set(
70 |         five38.read_leagues().reset_index().league
71 |     )
72 | 
73 | 
74 | def test_league_matches_forecasts(five38):
75 |     assert set(five38.read_forecasts().reset_index().league) == set(
76 |         five38.read_leagues().reset_index().league
77 |     )
78 | 
79 | 
80 | def test_league_matches_clinches(five38):
81 |     assert set(five38.read_clinches().reset_index().league) == set(
82 |         five38.read_leagues().reset_index().league
83 |     )
84 | 
85 | 
86 | # Bad inits
87 | 
88 | 
89 | def test_five38_league_value_error():
90 |     with pytest.raises(ValueError):
91 |         foo.FiveThirtyEight('xxx')
92 | 
93 | 
94 | def test_five38_league_type_error():
95 |     with pytest.raises(TypeError):
96 |         foo.FiveThirtyEight(1)  # type: ignore
97 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://raw.githubusercontent.com/probberechts/soccerdata/master/docs/_static/logo2.png
 2 |    :align: center
 3 |    :alt: SoccerData
 4 |    :width: 600px
 5 | 
 6 | .. badges-begin
 7 | 
 8 | |PyPI| |Python Version| |License| |Read the Docs| |Tests| |Codecov| |pre-commit| |Black|
 9 | 
10 | .. |PyPI| image:: https://img.shields.io/pypi/v/soccerdata.svg
11 |    :target: https://pypi.org/project/soccerdata/
12 |    :alt: PyPI
13 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/soccerdata
14 |    :target: https://pypi.org/project/soccerdata
15 |    :alt: Python Version
16 | .. |License| image:: https://img.shields.io/pypi/l/soccerdata.svg
17 |    :target: https://opensource.org/licenses/Apache-2.0
18 |    :alt: License
19 | .. |Read the Docs| image:: https://img.shields.io/readthedocs/soccerdata/latest.svg?label=Read%20the%20Docs
20 |    :target: https://soccerdata.readthedocs.io/
21 |    :alt: Read the documentation at https://soccerdata.readthedocs.io/
22 | .. |Tests| image:: https://github.com/probberechts/soccerdata/workflows/CI/badge.svg
23 |    :target: https://github.com/probberechts/soccerdata/actions?workflow=CI
24 |    :alt: Tests
25 | .. |Codecov| image:: https://codecov.io/gh/probberechts/soccerdata/branch/master/graph/badge.svg
26 |    :target: https://app.codecov.io/gh/probberechts/soccerdata
27 |    :alt: Codecov
28 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white
29 |    :target: https://github.com/pre-commit/pre-commit
30 |    :alt: pre-commit
31 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
32 |    :target: https://github.com/psf/black
33 |    :alt: Black
34 | 
35 | .. badges-end
36 | 
37 | SoccerData is a collection of wrappers over soccer data from `Club Elo`_,
38 | `ESPN`_, `FBref`_, `FiveThirtyEight`_, `Football-Data.co.uk`_, `SoFIFA`_ and
39 | `WhoScored`_. You get Pandas DataFrames with sensible, matching column names
40 | and identifiers across datasets. Data is downloaded when needed and cached
41 | locally.
42 | 
43 | .. code:: python
44 | 
45 |    import soccerdata as sd
46 | 
47 |    # Create scraper class instance for the Premier League
48 |    five38 = sd.FiveThirtyEight('ENG-Premier League', '1819')
49 | 
50 |    # Fetch dataframes
51 |    games = five38.read_games()
52 | 
53 | To learn how to install, configure and use SoccerData, see the
54 | `Quickstart guide <https://soccerdata.readthedocs.io/en/latest/usage.html>`__. For documentation on each of the
55 | supported data sources, see the `example notebooks <https://soccerdata.readthedocs.io/en/latest/datasources/>`__ and `API reference <https://soccerdata.readthedocs.io/en/latest/reference/>`__.
56 | 
57 | .. _Club Elo: https://www.clubelo.com/
58 | .. _ESPN: https://www.espn.com/soccer/
59 | .. _FBref: https://www.fbref.com/en/
60 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/
61 | .. _Football-Data.co.uk: https://www.football-data.co.uk/
62 | .. _SoFIFA: https://sofifa.com/
63 | .. _WhoScored: https://www.whoscored.com/
64 | 
65 | **Disclaimer:** As soccerdata relies on web scraping, any changes to the
66 | scraped websites will break the package. Hence, do not expect that all code
67 | will work all the time. If you spot any bugs, then please `fork it and start
68 | a pull request <https://github.com/probberechts/soccerdata/blob/master/CONTRIBUTING.rst>`__.
69 | 


--------------------------------------------------------------------------------
/tests/test_FBref.py:
--------------------------------------------------------------------------------
  1 | """Unittests for class soccerdata.FBref."""
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | import soccerdata as sd
  7 | from soccerdata.fbref import _concat
  8 | 
  9 | # Unittests -------------------------------------------------------------------
 10 | # Happy flow
 11 | 
 12 | 
 13 | @pytest.mark.parametrize(
 14 |     "stat_type",
 15 |     [
 16 |         "standard",
 17 |         "keeper",
 18 |         "keeper_adv",
 19 |         "shooting",
 20 |         "passing",
 21 |         "passing_types",
 22 |         "goal_shot_creation",
 23 |         "defense",
 24 |         "possession",
 25 |         "playing_time",
 26 |         "misc",
 27 |     ],
 28 | )
 29 | def test_read_team_season_stats(fbref_ligue1, stat_type):
 30 |     assert isinstance(fbref_ligue1.read_team_season_stats(stat_type), pd.DataFrame)
 31 | 
 32 | 
 33 | @pytest.mark.parametrize(
 34 |     "stat_type",
 35 |     [
 36 |         "standard",
 37 |         "shooting",
 38 |         "passing",
 39 |         "passing_types",
 40 |         "goal_shot_creation",
 41 |         "defense",
 42 |         "possession",
 43 |         "playing_time",
 44 |         "misc",
 45 |         "keeper",
 46 |         "keeper_adv",
 47 |     ],
 48 | )
 49 | def test_read_player_season_stats(fbref_ligue1, stat_type):
 50 |     assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame)
 51 | 
 52 | 
 53 | def test_read_schedule(fbref_ligue1):
 54 |     assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame)
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     "stat_type",
 59 |     [
 60 |         "summary",
 61 |         "keepers",
 62 |         "passing",
 63 |         "passing_types",
 64 |         "defense",
 65 |         "possession",
 66 |         "misc",
 67 |     ],
 68 | )
 69 | def test_read_player_match_stats(fbref_ligue1, stat_type):
 70 |     assert isinstance(
 71 |         fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame
 72 |     )
 73 | 
 74 | 
 75 | def test_read_shot_events(fbref_ligue1):
 76 |     assert isinstance(fbref_ligue1.read_shot_events(match_id="796787da"), pd.DataFrame)
 77 | 
 78 | 
 79 | def test_read_lineup(fbref_ligue1):
 80 |     assert isinstance(fbref_ligue1.read_lineup(match_id="796787da"), pd.DataFrame)
 81 | 
 82 | 
 83 | def test_combine_big5():
 84 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
 85 |     assert len(fbref_bigfive.read_leagues()) == 1
 86 |     assert len(fbref_bigfive.read_seasons()) == 1
 87 | 
 88 | 
 89 | @pytest.mark.parametrize(
 90 |     "stat_type",
 91 |     [
 92 |         "standard",
 93 |         "keeper",
 94 |         "keeper_adv",
 95 |         "shooting",
 96 |         "passing",
 97 |         "passing_types",
 98 |         "goal_shot_creation",
 99 |         "defense",
100 |         "possession",
101 |         "playing_time",
102 |         "misc",
103 |     ],
104 | )
105 | def test_combine_big5_team_season_stats(fbref_ligue1, stat_type):
106 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
107 |     ligue1 = fbref_ligue1.read_team_season_stats(stat_type).loc["FRA-Ligue 1"]
108 |     bigfive = fbref_bigfive.read_team_season_stats(stat_type).loc["FRA-Ligue 1"]
109 |     cols = _concat([ligue1, bigfive]).columns
110 |     ligue1.columns = cols
111 |     bigfive.columns = cols
112 |     pd.testing.assert_frame_equal(
113 |         ligue1,
114 |         bigfive,
115 |     )
116 | 
117 | 
118 | @pytest.mark.parametrize(
119 |     "stat_type",
120 |     [
121 |         "standard",
122 |         "shooting",
123 |         "passing",
124 |         "passing_types",
125 |         "goal_shot_creation",
126 |         "defense",
127 |         "possession",
128 |         "playing_time",
129 |         "misc",
130 |         "keeper",
131 |         "keeper_adv",
132 |     ],
133 | )
134 | def test_combine_big5_player_season_stats(fbref_ligue1, stat_type):
135 |     fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021)
136 |     ligue1 = fbref_ligue1.read_player_season_stats(stat_type).loc["FRA-Ligue 1"]
137 |     bigfive = fbref_bigfive.read_player_season_stats(stat_type).loc["FRA-Ligue 1"]
138 |     cols = _concat([ligue1, bigfive]).columns
139 |     ligue1.columns = cols
140 |     bigfive.columns = cols
141 |     pd.testing.assert_frame_equal(
142 |         ligue1,
143 |         bigfive,
144 |     )
145 | 


--------------------------------------------------------------------------------
/soccerdata/match_history.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://www.football-data.co.uk/data.php."""
  2 | import itertools
  3 | from pathlib import Path
  4 | from typing import Callable, Dict, List, Optional, Union
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from ._common import BaseRequestsReader, make_game_id
  9 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 10 | 
 11 | MATCH_HISTORY_DATA_DIR = DATA_DIR / 'MatchHistory'
 12 | MATCH_HISTORY_API = 'https://www.football-data.co.uk'
 13 | 
 14 | 
 15 | class MatchHistory(BaseRequestsReader):
 16 |     """Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php.
 17 | 
 18 |     Data will be downloaded as necessary and cached locally in
 19 |     ``~/soccerdata/data/MatchHistory``.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     leagues : string or iterable
 24 |         IDs of leagues to include.
 25 |     seasons : string, int or list
 26 |         Seasons to include. Supports multiple formats.
 27 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 28 |     proxy : 'tor' or dict or list(dict) or callable, optional
 29 |         Use a proxy to hide your IP address. Valid options are:
 30 |             - "tor": Uses the Tor network. Tor should be running in
 31 |               the background on port 9050.
 32 |             - dict: A dictionary with the proxy to use. The dict should be
 33 |               a mapping of supported protocols to proxy addresses. For example::
 34 | 
 35 |                   {
 36 |                       'http': 'http://10.10.1.10:3128',
 37 |                       'https': 'http://10.10.1.10:1080',
 38 |                   }
 39 | 
 40 |             - list(dict): A list of proxies to choose from. A different proxy will
 41 |               be selected from this list after failed requests, allowing rotating
 42 |               proxies.
 43 |             - callable: A function that returns a valid proxy. This function will
 44 |               be called after failed requests, allowing rotating proxies.
 45 |     no_cache : bool
 46 |         If True, will not use cached data.
 47 |     no_store : bool
 48 |         If True, will not store downloaded data.
 49 |     data_dir : Path, optional
 50 |         Path to directory where data will be cached.
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         leagues: Optional[Union[str, List[str]]] = None,
 56 |         seasons: Optional[Union[str, int, List]] = None,
 57 |         proxy: Optional[
 58 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 59 |         ] = None,
 60 |         no_cache: bool = NOCACHE,
 61 |         no_store: bool = NOSTORE,
 62 |         data_dir: Path = MATCH_HISTORY_DATA_DIR,
 63 |     ):
 64 |         super().__init__(
 65 |             leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir
 66 |         )
 67 |         self.seasons = seasons  # type: ignore
 68 | 
 69 |     def read_games(self) -> pd.DataFrame:
 70 |         """Retrieve game history for the selected leagues and seasons.
 71 | 
 72 |         Column names are explained here: http://www.football-data.co.uk/notes.txt
 73 | 
 74 |         Returns
 75 |         -------
 76 |         pd.DataFrame
 77 |         """
 78 |         urlmask = MATCH_HISTORY_API + '/mmz4281/{}/{}.csv'
 79 |         filemask = '{}_{}.csv'
 80 |         col_rename = {
 81 |             'Div': 'league',
 82 |             'Date': 'date',
 83 |             'Time': 'time',
 84 |             'HomeTeam': 'home_team',
 85 |             'AwayTeam': 'away_team',
 86 |             'Referee': 'referee',
 87 |         }
 88 | 
 89 |         df_list = []
 90 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
 91 |             filepath = self.data_dir / filemask.format(lkey, skey)
 92 |             url = urlmask.format(skey, lkey)
 93 |             current_season = not self._is_complete(lkey, skey)
 94 |             reader = self.get(url, filepath, no_cache=current_season)
 95 | 
 96 |             df_list.append(
 97 |                 pd.read_csv(
 98 |                     reader,
 99 |                     encoding='ISO-8859-1',
100 |                 ).assign(season=skey)
101 |             )
102 | 
103 |         df = (
104 |             pd.concat(df_list, sort=False)
105 |             .rename(columns=col_rename)
106 |             .assign(date=lambda x: pd.to_datetime(x["date"] + ' ' + x['time']))
107 |             .drop("time", axis=1)
108 |             .pipe(self._translate_league)
109 |             .replace(
110 |                 {
111 |                     'home_team': TEAMNAME_REPLACEMENTS,
112 |                     'away_team': TEAMNAME_REPLACEMENTS,
113 |                 }
114 |             )
115 |             .dropna(subset=['home_team', 'away_team'])
116 |         )
117 | 
118 |         df['game'] = df.apply(make_game_id, axis=1)
119 |         df.set_index(['league', 'season', 'game'], inplace=True)
120 |         df.sort_index(inplace=True)
121 |         return df
122 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   - push
  5 |   - pull_request
  6 | 
  7 | jobs:
  8 |   tests:
  9 |     name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }}
 10 |     runs-on: ${{ matrix.os }}
 11 |     strategy:
 12 |       fail-fast: false
 13 |       matrix:
 14 |         include:
 15 |           - { python: "3.9", os: "ubuntu-latest", session: "pre-commit" }
 16 |           - { python: "3.9", os: "ubuntu-latest", session: "mypy" }
 17 |           - { python: "3.9", os: "ubuntu-latest", session: "tests" }
 18 |           - { python: "3.8", os: "ubuntu-latest", session: "tests" }
 19 |           - { python: "3.7", os: "ubuntu-latest", session: "tests" }
 20 |           - { python: "3.9", os: "windows-latest", session: "tests" }
 21 |           - { python: "3.9", os: "macos-latest", session: "tests" }
 22 |           - { python: "3.9", os: "ubuntu-latest", session: "docs-build" }
 23 | 
 24 |     env:
 25 |       NOXSESSION: ${{ matrix.session }}
 26 |       FORCE_COLOR: "1"
 27 |       PRE_COMMIT_COLOR: "always"
 28 | 
 29 |     steps:
 30 |       - name: Check out the repository
 31 |         uses: actions/checkout@v3.1.0
 32 | 
 33 |       - name: Set up Python ${{ matrix.python }}
 34 |         uses: actions/setup-python@v4.3.0
 35 |         with:
 36 |           python-version: ${{ matrix.python }}
 37 | 
 38 |       - name: Upgrade pip
 39 |         run: |
 40 |           pip install --constraint=.github/workflows/constraints.txt pip
 41 |           pip --version
 42 | 
 43 |       - name: Upgrade pip in virtual environments
 44 |         shell: python
 45 |         run: |
 46 |           import os
 47 |           import pip
 48 | 
 49 |           with open(os.environ["GITHUB_ENV"], mode="a") as io:
 50 |               print(f"VIRTUALENV_PIP={pip.__version__}", file=io)
 51 | 
 52 |       - name: Install Poetry
 53 |         run: |
 54 |           pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
 55 |           poetry --version
 56 | 
 57 |       - name: Install Nox
 58 |         run: |
 59 |           pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
 60 |           pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
 61 |           nox --version
 62 | 
 63 |       - name: Compute pre-commit cache key
 64 |         if: matrix.session == 'pre-commit'
 65 |         id: pre-commit-cache
 66 |         shell: python
 67 |         run: |
 68 |           import hashlib
 69 |           import sys
 70 | 
 71 |           python = "py{}.{}".format(*sys.version_info[:2])
 72 |           payload = sys.version.encode() + sys.executable.encode()
 73 |           digest = hashlib.sha256(payload).hexdigest()
 74 |           result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8])
 75 | 
 76 |           print("::set-output name=result::{}".format(result))
 77 | 
 78 |       - name: Restore pre-commit cache
 79 |         uses: actions/cache@v3.0.10
 80 |         if: matrix.session == 'pre-commit'
 81 |         with:
 82 |           path: ~/.cache/pre-commit
 83 |           key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }}
 84 |           restore-keys: |
 85 |             ${{ steps.pre-commit-cache.outputs.result }}-
 86 | 
 87 |       - name: Install pandoc
 88 |         if: matrix.session == 'docs-build'
 89 |         run: sudo apt-get install -y pandoc
 90 | 
 91 |       - name: Run Nox
 92 |         run: |
 93 |           nox --force-color --python=${{ matrix.python }}
 94 | 
 95 |       - name: Upload coverage data
 96 |         if: always() && matrix.session == 'tests'
 97 |         uses: actions/upload-artifact@v3.1.0
 98 |         with:
 99 |           name: coverage-data
100 |           path: ".coverage.*"
101 | 
102 |       - name: Upload documentation
103 |         if: matrix.session == 'docs-build'
104 |         uses: actions/upload-artifact@v3.1.0
105 |         with:
106 |           name: docs
107 |           path: docs/_build
108 | 
109 |   coverage:
110 |     runs-on: ubuntu-latest
111 |     needs: tests
112 |     steps:
113 |       - name: Check out the repository
114 |         uses: actions/checkout@v3.1.0
115 | 
116 |       - name: Set up Python
117 |         uses: actions/setup-python@v4.3.0
118 |         with:
119 |           python-version: "3.9"
120 | 
121 |       - name: Upgrade pip
122 |         run: |
123 |           pip install --constraint=.github/workflows/constraints.txt pip
124 |           pip --version
125 | 
126 |       - name: Install Poetry
127 |         run: |
128 |           pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
129 |           poetry --version
130 | 
131 |       - name: Install Nox
132 |         run: |
133 |           pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
134 |           pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
135 |           nox --version
136 | 
137 |       - name: Download coverage data
138 |         uses: actions/download-artifact@v3.0.0
139 |         with:
140 |           name: coverage-data
141 | 
142 |       - name: Combine coverage data and display human readable report
143 |         run: |
144 |           nox --force-color --session=coverage
145 | 
146 |       - name: Create coverage report
147 |         run: |
148 |           nox --force-color --session=coverage -- xml
149 | 
150 |       - name: Upload coverage report
151 |         uses: codecov/codecov-action@v3.1.1
152 | 


--------------------------------------------------------------------------------
/soccerdata/_config.py:
--------------------------------------------------------------------------------
  1 | """Configurations."""
  2 | 
  3 | import json
  4 | import logging
  5 | import logging.config
  6 | import os
  7 | import sys
  8 | from pathlib import Path
  9 | 
 10 | import pretty_errors  # NOQA: F401 (imported but unused)
 11 | from rich.logging import RichHandler
 12 | 
 13 | # Configuration
 14 | NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", 'False').lower() in ('true', '1', 't')
 15 | NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", 'False').lower() in ('true', '1', 't')
 16 | LOGLEVEL = os.environ.get('SOCCERDATA_LOGLEVEL', 'INFO').upper()
 17 | 
 18 | # Directories
 19 | BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata"))
 20 | LOGS_DIR = Path(BASE_DIR, "logs")
 21 | DATA_DIR = Path(BASE_DIR, "data")
 22 | CONFIG_DIR = Path(BASE_DIR, "config")
 23 | 
 24 | # Create dirs
 25 | LOGS_DIR.mkdir(parents=True, exist_ok=True)
 26 | DATA_DIR.mkdir(parents=True, exist_ok=True)
 27 | CONFIG_DIR.mkdir(parents=True, exist_ok=True)
 28 | 
 29 | # Logger
 30 | logging_config = {
 31 |     "version": 1,
 32 |     "disable_existing_loggers": False,
 33 |     "formatters": {
 34 |         "minimal": {"format": "%(message)s"},
 35 |         "detailed": {
 36 |             "format": "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"  # noqa: E501
 37 |         },
 38 |     },
 39 |     "handlers": {
 40 |         "console": {
 41 |             "class": "logging.StreamHandler",
 42 |             "stream": sys.stdout,
 43 |             "formatter": "minimal",
 44 |             "level": logging.DEBUG,
 45 |         },
 46 |         "info": {
 47 |             "class": "logging.handlers.RotatingFileHandler",
 48 |             "filename": Path(LOGS_DIR, "info.log"),
 49 |             "maxBytes": 10485760,  # 1 MB
 50 |             "backupCount": 10,
 51 |             "formatter": "detailed",
 52 |             "level": logging.INFO,
 53 |         },
 54 |         "error": {
 55 |             "class": "logging.handlers.RotatingFileHandler",
 56 |             "filename": Path(LOGS_DIR, "error.log"),
 57 |             "maxBytes": 10485760,  # 1 MB
 58 |             "backupCount": 10,
 59 |             "formatter": "detailed",
 60 |             "level": logging.ERROR,
 61 |         },
 62 |     },
 63 |     "loggers": {
 64 |         "root": {
 65 |             "handlers": ["console", "info", "error"],
 66 |             "level": LOGLEVEL,
 67 |             "propagate": True,
 68 |         },
 69 |     },
 70 | }
 71 | logging.config.dictConfig(logging_config)
 72 | logger = logging.getLogger("root")
 73 | logger.handlers[0] = RichHandler(markup=True)
 74 | 
 75 | # Team name replacements
 76 | TEAMNAME_REPLACEMENTS = {}
 77 | _f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json"
 78 | if _f_custom_teamnname_replacements.is_file():
 79 |     with open(_f_custom_teamnname_replacements, encoding='utf8') as json_file:
 80 |         for team, to_replace_list in json.load(json_file).items():
 81 |             for to_replace in to_replace_list:
 82 |                 TEAMNAME_REPLACEMENTS[to_replace] = team
 83 |     logger.info("Custom team name replacements loaded from %s.", _f_custom_teamnname_replacements)
 84 | else:
 85 |     logger.info(
 86 |         "No custom team name replacements found. You can configure these in %s.",
 87 |         _f_custom_teamnname_replacements,
 88 |     )
 89 | 
 90 | 
 91 | # League dict
 92 | LEAGUE_DICT = {
 93 |     "ENG-Premier League": {
 94 |         "ClubElo": "ENG_1",
 95 |         "MatchHistory": "E0",
 96 |         "FiveThirtyEight": "premier-league",
 97 |         "FBref": "Premier League",
 98 |         "ESPN": "eng.1",
 99 |         "SoFIFA": "English Premier League (1)",
100 |         "WhoScored": "England - Premier League",
101 |         "season_start": "Aug",
102 |         "season_end": "May",
103 |     },
104 |     "ESP-La Liga": {
105 |         "ClubElo": "ESP_1",
106 |         "MatchHistory": "SP1",
107 |         "FiveThirtyEight": "la-liga",
108 |         "FBref": "La Liga",
109 |         "ESPN": "esp.1",
110 |         "SoFIFA": "Spain Primera Division (1)",
111 |         "WhoScored": "Spain - LaLiga",
112 |         "season_start": "Aug",
113 |         "season_end": "May",
114 |     },
115 |     "ITA-Serie A": {
116 |         "ClubElo": "ITA_1",
117 |         "MatchHistory": "I1",
118 |         "FiveThirtyEight": "serie-a",
119 |         "FBref": "Serie A",
120 |         "ESPN": "ita.1",
121 |         "SoFIFA": " Italian Serie A (1)",
122 |         "WhoScored": "Italy - Serie A",
123 |         "season_start": "Aug",
124 |         "season_end": "May",
125 |     },
126 |     "GER-Bundesliga": {
127 |         "ClubElo": "GER_1",
128 |         "MatchHistory": "D1",
129 |         "FiveThirtyEight": "bundesliga",
130 |         "FBref": "Fußball-Bundesliga",
131 |         "ESPN": "ger.1",
132 |         "SoFIFA": "German 1. Bundesliga (1)",
133 |         "WhoScored": "Germany - Bundesliga",
134 |         "season_start": "Aug",
135 |         "season_end": "May",
136 |     },
137 |     "FRA-Ligue 1": {
138 |         "ClubElo": "FRA_1",
139 |         "MatchHistory": "F1",
140 |         "FiveThirtyEight": "ligue-1",
141 |         "FBref": "Ligue 1",
142 |         "ESPN": "fra.1",
143 |         "SoFIFA": "French Ligue 1 (1)",
144 |         "WhoScored": "France - Ligue 1",
145 |         "season_start": "Aug",
146 |         "season_end": "May",
147 |     },
148 | }
149 | _f_custom_league_dict = CONFIG_DIR / "league_dict.json"
150 | if _f_custom_league_dict.is_file():
151 |     with open(_f_custom_league_dict, encoding='utf8') as json_file:
152 |         LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)}
153 |     logger.info("Custom league dict loaded from %s.", _f_custom_league_dict)
154 | else:
155 |     logger.info(
156 |         "No custom league dict found. You can configure additional leagues in %s.",
157 |         _f_custom_league_dict,
158 |     )
159 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
  1 | """Nox sessions."""
  2 | import os
  3 | import shlex
  4 | import shutil
  5 | import sys
  6 | from pathlib import Path
  7 | from textwrap import dedent
  8 | 
  9 | import nox
 10 | 
 11 | try:
 12 |     from nox_poetry import Session, session
 13 | except ImportError:
 14 |     message = f"""\
 15 |     Nox failed to import the 'nox-poetry' package.
 16 | 
 17 |     Please install it using the following command:
 18 | 
 19 |     {sys.executable} -m pip install nox-poetry"""
 20 |     raise SystemExit(dedent(message)) from None
 21 | 
 22 | 
 23 | package = "soccerdata"
 24 | python_versions = ["3.9", "3.8", "3.7"]
 25 | nox.needs_version = ">= 2021.6.6"
 26 | nox.options.sessions = (
 27 |     "pre-commit",
 28 |     "mypy",
 29 |     "tests",
 30 |     "docs-build",
 31 | )
 32 | 
 33 | 
 34 | def activate_virtualenv_in_precommit_hooks(session: Session) -> None:
 35 |     """Activate virtualenv in hooks installed by pre-commit.
 36 | 
 37 |     This function patches git hooks installed by pre-commit to activate the
 38 |     session's virtual environment. This allows pre-commit to locate hooks in
 39 |     that environment when invoked from git.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     session : Session
 44 |         The Session object.
 45 |     """
 46 |     assert session.bin is not None  # noqa: S101
 47 | 
 48 |     # Only patch hooks containing a reference to this session's bindir. Support
 49 |     # quoting rules for Python and bash, but strip the outermost quotes so we
 50 |     # can detect paths within the bindir, like <bindir>/python.
 51 |     bindirs = [
 52 |         bindir[1:-1] if bindir[0] in "'\"" else bindir
 53 |         for bindir in (repr(session.bin), shlex.quote(session.bin))
 54 |     ]
 55 | 
 56 |     virtualenv = session.env.get("VIRTUAL_ENV")
 57 |     if virtualenv is None:
 58 |         return
 59 | 
 60 |     headers = {
 61 |         # pre-commit < 2.16.0
 62 |         "python": f"""\
 63 |             import os
 64 |             os.environ["VIRTUAL_ENV"] = {virtualenv!r}
 65 |             os.environ["PATH"] = os.pathsep.join((
 66 |                 {session.bin!r},
 67 |                 os.environ.get("PATH", ""),
 68 |             ))
 69 |             """,
 70 |         # pre-commit >= 2.16.0
 71 |         "bash": f"""\
 72 |             VIRTUAL_ENV={shlex.quote(virtualenv)}
 73 |             PATH={shlex.quote(session.bin)}"{os.pathsep}$PATH"
 74 |             """,
 75 |     }
 76 | 
 77 |     hookdir = Path(".git") / "hooks"
 78 |     if not hookdir.is_dir():
 79 |         return
 80 | 
 81 |     for hook in hookdir.iterdir():
 82 |         if hook.name.endswith(".sample") or not hook.is_file():
 83 |             continue
 84 | 
 85 |         if not hook.read_bytes().startswith(b"#!"):
 86 |             continue
 87 | 
 88 |         text = hook.read_text()
 89 | 
 90 |         if not any(
 91 |             Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text
 92 |             for bindir in bindirs
 93 |         ):
 94 |             continue
 95 | 
 96 |         lines = text.splitlines()
 97 | 
 98 |         for executable, header in headers.items():
 99 |             if executable in lines[0].lower():
100 |                 lines.insert(1, dedent(header))
101 |                 hook.write_text("\n".join(lines))
102 |                 break
103 | 
104 | 
105 | @session(name="pre-commit", python=python_versions[0])
106 | def precommit(session: Session) -> None:
107 |     """Lint using pre-commit."""
108 |     args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"]
109 |     session.install(
110 |         "black",
111 |         "darglint",
112 |         "flake8",
113 |         "flake8-bugbear",
114 |         "flake8-docstrings",
115 |         "flake8-rst-docstrings",
116 |         "pep8-naming",
117 |         "pre-commit",
118 |         "pre-commit-hooks",
119 |         "pyupgrade",
120 |         "isort",
121 |     )
122 |     session.run("pre-commit", *args)
123 |     if args and args[0] == "install":
124 |         activate_virtualenv_in_precommit_hooks(session)
125 | 
126 | 
127 | @session(python=python_versions)
128 | def mypy(session: Session) -> None:
129 |     """Type-check using mypy."""
130 |     args = session.posargs or ["soccerdata", "tests", "docs/conf.py"]
131 |     session.install(".")
132 |     session.install("mypy", "pytest")
133 |     session.run("mypy", "--install-types", "--non-interactive", *args)
134 |     if not session.posargs:
135 |         session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py")
136 | 
137 | 
138 | @session(python=python_versions)
139 | def tests(session: Session) -> None:
140 |     """Run the test suite."""
141 |     args = session.posargs or ["-m", "not e2e and not fails_gha"]
142 |     session.install(".")
143 |     session.install("coverage[toml]", "pytest", "pytest-mock", "time-machine", "pygments")
144 |     try:
145 |         session.run("coverage", "run", "--parallel", "-m", "pytest", *args)
146 |     finally:
147 |         if session.interactive:
148 |             session.notify("coverage", posargs=[])
149 | 
150 | 
151 | @session(python=python_versions[0])
152 | def coverage(session: Session) -> None:
153 |     """Produce the coverage report."""
154 |     args = session.posargs or ["report"]
155 | 
156 |     session.install("coverage[toml]")
157 | 
158 |     if not session.posargs and any(Path().glob(".coverage.*")):
159 |         session.run("coverage", "combine")
160 | 
161 |     session.run("coverage", *args)
162 | 
163 | 
164 | @session(name="docs-build", python=python_versions[0])
165 | def docs_build(session: Session) -> None:
166 |     """Build the documentation."""
167 |     args = session.posargs or ["docs", "docs/_build"]
168 |     if not session.posargs and "FORCE_COLOR" in os.environ:
169 |         args.insert(0, "--color")
170 | 
171 |     session.install(".")
172 |     session.install("sphinx", "sphinx-click", "furo", "nbsphinx", "ipython")
173 | 
174 |     build_dir = Path("docs", "_build")
175 |     if build_dir.exists():
176 |         shutil.rmtree(build_dir)
177 | 
178 |     session.run("sphinx-build", *args, env={'SOCCERDATA_DIR': '~/soccerdata'})
179 | 
180 | 
181 | @session(python=python_versions[0])
182 | def docs(session: Session) -> None:
183 |     """Build and serve the documentation with live reloading on file changes."""
184 |     args = session.posargs or ["--host=0.0.0.0", "docs", "docs/_build"]
185 |     session.install(".")
186 |     session.install("sphinx", "sphinx-autobuild", "furo", "nbsphinx", "ipython")
187 | 
188 |     build_dir = Path("docs", "_build")
189 |     if build_dir.exists():
190 |         shutil.rmtree(build_dir)
191 | 
192 |     session.run("sphinx-autobuild", *args, env={'SOCCERDATA_DIR': '~/soccerdata'})
193 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Contributor Guide
  3 | =================
  4 | 
  5 | This document lays out guidelines and advice for contributing to this project.
  6 | If you're thinking of contributing, please start by reading this document and
  7 | getting a feel for how contributing to this project works. If you have any
  8 | questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer.
  9 | 
 10 | .. _Pieter Robberechts: https://people.cs.kuleuven.be/~pieter.robberechts/
 11 | 
 12 | The guide is split into sections based on the type of contribution you're
 13 | thinking of making.
 14 | 
 15 | 
 16 | .. _bug-reports:
 17 | 
 18 | Bug Reports
 19 | -----------
 20 | 
 21 | Bug reports are hugely important! Before you raise one, though, please check
 22 | through the `GitHub issues`_, **both open and closed**, to confirm that the bug
 23 | hasn't been reported before.
 24 | 
 25 | When filing an issue, make sure to answer these questions:
 26 | 
 27 | - Which Python version are you using?
 28 | - Which version of soccerdata are you using?
 29 | - What did you do?
 30 | - What did you expect to see?
 31 | - What did you see instead?
 32 | 
 33 | The best way to get your bug fixed is to provide a test case,
 34 | and/or steps to reproduce the issue.
 35 | 
 36 | .. _GitHub issues: https://github.com/probberechts/soccerdata/issues
 37 | 
 38 | 
 39 | Feature Requests
 40 | ----------------
 41 | 
 42 | If you believe there is a feature missing, feel free to raise a feature
 43 | request on the `Issue Tracker`_.
 44 | 
 45 | .. _Issue tracker: https://github.com/probberechts/soccerdata/issues
 46 | 
 47 | 
 48 | Documentation Contributions
 49 | ---------------------------
 50 | 
 51 | Documentation improvements are always welcome! The documentation files live in
 52 | the ``docs/`` directory of the codebase. They're written in
 53 | `reStructuredText`_, and use `Sphinx`_ to generate the full suite of
 54 | documentation.
 55 | 
 56 | You do not have to setup a development environment to make small changes to
 57 | the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes.
 58 | 
 59 | When contributing documentation, please do your best to follow the style of the
 60 | documentation files. This means a soft-limit of 79 characters wide in your text
 61 | files and a semi-formal, yet friendly and approachable, prose style.
 62 | 
 63 | When presenting Python code, use single-quoted strings (``'hello'`` instead of
 64 | ``"hello"``).
 65 | 
 66 | .. _reStructuredText: http://docutils.sourceforge.net/rst.html
 67 | .. _Sphinx: http://sphinx-doc.org/index.html
 68 | .. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
 69 | 
 70 | 
 71 | Code Contributions
 72 | ------------------
 73 | 
 74 | If you intend to contribute code, do not feel the need to sit on your
 75 | contribution until it is perfectly polished and complete. It helps everyone
 76 | involved for you to seek feedback as early as you possibly can. Submitting an
 77 | early, unfinished version of your contribution for feedback can save you from
 78 | putting a lot of work into a contribution that is not suitable for the
 79 | project.
 80 | 
 81 | Setting up your development environment
 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 83 | 
 84 | You need Python 3.7.1+ and the following tools:
 85 | 
 86 | - Poetry_
 87 | - Nox_
 88 | - nox-poetry_
 89 | 
 90 | Install the package with development requirements:
 91 | 
 92 | .. code:: console
 93 | 
 94 |    $ poetry install
 95 | 
 96 | You can now run an interactive Python session.
 97 | 
 98 | .. code:: console
 99 | 
100 |    $ poetry run python
101 | 
102 | .. _Poetry: https://python-poetry.org/
103 | .. _Nox: https://nox.thea.codes/
104 | .. _nox-poetry: https://nox-poetry.readthedocs.io/
105 | 
106 | Steps for submitting Code
107 | ~~~~~~~~~~~~~~~~~~~~~~~~~
108 | 
109 | When contributing code, you'll want to follow this checklist:
110 | 
111 | 1. Fork the repository on GitHub.
112 | 2. Run the tests to confirm they all pass on your system. If they don't, you'll
113 |    need to investigate why they fail. If you're unable to diagnose this
114 |    yourself, raise it as a bug report.
115 | 3. Write tests that demonstrate your bug or feature. Ensure that they fail.
116 | 4. Make your change.
117 | 5. Run the entire test suite again, confirming that all tests pass *including
118 |    the ones you just added*.
119 | 6. Make sure your code follows the code style discussed below.
120 | 7. Send a GitHub Pull Request to the main repository's ``master`` branch.
121 |    GitHub Pull Requests are the expected method of code collaboration on this
122 |    project.
123 | 
124 | Testing the project
125 | ~~~~~~~~~~~~~~~~~~~
126 | 
127 | Run the full test suite:
128 | 
129 | .. code:: console
130 | 
131 |    $ nox
132 | 
133 | List the available Nox sessions:
134 | 
135 | .. code:: console
136 | 
137 |    $ nox --list-sessions
138 | 
139 | You can also run a specific Nox session.
140 | For example, invoke the unit test suite like this:
141 | 
142 | .. code:: console
143 | 
144 |    $ nox --session=tests
145 | 
146 | Unit tests are located in the ``tests`` directory,
147 | and are written using the pytest_ testing framework.
148 | 
149 | .. _pytest: https://pytest.readthedocs.io/
150 | 
151 | Code style
152 | ~~~~~~~~~~~
153 | 
154 | The soccerdata codebase uses the `PEP 8`_ code style. In addition, we have
155 | a few guidelines:
156 | 
157 | - Line-length can exceed 79 characters, to 100, when convenient.
158 | - Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient.
159 | - Always use single-quoted strings (e.g. ``'#soccer'``), unless a single-quote occurs within the string.
160 | 
161 | To ensure all code conforms to this format. You can format the code using the
162 | pre-commit hooks.
163 | 
164 | .. code:: console
165 | 
166 |    $ nox --session=pre-commit
167 | 
168 | Docstrings are to follow the `numpydoc guidelines`_.
169 | 
170 | .. _PEP 8: https://pep8.org/
171 | .. _black: https://black.readthedocs.io/en/stable/
172 | .. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html
173 | 
174 | Submitting changes
175 | ~~~~~~~~~~~~~~~~~~
176 | 
177 | Open a `pull request`_ to submit changes to this project.
178 | 
179 | Your pull request needs to meet the following guidelines for acceptance:
180 | 
181 | - The Nox test suite must pass without errors and warnings.
182 | - Include unit tests.
183 | - If your changes add functionality, update the documentation accordingly.
184 | 
185 | Feel free to submit early, though. We can always iterate on this.
186 | 
187 | To run linting and code formatting checks before committing your change, you
188 | can install pre-commit as a Git hook by running the following command:
189 | 
190 | .. code:: console
191 | 
192 |    $ nox --session=pre-commit -- install
193 | 
194 | It is recommended to open an issue before starting work on anything.
195 | 
196 | .. _pull request: https://github.com/probberechts/soccerdata/pulls
197 | .. github-only
198 | 


--------------------------------------------------------------------------------
/soccerdata/clubelo.py:
--------------------------------------------------------------------------------
  1 | """Scraper for api.clubelo.com."""
  2 | import re
  3 | from datetime import datetime, timedelta
  4 | from pathlib import Path
  5 | from typing import Callable, Dict, List, Optional, Union
  6 | 
  7 | import pandas as pd
  8 | from unidecode import unidecode
  9 | 
 10 | from ._common import BaseRequestsReader, standardize_colnames
 11 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 12 | 
 13 | CLUB_ELO_DATADIR = DATA_DIR / "ClubElo"
 14 | CLUB_ELO_API = "http://api.clubelo.com"
 15 | 
 16 | 
 17 | class ClubElo(BaseRequestsReader):
 18 |     """Provides pd.DataFrames from CSV API at http://api.clubelo.com.
 19 | 
 20 |     Data will be downloaded as necessary and cached locally in
 21 |     ``~/soccerdata/data/ClubElo``.
 22 | 
 23 |     Since the source does not provide league names, this class will not filter
 24 |     by league. League names will be inserted from the other sources where
 25 |     available. Leagues that are only covered by clubelo.com will have NaN
 26 |     values.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     proxy : 'tor' or dict or list(dict) or callable, optional
 31 |         Use a proxy to hide your IP address. Valid options are:
 32 |             - "tor": Uses the Tor network. Tor should be running in
 33 |               the background on port 9050.
 34 |             - dict: A dictionary with the proxy to use. The dict should be
 35 |               a mapping of supported protocols to proxy addresses. For example::
 36 | 
 37 |                   {
 38 |                       'http': 'http://10.10.1.10:3128',
 39 |                       'https': 'http://10.10.1.10:1080',
 40 |                   }
 41 | 
 42 |             - list(dict): A list of proxies to choose from. A different proxy will
 43 |               be selected from this list after failed requests, allowing rotating
 44 |               proxies.
 45 |             - callable: A function that returns a valid proxy. This function will
 46 |               be called after failed requests, allowing rotating proxies.
 47 |     no_cache : bool
 48 |         If True, will not use cached data.
 49 |     no_store : bool
 50 |         If True, will not store downloaded data.
 51 |     data_dir : Path
 52 |         Path to directory where data will be cached.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         proxy: Optional[
 58 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 59 |         ] = None,
 60 |         no_cache: bool = NOCACHE,
 61 |         no_store: bool = NOSTORE,
 62 |         data_dir: Path = CLUB_ELO_DATADIR,
 63 |     ):
 64 |         """Initialize a new ClubElo reader."""
 65 |         super().__init__(no_cache=no_cache, no_store=no_store, data_dir=data_dir)
 66 | 
 67 |     def read_by_date(self, date: Optional[Union[str, datetime]] = None) -> pd.DataFrame:
 68 |         """Retrieve ELO scores for all teams at specified date.
 69 | 
 70 |         Elo scores are available as early as 1939. Values before 1960 should
 71 |         be considered provisional.
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         date : datetime object or string like 'YYYY-MM-DD'
 76 |             Date for which to retrieve ELO scores. If no date is specified,
 77 |             get today's scores.
 78 | 
 79 |         Returns
 80 |         -------
 81 |         pd.DataFrame
 82 |         """
 83 |         if not date:
 84 |             date = datetime.today()
 85 |         elif isinstance(date, str):
 86 |             date = datetime.strptime(date, "%Y-%m-%d")
 87 |         else:
 88 |             pass  # Assume datetime object
 89 | 
 90 |         datestring = date.strftime("%Y-%m-%d")
 91 |         filepath = self.data_dir / f"{datestring}.csv"
 92 |         url = f"{CLUB_ELO_API}/{datestring}"
 93 | 
 94 |         data = self.get(url, filepath)
 95 | 
 96 |         df = (
 97 |             pd.read_csv(
 98 |                 data, parse_dates=["From", "To"], infer_datetime_format=True, dayfirst=False
 99 |             )
100 |             .pipe(standardize_colnames)
101 |             .rename(columns={"club": "team"})
102 |             .replace({"team": TEAMNAME_REPLACEMENTS})
103 |             .replace("None", float("nan"))
104 |             .assign(rank=lambda x: x["rank"].astype("float"))
105 |             .assign(league=lambda x: x["country"] + "_" + x["level"].astype(str))
106 |             .pipe(self._translate_league)
107 |             .reset_index(drop=True)
108 |             .set_index("team")
109 |         )
110 |         return df
111 | 
112 |     def read_team_history(
113 |         self, team: str, max_age: Union[int, timedelta] = 1
114 |     ) -> Optional[pd.DataFrame]:
115 |         """Retrieve full ELO history for one club.
116 | 
117 |         For the exact spelling of a club's name, check the result
118 |         of :func:`~soccerdata.ClubElo.read_by_date` or
119 |         `clubelo.com <http://clubelo.com/Ranking>`__. You can also use
120 |         alternative team names specified in `teamname_replacements.json`.
121 |         Values before 1960 should be considered provisional.
122 | 
123 |         Parameters
124 |         ----------
125 |         team : str
126 |             The club's name
127 |         max_age : int for age in days, or timedelta object
128 |             The max. age of locally cached file before re-download.
129 | 
130 |         Raises
131 |         ------
132 |         TypeError
133 |             If max_age is not an integer or timedelta object.
134 |         ValueError
135 |             If no ratings for the given team are available.
136 | 
137 |         Returns
138 |         -------
139 |         pd.DataFrame
140 |         """
141 |         teams_to_check = [k for k, v in TEAMNAME_REPLACEMENTS.items() if v == team]
142 |         teams_to_check.append(team)
143 | 
144 |         for i, _ in enumerate(teams_to_check):
145 |             teams_to_check[i] = unidecode(teams_to_check[i])
146 |             teams_to_check[i] = re.sub(r"[\s']", "", teams_to_check[i])
147 | 
148 |         for _team in teams_to_check:
149 |             filepath = self.data_dir / f"{_team}.csv"
150 |             url = f"{CLUB_ELO_API}/{_team}"
151 |             data = self.get(url, filepath, max_age)
152 | 
153 |             df = (
154 |                 pd.read_csv(
155 |                     data,
156 |                     parse_dates=["From", "To"],
157 |                     infer_datetime_format=True,
158 |                     dayfirst=False,
159 |                 )
160 |                 .pipe(standardize_colnames)
161 |                 .rename(columns={"club": "team"})
162 |                 .replace("None", float("nan"))
163 |                 .assign(rank=lambda x: x["rank"].astype("float"))
164 |                 .set_index("from")
165 |                 .sort_index()
166 |             )
167 | 
168 |             if len(df) > 0:
169 |                 # clubelo.com returns a CSV with just a header for nonexistent club
170 |                 df.replace({"team": TEAMNAME_REPLACEMENTS}, inplace=True)
171 |                 return df
172 | 
173 |         raise ValueError(f"No data found for team {team}")
174 | 


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
  1 | """Unittests for soccerdata._common."""
  2 | 
  3 | import datetime
  4 | 
  5 | import pandas as pd
  6 | import pytest
  7 | import time_machine
  8 | 
  9 | import soccerdata
 10 | from soccerdata._common import (
 11 |     BaseRequestsReader,
 12 |     make_game_id,
 13 |     season_code,
 14 |     standardize_colnames,
 15 | )
 16 | 
 17 | # _download_and_save
 18 | 
 19 | 
 20 | def test_download_and_save_not_cached(tmp_path):
 21 |     reader = BaseRequestsReader()
 22 |     url = "http://api.clubelo.com/Barcelona"
 23 |     filepath = tmp_path / "Barcelona.csv"
 24 |     data = reader._download_and_save(url, filepath)
 25 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 26 | 
 27 | 
 28 | def test_download_and_save_cached(tmp_path):
 29 |     reader = BaseRequestsReader()
 30 |     url = "http://api.clubelo.com/Barcelona"
 31 |     filepath = tmp_path / "Barcelona.csv"
 32 |     data = reader._download_and_save(url, filepath)
 33 |     data = reader._download_and_save(url, filepath)
 34 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 35 | 
 36 | 
 37 | def test_download_and_save_no_cache(tmp_path):
 38 |     reader = BaseRequestsReader(no_cache=True)
 39 |     url = "http://api.clubelo.com/Barcelona"
 40 |     filepath = tmp_path / "Barcelona.csv"
 41 |     filepath.write_text("bogus")
 42 |     data = reader._download_and_save(url, filepath)
 43 |     assert len(pd.read_csv(data)) > 1
 44 | 
 45 | 
 46 | def test_download_and_save_no_store_no_filepath():
 47 |     reader = BaseRequestsReader(no_store=True)
 48 |     url = "http://api.clubelo.com/Barcelona"
 49 |     data = reader._download_and_save(url, filepath=None)
 50 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 51 | 
 52 | 
 53 | def test_download_and_save_no_cache_filepath(tmp_path):
 54 |     reader = BaseRequestsReader(no_store=True)
 55 |     url = "http://api.clubelo.com/Barcelona"
 56 |     filepath = tmp_path / "Barcelona.csv"
 57 |     data = reader._download_and_save(url, filepath)
 58 |     assert isinstance(pd.read_csv(data), pd.DataFrame)
 59 |     assert not filepath.exists()
 60 | 
 61 | 
 62 | # def test_download_and_save_requests_tor(tmp_path):
 63 | #     url = "https://check.torproject.org/api/ip"
 64 | #     reader = BaseRequestsReader(proxy=None)
 65 | #     ip_without_proxy = reader.get(url, tmp_path / "myip.txt")
 66 | #     ip_without_proxy = json.load(ip_without_proxy)
 67 | #     proxy_reader = BaseRequestsReader(proxy="tor")
 68 | #     ip_with_proxy = proxy_reader.get(url, tmp_path / "myproxyip.txt")
 69 | #     ip_with_proxy = json.load(ip_with_proxy)
 70 | #     assert ip_without_proxy["IP"] != ip_with_proxy["IP"]
 71 | #     assert ip_with_proxy["IsTor"]
 72 | #
 73 | #
 74 | # def test_download_and_save_selenium_tor(tmp_path):
 75 | #     url = "https://check.torproject.org/api/ip"
 76 | #     reader = BaseSeleniumReader(proxy=None).get(url, tmp_path / "myip.txt")
 77 | #     ip_without_proxy = html.parse(reader).xpath("//pre")[0].text
 78 | #     ip_without_proxy = json.loads(ip_without_proxy)
 79 | #     proxy_reader = BaseSeleniumReader(proxy="tor").get(url, tmp_path / "myproxyip.txt")
 80 | #     ip_with_proxy = html.parse(proxy_reader).xpath("//pre")[0].text
 81 | #     ip_with_proxy = json.loads(ip_with_proxy)
 82 | #     assert ip_without_proxy["IP"] != ip_with_proxy["IP"]
 83 | #     assert ip_with_proxy["IsTor"]
 84 | #
 85 | 
 86 | # make_game_id
 87 | 
 88 | 
 89 | def test_make_game_id():
 90 |     s = pd.Series(
 91 |         {
 92 |             "date": datetime.datetime(1993, 7, 30),
 93 |             "home_team": "Barcelona",
 94 |             "away_team": "Real Madrid",
 95 |         }
 96 |     )
 97 |     game_id = make_game_id(s)
 98 |     assert game_id == "1993-07-30 Barcelona-Real Madrid"
 99 | 
100 | 
101 | # standardize_colnames
102 | 
103 | 
104 | def test_standardize_colnames():
105 |     df = pd.DataFrame(
106 |         columns=[
107 |             "First Test",
108 |             "SecondTest",
109 |             "thirdTest",
110 |             "Fourthtest",
111 |             "Fifth-test",
112 |             "TestSix",
113 |         ]
114 |     )
115 |     df = standardize_colnames(
116 |         df, cols=["First Test", "SecondTest", "thirdTest", "Fourthtest", "Fifth-test"]
117 |     )
118 |     assert df.columns.tolist() == [
119 |         "first_test",
120 |         "second_test",
121 |         "third_test",
122 |         "fourthtest",
123 |         "fifth_test",
124 |         "TestSix",
125 |     ]
126 | 
127 | 
128 | # is_complete
129 | 
130 | 
131 | def test_is_complete():
132 |     reader = BaseRequestsReader(no_store=True)
133 |     with time_machine.travel(datetime.datetime(2020, 12, 25, 1, 24)):
134 |         assert reader._is_complete("ENG-Premier League", "1920")
135 |         assert not reader._is_complete("ENG-Premier League", "2021")
136 |     with time_machine.travel(datetime.datetime(2021, 2, 25, 1, 24)):
137 |         assert reader._is_complete("ENG-Premier League", "1920")
138 |         assert not reader._is_complete("ENG-Premier League", "2021")
139 |     with time_machine.travel(datetime.datetime(2021, 7, 1, 1, 24)):
140 |         assert reader._is_complete("ENG-Premier League", "1920")
141 |         assert reader._is_complete("ENG-Premier League", "2021")
142 |         assert not reader._is_complete("ENG-Premier League", "2122")
143 | 
144 | 
145 | def test_is_complete_default_value(mocker):
146 |     mocker.patch.object(soccerdata._common, "LEAGUE_DICT", {"FAKE-Dummy League": {}})
147 |     reader = BaseRequestsReader(no_store=True)
148 |     with time_machine.travel(datetime.datetime(2020, 12, 25, 1, 24)):
149 |         assert reader._is_complete("FAKE-Dummy League", "1920")
150 | 
151 | 
152 | def test_is_complete_undefined_league(mocker):
153 |     reader = BaseRequestsReader(no_store=True)
154 |     with pytest.raises(ValueError):
155 |         reader._is_complete("FAKE-Dummy League", "1920")
156 | 
157 | 
158 | # Season codes
159 | def test_season_pattern1a():
160 |     assert season_code("9495") == "9495"
161 | 
162 | 
163 | def test_season_pattern1a_warn():
164 |     with pytest.warns(UserWarning) as record:
165 |         assert season_code("2021") == "2021"
166 | 
167 |     # check that only one warning was raised
168 |     assert len(record) == 1
169 |     # check that the message matches
170 |     msg = 'Season id "2021" is ambiguous: interpreting as "20-21"'
171 |     assert record[0].message.args[0] == msg  # type: ignore
172 | 
173 | 
174 | def test_season_pattern1b():
175 |     my_season = check_post = "1998"
176 |     assert season_code(my_season) == "9899"
177 |     assert my_season == check_post
178 | 
179 | 
180 | def test_season_pattern1c():
181 |     assert season_code("1999") == "9900"
182 | 
183 | 
184 | def test_season_pattern2():
185 |     assert season_code("11") == "1112"
186 |     assert season_code("99") == "9900"
187 | 
188 | 
189 | def test_season_pattern3():
190 |     assert season_code("2011-2012") == "1112"
191 |     assert season_code("1999-2000") == "9900"
192 | 
193 | 
194 | def test_season_pattern4():
195 |     assert season_code("2011-12") == "1112"
196 |     assert season_code("1999-00") == "9900"
197 | 
198 | 
199 | def test_season_pattern5():
200 |     assert season_code("13-14") == "1314"
201 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | Usage
  4 | =====
  5 | 
  6 | This tutorial will walk you through installing, configuring, and using
  7 | SoccerData.
  8 | 
  9 | 
 10 | Installation
 11 | ------------
 12 | 
 13 | SoccerData can be easily installed via `pip <https://pip.readthedocs.org/>`__:
 14 | 
 15 | .. code:: bash
 16 | 
 17 |   python3 -m pip install soccerdata
 18 | 
 19 | 
 20 | Global configuration
 21 | ---------------------
 22 | 
 23 | Several settings that can be configured globally using the following environment variables:
 24 | 
 25 | ``SOCCERDATA_DIR``
 26 |     The directory where the downloaded data is cached and where logs are
 27 |     stored. By default, all data is stored to ``~/soccerdata`` on Linux / Mac
 28 |     OS and ``C:\Users\yourusername\soccerdata`` on Windows.
 29 | ``SOCCERDATA_NOCACHE``
 30 |     If set to "true", no cached data is returned. Note that no-cache does not
 31 |     mean "don't cache". All downloaded data is still cached and overwrites
 32 |     existing caches. If the sense of "don't cache" that you want is actually
 33 |     "don't store", then ``SOCCERDATA_NOSTORE`` is the option to use. By default,
 34 |     data is retrieved from the cache.
 35 | ``SOCCERDATA_NOSTORE``
 36 |     If set to "true", no data is stored. By default, data is cached.
 37 | ``SOCCERDATA_LOGLEVEL``
 38 |     The level of logging to use. By default, this is set to "INFO".
 39 | 
 40 | Example:
 41 | 
 42 | .. code-block:: bash
 43 | 
 44 |   # bash
 45 |   export SOCCERDATA_DIR = "~/soccerdata"
 46 |   export SOCCERDATA_NOCACHE = "False"
 47 |   export SOCCERDATA_NOSTORE = "False"
 48 |   export SOCCERDATA_LOGLEVEL = "INFO"
 49 | 
 50 | Scraping data
 51 | -------------
 52 | 
 53 | Each of the supported data sources has its corresponding class for fetching
 54 | data with a uniform API. For example, the :class:`~soccerdata.FBref` class is
 55 | used to fetch data from `fbref.com <https://www.fbref.com/>`__.
 56 | 
 57 | .. code:: python
 58 | 
 59 |    import soccerdata as sd
 60 | 
 61 |    # Create scraper class instance
 62 |    fbref = sd.FBref()
 63 | 
 64 | This will create a ``soccerdata/FBref/`` folder in your home directory  in
 65 | which all scraped data will be cached and where logs will be saved. If you
 66 | prefer to store the data in a different folder or disable caching, you can
 67 | configure this using environment variables (see above) or by setting the
 68 | ``data_dir``, ``no_cache`` and ``no_store`` parameters which are supported by
 69 | each scraper class.
 70 | 
 71 | .. code:: python
 72 | 
 73 |    # Create scraper class instance with custom caching behavior
 74 |    fbref = sd.FBref(data_dir="/tmp", no_cache=True, no_store=True)
 75 | 
 76 | Once you have a scraper class instance, you can use it to fetch data. See the
 77 | :ref:`API reference <api>` for the full list of options available for each scraper. For
 78 | example, to fetch aggregated shooting stats for all teams:
 79 | 
 80 | .. code:: python
 81 | 
 82 |    # Create dataframes
 83 |    season_stats = fbref.read_team_season_stats(stat_type='shooting')
 84 | 
 85 | 
 86 | The data is always returned as a convenient Pandas DataFrame.
 87 | 
 88 | .. csv-table::
 89 |    :file: output.csv
 90 |    :header-rows: 1
 91 | 
 92 | Not all data sources provide data for all leagues. The leagues available for
 93 | each source can be listed with the :meth:`~soccerdata.FBref.available_leagues`
 94 | class method.
 95 | 
 96 | .. code:: python
 97 | 
 98 |    sd.FBref.available_leagues()
 99 |    >>> ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A']
100 | 
101 | 
102 | By default, the data for all available leagues and 10 most recent seasons will
103 | be downloaded. In most cases, you would want to limit the data to a specific
104 | league and / or seasons. This can be done by passing a list of leagues and
105 | seasons to the constructor of the scraper class. For example:
106 | 
107 | .. code:: python
108 | 
109 |    # Create scraper class instance filtering on specific leagues and seasons
110 |    fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['1718', '1819'])
111 | 
112 | 
113 | See the examples and :ref:`API reference <api>` for detailed instructions for
114 | each of the available data sources.
115 | 
116 | Additional setup for WhoScored
117 | ------------------------------
118 | 
119 | WhoScored implements strong protection against scraping using Incapsula. To
120 | circumvent this, this scraper uses Selenium with the ChromeDriver extension to
121 | emulate a real user. Before using this scraper, you will have to `install
122 | Chrome`_. A Selenium driver matching your Chrome version will be downloaded
123 | automatically when you run the scraper.
124 | 
125 | Even with this setup, it is likely that your IP address will get blocked
126 | eventually. Therefore, is is recommended to setup a SOCKS5 proxy with Tor.
127 | Checkout the `installation guide`_ on the Tor website for installation
128 | instructions. After installing Tor, make sure to start it up before scraping.
129 | This can easily be done by running the ``tor`` command from your terminal (in
130 | a separate window), Tor will start up and run on “localhost:9050” by default.
131 | Once Tor is running, you can enable the extension by setting ``proxy='tor'``.
132 | 
133 | .. code:: python
134 | 
135 |    ws = sd.WhoScored(proxy='tor')
136 | 
137 | The code snippet above assumes you have a Tor proxy running on
138 | "localhost:9050". Many distributions indeed default to having a SOCKS proxy
139 | listening on port 9050, but some may not. In particular, the Tor Browser
140 | Bundle defaults to listening on port 9150. You can specify a custom host and
141 | port as
142 | 
143 | .. code:: python
144 | 
145 |    ws = sd.WhoScored(proxy={
146 |         "http": "socks5://127.0.0.1:9150",
147 |         "https": "socks5://127.0.0.1:9150",
148 |     })
149 | 
150 | 
151 | .. _insallation guide: https://community.torproject.org/onion-services/setup/install/
152 | .. _install Chrome: https://www.google.com/chrome/
153 | 
154 | 
155 | Adding additional leagues
156 | -------------------------
157 | 
158 | The top-5 European leagues are fully supported. If you want to add more
159 | leagues, you can configure these in ``SOCCERDATA_DIR/config/league_dict.json``.
160 | This file should contain a mapping between a generic name for the league and
161 | the identifier used internally by each data source that you want to support.
162 | For example, for the Dutch Eredivisie this would be:
163 | 
164 | .. code-block:: json
165 | 
166 |   {
167 |     "NED-Eredivisie": {
168 |       "ClubElo": "NED_1",
169 |       "MatchHistory": "N1",
170 |       "SoFIFA": "Holland Eredivisie (1)",
171 |       "FBref": "Dutch Eredivisie",
172 |       "ESPN": "ned.1",
173 |       "FiveThirtyEight": "eredivisie",
174 |       "WhoScored": "Netherlands - Eredivisie"
175 |       "season_start": "Aug",
176 |       "season_end": "May",
177 |     },
178 |   }
179 | 
180 | The ``season_end`` and ``season_start`` fields are optional. This should be the
181 | month in which the last game and first game of a season are played,
182 | respectively. If they are not provided, June is used as the last month of the
183 | season and July as the first one.
184 | 
185 | Note that the provided scrapers might give some errors for the leagues you add
186 | yourself. This is because the same data is not always available for all seasons.
187 | 
188 | 
189 | Uniform team names
190 | ------------------
191 | 
192 | Each data source uses a different set of team names, which makes it difficult
193 | to combine data from multiple sources. To mitigate this, SoccerData allows
194 | translating the team names to uniform names. This is done by providing
195 | a ``SOCCERDATA_DIR/config/team_dict.json`` file. This file should contain a
196 | mapping between a generic name for each team and the team name used by each
197 | data source that you want to support. The example below will map "Tottenham
198 | Hotspur", "Tottenham Hotspur FC" and "Spurs" to "Tottenham" in all scraped
199 | data.
200 | 
201 | .. code-block:: json
202 | 
203 |   {
204 |     "Tottenham": ["Tottenham Hotspur", "Tottenham Hotspur FC", "Spurs"],
205 |   }
206 | 
207 | Next steps
208 | ----------
209 | Look at you! You’re now basically an expert at SoccerData! ✨
210 | 
211 | From this point you can:
212 | 
213 | - Look at the example notebooks for each :ref:`Data source <datasources>`.
214 | - Take a deep dive into the :ref:`API <api>`.
215 | - Give us feedback or contribute, see :ref:`Contributing <contributing>`.
216 | 
217 | Have fun! 🎉
218 | 


--------------------------------------------------------------------------------
/soccerdata/fivethirtyeight.py:
--------------------------------------------------------------------------------
  1 | """Scraper for https://projects.fivethirtyeight.com/soccer-predictions."""
  2 | import itertools
  3 | import json
  4 | from pathlib import Path
  5 | from typing import Callable, Dict, List, Optional, Union
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames
 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 11 | 
 12 | FIVETHIRTYEIGHT_DATA_DIR = DATA_DIR / "FiveThirtyEight"
 13 | FIVETHIRTYEIGHT_API = "https://projects.fivethirtyeight.com/soccer-predictions"
 14 | 
 15 | 
 16 | class FiveThirtyEight(BaseRequestsReader):
 17 |     """Provides pd.DataFrames from fivethirtyeight's "Club Soccer Predictions" project.
 18 | 
 19 |     Data will be downloaded as necessary and cached locally in
 20 |     ``~/soccerdata/data/FiveThirtyEight``.
 21 | 
 22 |     Original project and background info:
 23 |     https://projects.fivethirtyeight.com/soccer-predictions/ and
 24 |     https://fivethirtyeight.com/features/how-our-club-soccer-projections-work/
 25 | 
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     leagues : string or iterable, optional
 30 |         IDs of Leagues to include.
 31 |     seasons : string, int or list, optional
 32 |         Seasons to include. Supports multiple formats.
 33 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 34 |     proxy : 'tor' or dict or list(dict) or callable, optional
 35 |         Use a proxy to hide your IP address. Valid options are:
 36 |             - "tor": Uses the Tor network. Tor should be running in
 37 |               the background on port 9050.
 38 |             - dict: A dictionary with the proxy to use. The dict should be
 39 |               a mapping of supported protocols to proxy addresses. For example::
 40 | 
 41 |                   {
 42 |                       'http': 'http://10.10.1.10:3128',
 43 |                       'https': 'http://10.10.1.10:1080',
 44 |                   }
 45 | 
 46 |             - list(dict): A list of proxies to choose from. A different proxy will
 47 |               be selected from this list after failed requests, allowing rotating
 48 |               proxies.
 49 |             - callable: A function that returns a valid proxy. This function will
 50 |               be called after failed requests, allowing rotating proxies.
 51 |     no_cache : bool
 52 |         If True, will not use cached data.
 53 |     no_store : bool
 54 |         If True, will not store downloaded data.
 55 |     data_dir : Path
 56 |         Path to directory where data will be cached.
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         leagues: Optional[Union[str, List[str]]] = None,
 62 |         seasons: Optional[Union[str, int, List]] = None,
 63 |         proxy: Optional[
 64 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 65 |         ] = None,
 66 |         no_cache: bool = NOCACHE,
 67 |         no_store: bool = NOSTORE,
 68 |         data_dir: Path = FIVETHIRTYEIGHT_DATA_DIR,
 69 |     ):
 70 |         """Initialize a new FiveThirtyEight reader."""
 71 |         super().__init__(
 72 |             leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir
 73 |         )
 74 |         self.seasons = seasons  # type: ignore
 75 |         self._data = {}
 76 | 
 77 |         url = f"{FIVETHIRTYEIGHT_API}/data.json"
 78 |         filepath = self.data_dir / "latest.json"
 79 |         reader = self.get(url, filepath)
 80 | 
 81 |         for k, v in json.load(reader).items():
 82 |             self._data[k] = v
 83 | 
 84 |     def read_leagues(self) -> pd.DataFrame:
 85 |         """Retrieve the selected leagues from the datasource.
 86 | 
 87 |         Returns
 88 |         -------
 89 |         pd.DataFrame
 90 |         """
 91 |         df = (
 92 |             pd.DataFrame.from_dict(self._data["leagues"])
 93 |             .rename(columns={"slug": "league", "id": "league_id"})
 94 |             .pipe(self._translate_league)
 95 |             .pipe(standardize_colnames)
 96 |             .drop(columns=["overview_column", "custom_template", "skip_cols"])
 97 |             .set_index("league")
 98 |             .loc[self._selected_leagues.keys()]
 99 |             .sort_index()
100 |         )
101 |         return df
102 | 
103 |     def read_games(self) -> pd.DataFrame:
104 |         """Retrieve all games for the selected leagues.
105 | 
106 |         Returns
107 |         -------
108 |         pd.DataFrame
109 |         """
110 |         col_rename = {
111 |             "adj_score1": "adj_score_home",
112 |             "adj_score2": "adj_score_away",
113 |             "chances1": "chances_home",
114 |             "chances2": "chances_away",
115 |             "datetime": "date",
116 |             "moves1": "moves_home",
117 |             "moves2": "moves_away",
118 |             "prob1": "prob_home",
119 |             "prob2": "prob_away",
120 |             "probtie": "prob_tie",
121 |             "score1": "score_home",
122 |             "score2": "score_away",
123 |             "team1": "home_team",
124 |             "team1_code": "home_code",
125 |             "team1_id": "home_id",
126 |             "team1_sdr_id": "home_sdr_id",
127 |             "team2": "away_team",
128 |             "team2_code": "away_code",
129 |             "team2_id": "away_id",
130 |             "team2_sdr_id": "away_sdr_id",
131 |         }
132 | 
133 |         filemask = "matches_{}_{}.csv"
134 |         urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_matches.json"
135 |         data = []
136 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
137 |             filepath = self.data_dir / filemask.format(lkey, skey)
138 |             url = urlmask.format(skey[:2], lkey)
139 |             reader = self.get(url, filepath)
140 |             data.extend([{"league": lkey, "season": skey, **d} for d in json.load(reader)])
141 | 
142 |         df = (
143 |             pd.DataFrame.from_dict(data)
144 |             .rename(columns=col_rename)
145 |             .assign(date=lambda x: pd.to_datetime(x["date"]))
146 |             .replace(
147 |                 {
148 |                     "home_team": TEAMNAME_REPLACEMENTS,
149 |                     "away_team": TEAMNAME_REPLACEMENTS,
150 |                 }
151 |             )
152 |             .drop("id", axis=1)
153 |             .drop("league_id", axis=1)
154 |             .replace("None", float("nan"))
155 |             .pipe(self._translate_league)
156 |         )
157 | 
158 |         df = df[~df.date.isna()]
159 |         df["game"] = df.apply(make_game_id, axis=1)
160 |         df.set_index(["league", "season", "game"], inplace=True)
161 |         df.sort_index(inplace=True)
162 |         return df
163 | 
164 |     def read_forecasts(self) -> pd.DataFrame:
165 |         """Retrieve the forecasted results for the selected leagues.
166 | 
167 |         Returns
168 |         -------
169 |         pd.DataFrame
170 |         """
171 |         filemask = "forecasts_{}_{}.csv"
172 |         urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_forecast.json"
173 |         data = []
174 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
175 |             filepath = self.data_dir / filemask.format(lkey, skey)
176 |             url = urlmask.format(skey[:2], lkey)
177 |             reader = self.get(url, filepath)
178 | 
179 |             forecasts = json.load(reader)
180 |             for f in forecasts["forecasts"]:
181 |                 for t in f["teams"]:
182 |                     data.append(
183 |                         {
184 |                             "league": lkey,
185 |                             "season": skey,
186 |                             "last_updated": f["last_updated"],
187 |                             **t,
188 |                         }
189 |                     )
190 |         df = (
191 |             pd.DataFrame.from_dict(data)
192 |             .rename(columns={"name": "team"})
193 |             .replace({"team": TEAMNAME_REPLACEMENTS})
194 |             .replace("None", float("nan"))
195 |             .pipe(self._translate_league)
196 |             .set_index(["league", "season", "last_updated", "team"])
197 |             .sort_index()
198 |         )
199 |         return df
200 | 
201 |     def read_clinches(self) -> pd.DataFrame:
202 |         """Retrieve clinches for the selected leagues.
203 | 
204 |         Returns
205 |         -------
206 |         pd.DataFrame
207 |         """
208 |         filemask = "clinches_{}_{}.csv"
209 |         urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_clinches.json"
210 |         data = []
211 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
212 |             filepath = self.data_dir / filemask.format(lkey, skey)
213 |             url = urlmask.format(skey[:2], lkey)
214 |             reader = self.get(url, filepath)
215 |             data.extend([{"league": lkey, "season": skey, **c} for c in json.load(reader)])
216 | 
217 |         teams = (
218 |             self.read_games()[["home_team", "home_id"]]
219 |             .drop_duplicates()
220 |             .rename(columns={"home_team": "team", "home_id": "team_id"})
221 |         )
222 |         df = (
223 |             pd.DataFrame.from_dict(data)
224 |             .assign(date=lambda x: pd.to_datetime(x["dt"]))
225 |             .merge(teams, on="team_id", how="left")
226 |             .replace({"team": TEAMNAME_REPLACEMENTS})
227 |             .drop("dt", axis=1)
228 |             .drop("league_id", axis=1)
229 |             .drop("team_id", axis=1)
230 |             .pipe(self._translate_league)
231 |             .set_index(["league", "season", "date"])
232 |             .sort_index()
233 |         )
234 |         return df
235 | 


--------------------------------------------------------------------------------
/soccerdata/sofifa.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://sofifa.com."""
  2 | import re
  3 | from pathlib import Path
  4 | from typing import Callable, Dict, List, Optional, Union
  5 | 
  6 | import pandas as pd
  7 | from lxml import html
  8 | 
  9 | from ._common import BaseRequestsReader, standardize_colnames
 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 11 | 
 12 | SO_FIFA_DATADIR = DATA_DIR / "SoFIFA"
 13 | SO_FIFA_API = "https://sofifa.com"
 14 | 
 15 | 
 16 | class SoFIFA(BaseRequestsReader):
 17 |     """Provides pd.DataFrames from data at http://sofifa.com.
 18 | 
 19 |     Data will be downloaded as necessary and cached locally in
 20 |     ``~/soccerdata/data/SoFIFA``.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     leagues : string or iterable, optional
 25 |         IDs of leagues to include.
 26 |     seasons : string, int or list, optional
 27 |         Seasons to include. Supports multiple formats.
 28 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 29 |     proxy : 'tor' or dict or list(dict) or callable, optional
 30 |         Use a proxy to hide your IP address. Valid options are:
 31 |             - "tor": Uses the Tor network. Tor should be running in
 32 |               the background on port 9050.
 33 |             - dict: A dictionary with the proxy to use. The dict should be
 34 |               a mapping of supported protocols to proxy addresses. For example::
 35 | 
 36 |                   {
 37 |                       'http': 'http://10.10.1.10:3128',
 38 |                       'https': 'http://10.10.1.10:1080',
 39 |                   }
 40 | 
 41 |             - list(dict): A list of proxies to choose from. A different proxy will
 42 |               be selected from this list after failed requests, allowing rotating
 43 |               proxies.
 44 |             - callable: A function that returns a valid proxy. This function will
 45 |               be called after failed requests, allowing rotating proxies.
 46 |     no_cache : bool
 47 |         If True, will not use cached data.
 48 |     no_store : bool
 49 |         If True, will not store downloaded data.
 50 |     data_dir : Path
 51 |         Path to directory where data will be cached.
 52 |     """
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         leagues: Optional[Union[str, List[str]]] = None,
 57 |         seasons: Optional[Union[str, int, List]] = None,
 58 |         proxy: Optional[
 59 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 60 |         ] = None,
 61 |         no_cache: bool = NOCACHE,
 62 |         no_store: bool = NOSTORE,
 63 |         data_dir: Path = SO_FIFA_DATADIR,
 64 |     ):
 65 |         """Initialize SoFIFA reader."""
 66 |         super().__init__(
 67 |             leagues=leagues,
 68 |             proxy=proxy,
 69 |             no_cache=no_cache,
 70 |             no_store=no_store,
 71 |             data_dir=data_dir,
 72 |         )
 73 |         self.rate_limit = 2
 74 |         self.seasons = seasons  # type: ignore
 75 | 
 76 |     def read_leagues(self) -> pd.DataFrame:
 77 |         """Retrieve selected leagues from the datasource.
 78 | 
 79 |         Returns
 80 |         -------
 81 |         pd.DataFrame
 82 |         """
 83 |         # read html page (overview)
 84 |         filepath = self.data_dir / "leagues.html"
 85 |         reader = self.get(SO_FIFA_API, filepath)
 86 | 
 87 |         # extract league links
 88 |         leagues = []
 89 |         tree = html.parse(reader)
 90 |         for node in tree.xpath("//select[@id='choices-lg']/optgroup/option"):
 91 |             leagues.append(
 92 |                 {
 93 |                     "league_id": int(node.get("value")),
 94 |                     "league": node.text,
 95 |                 }
 96 |             )
 97 |         df = pd.DataFrame(leagues).pipe(self._translate_league).set_index("league").sort_index()
 98 |         return df[df.index.isin(self._selected_leagues.keys())]
 99 | 
100 |     def read_teams(self) -> pd.DataFrame:
101 |         """Retrieve teams from the datasource for the selected leagues.
102 | 
103 |         Returns
104 |         -------
105 |         pd.DataFrame
106 |         """
107 |         # build url
108 |         urlmask = SO_FIFA_API + "/teams?lg={}&v={}"
109 |         filemask = "teams_{}_{}.html"
110 | 
111 |         # get league IDs
112 |         leagues = self.read_leagues()
113 | 
114 |         # collect teams
115 |         teams = []
116 |         for lkey, _ in self._selected_leagues.items():
117 |             league_id = leagues.at[lkey, "league_id"]
118 |             for skey in self.seasons:
119 |                 # read html page (league overview)
120 |                 season_id = skey[:2]
121 |                 filepath = self.data_dir / filemask.format(lkey, skey)
122 |                 url = urlmask.format(league_id, season_id)
123 |                 reader = self.get(url, filepath)
124 | 
125 |                 # extract team links
126 |                 tree = html.parse(reader)
127 |                 pat_team = re.compile(r"\/team\/(\d+)\/[\w-]+\/")
128 |                 for node in tree.xpath("//a[contains(@href,'/team/')]"):
129 |                     # extract team IDs from links
130 |                     teams.append(
131 |                         {
132 |                             "team_id": int(
133 |                                 re.search(pat_team, node.get("href")).group(1)  # type: ignore
134 |                             ),
135 |                             "team": node.xpath(".//div")[0].text,
136 |                             "league": lkey,
137 |                             "season": skey,
138 |                         }
139 |                     )
140 | 
141 |         # return data frame
142 |         df = (
143 |             pd.DataFrame(teams)
144 |             .replace({"team": TEAMNAME_REPLACEMENTS})
145 |             .set_index(["league", "season", "team"])
146 |             .sort_index()
147 |         )
148 |         return df
149 | 
150 |     def read_players(self) -> pd.DataFrame:
151 |         """Retrieve players from the datasource for the selected leagues.
152 | 
153 |         Returns
154 |         -------
155 |         pd.DataFrame
156 |         """
157 |         # build url
158 |         urlmask = SO_FIFA_API + "/team/{}?v={}"
159 |         filemask = str(self.data_dir / "players_{}_{}.html")
160 | 
161 |         # get team IDs
162 |         teams = self.read_teams().reset_index()
163 | 
164 |         # collect players
165 |         players = []
166 |         for _, team in teams.iterrows():
167 |             season_id = team.season[:2]
168 |             team_name = team.team
169 |             # read html page (team overview)
170 |             filepath = self.data_dir / filemask.format(team_name, season_id)
171 |             url = urlmask.format(team["team_id"], season_id)
172 |             reader = self.get(url, filepath)
173 | 
174 |             # extract player links
175 |             tree = html.parse(reader)
176 |             pat_player = re.compile(r"\/player\/(\d+)\/[\w-]+\/")
177 |             for node in tree.xpath("//a[contains(@href,'/player/') and @title]"):
178 |                 # extract player IDs from links
179 |                 # extract player names from links
180 |                 players.append(
181 |                     {
182 |                         "player_id": int(
183 |                             re.search(pat_player, node.get("href")).group(1)  # type: ignore
184 |                         ),
185 |                         "player": node.get("title"),
186 |                         "team": team_name,
187 |                         "league": team.league,
188 |                         "season": team.season,
189 |                     }
190 |                 )
191 | 
192 |         # return data frame
193 |         df = pd.DataFrame(players).set_index(["league", "season", "team", "player"]).sort_index()
194 |         return df
195 | 
196 |     def read_ratings(self) -> pd.DataFrame:
197 |         """Retrieve ratings from the datasource for the selected leagues.
198 | 
199 |         Returns
200 |         -------
201 |         pd.DataFrame
202 |         """
203 |         # build url
204 |         urlmask = SO_FIFA_API + "/player/{}?v={}"
205 |         filemask = "player_{}_{}.html"
206 | 
207 |         # get player IDs
208 |         players = self.read_players().reset_index()
209 | 
210 |         # prepare empty data frame
211 |         ratings = []
212 | 
213 |         # define labels to use for score extraction from player profile pages
214 |         score_labels = [
215 |             "Overall Rating",
216 |             "Potential",
217 |             "Crossing",
218 |             "Finishing",
219 |             "Heading Accuracy",
220 |             "Short Passing",
221 |             "Volleys",
222 |             "Dribbling",
223 |             "Curve",
224 |             "FK Accuracy",
225 |             "Long Passing",
226 |             "Ball Control",
227 |             "Acceleration",
228 |             "Sprint Speed",
229 |             "Agility",
230 |             "Reactions",
231 |             "Balance",
232 |             "Shot Power",
233 |             "Jumping",
234 |             "Stamina",
235 |             "Strength",
236 |             "Long Shots",
237 |             "Aggression",
238 |             "Interceptions",
239 |             "Positioning",
240 |             "Vision",
241 |             "Penalties",
242 |             "Composure",
243 |             "Marking",
244 |             "Standing Tackle",
245 |             "Sliding Tackle",
246 |             "GK Diving",
247 |             "GK Handling",
248 |             "GK Kicking",
249 |             "GK Positioning",
250 |             "GK Reflexes",
251 |         ]
252 | 
253 |         for _, player in players.iterrows():
254 |             # read html page (player overview)
255 |             player_name = player.player
256 |             filepath = self.data_dir / filemask.format(player_name, player.season)
257 |             url = urlmask.format(player["player_id"], player.season[:2])
258 |             reader = self.get(url, filepath)
259 | 
260 |             # extract scores one-by-one
261 |             tree = html.parse(reader)
262 |             scores = {
263 |                 "player": player_name,
264 |                 "league": player.league,
265 |                 "season": player.season,
266 |             }
267 |             for s in score_labels:
268 |                 nodes = tree.xpath(
269 |                     "(//li[not(self::script)] | //div)"
270 |                     f"[.//text()[contains(.,'{s}')]]"
271 |                     "/span[contains(@class, 'tag')]"
272 |                 )
273 |                 # for multiple matches, only accept first match
274 |                 if len(nodes) >= 1:
275 |                     scores[s] = nodes[0].text.strip()
276 |                 # if there's no match, put NA
277 |                 else:
278 |                     scores[s] = None
279 |             ratings.append(scores)
280 |         # return data frame
281 |         df = (
282 |             pd.DataFrame(ratings)
283 |             .pipe(standardize_colnames)
284 |             .set_index(["league", "season", "player"])
285 |             .sort_index()
286 |         )
287 |         return df
288 | 


--------------------------------------------------------------------------------
/docs/datasources/ClubElo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "e621e3ae",
  7 |    "metadata": {
  8 |     "nbsphinx": "hidden"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "env: SOCCERDATA_LOGLEVEL=ERROR\n",
 16 |       "env: SOCCERDATA_NOCACHE=True\n",
 17 |       "env: SOCCERDATA_NOSTORE=True\n"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "%env SOCCERDATA_LOGLEVEL=ERROR\n",
 23 |     "%env SOCCERDATA_NOCACHE=True\n",
 24 |     "%env SOCCERDATA_NOSTORE=True"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "id": "2454afe6",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import soccerdata as sd"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "b5784f2d",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# ClubElo"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "id": "8dab5be9",
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Provides pd.DataFrames from CSV API at http://api.clubelo.com.\n",
 56 |       "\n",
 57 |       "    Data will be downloaded as necessary and cached locally in\n",
 58 |       "    ``~/soccerdata/data/ClubElo``.\n",
 59 |       "\n",
 60 |       "    Since the source does not provide league names, this class will not filter\n",
 61 |       "    by league. League names will be inserted from the other sources where\n",
 62 |       "    available. Leagues that are only covered by clubelo.com will have NaN\n",
 63 |       "    values.\n",
 64 |       "\n",
 65 |       "    Parameters\n",
 66 |       "    ----------\n",
 67 |       "    proxy : 'tor' or or dict or list(dict) or callable, optional\n",
 68 |       "        Use a proxy to hide your IP address. Valid options are:\n",
 69 |       "            - \"tor\": Uses the Tor network. Tor should be running in\n",
 70 |       "              the background on port 9050.\n",
 71 |       "            - dict: A dictionary with the proxy to use. The dict should be\n",
 72 |       "              a mapping of supported protocols to proxy addresses. For example::\n",
 73 |       "\n",
 74 |       "                  {\n",
 75 |       "                      'http': 'http://10.10.1.10:3128',\n",
 76 |       "                      'https': 'http://10.10.1.10:1080',\n",
 77 |       "                  }\n",
 78 |       "\n",
 79 |       "            - list(dict): A list of proxies to choose from. A different proxy will\n",
 80 |       "              be selected from this list after failed requests, allowing rotating\n",
 81 |       "              proxies.\n",
 82 |       "            - callable: A function that returns a valid proxy. This function will\n",
 83 |       "              be called after failed requests, allowing rotating proxies.\n",
 84 |       "    no_cache : bool\n",
 85 |       "        If True, will not use cached data.\n",
 86 |       "    no_store : bool\n",
 87 |       "        If True, will not store downloaded data.\n",
 88 |       "    data_dir : Path\n",
 89 |       "        Path to directory where data will be cached.\n",
 90 |       "    \n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "elo = sd.ClubElo()\n",
 96 |     "print(elo.__doc__)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "3a4c2916",
102 |    "metadata": {},
103 |    "source": [
104 |     "## ELO scores for all teams at specified date"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "745be31a",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/html": [
116 |        "<div>\n",
117 |        "<style scoped>\n",
118 |        "    .dataframe tbody tr th:only-of-type {\n",
119 |        "        vertical-align: middle;\n",
120 |        "    }\n",
121 |        "\n",
122 |        "    .dataframe tbody tr th {\n",
123 |        "        vertical-align: top;\n",
124 |        "    }\n",
125 |        "\n",
126 |        "    .dataframe thead th {\n",
127 |        "        text-align: right;\n",
128 |        "    }\n",
129 |        "</style>\n",
130 |        "<table border=\"1\" class=\"dataframe\">\n",
131 |        "  <thead>\n",
132 |        "    <tr style=\"text-align: right;\">\n",
133 |        "      <th></th>\n",
134 |        "      <th>rank</th>\n",
135 |        "      <th>country</th>\n",
136 |        "      <th>level</th>\n",
137 |        "      <th>elo</th>\n",
138 |        "      <th>from</th>\n",
139 |        "      <th>to</th>\n",
140 |        "      <th>league</th>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>team</th>\n",
144 |        "      <th></th>\n",
145 |        "      <th></th>\n",
146 |        "      <th></th>\n",
147 |        "      <th></th>\n",
148 |        "      <th></th>\n",
149 |        "      <th></th>\n",
150 |        "      <th></th>\n",
151 |        "    </tr>\n",
152 |        "  </thead>\n",
153 |        "  <tbody>\n",
154 |        "    <tr>\n",
155 |        "      <th>Liverpool</th>\n",
156 |        "      <td>1.0</td>\n",
157 |        "      <td>ENG</td>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>2047.083862</td>\n",
160 |        "      <td>2022-04-20</td>\n",
161 |        "      <td>2022-04-24</td>\n",
162 |        "      <td>ENG-Premier League</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>Man City</th>\n",
166 |        "      <td>2.0</td>\n",
167 |        "      <td>ENG</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>2037.059937</td>\n",
170 |        "      <td>2022-04-21</td>\n",
171 |        "      <td>2022-04-23</td>\n",
172 |        "      <td>ENG-Premier League</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>Bayern</th>\n",
176 |        "      <td>3.0</td>\n",
177 |        "      <td>GER</td>\n",
178 |        "      <td>1</td>\n",
179 |        "      <td>1984.775391</td>\n",
180 |        "      <td>2022-04-18</td>\n",
181 |        "      <td>2022-04-23</td>\n",
182 |        "      <td>GER-Bundesliga</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>Real Madrid</th>\n",
186 |        "      <td>4.0</td>\n",
187 |        "      <td>ESP</td>\n",
188 |        "      <td>1</td>\n",
189 |        "      <td>1969.584351</td>\n",
190 |        "      <td>2022-04-21</td>\n",
191 |        "      <td>2022-04-26</td>\n",
192 |        "      <td>ESP-La Liga</td>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "      <th>Chelsea</th>\n",
196 |        "      <td>5.0</td>\n",
197 |        "      <td>ENG</td>\n",
198 |        "      <td>1</td>\n",
199 |        "      <td>1921.101440</td>\n",
200 |        "      <td>2022-04-21</td>\n",
201 |        "      <td>2022-04-24</td>\n",
202 |        "      <td>ENG-Premier League</td>\n",
203 |        "    </tr>\n",
204 |        "  </tbody>\n",
205 |        "</table>\n",
206 |        "</div>"
207 |       ],
208 |       "text/plain": [
209 |        "             rank country  level          elo       from         to  \\\n",
210 |        "team                                                                  \n",
211 |        "Liverpool     1.0     ENG      1  2047.083862 2022-04-20 2022-04-24   \n",
212 |        "Man City      2.0     ENG      1  2037.059937 2022-04-21 2022-04-23   \n",
213 |        "Bayern        3.0     GER      1  1984.775391 2022-04-18 2022-04-23   \n",
214 |        "Real Madrid   4.0     ESP      1  1969.584351 2022-04-21 2022-04-26   \n",
215 |        "Chelsea       5.0     ENG      1  1921.101440 2022-04-21 2022-04-24   \n",
216 |        "\n",
217 |        "                         league  \n",
218 |        "team                             \n",
219 |        "Liverpool    ENG-Premier League  \n",
220 |        "Man City     ENG-Premier League  \n",
221 |        "Bayern           GER-Bundesliga  \n",
222 |        "Real Madrid         ESP-La Liga  \n",
223 |        "Chelsea      ENG-Premier League  "
224 |       ]
225 |      },
226 |      "execution_count": 4,
227 |      "metadata": {},
228 |      "output_type": "execute_result"
229 |     }
230 |    ],
231 |    "source": [
232 |     "current_elo = elo.read_by_date()\n",
233 |     "current_elo.head()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "246ca661",
239 |    "metadata": {},
240 |    "source": [
241 |     "## Full ELO history for one club"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 5,
247 |    "id": "1c87e14a",
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/html": [
253 |        "<div>\n",
254 |        "<style scoped>\n",
255 |        "    .dataframe tbody tr th:only-of-type {\n",
256 |        "        vertical-align: middle;\n",
257 |        "    }\n",
258 |        "\n",
259 |        "    .dataframe tbody tr th {\n",
260 |        "        vertical-align: top;\n",
261 |        "    }\n",
262 |        "\n",
263 |        "    .dataframe thead th {\n",
264 |        "        text-align: right;\n",
265 |        "    }\n",
266 |        "</style>\n",
267 |        "<table border=\"1\" class=\"dataframe\">\n",
268 |        "  <thead>\n",
269 |        "    <tr style=\"text-align: right;\">\n",
270 |        "      <th></th>\n",
271 |        "      <th>rank</th>\n",
272 |        "      <th>team</th>\n",
273 |        "      <th>country</th>\n",
274 |        "      <th>level</th>\n",
275 |        "      <th>elo</th>\n",
276 |        "      <th>to</th>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <th>from</th>\n",
280 |        "      <th></th>\n",
281 |        "      <th></th>\n",
282 |        "      <th></th>\n",
283 |        "      <th></th>\n",
284 |        "      <th></th>\n",
285 |        "      <th></th>\n",
286 |        "    </tr>\n",
287 |        "  </thead>\n",
288 |        "  <tbody>\n",
289 |        "    <tr>\n",
290 |        "      <th>1939-10-22</th>\n",
291 |        "      <td>NaN</td>\n",
292 |        "      <td>Barcelona</td>\n",
293 |        "      <td>ESP</td>\n",
294 |        "      <td>1</td>\n",
295 |        "      <td>1636.704590</td>\n",
296 |        "      <td>1939-12-03</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>1939-12-04</th>\n",
300 |        "      <td>NaN</td>\n",
301 |        "      <td>Barcelona</td>\n",
302 |        "      <td>ESP</td>\n",
303 |        "      <td>1</td>\n",
304 |        "      <td>1626.102173</td>\n",
305 |        "      <td>1939-12-10</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>1939-12-11</th>\n",
309 |        "      <td>NaN</td>\n",
310 |        "      <td>Barcelona</td>\n",
311 |        "      <td>ESP</td>\n",
312 |        "      <td>1</td>\n",
313 |        "      <td>1636.728271</td>\n",
314 |        "      <td>1939-12-17</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>1939-12-18</th>\n",
318 |        "      <td>NaN</td>\n",
319 |        "      <td>Barcelona</td>\n",
320 |        "      <td>ESP</td>\n",
321 |        "      <td>1</td>\n",
322 |        "      <td>1646.951660</td>\n",
323 |        "      <td>1939-12-24</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>1939-12-25</th>\n",
327 |        "      <td>NaN</td>\n",
328 |        "      <td>Barcelona</td>\n",
329 |        "      <td>ESP</td>\n",
330 |        "      <td>1</td>\n",
331 |        "      <td>1637.424316</td>\n",
332 |        "      <td>1939-12-31</td>\n",
333 |        "    </tr>\n",
334 |        "  </tbody>\n",
335 |        "</table>\n",
336 |        "</div>"
337 |       ],
338 |       "text/plain": [
339 |        "            rank       team country  level          elo         to\n",
340 |        "from                                                              \n",
341 |        "1939-10-22   NaN  Barcelona     ESP      1  1636.704590 1939-12-03\n",
342 |        "1939-12-04   NaN  Barcelona     ESP      1  1626.102173 1939-12-10\n",
343 |        "1939-12-11   NaN  Barcelona     ESP      1  1636.728271 1939-12-17\n",
344 |        "1939-12-18   NaN  Barcelona     ESP      1  1646.951660 1939-12-24\n",
345 |        "1939-12-25   NaN  Barcelona     ESP      1  1637.424316 1939-12-31"
346 |       ]
347 |      },
348 |      "execution_count": 5,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "barca_elo = elo.read_team_history(\"Barcelona\")\n",
355 |     "barca_elo.head()"
356 |    ]
357 |   }
358 |  ],
359 |  "metadata": {
360 |   "kernelspec": {
361 |    "display_name": "soccerdata",
362 |    "language": "python",
363 |    "name": "soccerdata"
364 |   },
365 |   "language_info": {
366 |    "codemirror_mode": {
367 |     "name": "ipython",
368 |     "version": 3
369 |    },
370 |    "file_extension": ".py",
371 |    "mimetype": "text/x-python",
372 |    "name": "python",
373 |    "nbconvert_exporter": "python",
374 |    "pygments_lexer": "ipython3",
375 |    "version": "3.9.6"
376 |   },
377 |   "toc": {
378 |    "base_numbering": 1,
379 |    "nav_menu": {},
380 |    "number_sections": true,
381 |    "sideBar": true,
382 |    "skip_h1_title": false,
383 |    "title_cell": "Table of Contents",
384 |    "title_sidebar": "Contents",
385 |    "toc_cell": false,
386 |    "toc_position": {},
387 |    "toc_section_display": true,
388 |    "toc_window_display": true
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 5
393 | }
394 | 


--------------------------------------------------------------------------------
/soccerdata/espn.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://site.api.espn.com/apis/site/v2/sports/soccer."""
  2 | import datetime
  3 | import itertools
  4 | import json
  5 | import re
  6 | from pathlib import Path
  7 | from typing import Callable, Dict, List, Optional, Union
  8 | 
  9 | import pandas as pd
 10 | import requests
 11 | 
 12 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames
 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
 14 | 
 15 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/summary?event=513466
 16 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/scoreboard?dates=20180901
 17 | 
 18 | ESPN_DATADIR = DATA_DIR / 'ESPN'
 19 | ESPN_API = 'http://site.api.espn.com/apis/site/v2/sports/soccer'
 20 | 
 21 | 
 22 | class ESPN(BaseRequestsReader):
 23 |     """Provides pd.DataFrames from JSON api available at http://site.api.espn.com.
 24 | 
 25 |     Data will be downloaded as necessary and cached locally in
 26 |     ``~/soccerdata/data/ESPN``.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     leagues : string or iterable, optional
 31 |         IDs of leagues to include.
 32 | 
 33 |     seasons : string, int or list, optional
 34 |         Seasons to include. Supports multiple formats.
 35 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 36 |     proxy : 'tor' or dict or list(dict) or callable, optional
 37 |         Use a proxy to hide your IP address. Valid options are:
 38 |             - "tor": Uses the Tor network. Tor should be running in
 39 |               the background on port 9050.
 40 |             - dict: A dictionary with the proxy to use. The dict should be
 41 |               a mapping of supported protocols to proxy addresses. For example::
 42 | 
 43 |                   {
 44 |                       'http': 'http://10.10.1.10:3128',
 45 |                       'https': 'http://10.10.1.10:1080',
 46 |                   }
 47 | 
 48 |             - list(dict): A list of proxies to choose from. A different proxy will
 49 |               be selected from this list after failed requests, allowing rotating
 50 |               proxies.
 51 |             - callable: A function that returns a valid proxy. This function will
 52 |               be called after failed requests, allowing rotating proxies.
 53 |     no_cache : bool
 54 |         If True, will not use cached data.
 55 |     no_store : bool
 56 |         If True, will not store downloaded data.
 57 |     data_dir : Path
 58 |         Path to directory where data will be cached.
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         leagues: Optional[Union[str, List[str]]] = None,
 64 |         seasons: Optional[Union[str, int, List]] = None,
 65 |         proxy: Optional[
 66 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 67 |         ] = None,
 68 |         no_cache: bool = NOCACHE,
 69 |         no_store: bool = NOSTORE,
 70 |         data_dir: Path = ESPN_DATADIR,
 71 |     ):
 72 |         """Initialize a new ESPN reader."""
 73 |         super().__init__(
 74 |             leagues=leagues,
 75 |             proxy=proxy,
 76 |             no_cache=no_cache,
 77 |             no_store=no_store,
 78 |             data_dir=data_dir,
 79 |         )
 80 |         self.seasons = seasons  # type: ignore
 81 | 
 82 |     def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
 83 |         """Retrieve the game schedule for the selected leagues and seasons.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         force_cache : bool
 88 |              By default no cached data is used for the current season.
 89 |              If True, will force the use of cached data anyway.
 90 | 
 91 |         Returns
 92 |         -------
 93 |         pd.DataFrame
 94 |         """
 95 |         urlmask = ESPN_API + '/{}/scoreboard?dates={}'
 96 |         filemask = 'Schedule_{}_{}.json'
 97 | 
 98 |         df_list = []
 99 |         # Get match days
100 |         for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons):
101 |             if int(skey[:2]) > int(str(datetime.datetime.now().year + 1)[-2:]):
102 |                 start_date = ''.join(['19', skey[:2], '07', '01'])
103 |             else:
104 |                 start_date = ''.join(['20', skey[:2], '07', '01'])
105 | 
106 |             url = urlmask.format(lkey, start_date)
107 |             resp = requests.get(url=url)
108 |             data = resp.json()
109 | 
110 |             match_dates = [
111 |                 datetime.datetime.strptime(d, '%Y-%m-%dT%H:%MZ').strftime('%Y%m%d')
112 |                 for d in data['leagues'][0]['calendar']
113 |             ]
114 |             for date in match_dates:
115 |                 url = urlmask.format(lkey, date)
116 |                 filepath = self.data_dir / filemask.format(lkey, date)
117 |                 current_season = not self._is_complete(lkey, skey)
118 |                 reader = self.get(url, filepath, no_cache=current_season and not force_cache)
119 | 
120 |                 data = json.load(reader)
121 |                 df_list.extend(
122 |                     [
123 |                         {
124 |                             'league': lkey,
125 |                             'season': skey,
126 |                             'date': e['date'],
127 |                             'home_team': e['competitions'][0]['competitors'][0]['team']['name'],
128 |                             'away_team': e['competitions'][0]['competitors'][1]['team']['name'],
129 |                             'game_id': int(e['id']),
130 |                             'league_id': lkey,
131 |                         }
132 |                         for e in data['events']
133 |                     ]
134 |                 )
135 |         df = (
136 |             pd.DataFrame(df_list)
137 |             .pipe(self._translate_league)
138 |             .replace({'home_team': TEAMNAME_REPLACEMENTS, 'away_team': TEAMNAME_REPLACEMENTS})
139 |             .assign(date=lambda x: pd.to_datetime(x['date']))
140 |             .dropna(subset=['home_team', 'away_team', 'date'])
141 |             .assign(game=lambda df: df.apply(make_game_id, axis=1))
142 |             .set_index(['league', 'season', 'game'])
143 |             .sort_index()
144 |         )
145 | 
146 |         return df
147 | 
148 |     def read_matchsheet(self, match_id: Optional[Union[int, List[int]]] = None) -> pd.DataFrame:
149 |         """Retrieve match sheets for the selected leagues and seasons.
150 | 
151 |         Parameters
152 |         ----------
153 |         match_id : int or list of int, optional
154 |             Retrieve the match sheet for a specific game.
155 | 
156 |         Raises
157 |         ------
158 |         ValueError
159 |             If no games with the given IDs were found for the selected seasons and leagues.
160 | 
161 |         Returns
162 |         -------
163 |         pd.DataFrame.
164 |         """
165 |         urlmask = ESPN_API + '/{}/summary?event={}'
166 |         filemask = 'Summary_{}.json'
167 | 
168 |         df_schedule = self.read_schedule().reset_index()
169 |         if match_id is not None:
170 |             iterator = df_schedule[
171 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id)
172 |             ]
173 |             if len(iterator) == 0:
174 |                 raise ValueError(
175 |                     'No games with the given IDs found for the selected seasons and leagues.'
176 |                 )
177 |         else:
178 |             iterator = df_schedule
179 | 
180 |         df_list = []
181 |         for i, match in iterator.iterrows():
182 |             url = urlmask.format(match['league_id'], match['game_id'])
183 |             filepath = self.data_dir / filemask.format(match['game_id'])
184 |             reader = self.get(url, filepath)
185 | 
186 |             data = json.load(reader)
187 |             for i in range(2):
188 |                 match_sheet = {
189 |                     'game': match['game'],
190 |                     'league': match['league'],
191 |                     'season': match['season'],
192 |                     'team': data['boxscore']['form'][i]['team']['displayName'],
193 |                     'is_home': (i == 0),
194 |                     'venue': data['gameInfo']['venue']['fullName']
195 |                     if 'venue' in data['gameInfo']
196 |                     else None,
197 |                     'attendance': data['gameInfo']['attendance'],
198 |                     'capacity': data['gameInfo']['venue']['capacity']
199 |                     if 'venue' in data['gameInfo']
200 |                     else None,
201 |                     'roster': data['rosters'][i]['roster'],
202 |                 }
203 |                 if 'statistics' in data['boxscore']['teams'][i]:
204 |                     for stat in data['boxscore']['teams'][i]['statistics']:
205 |                         match_sheet[stat['name']] = stat['displayValue']
206 |                 df_list.append(match_sheet)
207 |         df = (
208 |             pd.DataFrame(df_list)
209 |             .replace({'team': TEAMNAME_REPLACEMENTS})
210 |             .pipe(standardize_colnames)
211 |             .set_index(['league', 'season', 'game', 'team'])
212 |             .sort_index()
213 |         )
214 |         return df
215 | 
216 |     def read_lineup(  # noqa: C901
217 |         self, match_id: Optional[Union[int, List[int]]] = None
218 |     ) -> pd.DataFrame:
219 |         """Retrieve lineups for the selected leagues and seasons.
220 | 
221 |         Parameters
222 |         ----------
223 |         match_id : int or list of int, optional
224 |             Retrieve the lineup for a specific game.
225 | 
226 |         Raises
227 |         ------
228 |         ValueError
229 |             If no games with the given IDs were found for the selected seasons and leagues.
230 | 
231 |         Returns
232 |         -------
233 |         pd.DataFrame.
234 |         """
235 |         urlmask = ESPN_API + '/{}/summary?event={}'
236 |         filemask = 'Summary_{}.json'
237 | 
238 |         df_schedule = self.read_schedule().reset_index()
239 |         if match_id is not None:
240 |             iterator = df_schedule[
241 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id)
242 |             ]
243 |             if len(iterator) == 0:
244 |                 raise ValueError(
245 |                     'No games with the given IDs found for the selected seasons and leagues.'
246 |                 )
247 |         else:
248 |             iterator = df_schedule
249 | 
250 |         df_list = []
251 |         for i, match in iterator.iterrows():
252 |             url = urlmask.format(match['league_id'], match['game_id'])
253 |             filepath = self.data_dir / filemask.format(match['game_id'])
254 |             reader = self.get(url, filepath)
255 | 
256 |             data = json.load(reader)
257 |             for i in range(2):
258 |                 for p in data['rosters'][i]['roster']:
259 |                     match_sheet = {
260 |                         'game': match['game'],
261 |                         'league': match['league'],
262 |                         'season': match['season'],
263 |                         'team': data['boxscore']['form'][i]['team']['displayName'],
264 |                         'is_home': (i == 0),
265 |                         'player': p['athlete']['displayName'],
266 |                         'position': p['position']['name'] if 'position' in p else None,
267 |                         'formation_place': p['formationPlace'] if 'formationPlace' in p else None,
268 |                     }
269 | 
270 |                     if p['starter']:
271 |                         match_sheet['sub_in'] = 'start'
272 |                     elif p['subbedIn']:
273 |                         ii = [i for i, x in enumerate(p['plays']) if x['substitution']][0]
274 |                         match_sheet['sub_in'] = sum(
275 |                             map(
276 |                                 int,
277 |                                 re.findall(
278 |                                     r'(\d{1,3})',
279 |                                     p['plays'][ii]['clock']['displayValue'],
280 |                                 ),
281 |                             )
282 |                         )
283 |                     else:
284 |                         match_sheet['sub_in'] = None
285 | 
286 |                     if (p['starter'] or p['subbedIn']) and not p['subbedOut']:
287 |                         match_sheet['sub_out'] = 'end'
288 |                     elif p['subbedOut']:
289 |                         j = 0 if not p['subbedIn'] else 1
290 |                         ii = [i for i, x in enumerate(p['plays']) if x['substitution']][j]
291 |                         match_sheet['sub_out'] = sum(
292 |                             map(
293 |                                 int,
294 |                                 re.findall(
295 |                                     r'(\d{1,3})',
296 |                                     p['plays'][ii]['clock']['displayValue'],
297 |                                 ),
298 |                             )
299 |                         )
300 |                     else:
301 |                         match_sheet['sub_out'] = None
302 | 
303 |                     if 'stats' in p:
304 |                         for stat in p['stats']:
305 |                             match_sheet[stat['name']] = stat['value']
306 | 
307 |                     df_list.append(match_sheet)
308 |         df = (
309 |             pd.DataFrame(df_list)
310 |             .replace({'team': TEAMNAME_REPLACEMENTS})
311 |             .pipe(standardize_colnames)
312 |             .set_index(['league', 'season', 'game', 'team', 'player'])
313 |             .sort_index()
314 |         )
315 |         return df
316 | 


--------------------------------------------------------------------------------
/soccerdata/_common.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import pprint
  4 | import random
  5 | import re
  6 | import time
  7 | import warnings
  8 | from abc import ABC, abstractmethod
  9 | from datetime import date, datetime, timedelta
 10 | from pathlib import Path
 11 | from typing import IO, Callable, Dict, Iterable, List, Optional, Union
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import requests
 16 | import undetected_chromedriver as uc
 17 | from dateutil.relativedelta import relativedelta
 18 | from selenium.common.exceptions import WebDriverException
 19 | 
 20 | from ._config import DATA_DIR, LEAGUE_DICT, logger
 21 | 
 22 | 
 23 | class BaseReader(ABC):
 24 |     """Base class for data readers.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     leagues : str or list of str, optional
 29 |         The leagues to read. If None, all available leagues are read.
 30 |     proxy : 'tor' or or dict or list(dict) or callable, optional
 31 |         Use a proxy to hide your IP address. Valid options are:
 32 |             - "tor": Uses the Tor network. Tor should be running in
 33 |               the background on port 9050.
 34 |             - dict: A dictionary with the proxy to use. The dict should be
 35 |               a mapping of supported protocols to proxy addresses. For example::
 36 | 
 37 |                   {
 38 |                       'http': 'http://10.10.1.10:3128',
 39 |                       'https': 'http://10.10.1.10:1080',
 40 |                   }
 41 | 
 42 |             - list(dict): A list of proxies to choose from. A different proxy will
 43 |               be selected from this list after failed requests, allowing rotating
 44 |               proxies.
 45 |             - callable: A function that returns a valid proxy. This function will
 46 |               be called after failed requests, allowing rotating proxies.
 47 |     no_cache : bool
 48 |         If True, will not use cached data.
 49 |     no_store : bool
 50 |         If True, will not store downloaded data.
 51 |     data_dir : Path
 52 |         Path to directory where data will be cached.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         leagues: Optional[Union[str, List[str]]] = None,
 58 |         proxy: Optional[
 59 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 60 |         ] = None,
 61 |         no_cache: bool = False,
 62 |         no_store: bool = False,
 63 |         data_dir: Path = DATA_DIR,
 64 |     ):
 65 |         """Create a new data reader."""
 66 |         if isinstance(proxy, str) and proxy.lower() == "tor":
 67 |             self.proxy = lambda: {
 68 |                 "http": "socks5://127.0.0.1:9050",
 69 |                 "https": "socks5://127.0.0.1:9050",
 70 |             }
 71 |         elif isinstance(proxy, dict):
 72 |             self.proxy = lambda: proxy  # type: ignore
 73 |         elif isinstance(proxy, list):
 74 |             self.proxy = lambda: random.choice(proxy)  # type: ignore
 75 |         elif callable(proxy):
 76 |             self.proxy = proxy
 77 |         else:
 78 |             self.proxy = lambda: {}
 79 | 
 80 |         self._selected_leagues = leagues  # type: ignore
 81 |         self.no_cache = no_cache
 82 |         self.no_store = no_store
 83 |         self.data_dir = data_dir
 84 |         self.rate_limit = 0
 85 |         self.max_delay = 0
 86 |         if self.no_store:
 87 |             logger.info("Caching is disabled")
 88 |         else:
 89 |             logger.info("Saving cached data to %s", self.data_dir)
 90 |             self.data_dir.mkdir(parents=True, exist_ok=True)
 91 | 
 92 |     def get(
 93 |         self,
 94 |         url: str,
 95 |         filepath: Optional[Path] = None,
 96 |         max_age: Optional[Union[int, timedelta]] = None,
 97 |         no_cache: bool = False,
 98 |         var: Optional[str] = None,
 99 |     ) -> IO[bytes]:
100 |         """Load data from `url`.
101 | 
102 |         By default, the source of `url` is downloaded and saved to `filepath`.
103 |         If `filepath` exists, the `url` is not visited and the cached data is
104 |         returned.
105 | 
106 |         Parameters
107 |         ----------
108 |         url : str
109 |             URL to download.
110 |         filepath : Path, optional
111 |             Path to save downloaded file. If None, downloaded data is not cached.
112 |         max_age : int for age in days, or timedelta object
113 |             The max. age of locally cached file before re-download.
114 |         no_cache : bool
115 |             If True, will not use cached data. Overrides the class property.
116 |         var : str, optional
117 |             Return a javascript variable instead of the page source.
118 | 
119 |         Raises
120 |         ------
121 |         TypeError
122 |             If max_age is not an integer or timedelta object.
123 | 
124 |         Returns
125 |         -------
126 |         io.BufferedIOBase
127 |             File-like object of downloaded data.
128 |         """
129 |         is_cached = self._is_cached(filepath, max_age)
130 |         if no_cache or self.no_cache or not is_cached:
131 |             logger.debug("Scraping %s", url)
132 |             return self._download_and_save(url, filepath, var)
133 |         logger.debug("Retrieving %s from cache", url)
134 |         assert filepath is not None
135 |         return filepath.open(mode="rb")
136 | 
137 |     def _is_cached(
138 |         self,
139 |         filepath: Optional[Path] = None,
140 |         max_age: Optional[Union[int, timedelta]] = None,
141 |     ) -> bool:
142 |         """Check if `filepath` contains valid cached data.
143 | 
144 |         Parameters
145 |         ----------
146 |         filepath : Path, optional
147 |             Path where file should be cached. If None, return False.
148 |         max_age : int for age in days, or timedelta object
149 |             The max. age of locally cached file.
150 | 
151 |         Raises
152 |         ------
153 |         TypeError
154 |             If max_age is not an integer or timedelta object.
155 | 
156 |         Returns
157 |         -------
158 |         bool
159 |             True in case of a cache hit, otherwise False.
160 |         """
161 |         # Validate inputs
162 |         if max_age is not None:
163 |             if isinstance(max_age, int):
164 |                 _max_age = timedelta(days=max_age)
165 |             elif isinstance(max_age, timedelta):
166 |                 _max_age = max_age
167 |             else:
168 |                 raise TypeError("max_age must be of type int or datetime.timedelta")
169 |         else:
170 |             _max_age = None
171 | 
172 |         cache_invalid = False
173 |         # Check if cached file is too old
174 |         if _max_age is not None and filepath is not None and filepath.exists():
175 |             last_modified = datetime.fromtimestamp(filepath.stat().st_mtime)
176 |             now = datetime.now()
177 |             if (now - last_modified) > _max_age:
178 |                 cache_invalid = True
179 | 
180 |         return not cache_invalid and filepath is not None and filepath.exists()
181 | 
182 |     @abstractmethod
183 |     def _download_and_save(
184 |         self,
185 |         url: str,
186 |         filepath: Optional[Path] = None,
187 |         var: Optional[str] = None,
188 |     ) -> IO[bytes]:
189 |         """Download data at `url` to `filepath`.
190 | 
191 |         Parameters
192 |         ----------
193 |         url : str
194 |             URL to download.
195 |         filepath : Path, optional
196 |             Path to save downloaded file. If None, downloaded data is not cached.
197 |         var : str, optional
198 |             Return a javascript variable instead of the page source.
199 | 
200 |         Returns
201 |         -------
202 |         io.BufferedIOBase
203 |             File-like object of downloaded data.
204 |         """
205 | 
206 |     @classmethod
207 |     def available_leagues(cls) -> List[str]:
208 |         """Return a list of league IDs available for this source."""
209 |         return sorted(cls._all_leagues().keys())
210 | 
211 |     @classmethod
212 |     def _all_leagues(cls) -> Dict[str, str]:
213 |         """Return a dict mapping all canonical league IDs to source league IDs."""
214 |         if not hasattr(cls, "_all_leagues_dict"):
215 |             cls._all_leagues_dict = {  # type: ignore
216 |                 k: v[cls.__name__] for k, v in LEAGUE_DICT.items() if cls.__name__ in v
217 |             }
218 |         return cls._all_leagues_dict  # type: ignore
219 | 
220 |     @classmethod
221 |     def _translate_league(cls, df: pd.DataFrame, col: str = "league") -> pd.DataFrame:
222 |         """Map source league ID to canonical ID."""
223 |         flip = {v: k for k, v in cls._all_leagues().items()}
224 |         mask = ~df[col].isin(flip)
225 |         df.loc[mask, col] = np.nan
226 |         df[col] = df[col].replace(flip)
227 |         return df
228 | 
229 |     @property
230 |     def _selected_leagues(self) -> Dict[str, str]:
231 |         """Return a dict mapping selected canonical league IDs to source league IDs."""
232 |         return self._leagues_dict
233 | 
234 |     @_selected_leagues.setter
235 |     def _selected_leagues(self, ids: Optional[Union[str, List[str]]] = None) -> None:
236 |         if ids is None:
237 |             self._leagues_dict = self._all_leagues()
238 |         else:
239 |             if len(ids) == 0:
240 |                 raise ValueError("Empty iterable not allowed for 'leagues'")
241 |             if isinstance(ids, str):
242 |                 ids = [ids]
243 |             tmp_league_dict = {}
244 |             for i in ids:
245 |                 if i not in self._all_leagues():
246 |                     raise ValueError(
247 |                         f"""
248 |                         Invalid league '{i}'. Valid leagues are:
249 |                         { pprint.pformat(self.available_leagues()) }
250 |                         """
251 |                     )
252 |                 tmp_league_dict[i] = self._all_leagues()[i]
253 |             self._leagues_dict = tmp_league_dict
254 | 
255 |     def _is_complete(self, league: str, season: str) -> bool:
256 |         """Check if a season is complete."""
257 |         if league in LEAGUE_DICT:
258 |             league_dict = LEAGUE_DICT[league]
259 |         else:
260 |             flip = {v: k for k, v in self._all_leagues().items()}
261 |             if league in flip:
262 |                 league_dict = LEAGUE_DICT[flip[league]]
263 |             else:
264 |                 raise ValueError(f"Invalid league '{league}'")
265 |         if "season_end" not in league_dict:
266 |             season_ends = date(datetime.strptime(season[-2:], "%y").year, 7, 1)
267 |         else:
268 |             season_ends = (
269 |                 date(
270 |                     datetime.strptime(season[-2:], "%y").year,
271 |                     datetime.strptime(league_dict["season_end"], "%b").month,
272 |                     1,
273 |                 )
274 |                 + relativedelta(months=1)
275 |             )
276 |         return date.today() >= season_ends
277 | 
278 |     @property
279 |     def leagues(self) -> List[str]:
280 |         """Return a list of selected leagues."""
281 |         return list(self._leagues_dict.keys())
282 | 
283 |     @property
284 |     def seasons(self) -> List[str]:
285 |         """Return a list of selected seasons."""
286 |         return self._season_ids
287 | 
288 |     @seasons.setter
289 |     def seasons(self, seasons: Optional[Union[str, int, Iterable[Union[str, int]]]]) -> None:
290 |         if seasons is None:
291 |             logger.info("No seasons provided. Will retrieve data for the last 5 seasons.")
292 |             year = datetime.today().year
293 |             seasons = range(year, year - 6, -1)
294 |         if isinstance(seasons, str) or isinstance(seasons, int):
295 |             seasons = [seasons]
296 |         self._season_ids = [season_code(s) for s in seasons]
297 | 
298 | 
299 | class BaseRequestsReader(BaseReader):
300 |     """Base class for readers that use the Python requests module."""
301 | 
302 |     def __init__(
303 |         self,
304 |         leagues: Optional[Union[str, List[str]]] = None,
305 |         proxy: Optional[
306 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
307 |         ] = None,
308 |         no_cache: bool = False,
309 |         no_store: bool = False,
310 |         data_dir: Path = DATA_DIR,
311 |     ):
312 |         """Initialize the reader."""
313 |         super().__init__(
314 |             no_cache=no_cache,
315 |             no_store=no_store,
316 |             leagues=leagues,
317 |             proxy=proxy,
318 |             data_dir=data_dir,
319 |         )
320 | 
321 |         self._session = self._init_session()
322 | 
323 |     def _init_session(self) -> requests.Session:
324 |         session = requests.Session()
325 |         session.proxies.update(self.proxy())
326 |         return session
327 | 
328 |     def _download_and_save(
329 |         self,
330 |         url: str,
331 |         filepath: Optional[Path] = None,
332 |         var: Optional[str] = None,
333 |     ) -> IO[bytes]:
334 |         """Download file at url to filepath. Overwrites if filepath exists."""
335 |         for i in range(5):
336 |             try:
337 |                 response = self._session.get(url, stream=True)
338 |                 time.sleep(self.rate_limit + random.random() * self.max_delay)
339 |                 response.raise_for_status()
340 |                 if not self.no_store and filepath is not None:
341 |                     with filepath.open(mode="wb") as fh:
342 |                         fh.write(response.content)
343 |                 return io.BytesIO(response.content)
344 |             except Exception:
345 |                 logger.exception(
346 |                     "Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1
347 |                 )
348 |                 self._session = self._init_session()
349 |                 continue
350 | 
351 |         raise ConnectionError("Could not download %s." % url)
352 | 
353 | 
354 | class BaseSeleniumReader(BaseReader):
355 |     """Base class for readers that use Selenium."""
356 | 
357 |     def __init__(
358 |         self,
359 |         leagues: Optional[Union[str, List[str]]] = None,
360 |         proxy: Optional[
361 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
362 |         ] = None,
363 |         no_cache: bool = False,
364 |         no_store: bool = False,
365 |         data_dir: Path = DATA_DIR,
366 |         path_to_browser: Optional[Path] = None,
367 |         headless: bool = True,
368 |     ):
369 |         """Initialize the reader."""
370 |         super().__init__(
371 |             no_cache=no_cache,
372 |             no_store=no_store,
373 |             leagues=leagues,
374 |             proxy=proxy,
375 |             data_dir=data_dir,
376 |         )
377 |         self.path_to_browser = path_to_browser
378 |         self.headless = headless
379 | 
380 |         try:
381 |             self._driver = self._init_webdriver()
382 |         except WebDriverException as e:
383 |             logger.error(
384 |                 """
385 |                 The ChromeDriver was unable to initiate/spawn a new
386 |                 WebBrowser. You will not be able to scrape new data.
387 |                 %s
388 |                 """,
389 |                 e,
390 |             )
391 | 
392 |     def _init_webdriver(self) -> "uc.Chrome":
393 |         """Start the Selenium driver."""
394 |         # Quit existing driver
395 |         if hasattr(self, "_driver"):
396 |             self._driver.quit()
397 |         # Start a new driver
398 |         chrome_options = uc.ChromeOptions()
399 |         if self.headless:
400 |             chrome_options.add_argument("--headless")
401 |         if self.path_to_browser is not None:
402 |             chrome_options.add_argument("--binary-location=" + str(self.path_to_browser))
403 |         proxy = self.proxy()
404 |         if len(proxy):
405 |             proxy_str = ";".join(f"{prot}={url}" for prot, url in proxy.items())
406 |             resolver_rules = "MAP * ~NOTFOUND , EXCLUDE 127.0.0.1"
407 |             chrome_options.add_argument("--proxy-server=" + proxy_str)
408 |             chrome_options.add_argument("--host-resolver-rules=" + resolver_rules)
409 |         return uc.Chrome(options=chrome_options)
410 | 
411 |     def _download_and_save(  # noqa: C901
412 |         self,
413 |         url: str,
414 |         filepath: Optional[Path] = None,
415 |         var: Optional[str] = None,
416 |     ) -> IO[bytes]:
417 |         """Download file at url to filepath. Overwrites if filepath exists."""
418 |         for i in range(5):
419 |             try:
420 |                 self._driver.get(url)
421 |                 time.sleep(self.rate_limit + random.random() * self.max_delay)
422 |                 if "Incapsula incident ID" in self._driver.page_source:
423 |                     raise WebDriverException(
424 |                         "Your IP is blocked. Use tor or a proxy to continue scraping."
425 |                     )
426 |                 if var is None:
427 |                     response = self._driver.execute_script(
428 |                         "return document.body.innerHTML;"
429 |                     ).encode("utf-8")
430 |                 else:
431 |                     response = json.dumps(self._driver.execute_script("return " + var)).encode(
432 |                         "utf-8"
433 |                     )
434 |                 if not self.no_store and filepath is not None:
435 |                     filepath.parent.mkdir(parents=True, exist_ok=True)
436 |                     with filepath.open(mode="wb") as fh:
437 |                         fh.write(response)
438 |                 return io.BytesIO(response)
439 |             except Exception:
440 |                 logger.exception(
441 |                     "Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1
442 |                 )
443 |                 self._driver = self._init_webdriver()
444 |                 continue
445 | 
446 |         raise ConnectionError("Could not download %s." % url)
447 | 
448 | 
449 | def season_code(season: Union[str, int]) -> str:  # noqa: C901
450 |     """Convert a string or int to a season code like '1718'."""
451 |     season = str(season)
452 |     pat1 = re.compile(r"^[0-9]{4}$")  # 1994 | 9495
453 |     pat2 = re.compile(r"^[0-9]{2}$")  # 94
454 |     pat3 = re.compile(r"^[0-9]{4}-[0-9]{4}$")  # 1994-1995
455 |     pat4 = re.compile(r"^[0-9]{4}/[0-9]{4}$")  # 1994/1995
456 |     pat5 = re.compile(r"^[0-9]{4}-[0-9]{2}$")  # 1994-95
457 |     pat6 = re.compile(r"^[0-9]{2}-[0-9]{2}$")  # 94-95
458 | 
459 |     if re.match(pat1, season):
460 |         if int(season[2:]) == int(season[:2]) + 1:
461 |             if season == "1920" or season == "2021":
462 |                 msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format(
463 |                     season, season[:2], season[-2:]
464 |                 )
465 |                 warnings.warn(msg)
466 |             return season  # 9495
467 |         elif season[2:] == "99":
468 |             return "".join([season[2:], "00"])  # 1999
469 |         else:
470 |             return "".join([season[-2:], f"{int(season[-2:]) + 1:02d}"])  # 1994
471 |     elif re.match(pat2, season):
472 |         if season == "99":
473 |             return "".join([season, "00"])  # 99
474 |         else:
475 |             return "".join([season, f"{int(season) + 1:02d}"])  # 94
476 |     elif re.match(pat3, season):
477 |         return "".join([season[2:4], season[-2:]])  # 1994-1995
478 |     elif re.match(pat4, season):
479 |         return "".join([season[2:4], season[-2:]])  # 1994/1995
480 |     elif re.match(pat5, season):
481 |         return "".join([season[2:4], season[-2:]])  # 1994-95
482 |     elif re.match(pat6, season):
483 |         return "".join([season[:2], season[-2:]])  # 94-95
484 |     else:
485 |         return season
486 | 
487 | 
488 | def make_game_id(row: pd.Series) -> str:
489 |     """Return a game id based on date, home and away team."""
490 |     if pd.isnull(row["date"]):
491 |         game_id = "{}-{}".format(
492 |             row["home_team"],
493 |             row["away_team"],
494 |         )
495 |     else:
496 |         game_id = "{} {}-{}".format(
497 |             row["date"].strftime("%Y-%m-%d"),
498 |             row["home_team"],
499 |             row["away_team"],
500 |         )
501 |     return game_id
502 | 
503 | 
504 | def standardize_colnames(df: pd.DataFrame, cols: Optional[List[str]] = None) -> pd.DataFrame:
505 |     """Convert DataFrame column names to snake case."""
506 | 
507 |     def to_snake(name: str) -> str:
508 |         name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
509 |         name = re.sub("__([A-Z])", r"_\1", name)
510 |         name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
511 |         return name.lower().replace("-", "_").replace(" ", "")
512 | 
513 |     if cols is None:
514 |         cols = list(df.columns)
515 | 
516 |     return df.rename(columns={c: to_snake(c) for c in cols})
517 | 
518 | 
519 | def get_proxy() -> Dict[str, str]:
520 |     """Return a public proxy."""
521 |     # list of free proxy apis
522 |     # protocols: http, https, socks4 and socks5
523 |     list_of_proxy_content = [
524 |         "https://proxylist.geonode.com/api/proxy-list?sort_by=lastChecked&sort_type=desc",
525 |     ]
526 | 
527 |     # extracting json data from this list of proxies
528 |     full_proxy_list = []
529 |     for proxy_url in list_of_proxy_content:
530 |         proxy_json = json.loads(requests.get(proxy_url).text)["data"]
531 |         full_proxy_list.extend(proxy_json)
532 | 
533 |         if not full_proxy_list:
534 |             logger.info("There are currently no proxies available. Exiting...")
535 |             return {}
536 |         else:
537 |             logger.info(f"Found {len(full_proxy_list)} proxy servers. Checking...\n")
538 | 
539 |     # creating proxy dict
540 |     final_proxy_list = []
541 |     for proxy in full_proxy_list:
542 |         protocol = proxy["protocols"][0]
543 |         ip_ = proxy["ip"]
544 |         port = proxy["port"]
545 | 
546 |         proxy = {
547 |             "https": protocol + "://" + ip_ + ":" + port,
548 |             "http": protocol + "://" + ip_ + ":" + port,
549 |         }
550 | 
551 |         final_proxy_list.append(proxy)
552 | 
553 |     # trying proxy
554 |     for proxy in final_proxy_list:
555 |         if check_proxy(proxy):
556 |             return proxy
557 | 
558 |     logger.info("There are currently no proxies available. Exiting...")
559 |     return {}
560 | 
561 | 
562 | def check_proxy(proxy: dict) -> bool:
563 |     """Check if proxy is working."""
564 |     try:
565 |         r0 = requests.get("https://ipinfo.io/json", proxies=proxy, timeout=15)
566 |         return r0.status_code == 200
567 |     except Exception as error:
568 |         logger.error(f"BAD PROXY: Reason: {str(error)}\n")
569 |         return False
570 | 


--------------------------------------------------------------------------------
/docs/datasources/SoFIFA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "e621e3ae",
  7 |    "metadata": {
  8 |     "nbsphinx": "hidden"
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import pandas as pd\n",
 13 |     "pd.set_option('display.max_columns', None)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "0f792a6b",
 20 |    "metadata": {
 21 |     "nbsphinx": "hidden"
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "env: SOCCERDATA_LOGLEVEL=ERROR\n",
 29 |       "env: SOCCERDATA_NOCACHE=True\n",
 30 |       "env: SOCCERDATA_NOSTORE=True\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "%env SOCCERDATA_LOGLEVEL=ERROR\n",
 36 |     "%env SOCCERDATA_NOCACHE=True\n",
 37 |     "%env SOCCERDATA_NOSTORE=True"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "id": "2454afe6",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import soccerdata as sd"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "b5784f2d",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# SoFIFA"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "id": "8dab5be9",
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stderr",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/soccerdata/_common.py:466: UserWarning: Season id \"2021\" is ambiguous: interpreting as \"20-21\"\n",
 69 |       "  warnings.warn(msg)\n"
 70 |      ]
 71 |     },
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "Provides pd.DataFrames from data at http://sofifa.com.\n",
 77 |       "\n",
 78 |       "    Data will be downloaded as necessary and cached locally in\n",
 79 |       "    ``~/soccerdata/data/SoFIFA``.\n",
 80 |       "\n",
 81 |       "    Parameters\n",
 82 |       "    ----------\n",
 83 |       "    leagues : string or iterable, optional\n",
 84 |       "        IDs of leagues to include.\n",
 85 |       "    seasons : string, int or list, optional\n",
 86 |       "        Seasons to include. Supports multiple formats.\n",
 87 |       "        Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]\n",
 88 |       "    proxy : 'tor' or or dict or list(dict) or callable, optional\n",
 89 |       "        Use a proxy to hide your IP address. Valid options are:\n",
 90 |       "            - \"tor\": Uses the Tor network. Tor should be running in\n",
 91 |       "              the background on port 9050.\n",
 92 |       "            - dict: A dictionary with the proxy to use. The dict should be\n",
 93 |       "              a mapping of supported protocols to proxy addresses. For example::\n",
 94 |       "\n",
 95 |       "                  {\n",
 96 |       "                      'http': 'http://10.10.1.10:3128',\n",
 97 |       "                      'https': 'http://10.10.1.10:1080',\n",
 98 |       "                  }\n",
 99 |       "\n",
100 |       "            - list(dict): A list of proxies to choose from. A different proxy will\n",
101 |       "              be selected from this list after failed requests, allowing rotating\n",
102 |       "              proxies.\n",
103 |       "            - callable: A function that returns a valid proxy. This function will\n",
104 |       "              be called after failed requests, allowing rotating proxies.\n",
105 |       "    no_cache : bool\n",
106 |       "        If True, will not use cached data.\n",
107 |       "    no_store : bool\n",
108 |       "        If True, will not store downloaded data.\n",
109 |       "    data_dir : Path\n",
110 |       "        Path to directory where data will be cached.\n",
111 |       "    \n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "sofifa = sd.SoFIFA(leagues=\"ENG-Premier League\", seasons=2021)\n",
117 |     "print(sofifa.__doc__)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "id": "3a4c2916",
123 |    "metadata": {},
124 |    "source": [
125 |     "## EA Sports FIFA player ratings"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 5,
131 |    "id": "745be31a",
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/html": [
137 |        "<div>\n",
138 |        "<style scoped>\n",
139 |        "    .dataframe tbody tr th:only-of-type {\n",
140 |        "        vertical-align: middle;\n",
141 |        "    }\n",
142 |        "\n",
143 |        "    .dataframe tbody tr th {\n",
144 |        "        vertical-align: top;\n",
145 |        "    }\n",
146 |        "\n",
147 |        "    .dataframe thead th {\n",
148 |        "        text-align: right;\n",
149 |        "    }\n",
150 |        "</style>\n",
151 |        "<table border=\"1\" class=\"dataframe\">\n",
152 |        "  <thead>\n",
153 |        "    <tr style=\"text-align: right;\">\n",
154 |        "      <th></th>\n",
155 |        "      <th></th>\n",
156 |        "      <th></th>\n",
157 |        "      <th>overall_rating</th>\n",
158 |        "      <th>potential</th>\n",
159 |        "      <th>crossing</th>\n",
160 |        "      <th>finishing</th>\n",
161 |        "      <th>heading_accuracy</th>\n",
162 |        "      <th>short_passing</th>\n",
163 |        "      <th>volleys</th>\n",
164 |        "      <th>dribbling</th>\n",
165 |        "      <th>curve</th>\n",
166 |        "      <th>fk_accuracy</th>\n",
167 |        "      <th>long_passing</th>\n",
168 |        "      <th>ball_control</th>\n",
169 |        "      <th>acceleration</th>\n",
170 |        "      <th>sprint_speed</th>\n",
171 |        "      <th>agility</th>\n",
172 |        "      <th>reactions</th>\n",
173 |        "      <th>balance</th>\n",
174 |        "      <th>shot_power</th>\n",
175 |        "      <th>jumping</th>\n",
176 |        "      <th>stamina</th>\n",
177 |        "      <th>strength</th>\n",
178 |        "      <th>long_shots</th>\n",
179 |        "      <th>aggression</th>\n",
180 |        "      <th>interceptions</th>\n",
181 |        "      <th>positioning</th>\n",
182 |        "      <th>vision</th>\n",
183 |        "      <th>penalties</th>\n",
184 |        "      <th>composure</th>\n",
185 |        "      <th>marking</th>\n",
186 |        "      <th>standing_tackle</th>\n",
187 |        "      <th>sliding_tackle</th>\n",
188 |        "      <th>gk_diving</th>\n",
189 |        "      <th>gk_handling</th>\n",
190 |        "      <th>gk_kicking</th>\n",
191 |        "      <th>gk_positioning</th>\n",
192 |        "      <th>gk_reflexes</th>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "      <th>league</th>\n",
196 |        "      <th>season</th>\n",
197 |        "      <th>player</th>\n",
198 |        "      <th></th>\n",
199 |        "      <th></th>\n",
200 |        "      <th></th>\n",
201 |        "      <th></th>\n",
202 |        "      <th></th>\n",
203 |        "      <th></th>\n",
204 |        "      <th></th>\n",
205 |        "      <th></th>\n",
206 |        "      <th></th>\n",
207 |        "      <th></th>\n",
208 |        "      <th></th>\n",
209 |        "      <th></th>\n",
210 |        "      <th></th>\n",
211 |        "      <th></th>\n",
212 |        "      <th></th>\n",
213 |        "      <th></th>\n",
214 |        "      <th></th>\n",
215 |        "      <th></th>\n",
216 |        "      <th></th>\n",
217 |        "      <th></th>\n",
218 |        "      <th></th>\n",
219 |        "      <th></th>\n",
220 |        "      <th></th>\n",
221 |        "      <th></th>\n",
222 |        "      <th></th>\n",
223 |        "      <th></th>\n",
224 |        "      <th></th>\n",
225 |        "      <th></th>\n",
226 |        "      <th></th>\n",
227 |        "      <th></th>\n",
228 |        "      <th></th>\n",
229 |        "      <th></th>\n",
230 |        "      <th></th>\n",
231 |        "      <th></th>\n",
232 |        "      <th></th>\n",
233 |        "      <th></th>\n",
234 |        "    </tr>\n",
235 |        "  </thead>\n",
236 |        "  <tbody>\n",
237 |        "    <tr>\n",
238 |        "      <th rowspan=\"5\" valign=\"top\">ENG-Premier League</th>\n",
239 |        "      <th rowspan=\"5\" valign=\"top\">2021</th>\n",
240 |        "      <th>Aaron Cresswell</th>\n",
241 |        "      <td>Best Overall Rating</td>\n",
242 |        "      <td>None</td>\n",
243 |        "      <td>83</td>\n",
244 |        "      <td>54</td>\n",
245 |        "      <td>68</td>\n",
246 |        "      <td>77</td>\n",
247 |        "      <td>48</td>\n",
248 |        "      <td>76</td>\n",
249 |        "      <td>80</td>\n",
250 |        "      <td>76</td>\n",
251 |        "      <td>69</td>\n",
252 |        "      <td>77</td>\n",
253 |        "      <td>72</td>\n",
254 |        "      <td>67</td>\n",
255 |        "      <td>73</td>\n",
256 |        "      <td>78</td>\n",
257 |        "      <td>81</td>\n",
258 |        "      <td>71</td>\n",
259 |        "      <td>86</td>\n",
260 |        "      <td>75</td>\n",
261 |        "      <td>61</td>\n",
262 |        "      <td>66</td>\n",
263 |        "      <td>73</td>\n",
264 |        "      <td>77</td>\n",
265 |        "      <td>60</td>\n",
266 |        "      <td>68</td>\n",
267 |        "      <td>59</td>\n",
268 |        "      <td>73</td>\n",
269 |        "      <td>None</td>\n",
270 |        "      <td>78</td>\n",
271 |        "      <td>79</td>\n",
272 |        "      <td>14</td>\n",
273 |        "      <td>7</td>\n",
274 |        "      <td>9</td>\n",
275 |        "      <td>9</td>\n",
276 |        "      <td>12</td>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <th>Aaron Lennon</th>\n",
280 |        "      <td>Best Overall Rating</td>\n",
281 |        "      <td>None</td>\n",
282 |        "      <td>74</td>\n",
283 |        "      <td>58</td>\n",
284 |        "      <td>26</td>\n",
285 |        "      <td>72</td>\n",
286 |        "      <td>68</td>\n",
287 |        "      <td>79</td>\n",
288 |        "      <td>66</td>\n",
289 |        "      <td>53</td>\n",
290 |        "      <td>64</td>\n",
291 |        "      <td>72</td>\n",
292 |        "      <td>80</td>\n",
293 |        "      <td>77</td>\n",
294 |        "      <td>81</td>\n",
295 |        "      <td>73</td>\n",
296 |        "      <td>93</td>\n",
297 |        "      <td>64</td>\n",
298 |        "      <td>60</td>\n",
299 |        "      <td>68</td>\n",
300 |        "      <td>53</td>\n",
301 |        "      <td>57</td>\n",
302 |        "      <td>55</td>\n",
303 |        "      <td>50</td>\n",
304 |        "      <td>72</td>\n",
305 |        "      <td>72</td>\n",
306 |        "      <td>62</td>\n",
307 |        "      <td>73</td>\n",
308 |        "      <td>None</td>\n",
309 |        "      <td>50</td>\n",
310 |        "      <td>43</td>\n",
311 |        "      <td>14</td>\n",
312 |        "      <td>7</td>\n",
313 |        "      <td>7</td>\n",
314 |        "      <td>16</td>\n",
315 |        "      <td>11</td>\n",
316 |        "    </tr>\n",
317 |        "    <tr>\n",
318 |        "      <th>Aaron Ramsdale</th>\n",
319 |        "      <td>Best Overall Rating</td>\n",
320 |        "      <td>None</td>\n",
321 |        "      <td>15</td>\n",
322 |        "      <td>14</td>\n",
323 |        "      <td>15</td>\n",
324 |        "      <td>54</td>\n",
325 |        "      <td>16</td>\n",
326 |        "      <td>18</td>\n",
327 |        "      <td>17</td>\n",
328 |        "      <td>16</td>\n",
329 |        "      <td>54</td>\n",
330 |        "      <td>35</td>\n",
331 |        "      <td>47</td>\n",
332 |        "      <td>50</td>\n",
333 |        "      <td>53</td>\n",
334 |        "      <td>80</td>\n",
335 |        "      <td>49</td>\n",
336 |        "      <td>63</td>\n",
337 |        "      <td>66</td>\n",
338 |        "      <td>35</td>\n",
339 |        "      <td>59</td>\n",
340 |        "      <td>15</td>\n",
341 |        "      <td>35</td>\n",
342 |        "      <td>24</td>\n",
343 |        "      <td>18</td>\n",
344 |        "      <td>64</td>\n",
345 |        "      <td>25</td>\n",
346 |        "      <td>65</td>\n",
347 |        "      <td>None</td>\n",
348 |        "      <td>16</td>\n",
349 |        "      <td>15</td>\n",
350 |        "      <td>82</td>\n",
351 |        "      <td>77</td>\n",
352 |        "      <td>84</td>\n",
353 |        "      <td>78</td>\n",
354 |        "      <td>84</td>\n",
355 |        "    </tr>\n",
356 |        "    <tr>\n",
357 |        "      <th>Abdoulaye Doucouré</th>\n",
358 |        "      <td>Best Overall Rating</td>\n",
359 |        "      <td>None</td>\n",
360 |        "      <td>69</td>\n",
361 |        "      <td>75</td>\n",
362 |        "      <td>69</td>\n",
363 |        "      <td>81</td>\n",
364 |        "      <td>68</td>\n",
365 |        "      <td>79</td>\n",
366 |        "      <td>44</td>\n",
367 |        "      <td>42</td>\n",
368 |        "      <td>77</td>\n",
369 |        "      <td>81</td>\n",
370 |        "      <td>67</td>\n",
371 |        "      <td>76</td>\n",
372 |        "      <td>65</td>\n",
373 |        "      <td>79</td>\n",
374 |        "      <td>68</td>\n",
375 |        "      <td>84</td>\n",
376 |        "      <td>73</td>\n",
377 |        "      <td>90</td>\n",
378 |        "      <td>82</td>\n",
379 |        "      <td>77</td>\n",
380 |        "      <td>78</td>\n",
381 |        "      <td>81</td>\n",
382 |        "      <td>77</td>\n",
383 |        "      <td>75</td>\n",
384 |        "      <td>54</td>\n",
385 |        "      <td>77</td>\n",
386 |        "      <td>None</td>\n",
387 |        "      <td>80</td>\n",
388 |        "      <td>76</td>\n",
389 |        "      <td>15</td>\n",
390 |        "      <td>12</td>\n",
391 |        "      <td>12</td>\n",
392 |        "      <td>15</td>\n",
393 |        "      <td>14</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>Adam Webster</th>\n",
397 |        "      <td>Best Overall Rating</td>\n",
398 |        "      <td>None</td>\n",
399 |        "      <td>50</td>\n",
400 |        "      <td>30</td>\n",
401 |        "      <td>78</td>\n",
402 |        "      <td>74</td>\n",
403 |        "      <td>27</td>\n",
404 |        "      <td>68</td>\n",
405 |        "      <td>27</td>\n",
406 |        "      <td>25</td>\n",
407 |        "      <td>70</td>\n",
408 |        "      <td>73</td>\n",
409 |        "      <td>64</td>\n",
410 |        "      <td>74</td>\n",
411 |        "      <td>63</td>\n",
412 |        "      <td>74</td>\n",
413 |        "      <td>57</td>\n",
414 |        "      <td>37</td>\n",
415 |        "      <td>77</td>\n",
416 |        "      <td>69</td>\n",
417 |        "      <td>77</td>\n",
418 |        "      <td>32</td>\n",
419 |        "      <td>75</td>\n",
420 |        "      <td>76</td>\n",
421 |        "      <td>26</td>\n",
422 |        "      <td>53</td>\n",
423 |        "      <td>24</td>\n",
424 |        "      <td>77</td>\n",
425 |        "      <td>None</td>\n",
426 |        "      <td>77</td>\n",
427 |        "      <td>75</td>\n",
428 |        "      <td>10</td>\n",
429 |        "      <td>8</td>\n",
430 |        "      <td>14</td>\n",
431 |        "      <td>7</td>\n",
432 |        "      <td>12</td>\n",
433 |        "    </tr>\n",
434 |        "  </tbody>\n",
435 |        "</table>\n",
436 |        "</div>"
437 |       ],
438 |       "text/plain": [
439 |        "                                                   overall_rating potential  \\\n",
440 |        "league             season player                                              \n",
441 |        "ENG-Premier League 2021   Aaron Cresswell     Best Overall Rating      None   \n",
442 |        "                          Aaron Lennon        Best Overall Rating      None   \n",
443 |        "                          Aaron Ramsdale      Best Overall Rating      None   \n",
444 |        "                          Abdoulaye Doucouré  Best Overall Rating      None   \n",
445 |        "                          Adam Webster        Best Overall Rating      None   \n",
446 |        "\n",
447 |        "                                             crossing finishing  \\\n",
448 |        "league             season player                                  \n",
449 |        "ENG-Premier League 2021   Aaron Cresswell          83        54   \n",
450 |        "                          Aaron Lennon             74        58   \n",
451 |        "                          Aaron Ramsdale           15        14   \n",
452 |        "                          Abdoulaye Doucouré       69        75   \n",
453 |        "                          Adam Webster             50        30   \n",
454 |        "\n",
455 |        "                                             heading_accuracy short_passing  \\\n",
456 |        "league             season player                                              \n",
457 |        "ENG-Premier League 2021   Aaron Cresswell                  68            77   \n",
458 |        "                          Aaron Lennon                     26            72   \n",
459 |        "                          Aaron Ramsdale                   15            54   \n",
460 |        "                          Abdoulaye Doucouré               69            81   \n",
461 |        "                          Adam Webster                     78            74   \n",
462 |        "\n",
463 |        "                                             volleys dribbling curve  \\\n",
464 |        "league             season player                                       \n",
465 |        "ENG-Premier League 2021   Aaron Cresswell         48        76    80   \n",
466 |        "                          Aaron Lennon            68        79    66   \n",
467 |        "                          Aaron Ramsdale          16        18    17   \n",
468 |        "                          Abdoulaye Doucouré      68        79    44   \n",
469 |        "                          Adam Webster            27        68    27   \n",
470 |        "\n",
471 |        "                                             fk_accuracy long_passing  \\\n",
472 |        "league             season player                                        \n",
473 |        "ENG-Premier League 2021   Aaron Cresswell             76           69   \n",
474 |        "                          Aaron Lennon                53           64   \n",
475 |        "                          Aaron Ramsdale              16           54   \n",
476 |        "                          Abdoulaye Doucouré          42           77   \n",
477 |        "                          Adam Webster                25           70   \n",
478 |        "\n",
479 |        "                                             ball_control acceleration  \\\n",
480 |        "league             season player                                         \n",
481 |        "ENG-Premier League 2021   Aaron Cresswell              77           72   \n",
482 |        "                          Aaron Lennon                 72           80   \n",
483 |        "                          Aaron Ramsdale               35           47   \n",
484 |        "                          Abdoulaye Doucouré           81           67   \n",
485 |        "                          Adam Webster                 73           64   \n",
486 |        "\n",
487 |        "                                             sprint_speed agility reactions  \\\n",
488 |        "league             season player                                              \n",
489 |        "ENG-Premier League 2021   Aaron Cresswell              67      73        78   \n",
490 |        "                          Aaron Lennon                 77      81        73   \n",
491 |        "                          Aaron Ramsdale               50      53        80   \n",
492 |        "                          Abdoulaye Doucouré           76      65        79   \n",
493 |        "                          Adam Webster                 74      63        74   \n",
494 |        "\n",
495 |        "                                             balance shot_power jumping  \\\n",
496 |        "league             season player                                          \n",
497 |        "ENG-Premier League 2021   Aaron Cresswell         81         71      86   \n",
498 |        "                          Aaron Lennon            93         64      60   \n",
499 |        "                          Aaron Ramsdale          49         63      66   \n",
500 |        "                          Abdoulaye Doucouré      68         84      73   \n",
501 |        "                          Adam Webster            57         37      77   \n",
502 |        "\n",
503 |        "                                             stamina strength long_shots  \\\n",
504 |        "league             season player                                           \n",
505 |        "ENG-Premier League 2021   Aaron Cresswell         75       61         66   \n",
506 |        "                          Aaron Lennon            68       53         57   \n",
507 |        "                          Aaron Ramsdale          35       59         15   \n",
508 |        "                          Abdoulaye Doucouré      90       82         77   \n",
509 |        "                          Adam Webster            69       77         32   \n",
510 |        "\n",
511 |        "                                             aggression interceptions  \\\n",
512 |        "league             season player                                        \n",
513 |        "ENG-Premier League 2021   Aaron Cresswell            73            77   \n",
514 |        "                          Aaron Lennon               55            50   \n",
515 |        "                          Aaron Ramsdale             35            24   \n",
516 |        "                          Abdoulaye Doucouré         78            81   \n",
517 |        "                          Adam Webster               75            76   \n",
518 |        "\n",
519 |        "                                             positioning vision penalties  \\\n",
520 |        "league             season player                                            \n",
521 |        "ENG-Premier League 2021   Aaron Cresswell             60     68        59   \n",
522 |        "                          Aaron Lennon                72     72        62   \n",
523 |        "                          Aaron Ramsdale              18     64        25   \n",
524 |        "                          Abdoulaye Doucouré          77     75        54   \n",
525 |        "                          Adam Webster                26     53        24   \n",
526 |        "\n",
527 |        "                                             composure marking  \\\n",
528 |        "league             season player                                 \n",
529 |        "ENG-Premier League 2021   Aaron Cresswell           73    None   \n",
530 |        "                          Aaron Lennon              73    None   \n",
531 |        "                          Aaron Ramsdale            65    None   \n",
532 |        "                          Abdoulaye Doucouré        77    None   \n",
533 |        "                          Adam Webster              77    None   \n",
534 |        "\n",
535 |        "                                             standing_tackle sliding_tackle  \\\n",
536 |        "league             season player                                              \n",
537 |        "ENG-Premier League 2021   Aaron Cresswell                 78             79   \n",
538 |        "                          Aaron Lennon                    50             43   \n",
539 |        "                          Aaron Ramsdale                  16             15   \n",
540 |        "                          Abdoulaye Doucouré              80             76   \n",
541 |        "                          Adam Webster                    77             75   \n",
542 |        "\n",
543 |        "                                             gk_diving gk_handling gk_kicking  \\\n",
544 |        "league             season player                                                \n",
545 |        "ENG-Premier League 2021   Aaron Cresswell           14           7          9   \n",
546 |        "                          Aaron Lennon              14           7          7   \n",
547 |        "                          Aaron Ramsdale            82          77         84   \n",
548 |        "                          Abdoulaye Doucouré        15          12         12   \n",
549 |        "                          Adam Webster              10           8         14   \n",
550 |        "\n",
551 |        "                                             gk_positioning gk_reflexes  \n",
552 |        "league             season player                                         \n",
553 |        "ENG-Premier League 2021   Aaron Cresswell                 9          12  \n",
554 |        "                          Aaron Lennon                   16          11  \n",
555 |        "                          Aaron Ramsdale                 78          84  \n",
556 |        "                          Abdoulaye Doucouré             15          14  \n",
557 |        "                          Adam Webster                    7          12  "
558 |       ]
559 |      },
560 |      "execution_count": 5,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "ratings = sofifa.read_ratings()\n",
567 |     "ratings.head()"
568 |    ]
569 |   }
570 |  ],
571 |  "metadata": {
572 |   "kernelspec": {
573 |    "display_name": "soccerdata",
574 |    "language": "python",
575 |    "name": "soccerdata"
576 |   },
577 |   "language_info": {
578 |    "codemirror_mode": {
579 |     "name": "ipython",
580 |     "version": 3
581 |    },
582 |    "file_extension": ".py",
583 |    "mimetype": "text/x-python",
584 |    "name": "python",
585 |    "nbconvert_exporter": "python",
586 |    "pygments_lexer": "ipython3",
587 |    "version": "3.9.6"
588 |   },
589 |   "toc": {
590 |    "base_numbering": 1,
591 |    "nav_menu": {},
592 |    "number_sections": true,
593 |    "sideBar": true,
594 |    "skip_h1_title": false,
595 |    "title_cell": "Table of Contents",
596 |    "title_sidebar": "Contents",
597 |    "toc_cell": false,
598 |    "toc_position": {},
599 |    "toc_section_display": true,
600 |    "toc_window_display": true
601 |   }
602 |  },
603 |  "nbformat": 4,
604 |  "nbformat_minor": 5
605 | }
606 | 


--------------------------------------------------------------------------------
/soccerdata/fbref.py:
--------------------------------------------------------------------------------
  1 | """Scraper for http://fbref.com."""
  2 | import itertools
  3 | import warnings
  4 | from functools import reduce
  5 | from pathlib import Path
  6 | from typing import Callable, Dict, List, Optional, Union
  7 | 
  8 | import pandas as pd
  9 | from lxml import etree, html
 10 | 
 11 | from ._common import (
 12 |     BaseRequestsReader,
 13 |     make_game_id,
 14 |     season_code,
 15 |     standardize_colnames,
 16 | )
 17 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
 18 | 
 19 | FBREF_DATADIR = DATA_DIR / "FBref"
 20 | FBREF_API = "https://fbref.com"
 21 | 
 22 | BIG_FIVE_DICT = {
 23 |     "it Serie A": "ITA-Serie A",
 24 |     "fr Ligue 1": "FRA-Ligue 1",
 25 |     "es La Liga": "ESP-La Liga",
 26 |     "eng Premier League": "ENG-Premier League",
 27 |     "de Bundesliga": "GER-Bundesliga",
 28 | }
 29 | 
 30 | 
 31 | class FBref(BaseRequestsReader):
 32 |     """Provides pd.DataFrames from data at http://fbref.com.
 33 | 
 34 |     Data will be downloaded as necessary and cached locally in
 35 |     ``~/soccerdata/data/FBref``.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     leagues : string or iterable, optional
 40 |         IDs of leagues to include. For efficiently reading data from the Top-5
 41 |         European leagues, use "Big 5 European Leagues Combined".
 42 |     seasons : string, int or list, optional
 43 |         Seasons to include. Supports multiple formats.
 44 |         Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
 45 |     proxy : 'tor' or dict or list(dict) or callable, optional
 46 |         Use a proxy to hide your IP address. Valid options are:
 47 |             - "tor": Uses the Tor network. Tor should be running in
 48 |               the background on port 9050.
 49 |             - dict: A dictionary with the proxy to use. The dict should be
 50 |               a mapping of supported protocols to proxy addresses. For example::
 51 | 
 52 |                   {
 53 |                       'http': 'http://10.10.1.10:3128',
 54 |                       'https': 'http://10.10.1.10:1080',
 55 |                   }
 56 | 
 57 |             - list(dict): A list of proxies to choose from. A different proxy will
 58 |               be selected from this list after failed requests, allowing rotating
 59 |               proxies.
 60 |             - callable: A function that returns a valid proxy. This function will
 61 |               be called after failed requests, allowing rotating proxies.
 62 |     no_cache : bool
 63 |         If True, will not use cached data.
 64 |     no_store : bool
 65 |         If True, will not store downloaded data.
 66 |     data_dir : Path
 67 |         Path to directory where data will be cached.
 68 |     """
 69 | 
 70 |     def __init__(
 71 |         self,
 72 |         leagues: Optional[Union[str, List[str]]] = None,
 73 |         seasons: Optional[Union[str, int, List]] = None,
 74 |         proxy: Optional[
 75 |             Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]]
 76 |         ] = None,
 77 |         no_cache: bool = NOCACHE,
 78 |         no_store: bool = NOSTORE,
 79 |         data_dir: Path = FBREF_DATADIR,
 80 |     ):
 81 |         """Initialize FBref reader."""
 82 |         super().__init__(
 83 |             leagues=leagues,
 84 |             proxy=proxy,
 85 |             no_cache=no_cache,
 86 |             no_store=no_store,
 87 |             data_dir=data_dir,
 88 |         )
 89 |         self.rate_limit = 3
 90 |         self.seasons = seasons  # type: ignore
 91 |         # check if all top 5 leagues are selected
 92 |         selected_leagues = set(self._leagues_dict.keys())
 93 |         if set(BIG_FIVE_DICT.values()).issubset(selected_leagues):
 94 |             warnings.warn(
 95 |                 "You are trying to scrape data for all of the Big 5 European leagues. "
 96 |                 "This can be done more efficiently by setting "
 97 |                 "leagues='Big 5 European Leagues Combined'."
 98 |             )
 99 | 
100 |     @property
101 |     def leagues(self) -> List[str]:
102 |         """Return a list of selected leagues."""
103 |         selected_leagues = set(self._leagues_dict.keys())
104 |         if "Big 5 European Leagues Combined" in selected_leagues:
105 |             selected_leagues -= set(BIG_FIVE_DICT.values())
106 |         return list(selected_leagues)
107 | 
108 |     @classmethod
109 |     def _all_leagues(cls) -> Dict[str, str]:
110 |         """Return a dict mapping all canonical league IDs to source league IDs."""
111 |         res = super()._all_leagues()
112 |         res.update({"Big 5 European Leagues Combined": "Big 5 European Leagues Combined"})
113 |         return res
114 | 
115 |     def read_leagues(self) -> pd.DataFrame:
116 |         """Retrieve selected leagues from the datasource.
117 | 
118 |         Returns
119 |         -------
120 |         pd.DataFrame
121 |         """
122 |         url = f"{FBREF_API}/en/comps/"
123 |         filepath = self.data_dir / "leagues.html"
124 |         reader = self.get(url, filepath)
125 | 
126 |         # extract league links
127 |         leagues = []
128 |         tree = html.parse(reader)
129 |         for table in tree.xpath("//table[contains(@id, 'comps')]"):
130 |             df_table = pd.read_html(etree.tostring(table, method="html"))[0]
131 |             df_table["url"] = table.xpath(".//th[@data-stat='league_name']/a/@href")
132 |             leagues.append(df_table)
133 |         df = (
134 |             pd.concat(leagues)
135 |             .pipe(standardize_colnames)
136 |             .rename(columns={"competition_name": "league"})
137 |             .pipe(self._translate_league)
138 |             .drop_duplicates(subset="league")
139 |             .set_index("league")
140 |             .sort_index()
141 |         )
142 |         df["country"] = df["country"].apply(
143 |             lambda x: x.split(" ")[1] if isinstance(x, str) else None
144 |         )
145 |         return df[df.index.isin(self.leagues)]
146 | 
147 |     def read_seasons(self) -> pd.DataFrame:
148 |         """Retrieve the selected seasons for the selected leagues.
149 | 
150 |         Returns
151 |         -------
152 |         pd.DataFrame
153 |         """
154 |         filemask = "seasons_{}.html"
155 |         df_leagues = self.read_leagues()
156 | 
157 |         seasons = []
158 |         for lkey, league in df_leagues.iterrows():
159 |             url = FBREF_API + league.url
160 |             filepath = self.data_dir / filemask.format(lkey)
161 |             reader = self.get(url, filepath)
162 | 
163 |             # extract season links
164 |             tree = html.parse(reader)
165 |             df_table = pd.read_html(etree.tostring(tree), attrs={"id": "seasons"})[0]
166 |             df_table["url"] = tree.xpath(
167 |                 "//table[@id='seasons']//th[@data-stat='year_id']/a/@href"
168 |             )
169 |             seasons.append(df_table)
170 | 
171 |         df = pd.concat(seasons).pipe(standardize_colnames)
172 |         # A competition name field is not inlcuded in the Big 5 European Leagues Combined
173 |         if "competition_name" in df.columns:
174 |             df = df.rename(columns={"competition_name": "league"}).pipe(self._translate_league)
175 |         else:
176 |             df["league"] = "Big 5 European Leagues Combined"
177 |         df["season"] = df["season"].apply(lambda x: season_code(x))
178 |         df = df.set_index(["league", "season"]).sort_index()
179 |         return df.loc[df.index.isin(itertools.product(self.leagues, self.seasons))]
180 | 
181 |     def read_team_season_stats(
182 |         self, stat_type: str = "standard", opponent_stats: bool = False
183 |     ) -> pd.DataFrame:
184 |         """Retrieve teams from the datasource for the selected leagues.
185 | 
186 |         The following stat types are available:
187 |             * 'standard'
188 |             * 'keeper'
189 |             * 'keeper_adv'
190 |             * 'shooting'
191 |             * 'passing'
192 |             * 'passing_types'
193 |             * 'goal_shot_creation'
194 |             * 'defense'
195 |             * 'possession'
196 |             * 'playing_time'
197 |             * 'misc'
198 | 
199 |         Parameters
200 |         ----------
201 |         stat_type: str
202 |             Type of stats to retrieve.
203 |         opponent_stats: bool
204 |             If True, will retrieve opponent stats.
205 | 
206 |         Raises
207 |         ------
208 |         TypeError
209 |             If ``stat_type`` is not valid.
210 | 
211 |         Returns
212 |         -------
213 |         pd.DataFrame
214 |         """
215 |         team_stats = [
216 |             "standard",
217 |             "keeper",
218 |             "keeper_adv",
219 |             "shooting",
220 |             "passing",
221 |             "passing_types",
222 |             "goal_shot_creation",
223 |             "defense",
224 |             "possession",
225 |             "playing_time",
226 |             "misc",
227 |         ]
228 | 
229 |         filemask = "teams_{}_{}_{}.html"
230 | 
231 |         if stat_type not in team_stats:
232 |             raise TypeError(f"Invalid argument: stat_type should be in {team_stats}")
233 | 
234 |         if stat_type == "standard":
235 |             page = "stats"
236 |         elif stat_type == "keeper":
237 |             page = "keepers"
238 |         elif stat_type == "keeper_adv":
239 |             page = "keepersadv"
240 |         elif stat_type == "goal_shot_creation":
241 |             page = "gca"
242 |             stat_type = "gca"
243 |         elif stat_type == "playing_time":
244 |             page = "playingtime"
245 |         else:
246 |             page = stat_type
247 | 
248 |         if opponent_stats:
249 |             stat_type += "_against"
250 |         else:
251 |             stat_type += "_for"
252 | 
253 |         # get league IDs
254 |         seasons = self.read_seasons()
255 | 
256 |         # collect teams
257 |         teams = []
258 |         for (lkey, skey), season in seasons.iterrows():
259 |             big_five = lkey == "Big 5 European Leagues Combined"
260 |             # read html page (league overview)
261 |             filepath = self.data_dir / filemask.format(
262 |                 lkey, skey, stat_type if big_five else "all"
263 |             )
264 |             url = (
265 |                 FBREF_API
266 |                 + "/".join(season.url.split("/")[:-1])
267 |                 + (f"/{page}/squads/" if big_five else "/")
268 |                 + season.url.split("/")[-1]
269 |             )
270 |             reader = self.get(url, filepath)
271 | 
272 |             # extract team links
273 |             tree = html.parse(reader)
274 |             if big_five:
275 |                 df_table = pd.read_html(
276 |                     etree.tostring(tree), attrs={"id": f"stats_teams_{stat_type}"}
277 |                 )[0]
278 |                 df_table["url"] = tree.xpath(
279 |                     f"//table[@id='stats_teams_{stat_type}']//td[@data-stat='team']/a/@href"
280 |                 )
281 |                 df_table["league"] = (
282 |                     df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT)
283 |                 )
284 |                 df_table["season"] = skey
285 |                 df_table = df_table.drop("Rk", axis=1, level=1).drop("Comp", axis=1, level=1)
286 |             else:
287 |                 df_table = pd.read_html(
288 |                     etree.tostring(tree), attrs={"id": f"stats_squads_{stat_type}"}
289 |                 )[0]
290 |                 df_table["url"] = tree.xpath(
291 |                     f"//table[@id='stats_squads_{stat_type}']//th[@data-stat='team']/a/@href"
292 |                 )
293 |                 df_table["league"] = lkey
294 |                 df_table["season"] = skey
295 |             teams.append(df_table)
296 | 
297 |         # return data frame
298 |         df = (
299 |             _concat(teams)
300 |             .rename(columns={"Squad": "team"})
301 |             .replace({"team": TEAMNAME_REPLACEMENTS})
302 |             .set_index(["league", "season", "team"])
303 |             .sort_index()
304 |         )
305 |         return df
306 | 
307 |     def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
308 |         """Retrieve players from the datasource for the selected leagues.
309 | 
310 |         The following stat types are available:
311 |             * 'standard'
312 |             * 'shooting'
313 |             * 'passing'
314 |             * 'passing_types'
315 |             * 'goal_shot_creation'
316 |             * 'defense'
317 |             * 'possession'
318 |             * 'playing_time'
319 |             * 'misc'
320 |             * 'keeper'
321 |             * 'keeper_adv'
322 | 
323 |         Parameters
324 |         ----------
325 |         stat_type :str
326 |             Type of stats to retrieve.
327 | 
328 |         Raises
329 |         ------
330 |         TypeError
331 |             If ``stat_type`` is not valid.
332 | 
333 |         Returns
334 |         -------
335 |         pd.DataFrame
336 |         """
337 |         player_stats = [
338 |             "standard",
339 |             "keeper",
340 |             "keeper_adv",
341 |             "shooting",
342 |             "passing",
343 |             "passing_types",
344 |             "goal_shot_creation",
345 |             "defense",
346 |             "possession",
347 |             "playing_time",
348 |             "misc",
349 |         ]
350 | 
351 |         filemask = "players_{}_{}_{}.html"
352 | 
353 |         if stat_type not in player_stats:
354 |             raise TypeError(f"Invalid argument: stat_type should be in {player_stats}")
355 | 
356 |         if stat_type == "standard":
357 |             page = "stats"
358 |         elif stat_type == "goal_shot_creation":
359 |             page = "gca"
360 |             stat_type = "gca"
361 |         elif stat_type == "playing_time":
362 |             page = "playingtime"
363 |         elif stat_type == "keeper":
364 |             page = "keepers"
365 |         elif stat_type == "keeper_adv":
366 |             page = "keepersadv"
367 |         else:
368 |             page = stat_type
369 | 
370 |         # get league IDs
371 |         seasons = self.read_seasons()
372 | 
373 |         # collect players
374 |         players = []
375 |         for (lkey, skey), season in seasons.iterrows():
376 |             big_five = lkey == "Big 5 European Leagues Combined"
377 |             filepath = self.data_dir / filemask.format(lkey, skey, stat_type)
378 |             url = (
379 |                 FBREF_API
380 |                 + "/".join(season.url.split("/")[:-1])
381 |                 + f"/{page}"
382 |                 + ("/players/" if big_five else "/")
383 |                 + season.url.split("/")[-1]
384 |             )
385 |             reader = self.get(url, filepath)
386 |             tree = html.parse(reader)
387 |             if big_five:
388 |                 df_table = pd.read_html(etree.tostring(tree))[0]
389 |                 df_table[("Unnamed: league", "league")] = (
390 |                     df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT)
391 |                 )
392 |                 df_table[("Unnamed: season", "season")] = skey
393 |                 df_table = df_table.drop("Comp", axis=1, level=1)
394 |             else:
395 |                 el = tree.xpath(f"//comment()[contains(.,'div_stats_{stat_type}')]")
396 |                 df_table = pd.read_html(el[0].text, attrs={"id": f"stats_{stat_type}"})[0]
397 |                 df_table[("Unnamed: league", "league")] = lkey
398 |                 df_table[("Unnamed: season", "season")] = skey
399 |             players.append(df_table)
400 | 
401 |         # return dataframe
402 |         df = _concat(players)
403 |         df = df[df.Player != "Player"]
404 |         df = (
405 |             df.drop("Matches", axis=1, level=0)
406 |             .drop("Rk", axis=1, level=0)
407 |             .rename(columns={"Player": "player", "Squad": "team"})
408 |             .replace({"team": TEAMNAME_REPLACEMENTS})
409 |             .set_index(["league", "season", "team", "player"])
410 |             .sort_index()
411 |         )
412 |         df["Nation"] = df["Nation"].apply(
413 |             lambda x: x.split(" ")[1] if isinstance(x, str) and " " in x else None
414 |         )
415 |         return df
416 | 
417 |     def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
418 |         """Retrieve the game schedule for the selected leagues and seasons.
419 | 
420 |         Parameters
421 |         ----------
422 |         force_cache : bool
423 |              By default no cached data is used for the current season.
424 |              If True, will force the use of cached data anyway.
425 | 
426 |         Returns
427 |         -------
428 |         pd.DataFrame
429 |         """
430 |         # get league IDs
431 |         seasons = self.read_seasons()
432 | 
433 |         # collect teams
434 |         schedule = []
435 |         for (lkey, skey), season in seasons.iterrows():
436 |             # read html page (league overview)
437 |             url_stats = FBREF_API + season.url
438 |             filepath_stats = self.data_dir / f"teams_{lkey}_{skey}.html"
439 |             reader = self.get(url_stats, filepath_stats)
440 |             tree = html.parse(reader)
441 | 
442 |             url_fixtures = FBREF_API + tree.xpath("//a[text()='Scores & Fixtures']")[0].get("href")
443 |             filepath_fixtures = self.data_dir / f"schedule_{lkey}_{skey}.html"
444 |             current_season = not self._is_complete(lkey, skey)
445 |             reader = self.get(
446 |                 url_fixtures, filepath_fixtures, no_cache=current_season and not force_cache
447 |             )
448 |             tree = html.parse(reader)
449 |             table = tree.xpath("//table[contains(@id, 'sched')]")[0]
450 |             df_table = pd.read_html(etree.tostring(table))[0]
451 |             df_table["Match Report"] = [
452 |                 mlink.xpath("./a/@href")[0]
453 |                 if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report"
454 |                 else None
455 |                 for mlink in table.xpath(".//td[@data-stat='match_report']")
456 |             ]
457 |             df_table["league"] = lkey
458 |             df_table["season"] = skey
459 |             df_table = df_table.dropna(how="all")
460 |             schedule.append(df_table)
461 |         df = (
462 |             pd.concat(schedule)
463 |             .rename(
464 |                 columns={
465 |                     "Wk": "week",
466 |                     "Home": "home_team",
467 |                     "Away": "away_team",
468 |                     "xG": "home_xg",
469 |                     "xG.1": "away_xg",
470 |                 }
471 |             )
472 |             .replace(
473 |                 {
474 |                     "home_team": TEAMNAME_REPLACEMENTS,
475 |                     "away_team": TEAMNAME_REPLACEMENTS,
476 |                 }
477 |             )
478 |             .pipe(standardize_colnames)
479 |         )
480 |         df["date"] = pd.to_datetime(df["date"]).ffill()
481 |         df["game"] = df.apply(make_game_id, axis=1)
482 |         df.loc[~df.match_report.isna(), "game_id"] = (
483 |             df.loc[~df.match_report.isna(), "match_report"].str.split("/").str[3]
484 |         )
485 |         df = df.set_index(["league", "season", "game"]).sort_index()
486 |         return df
487 | 
488 |     def _parse_teams(self, tree: etree.ElementTree) -> List[Dict]:
489 |         """Parse the teams from a match summary page.
490 | 
491 |         Parameters
492 |         ----------
493 |         tree : etree.ElementTree
494 |             The match summary page.
495 | 
496 |         Returns
497 |         -------
498 |         list of dict
499 |         """
500 |         team_nodes = tree.xpath("//div[@class='scorebox']//strong/a")[:2]
501 |         teams = []
502 |         for team in team_nodes:
503 |             teams.append({"id": team.get("href").split("/")[3], "name": team.text.strip()})
504 |         return teams
505 | 
506 |     def read_lineup(
507 |         self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False
508 |     ) -> pd.DataFrame:
509 |         """Retrieve lineups for the selected leagues and seasons.
510 | 
511 |         Parameters
512 |         ----------
513 |         match_id : int or list of int, optional
514 |             Retrieve the lineup for a specific game.
515 |         force_cache : bool
516 |             By default no cached data is used to scrape the list of available
517 |             games for the current season. If True, will force the use of
518 |             cached data anyway.
519 | 
520 |         Raises
521 |         ------
522 |         ValueError
523 |             If no games with the given IDs were found for the selected seasons and leagues.
524 | 
525 |         Returns
526 |         -------
527 |         pd.DataFrame.
528 |         """
529 |         urlmask = FBREF_API + "/en/matches/{}"
530 |         filemask = "match_{}.html"
531 | 
532 |         # Retrieve games for which a match report is available
533 |         df_schedule = self.read_schedule(force_cache).reset_index()
534 |         df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()]
535 |         # Select requested games if available
536 |         if match_id is not None:
537 |             iterator = df_schedule[
538 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id)
539 |             ]
540 |             if len(iterator) == 0:
541 |                 raise ValueError("No games found with the given IDs in the selected seasons.")
542 |         else:
543 |             iterator = df_schedule
544 | 
545 |         lineups = []
546 |         for i, game in iterator.iterrows():
547 |             url = urlmask.format(game["game_id"])
548 |             # get league and season
549 |             logger.info(
550 |                 "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
551 |             )
552 |             filepath = self.data_dir / filemask.format(game["game_id"])
553 |             reader = self.get(url, filepath)
554 |             tree = html.parse(reader)
555 |             teams = self._parse_teams(tree)
556 |             tables = tree.xpath("//div[@class='lineup']")
557 |             for i, table in enumerate(tables):
558 |                 df_table = pd.read_html(etree.tostring(table))[0]
559 |                 df_table.columns = ["jersey_number", "player"]
560 |                 df_table["team"] = teams[i]["name"]
561 |                 if "Bench" in df_table.jersey_number.values:
562 |                     bench_idx = df_table.index[df_table.jersey_number == "Bench"][0]
563 |                     df_table.loc[:bench_idx, "is_starter"] = True
564 |                     df_table.loc[bench_idx:, "is_starter"] = False
565 |                     df_table["game"] = game["game"]
566 |                     df_table["league"] = game["league"]
567 |                     df_table["season"] = game["season"]
568 |                     df_table["game"] = game["game"]
569 |                     df_table.drop(bench_idx, inplace=True)
570 |                 lineups.append(df_table)
571 |         df = pd.concat(lineups).set_index(["league", "season", "game", "team", "player"])
572 |         # TODO: sub in, sub out, position
573 |         return df
574 | 
575 |     def read_player_match_stats(
576 |         self,
577 |         stat_type: str = "summary",
578 |         match_id: Optional[Union[str, List[str]]] = None,
579 |         force_cache: bool = False,
580 |     ) -> pd.DataFrame:
581 |         """Retrieve the match stats for the selected leagues and seasons.
582 | 
583 |         The following stat types are available:
584 |             * 'summary'
585 |             * 'keepers'
586 |             * 'passing'
587 |             * 'passing_types'
588 |             * 'defense'
589 |             * 'possession'
590 |             * 'misc'
591 | 
592 |         Parameters
593 |         ----------
594 |         stat_type : str
595 |             Type of stats to retrieve.
596 |         match_id : int or list of int, optional
597 |             Retrieve the event stream for a specific game.
598 |         force_cache : bool
599 |             By default no cached data is used to scrape the list of available
600 |             games for the current season. If True, will force the use of
601 |             cached data anyway.
602 | 
603 |         Raises
604 |         ------
605 |         ValueError
606 |             If no games with the given IDs were found for the selected seasons and leagues.
607 |         TypeError
608 |             If ``stat_type`` is not valid.
609 | 
610 |         Returns
611 |         -------
612 |         pd.DataFrame
613 |         """
614 |         match_stats = [
615 |             "summary",
616 |             "keepers",
617 |             "passing",
618 |             "passing_types",
619 |             "defense",
620 |             "possession",
621 |             "misc",
622 |         ]
623 | 
624 |         urlmask = FBREF_API + "/en/matches/{}"
625 |         filemask = "match_{}.html"
626 | 
627 |         if stat_type not in match_stats:
628 |             raise TypeError(f"Invalid argument: stat_type should be in {match_stats}")
629 | 
630 |         # Retrieve games for which a match report is available
631 |         df_schedule = self.read_schedule(force_cache).reset_index()
632 |         df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()]
633 |         # Selec requested games if available
634 |         if match_id is not None:
635 |             iterator = df_schedule[
636 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id)
637 |             ]
638 |             if len(iterator) == 0:
639 |                 raise ValueError("No games found with the given IDs in the selected seasons.")
640 |         else:
641 |             iterator = df_schedule
642 | 
643 |         stats = []
644 |         for i, game in iterator.iterrows():
645 |             url = urlmask.format(game["game_id"])
646 |             # get league and season
647 |             logger.info(
648 |                 "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
649 |             )
650 |             filepath = self.data_dir / filemask.format(game["game_id"])
651 |             reader = self.get(url, filepath)
652 |             tree = html.parse(reader)
653 |             (home_team, away_team) = self._parse_teams(tree)
654 |             if stat_type == "keepers":
655 |                 id_format = "keeper_stats_{}"
656 |             else:
657 |                 id_format = "stats_{}_" + stat_type
658 |             table = tree.xpath("//table[@id='" + id_format.format(home_team["id"]) + "']")[0]
659 |             df_table = pd.read_html(etree.tostring(table))[0]
660 |             df_table["team"] = home_team["name"]
661 |             df_table["game"] = game["game"]
662 |             df_table["league"] = game["league"]
663 |             df_table["season"] = game["season"]
664 |             df_table["game_id"] = game["game_id"]
665 |             stats.append(df_table)
666 |             table = tree.xpath("//table[@id='" + id_format.format(away_team["id"]) + "']")[0]
667 |             df_table = pd.read_html(etree.tostring(table))[0]
668 |             df_table["team"] = away_team["name"]
669 |             df_table["game"] = game["game"]
670 |             df_table["league"] = game["league"]
671 |             df_table["season"] = game["season"]
672 |             df_table["game_id"] = game["game_id"]
673 |             stats.append(df_table)
674 | 
675 |         df = _concat(stats)
676 |         df = df[~df.Player.str.contains(r"^\d+\sPlayers$")]
677 |         df = (
678 |             df.rename(columns={"Player": "player"})
679 |             .replace({"team": TEAMNAME_REPLACEMENTS})
680 |             .set_index(["league", "season", "game", "team", "player"])
681 |             .sort_index()
682 |         )
683 |         return df
684 | 
685 |     def read_shot_events(
686 |         self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False
687 |     ) -> pd.DataFrame:
688 |         """Retrieve shooting and shot creation event data for the selected seasons or selected matches.
689 | 
690 |         The data returned includes who took the shot, when, with which body
691 |         part and from how far away. Additionally, the player creating the
692 |         chance and also the creation before this are included in the data.
693 | 
694 |         Parameters
695 |         ----------
696 |         match_id : int or list of int, optional
697 |             Retrieve the lineup for a specific game.
698 |         force_cache : bool
699 |             By default no cached data is used to scrape the list of available
700 |             games for the current season. If True, will force the use of
701 |             cached data anyway.
702 | 
703 |         Raises
704 |         ------
705 |         ValueError
706 |             If no games with the given IDs were found for the selected seasons and leagues.
707 | 
708 |         Returns
709 |         -------
710 |         pd.DataFrame.
711 |         """
712 |         urlmask = FBREF_API + "/en/matches/{}"
713 |         filemask = "match_{}.html"
714 | 
715 |         # Retrieve games for which a match report is available
716 |         df_schedule = self.read_schedule(force_cache).reset_index()
717 |         df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()]
718 |         # Selec requested games if available
719 |         if match_id is not None:
720 |             iterator = df_schedule[
721 |                 df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id)
722 |             ]
723 |             if len(iterator) == 0:
724 |                 raise ValueError("No games found with the given IDs in the selected seasons.")
725 |         else:
726 |             iterator = df_schedule
727 | 
728 |         shots = []
729 |         for i, game in iterator.iterrows():
730 |             url = urlmask.format(game["game_id"])
731 |             # get league and season
732 |             logger.info(
733 |                 "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
734 |             )
735 |             filepath = self.data_dir / filemask.format(game["game_id"])
736 |             reader = self.get(url, filepath)
737 |             tree = html.parse(reader)
738 |             df_table = pd.read_html(etree.tostring(tree), attrs={"id": "shots_all"})[0]
739 |             df_table["league"] = game["league"]
740 |             df_table["season"] = game["season"]
741 |             df_table["game"] = game["game"]
742 |             shots.append(df_table)
743 | 
744 |         df = (
745 |             _concat(shots)
746 |             .rename(columns={"Squad": "team"})
747 |             .replace({"team": TEAMNAME_REPLACEMENTS})
748 |             .pipe(
749 |                 standardize_colnames,
750 |                 cols=["Outcome", "Minute", "Distance", "Player", "Body Part", "Notes", "Event"],
751 |             )
752 |             .set_index(["league", "season", "game", "team", "player"])
753 |             .sort_index()
754 |             .dropna(how="all")
755 |         )
756 |         return df
757 | 
758 | 
759 | def _concat(dfs: List[pd.DataFrame]) -> pd.DataFrame:
760 |     """Merge matching tables scraped from different pages.
761 | 
762 |     The level 0 headers are not consitent across seasons and leagues, this
763 |     function tries to determine uniform column names.
764 | 
765 |     Parameters
766 |     ----------
767 |     dfs : list(pd.DataFrame)
768 |         Input dataframes.
769 | 
770 |     Returns
771 |     -------
772 |     pd.DataFrame
773 |         Concatenated dataframe with uniform column names.
774 |     """
775 |     # Look for the most complete level 0 columns
776 |     all_columns = []
777 |     for df in dfs:
778 |         columns = pd.DataFrame(df.columns.tolist())
779 |         # Move missing columns to level 0
780 |         columns.replace({"": None}, inplace=True)
781 |         mask = pd.isnull(columns[1])
782 |         columns.loc[mask, [0, 1]] = columns.loc[mask, [1, 0]].values
783 |         # Rename unnamed columns
784 |         mask = columns[0].str.startswith("Unnamed:").fillna(False)
785 |         columns.loc[mask, 0] = None
786 |         all_columns.append(columns)
787 |     columns = reduce(lambda l, r: l.combine_first(r), all_columns)
788 | 
789 |     # Move the remaining missing columns back to level 1 and replace with empyt string
790 |     mask = pd.isnull(columns[0])
791 |     columns.loc[mask, [0, 1]] = columns.loc[mask, [1, 0]].values
792 |     columns.loc[mask, 1] = ""
793 | 
794 |     for df in dfs:
795 |         df.columns = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())
796 | 
797 |     return pd.concat(dfs)
798 | 


--------------------------------------------------------------------------------