├── docs ├── license.rst ├── contributing.rst ├── requirements.txt ├── _static │ ├── logo.png │ ├── logo2.png │ ├── favicon.ico │ └── default.css ├── reference │ ├── espn.rst │ ├── sofifa.rst │ ├── matchhistory.rst │ ├── clubelo.rst │ ├── whoscored.rst │ ├── fivethirtyeight.rst │ ├── fbref.rst │ └── index.rst ├── output.csv ├── conf.py ├── datasources │ ├── index.rst │ ├── ClubElo.ipynb │ └── SoFIFA.ipynb ├── index.rst └── usage.rst ├── tests ├── __init__.py ├── test_SoFIFA.py ├── test_MatchHistory.py ├── test_Whoscored.py ├── test_ESPN.py ├── test_Integration.py ├── conftest.py ├── test_ClubElo.py ├── test_config.py ├── test_FiveThirtyEight.py ├── test_FBref.py └── test_common.py ├── .github ├── renovate.json └── workflows │ ├── constraints.txt │ ├── release.yml │ └── ci.yml ├── .bumpversion.cfg ├── soccerdata ├── __init__.py ├── match_history.py ├── _config.py ├── clubelo.py ├── fivethirtyeight.py ├── sofifa.py ├── espn.py ├── _common.py └── fbref.py ├── .readthedocs.yml ├── Makefile ├── .gitignore ├── setup.cfg ├── LICENSE.rst ├── .pre-commit-config.yaml ├── pyproject.toml ├── README.rst ├── noxfile.py └── CONTRIBUTING.rst /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../LICENSE.rst 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for the soccerdata package.""" 2 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | .. include:: ../CONTRIBUTING.rst 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo==2022.9.29 2 | sphinx==4.5.0 3 | nbsphinx==0.8.9 4 | -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/logo2.png -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:base", ":automergePatch"], 3 | "stabilityDays": 7 4 | } 5 | -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewRowlinson/soccerdata/master/docs/_static/favicon.ico -------------------------------------------------------------------------------- /.github/workflows/constraints.txt: -------------------------------------------------------------------------------- 1 | pip==21.3.1 2 | nox==2021.10.1 3 | nox-poetry==0.9.0 4 | poetry==1.1.12 5 | virtualenv==20.10.0 6 | -------------------------------------------------------------------------------- /docs/reference/espn.rst: -------------------------------------------------------------------------------- 1 | .. _api-espn: 2 | 3 | ESPN 4 | ===== 5 | 6 | .. autoclass:: soccerdata.ESPN 7 | :inherited-members: 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/reference/sofifa.rst: -------------------------------------------------------------------------------- 1 | .. _api-sofifa: 2 | 3 | SoFIFA 4 | ======== 5 | 6 | .. autoclass:: soccerdata.SoFIFA 7 | :members: available_leagues, read_ratings 8 | -------------------------------------------------------------------------------- /docs/reference/matchhistory.rst: -------------------------------------------------------------------------------- 1 | .. _api-matchhistory: 2 | 3 | MatchHistory 4 | ============= 5 | 6 | .. autoclass:: soccerdata.MatchHistory 7 | :inherited-members: 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/reference/clubelo.rst: -------------------------------------------------------------------------------- 1 | .. _api-clubelo: 2 | 3 | Club Elo 4 | ======== 5 | 6 | .. autoclass:: soccerdata.ClubElo 7 | :inherited-members: available_leagues 8 | :members: read_by_date, read_team_history 9 | -------------------------------------------------------------------------------- /docs/reference/whoscored.rst: -------------------------------------------------------------------------------- 1 | .. _api-whoscored: 2 | 3 | WhoScored 4 | ========= 5 | 6 | .. autoclass:: soccerdata.WhoScored 7 | :members: available_leagues, read_schedule, read_missing_players, read_events 8 | -------------------------------------------------------------------------------- /docs/reference/fivethirtyeight.rst: -------------------------------------------------------------------------------- 1 | .. _api-fivethirtyeight: 2 | 3 | FiveThirtyEight 4 | =============== 5 | 6 | .. autoclass:: soccerdata.FiveThirtyEight 7 | :members: available_leagues, read_games, read_forecasts, read_clinches 8 | -------------------------------------------------------------------------------- /docs/reference/fbref.rst: -------------------------------------------------------------------------------- 1 | .. _api-fbref: 2 | 3 | FBref 4 | ===== 5 | 6 | .. autoclass:: soccerdata.FBref 7 | :members: available_leagues, read_team_season_stats, read_player_season_stats, 8 | read_schedule, read_player_match_stats, read_lineup, read_shot_events 9 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.0 3 | commit = True 4 | tag = False 5 | 6 | [bumpversion:file:pyproject.toml] 7 | search = version = "{current_version}" 8 | replace = version = "{new_version}" 9 | 10 | [bumpversion:file:soccerdata/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | -------------------------------------------------------------------------------- /tests/test_SoFIFA.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.SoFIFA.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | # Unittests ------------------------------------------------------------------- 7 | 8 | 9 | @pytest.mark.fails_gha 10 | def test_sofifa_ratings(sofifa_bundesliga): 11 | assert isinstance(sofifa_bundesliga.read_ratings(), pd.DataFrame) 12 | -------------------------------------------------------------------------------- /tests/test_MatchHistory.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.MatchHistory.""" 2 | 3 | import pandas as pd 4 | 5 | 6 | # Unittests ------------------------------------------------------------------- 7 | # Reader 8 | def test_epl_2y(match_epl_2y): 9 | df = match_epl_2y.read_games() 10 | assert isinstance(df, pd.DataFrame) 11 | assert len(df.index.get_level_values("season").unique()) == 2 12 | -------------------------------------------------------------------------------- /tests/test_Whoscored.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.WhoScored.""" 2 | import pandas as pd 3 | import pytest 4 | 5 | # Unittests ------------------------------------------------------------------- 6 | 7 | 8 | @pytest.mark.fails_gha 9 | def test_whoscored_missing_players(whoscored): 10 | assert isinstance(whoscored.read_missing_players(1485184), pd.DataFrame) 11 | 12 | 13 | @pytest.mark.fails_gha 14 | def test_whoscored_events(whoscored): 15 | assert isinstance(whoscored.read_events(1485184), pd.DataFrame) 16 | -------------------------------------------------------------------------------- /soccerdata/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of tools to read and process soccer data from various sources.""" 2 | 3 | __version__ = '1.1.0' 4 | 5 | __all__ = [ 6 | 'FiveThirtyEight', 7 | 'ClubElo', 8 | 'MatchHistory', 9 | 'FBref', 10 | 'ESPN', 11 | 'WhoScored', 12 | 'SoFIFA', 13 | ] 14 | 15 | from .clubelo import ClubElo 16 | from .espn import ESPN 17 | from .fbref import FBref 18 | from .fivethirtyeight import FiveThirtyEight 19 | from .match_history import MatchHistory 20 | from .sofifa import SoFIFA 21 | from .whoscored import WhoScored 22 | -------------------------------------------------------------------------------- /docs/output.csv: -------------------------------------------------------------------------------- 1 | league,season,team,#Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG 2 | ENG-Premier League,2021,Arsenal,29,38.0,53,455,141,31.0,11.97,3.71,0.1,0.33,16.9,23,6,6,53.5,49.0,0.11,-0.5,-2.0 3 | ,,Aston Villa,24,38.0,52,518,179,34.6,13.63,4.71,0.09,0.26,16.5,15,5,6,52.9,48.5,0.1,-0.9,-1.5 4 | ,,Brighton,27,38.0,39,476,129,27.1,12.53,3.39,0.07,0.26,16.6,14,6,9,51.6,44.8,0.1,-12.6,-11.8 5 | ,,Burnley,25,38.0,32,383,125,32.6,10.08,3.29,0.08,0.23,16.6,15,3,3,39.9,37.6,0.1,-7.9,-8.6 6 | ,,Chelsea,27,38.0,56,553,194,35.1,14.55,5.11,0.09,0.25,16.3,16,8,10,64.0,56.4,0.1,-8.0,-8.4 7 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: all 14 | 15 | # Optionally set the version of Python and requirements required to build your docs 16 | python: 17 | version: 3.7 18 | install: 19 | - requirements: docs/requirements.txt 20 | - method: pip 21 | path: . 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init test lint pretty 2 | 3 | BIN = .venv/bin/ 4 | CODE = soccerdata 5 | PY = 3.9 6 | 7 | init: 8 | python3 -m venv .venv 9 | poetry install 10 | 11 | test: 12 | nox -rs tests-$(PY) -- $(args) 13 | 14 | mypy: 15 | nox -rs mypy-$(PY) -- $(args) 16 | 17 | lint: 18 | nox -rs pre-commit -- $(args) 19 | 20 | precommit_install: 21 | nox -rs pre-commit -- install 22 | 23 | bump_major: 24 | $(BIN)bumpversion major 25 | 26 | bump_minor: 27 | $(BIN)bumpversion minor 28 | 29 | bump_patch: 30 | $(BIN)bumpversion patch 31 | 32 | clean: 33 | find . -type f -name "*.py[co]" -delete 34 | find . -type d -name "__pycache__" -delete 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | logs 3 | config 4 | notebooks/data 5 | notebooks_priv 6 | 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Packages 13 | *.egg 14 | *.egg-info 15 | dist 16 | build 17 | eggs 18 | parts 19 | bin 20 | var 21 | sdist 22 | develop-eggs 23 | .installed.cfg 24 | lib 25 | lib64 26 | __pycache__ 27 | 28 | # Installer logs 29 | pip-log.txt 30 | 31 | # Unit test / coverage reports 32 | .coverage 33 | .tox 34 | 35 | # Translations 36 | *.mo 37 | 38 | # Data 39 | .ipynb_checkpoints 40 | 41 | # Sphinx documentation 42 | docs/_build/ 43 | docs/modules/generated/ 44 | 45 | # Hidden files 46 | .* 47 | 48 | # ...except these 49 | !.gitignore 50 | !.travis.yml 51 | -------------------------------------------------------------------------------- /tests/test_ESPN.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.ESPN.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | # Unittests ------------------------------------------------------------------- 7 | 8 | 9 | def test_espn_schedule(espn_seriea): 10 | assert isinstance(espn_seriea.read_schedule(), pd.DataFrame) 11 | 12 | 13 | def test_espn_matchsheet(espn_seriea): 14 | assert isinstance(espn_seriea.read_matchsheet(554204), pd.DataFrame) 15 | 16 | 17 | def test_espn_lineups(espn_seriea): 18 | assert isinstance(espn_seriea.read_lineup(554204), pd.DataFrame) 19 | 20 | 21 | def test_espn_id_not_in_season(espn_seriea): 22 | with pytest.raises(ValueError): 23 | assert isinstance(espn_seriea.read_lineup(123), pd.DataFrame) 24 | -------------------------------------------------------------------------------- /docs/_static/default.css: -------------------------------------------------------------------------------- 1 | .toctree-l1 a:active, 2 | .toctree-l1 a:hover { 3 | background-color: #676767; 4 | } 5 | 6 | .sidebar-logo { 7 | max-width: 100%; 8 | } 9 | 10 | .sidebar-drawer { 11 | width: calc(50% - 25em); 12 | min-width: 22em; 13 | } 14 | 15 | .sidebar-drawer .sidebar-container { 16 | width: 23em; 17 | } 18 | 19 | li.toctree-l2 { 20 | font-size: 80%; 21 | } 22 | 23 | @media (max-width: 67em) { 24 | .sidebar-drawer { 25 | width: 22em; 26 | left: -22em; 27 | } 28 | .sidebar-drawer .sidebar-container { 29 | width: 22em; 30 | } 31 | li.toctree-l2 { 32 | font-size: 75%; 33 | } 34 | } 35 | 36 | /* autosummary table text */ 37 | article .align-center, 38 | article .align-default { 39 | text-align: left; 40 | } 41 | -------------------------------------------------------------------------------- /docs/reference/index.rst: -------------------------------------------------------------------------------- 1 | .. soccerdata package index documentation toctree 2 | .. _api: 3 | 4 | .. currentmodule:: soccerdata 5 | 6 | API 7 | === 8 | 9 | .. list-table:: 10 | :widths: 30 70 11 | 12 | * - :ref:`Club Elo ` 13 | - ClubElo reader. 14 | * - :ref:`ESPN ` 15 | - ESPN reader. 16 | * - :ref:`FBref ` 17 | - FBref reader. 18 | * - :ref:`FiveThirtyEight ` 19 | - FiveThirtyEight reader. 20 | * - :ref:`MatchHistory ` 21 | - Football-data.co.uk reader. 22 | * - :ref:`SoFIFA ` 23 | - SoFIFA reader. 24 | * - :ref:`WhoScored ` 25 | - WhoScored reader. 26 | 27 | .. toctree:: 28 | :hidden: 29 | 30 | clubelo 31 | espn 32 | fbref 33 | fivethirtyeight 34 | matchhistory 35 | sofifa 36 | whoscored 37 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | """Sphinx configuration.""" 2 | project = "soccerdata" 3 | author = "Pieter Robberechts" 4 | copyright = f"2021, {author}" 5 | extensions = [ 6 | "sphinx.ext.autodoc", 7 | "sphinx.ext.napoleon", 8 | "nbsphinx", 9 | ] 10 | exclude_patterns = ["_build", "**.ipynb_checkpoints"] 11 | autodoc_typehints = "description" 12 | autodoc_member_order = "bysource" 13 | 14 | # -- Options for HTML output ------------------------------------------------- 15 | 16 | html_theme = "furo" 17 | html_logo = "_static/logo2.png" 18 | html_favicon = "_static/favicon.ico" 19 | html_theme_options = { 20 | "sidebar_hide_name": True, 21 | "light_css_variables": { 22 | "color-brand-primary": "#2F3C7E", 23 | "color-brand-content": "#2F3C7E", 24 | "color-sidebar-background": "#fdf3f4", 25 | # "color-api-name": "#7bb5b2", 26 | # "color-api-pre-name": "#7bb5b2", 27 | }, 28 | "dark_css_variables": { 29 | "color-brand-primary": "#7C4DFF", 30 | "color-brand-content": "#7C4DFF", 31 | }, 32 | } 33 | 34 | html_static_path = ["_static"] 35 | html_css_files = ["default.css"] 36 | -------------------------------------------------------------------------------- /tests/test_Integration.py: -------------------------------------------------------------------------------- 1 | """Integration tests for soccerdata package.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import soccerdata as foo 7 | 8 | # TODO: integration tests 9 | # Names of common leagues equal for all classes 10 | # Number of clubs equal for all common leagues over classes 11 | # Clubnames equal for all common leagues over classes 12 | # Number of games equal for all common leagues/seasons over classes 13 | # Scores per game equal for all common leagues over classes 14 | 15 | 16 | @pytest.mark.e2e 17 | def test_five38_vs_elo(): 18 | """We should be able to retrieve the Elo history for all teams in these leagues.""" 19 | league_sel = [ 20 | 'ENG-Premier League', 21 | 'ESP-La Liga', 22 | 'FRA-Ligue 1', 23 | 'GER-Bundesliga', 24 | 'ITA-Serie A', 25 | ] 26 | 27 | five38 = foo.FiveThirtyEight(leagues=league_sel, seasons='1819') 28 | five38_games = five38.read_games() 29 | 30 | elo = foo.ClubElo() 31 | elo_hist = pd.concat([elo.read_team_history(team) for team in set(five38_games['home_team'])]) 32 | 33 | assert set(five38_games['home_team']) - set(elo_hist['team']) == set() 34 | -------------------------------------------------------------------------------- /docs/datasources/index.rst: -------------------------------------------------------------------------------- 1 | .. soccerdata package index documentation toctree 2 | .. _datasources: 3 | 4 | .. currentmodule:: soccerdata 5 | 6 | Data Sources 7 | ============ 8 | 9 | Currently the following data sources are supported: 10 | 11 | .. list-table:: 12 | :widths: 30 70 13 | 14 | * - `Club Elo `_ 15 | - Team’s relative strengths as Elo ratings, for most European leagues. Recalculated after every round, includes history. 16 | * - `ESPN `_ 17 | - Historical results, statistics and lineups. 18 | * - `FBref `_ 19 | - Historical results, lineups, and detailed aggregated statistics for teams and individual players based on StatsBomb data. 20 | * - `FiveThirtyEight `_ 21 | - Team’s relative strengths as SPI ratings, predictions and results for the top European and American leagues. 22 | * - `MatchHistory `_ 23 | - Historical results, betting odds and match statistics. Level of detail depends on league. 24 | * - `SoFIFA `_ 25 | - Detailed scores on all player's abilities from EA Sports FIFA. 26 | * - `WhoScored `_ 27 | - Historical results, match preview data and detailed Opta event stream data for major leagues. 28 | 29 | .. toctree:: 30 | :hidden: 31 | 32 | ClubElo 33 | ESPN 34 | FBref 35 | FiveThirtyEight 36 | MatchHistory 37 | SoFIFA 38 | WhoScored 39 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | enable-extensions = G 3 | exclude = .git, .venv 4 | ignore = 5 | A003 ; 'id' is a python builtin, consider renaming the class attribute 6 | W503 ; line break before binary operator 7 | RST303 ; Unknown directive type "XXX". 8 | RST304 ; Unknown interpreted text role "XXX". 9 | DAR402 ; The docstring describes an exception not explicitly raised. 10 | per-file-ignores = 11 | tests/*:D103 12 | max-complexity = 10 13 | max-line-length = 100 14 | show-source = true 15 | application-import-names = soccerdata 16 | docstring-convention = numpy 17 | strictness = short 18 | docstring_style = numpy 19 | 20 | 21 | [pylint] 22 | good-names=i,j,k,e,x,_,pk,id 23 | max-args=5 24 | max-attributes=10 25 | max-bool-expr=5 26 | max-module-lines=200 27 | max-nested-blocks=2 28 | max-public-methods=5 29 | max-returns=5 30 | max-statements=20 31 | output-format = colorized 32 | 33 | disable= 34 | C0103, ; Constant name "api" doesn't conform to UPPER_CASE naming style (invalid-name) 35 | C0111, ; Missing module docstring (missing-docstring) 36 | C0330, ; Wrong hanging indentation before block (add 4 spaces) 37 | E0213, ; Method should have "self" as first argument (no-self-argument) - N805 for flake8 38 | R0201, ; Method could be a function (no-self-use) 39 | R0901, ; Too many ancestors (m/n) (too-many-ancestors) 40 | R0903, ; Too few public methods (m/n) (too-few-public-methods) 41 | 42 | ignored-classes= 43 | contextlib.closing, 44 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Apache License 2 | ============== 3 | 4 | Copyright (c) 2021 Pieter Robberechts 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | This file incorporates code of the `footballdata`_ software package covered 19 | by the following copyright and permission notice: 20 | 21 | Copyright (c) 2017 skagr 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy 24 | of this software and associated documentation files (the "Software"), to deal 25 | in the Software without restriction, including without limitation the rights 26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | copies of the Software, and to permit persons to whom the Software is 28 | furnished to do so, subject to the following conditions: 29 | 30 | The above copyright notice and this permission notice shall be included in all 31 | copies or substantial portions of the Software. 32 | 33 | .. _footballdata: https://github.com/skagr/footballdata 34 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Pytest fixtures for soccerdata package.""" 2 | 3 | import pytest 4 | 5 | import soccerdata as foo 6 | 7 | 8 | @pytest.fixture 9 | def five38(): 10 | """Return a correctly initialized instance of FiveThirtyEight.""" 11 | return foo.FiveThirtyEight(seasons="20-21") 12 | 13 | 14 | @pytest.fixture 15 | def five38_laliga(): 16 | """Return a correctly initialized instance of FiveThirtyEight filtered by league: La Liga.""" 17 | return foo.FiveThirtyEight("ESP-La Liga", "20-21") 18 | 19 | 20 | @pytest.fixture 21 | def espn_seriea(): 22 | """Return a correctly initialized instance of ESPN filtered by league: Serie A.""" 23 | return foo.ESPN("ITA-Serie A", "20-21") 24 | 25 | 26 | @pytest.fixture 27 | def sofifa_bundesliga(): 28 | """Return a correctly initialized instance of SoFIFA filtered by league: Bundesliga.""" 29 | return foo.SoFIFA("GER-Bundesliga", "20-21") 30 | 31 | 32 | @pytest.fixture 33 | def fbref_ligue1(): 34 | """Return a correctly initialized instance of FBref filtered by league: Ligue 1.""" 35 | return foo.FBref("FRA-Ligue 1", "20-21") 36 | 37 | 38 | @pytest.fixture 39 | def elo(): 40 | """Return a correctly initialized ClubElo instance.""" 41 | return foo.ClubElo() 42 | 43 | 44 | @pytest.fixture 45 | def match_epl_2y(): 46 | """Return a MatchHistory instance for the last 2 years of the EPL.""" 47 | return foo.MatchHistory("ENG-Premier League", list(range(2018, 2020))) 48 | 49 | 50 | @pytest.fixture 51 | def whoscored(): 52 | """Return a correctly initialized instance of WhoScored.""" 53 | return foo.WhoScored("ENG-Premier League", "20-21") 54 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | entry: black 7 | language: system 8 | types: [python] 9 | require_serial: true 10 | - id: check-added-large-files 11 | name: Check for added large files 12 | entry: check-added-large-files 13 | language: system 14 | - id: check-toml 15 | name: Check Toml 16 | entry: check-toml 17 | language: system 18 | types: [toml] 19 | - id: check-yaml 20 | name: Check Yaml 21 | entry: check-yaml 22 | language: system 23 | types: [yaml] 24 | - id: end-of-file-fixer 25 | name: Fix End of Files 26 | entry: end-of-file-fixer 27 | language: system 28 | types: [text] 29 | stages: [commit, push, manual] 30 | - id: flake8 31 | name: flake8 32 | entry: flake8 33 | language: system 34 | types: [python] 35 | require_serial: true 36 | - id: pyupgrade 37 | name: pyupgrade 38 | description: Automatically upgrade syntax for newer versions. 39 | entry: pyupgrade 40 | language: system 41 | types: [python] 42 | args: [--py37-plus] 43 | - id: isort 44 | name: Reorder python imports 45 | entry: isort 46 | language: system 47 | types: [python] 48 | - id: trailing-whitespace 49 | name: Trim Trailing Whitespace 50 | entry: trailing-whitespace-fixer 51 | language: system 52 | types: [text] 53 | stages: [commit, push, manual] 54 | - repo: https://github.com/pre-commit/mirrors-prettier 55 | rev: v2.4.1 56 | hooks: 57 | - id: prettier 58 | -------------------------------------------------------------------------------- /tests/test_ClubElo.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.ClubElo.""" 2 | import json 3 | from datetime import datetime, timedelta 4 | from importlib import reload 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | from soccerdata import _config as conf 10 | from soccerdata import clubelo as foo 11 | 12 | # Unittests ------------------------------------------------------------------- 13 | # Happy flow 14 | 15 | 16 | def test_by_date(elo): 17 | assert isinstance(elo.read_by_date(), pd.DataFrame) 18 | assert isinstance(elo.read_by_date('2017-04-01'), pd.DataFrame) 19 | assert isinstance(elo.read_by_date(datetime(2017, 4, 1)), pd.DataFrame) 20 | 21 | 22 | def test_club_hist_age(elo): 23 | assert isinstance(elo.read_team_history('Feyenoord'), pd.DataFrame) 24 | assert isinstance(elo.read_team_history('Feyenoord', 2), pd.DataFrame) 25 | max_age = timedelta(milliseconds=1) 26 | assert isinstance(elo.read_team_history('Feyenoord', max_age), pd.DataFrame) 27 | 28 | 29 | def test_club_hist_replacement(monkeypatch, tmp_path): 30 | monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path)) 31 | # no teamname_replacements.json 32 | reload(conf) 33 | assert conf.TEAMNAME_REPLACEMENTS == {} 34 | fp = tmp_path / "config" / "teamname_replacements.json" 35 | with open(fp, 'w', encoding='utf8') as outfile: 36 | json.dump({"Manchester City": ["Man City"]}, outfile) 37 | # correctly parse teamname_replacements.json 38 | reload(conf) 39 | reload(foo) 40 | elo = foo.ClubElo() 41 | assert isinstance(elo.read_team_history('Manchester City'), pd.DataFrame) 42 | 43 | 44 | # Bad calls 45 | 46 | 47 | def test_by_date_bad_params(elo): 48 | with pytest.raises(ValueError): 49 | elo.read_by_date('2017') 50 | with pytest.raises(AttributeError): 51 | elo.read_by_date(1 / 4) 52 | 53 | 54 | def test_club_hist_bad_params(elo): 55 | with pytest.raises(TypeError): 56 | elo.read_team_history() # missing argument 57 | with pytest.raises(ValueError): 58 | elo.read_team_history('FC Knudde') # no data for team 59 | with pytest.raises(TypeError): 60 | elo.read_team_history('Feyenoord', datetime.now()) # invalid max_age type 61 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | jobs: 10 | release: 11 | name: Release 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Check out the repository 15 | uses: actions/checkout@v3.1.0 16 | with: 17 | fetch-depth: 2 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v4.3.0 21 | with: 22 | python-version: "3.8" 23 | 24 | - name: Upgrade pip 25 | run: | 26 | pip install --constraint=.github/workflows/constraints.txt pip 27 | pip --version 28 | 29 | - name: Install Poetry 30 | run: | 31 | pip install --constraint=.github/workflows/constraints.txt poetry 32 | poetry --version 33 | 34 | - name: Check if there is a parent commit 35 | id: check-parent-commit 36 | run: | 37 | echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)" 38 | 39 | - name: Detect and tag new version 40 | id: check-version 41 | if: steps.check-parent-commit.outputs.sha 42 | uses: salsify/action-detect-and-tag-new-version@v2.0.1 43 | with: 44 | version-command: | 45 | bash -o pipefail -c "poetry version | awk '{ print \$2 }'" 46 | 47 | - name: Bump version for developmental release 48 | if: "! steps.check-version.outputs.tag" 49 | run: | 50 | poetry version patch && 51 | version=$(poetry version | awk '{ print $2 }') && 52 | poetry version $version.dev.$(date +%s) 53 | 54 | - name: Build package 55 | run: | 56 | poetry build --ansi 57 | 58 | - name: Publish package on PyPI 59 | if: steps.check-version.outputs.tag 60 | uses: pypa/gh-action-pypi-publish@v1.5.1 61 | with: 62 | user: __token__ 63 | password: ${{ secrets.PYPI_TOKEN }} 64 | 65 | - name: Publish package on TestPyPI 66 | if: "! steps.check-version.outputs.tag" 67 | uses: pypa/gh-action-pypi-publish@v1.5.1 68 | with: 69 | user: __token__ 70 | password: ${{ secrets.TEST_PYPI_TOKEN }} 71 | repository_url: https://test.pypi.org/legacy/ 72 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | """Unittests for soccerdata._config.""" 2 | import json 3 | import logging 4 | from importlib import reload 5 | 6 | from soccerdata import _config as conf 7 | 8 | 9 | def test_env_soccerdata_dir(monkeypatch, tmp_path): 10 | monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path)) 11 | reload(conf) 12 | assert conf.BASE_DIR == tmp_path 13 | 14 | 15 | def test_env_nocache(monkeypatch): 16 | monkeypatch.setenv('SOCCERDATA_NOCACHE', 't') 17 | reload(conf) 18 | assert conf.NOCACHE is True 19 | 20 | monkeypatch.setenv('SOCCERDATA_NOCACHE', 'true') 21 | reload(conf) 22 | assert conf.NOCACHE is True 23 | 24 | monkeypatch.setenv('SOCCERDATA_NOCACHE', 'f') 25 | reload(conf) 26 | assert conf.NOCACHE is False 27 | 28 | 29 | def test_env_nostore(monkeypatch): 30 | monkeypatch.setenv('SOCCERDATA_NOSTORE', 't') 31 | reload(conf) 32 | assert conf.NOSTORE is True 33 | 34 | monkeypatch.setenv('SOCCERDATA_NOSTORE', 'true') 35 | reload(conf) 36 | assert conf.NOSTORE is True 37 | 38 | monkeypatch.setenv('SOCCERDATA_NOSTORE', 'f') 39 | reload(conf) 40 | assert conf.NOSTORE is False 41 | 42 | 43 | def test_env_loglevel(monkeypatch): 44 | monkeypatch.setenv('SOCCERDATA_LOGLEVEL', 'DEBUG') 45 | reload(conf) 46 | assert conf.logger.level == logging.DEBUG 47 | 48 | 49 | def test_read_teamnname_replacements(monkeypatch, tmp_path): 50 | monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path)) 51 | # no teamname_replacements.json 52 | reload(conf) 53 | assert conf.TEAMNAME_REPLACEMENTS == {} 54 | fp = tmp_path / "config" / "teamname_replacements.json" 55 | with open(fp, 'w', encoding='utf8') as outfile: 56 | json.dump({"Celta de Vigo": ["Celta Vigo", "Celta"]}, outfile) 57 | # correctly parse teamname_replacements.json 58 | reload(conf) 59 | assert conf.TEAMNAME_REPLACEMENTS == { 60 | "Celta Vigo": "Celta de Vigo", 61 | "Celta": "Celta de Vigo", 62 | } 63 | 64 | 65 | def test_read_league_dict(monkeypatch, tmp_path): 66 | monkeypatch.setenv('SOCCERDATA_DIR', str(tmp_path)) 67 | # no league_dict.json 68 | reload(conf) 69 | nb_default = len(conf.LEAGUE_DICT) 70 | fp = tmp_path / "config" / "league_dict.json" 71 | with open(fp, 'w', encoding='utf8') as outfile: 72 | json.dump({"ABC-Fake": {"WhoScored": "Fake"}}, outfile) 73 | # correctly parse league_dict.json 74 | reload(conf) 75 | assert len(conf.LEAGUE_DICT) == nb_default + 1 76 | assert conf.LEAGUE_DICT['ABC-Fake'] == {'WhoScored': 'Fake'} 77 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to SoccerData 2 | ====================== 3 | 4 | 5 | SoccerData is a collection of wrappers over soccer data from `Club Elo`_, 6 | `ESPN`_, `FBref`_, `FiveThirtyEight`_, `Football-Data.co.uk`_, `SoFIFA`_ and 7 | `WhoScored`_. You get Pandas DataFrames with sensible, matching column names 8 | and identifiers across datasets. Data is downloaded when needed and cached 9 | locally. 10 | 11 | .. code:: python 12 | 13 | import soccerdata as sd 14 | 15 | # Create scraper class instance for the Premier League 16 | five38 = sd.FiveThirtyEight('ENG-Premier League', '1819') 17 | 18 | # Fetch dataframes 19 | games = five38.read_games() 20 | 21 | To learn how to install, configure and use SoccerData, see the 22 | :ref:`Quickstart guide `. For documentation on each of the 23 | supported data sources, see the :ref:`API reference `. 24 | 25 | Other useful projects 26 | ---------------------- 27 | 28 | SoccerData is not the only tool of its kind. If SoccerData doesn’t quite fit 29 | your needs or you want to obtain data from other sources, we recommend looking 30 | at these tools: 31 | 32 | - `worldfootballR`_: an R package with scrapers for FBref, Transfermarkt and Understat. 33 | - `Tyrone Mings`_: a Python package to scrape data from TransferMarkt 34 | - `understat`_:a Python package to scrape data from Understat 35 | - `understatr`_: an R package to scrape data from Understat 36 | - `ScraperFC`_: a Python package to scrape data from FBRef, Understat, FiveThirtyEight and WhoScored 37 | - `Scrape-FBref-data`_: Python package to scrape StatsBomb data via FBref 38 | 39 | 40 | .. toctree:: 41 | :hidden: 42 | :maxdepth: 1 43 | 44 | usage 45 | datasources/index 46 | reference/index 47 | contributing 48 | License 49 | Changelog 50 | 51 | .. _socceraction: https://socceraction.readthedocs.io/en/latest/modules/generated/socceraction.data.opta.OptaLoader.html#socceraction.data.opta.OptaLoader 52 | .. _Club Elo: https://www.clubelo.com/ 53 | .. _ESPN: https://www.espn.com/soccer/ 54 | .. _FBref: https://www.fbref.com/en/ 55 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/ 56 | .. _Football-Data.co.uk: https://www.football-data.co.uk/ 57 | .. _SoFIFA: https://sofifa.com/ 58 | .. _WhoScored: https://www.whoscored.com/ 59 | .. _worldfootballR: https://jaseziv.github.io/worldfootballR/index.html 60 | .. _Tyrone Mings: https://github.com/FCrSTATS/tyrone_mings 61 | .. _understat: https://github.com/amosbastian/understat 62 | .. _understatr: https://github.com/ewenme/understatr 63 | .. _ScraperFC: https://github.com/oseymour/ScraperFC 64 | .. _Scrape-FBref-data: https://github.com/parth1902/Scrape-FBref-data 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "soccerdata" 3 | version = "1.1.0" 4 | description = "A collection of wrappers over soccer data from various websites / APIs." 5 | authors = ["Pieter Robberechts "] 6 | license = "Apache-2.0" 7 | readme = 'README.rst' 8 | homepage = "https://github.com/probberechts/soccerdata" 9 | repository = "https://github.com/probberechts/soccerdata" 10 | keywords = ["soccer", "football", "soccer data", "web scraping", "soccer analytics"] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: MIT License", 14 | "Operating System :: OS Independent" 15 | ] 16 | 17 | [tool.poetry.dependencies] 18 | python = ">=3.7,<4.0.0" 19 | pandas = "^1.0" 20 | requests = "^2.23" 21 | unicode = "^2.7" 22 | lxml = "^4.6" 23 | selenium = "^4.0.0" 24 | Unidecode = "^1.2.0" 25 | rich = "^12.0.0" 26 | pretty-errors = "^1.2.25" 27 | PySocks = "^1.7.1" 28 | html5lib = "^1.1" 29 | undetected-chromedriver = "^3.1.3" 30 | 31 | [tool.poetry.dev-dependencies] 32 | pytest = "^7.0.0" 33 | mypy = "^0.982" 34 | pylint = "^2.6.0" 35 | pytest-deadfixtures = "^2.2.1" 36 | unify = "^0.5" 37 | black = "^21.12b0" 38 | Sphinx = "^4.3.2" 39 | sphinx-autobuild = "^2021.3.14" 40 | furo = "^2022.0.0" 41 | coverage = {version = "^6.2", extras = ["toml"]} 42 | pre-commit = "^2.16.0" 43 | flake8 = "^4.0.1" 44 | flake8-bugbear = "^22.0.0" 45 | flake8-docstrings = "^1.6.0" 46 | flake8-rst-docstrings = "^0.2.5" 47 | pep8-naming = "^0.13.0" 48 | darglint = "^1.8.1" 49 | pre-commit-hooks = "^4.1.0" 50 | Pygments = "^2.10.0" 51 | time-machine = "^2.5.0" 52 | pytest-mock = "^3.6.1" 53 | bumpversion = "^0.6.0" 54 | nbsphinx = "^0.8.8" 55 | 56 | [tool.isort] 57 | profile = "black" 58 | src_paths = ["soccerdata", "tests"] 59 | balanced_wrapping = true 60 | default_section = "THIRDPARTY" 61 | include_trailing_comma = true 62 | known_first_party = ["soccerdata", "tests"] 63 | line_length = 79 64 | multi_line_output = 3 65 | 66 | [tool.black] 67 | line-length = 99 68 | target-version = ['py38'] 69 | skip-string-normalization = 1 70 | include = '\.pyi?$' 71 | 72 | [tool.coverage.paths] 73 | source = ["soccerdata", "*/site-packages"] 74 | 75 | [tool.coverage.run] 76 | branch = true 77 | source = ["soccerdata"] 78 | 79 | [tool.coverage.report] 80 | show_missing = true 81 | ignore_errors = true 82 | 83 | [tool.mypy] 84 | ignore_missing_imports = true 85 | disallow_untyped_defs = true 86 | disallow_incomplete_defs = true 87 | no_implicit_optional = true 88 | check_untyped_defs = true 89 | show_error_codes = true 90 | warn_unused_ignores = true 91 | 92 | [[tool.mypy.overrides]] 93 | module = ["tests.*"] 94 | disallow_untyped_defs = false 95 | 96 | [build-system] 97 | requires = ["poetry>=0.12"] 98 | build-backend = "poetry.masonry.api" 99 | -------------------------------------------------------------------------------- /tests/test_FiveThirtyEight.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.FiveThirtyEight.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import soccerdata as foo 7 | 8 | # Unittests ------------------------------------------------------------------- 9 | # Happy flow 10 | 11 | 12 | def test_five38_league_ids(five38_laliga): 13 | assert isinstance(five38_laliga._selected_leagues, dict) 14 | 15 | 16 | def test_five38_leagues(five38_laliga): 17 | assert isinstance(five38_laliga.read_leagues(), pd.DataFrame) 18 | 19 | 20 | def test_five38_games(five38_laliga): 21 | assert isinstance(five38_laliga.read_games(), pd.DataFrame) 22 | 23 | 24 | def test_five38_forecasts(five38_laliga): 25 | assert isinstance(five38_laliga.read_forecasts(), pd.DataFrame) 26 | 27 | 28 | def test_five38_clinches(five38_laliga): 29 | assert isinstance(five38_laliga.read_clinches(), pd.DataFrame) 30 | 31 | 32 | def test_five38_league_ids_ll(five38_laliga): 33 | assert isinstance(five38_laliga._selected_leagues, dict) 34 | 35 | 36 | def test_five38_leagues_ll(five38_laliga): 37 | assert isinstance(five38_laliga.read_leagues(), pd.DataFrame) 38 | 39 | 40 | def test_five38_games_ll(five38_laliga): 41 | assert isinstance(five38_laliga.read_games(), pd.DataFrame) 42 | 43 | 44 | def test_five38_forecasts_ll(five38_laliga): 45 | assert isinstance(five38_laliga.read_forecasts(), pd.DataFrame) 46 | 47 | 48 | def test_five38_clinches_ll(five38_laliga): 49 | assert isinstance(five38_laliga.read_clinches(), pd.DataFrame) 50 | 51 | 52 | def test_five38_laliga(five38_laliga): 53 | df = five38_laliga.read_leagues() 54 | assert len(df) == 1 55 | assert df.loc['ESP-La Liga', 'long_name'] == 'La Liga' 56 | 57 | 58 | def test_league_counts(five38): 59 | assert len(five38._selected_leagues) == len(five38.read_leagues()) 60 | assert len(five38._selected_leagues) == len( 61 | five38.read_games().reset_index()['league'].unique() 62 | ) 63 | assert len(five38._selected_leagues) == len( 64 | five38.read_forecasts().reset_index()['league'].unique() 65 | ) 66 | 67 | 68 | def test_league_matches_games(five38): 69 | assert set(five38.read_games().reset_index().league) == set( 70 | five38.read_leagues().reset_index().league 71 | ) 72 | 73 | 74 | def test_league_matches_forecasts(five38): 75 | assert set(five38.read_forecasts().reset_index().league) == set( 76 | five38.read_leagues().reset_index().league 77 | ) 78 | 79 | 80 | def test_league_matches_clinches(five38): 81 | assert set(five38.read_clinches().reset_index().league) == set( 82 | five38.read_leagues().reset_index().league 83 | ) 84 | 85 | 86 | # Bad inits 87 | 88 | 89 | def test_five38_league_value_error(): 90 | with pytest.raises(ValueError): 91 | foo.FiveThirtyEight('xxx') 92 | 93 | 94 | def test_five38_league_type_error(): 95 | with pytest.raises(TypeError): 96 | foo.FiveThirtyEight(1) # type: ignore 97 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://raw.githubusercontent.com/probberechts/soccerdata/master/docs/_static/logo2.png 2 | :align: center 3 | :alt: SoccerData 4 | :width: 600px 5 | 6 | .. badges-begin 7 | 8 | |PyPI| |Python Version| |License| |Read the Docs| |Tests| |Codecov| |pre-commit| |Black| 9 | 10 | .. |PyPI| image:: https://img.shields.io/pypi/v/soccerdata.svg 11 | :target: https://pypi.org/project/soccerdata/ 12 | :alt: PyPI 13 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/soccerdata 14 | :target: https://pypi.org/project/soccerdata 15 | :alt: Python Version 16 | .. |License| image:: https://img.shields.io/pypi/l/soccerdata.svg 17 | :target: https://opensource.org/licenses/Apache-2.0 18 | :alt: License 19 | .. |Read the Docs| image:: https://img.shields.io/readthedocs/soccerdata/latest.svg?label=Read%20the%20Docs 20 | :target: https://soccerdata.readthedocs.io/ 21 | :alt: Read the documentation at https://soccerdata.readthedocs.io/ 22 | .. |Tests| image:: https://github.com/probberechts/soccerdata/workflows/CI/badge.svg 23 | :target: https://github.com/probberechts/soccerdata/actions?workflow=CI 24 | :alt: Tests 25 | .. |Codecov| image:: https://codecov.io/gh/probberechts/soccerdata/branch/master/graph/badge.svg 26 | :target: https://app.codecov.io/gh/probberechts/soccerdata 27 | :alt: Codecov 28 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white 29 | :target: https://github.com/pre-commit/pre-commit 30 | :alt: pre-commit 31 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 32 | :target: https://github.com/psf/black 33 | :alt: Black 34 | 35 | .. badges-end 36 | 37 | SoccerData is a collection of wrappers over soccer data from `Club Elo`_, 38 | `ESPN`_, `FBref`_, `FiveThirtyEight`_, `Football-Data.co.uk`_, `SoFIFA`_ and 39 | `WhoScored`_. You get Pandas DataFrames with sensible, matching column names 40 | and identifiers across datasets. Data is downloaded when needed and cached 41 | locally. 42 | 43 | .. code:: python 44 | 45 | import soccerdata as sd 46 | 47 | # Create scraper class instance for the Premier League 48 | five38 = sd.FiveThirtyEight('ENG-Premier League', '1819') 49 | 50 | # Fetch dataframes 51 | games = five38.read_games() 52 | 53 | To learn how to install, configure and use SoccerData, see the 54 | `Quickstart guide `__. For documentation on each of the 55 | supported data sources, see the `example notebooks `__ and `API reference `__. 56 | 57 | .. _Club Elo: https://www.clubelo.com/ 58 | .. _ESPN: https://www.espn.com/soccer/ 59 | .. _FBref: https://www.fbref.com/en/ 60 | .. _FiveThirtyEight: https://fivethirtyeight.com/soccer-predictions/ 61 | .. _Football-Data.co.uk: https://www.football-data.co.uk/ 62 | .. _SoFIFA: https://sofifa.com/ 63 | .. _WhoScored: https://www.whoscored.com/ 64 | 65 | **Disclaimer:** As soccerdata relies on web scraping, any changes to the 66 | scraped websites will break the package. Hence, do not expect that all code 67 | will work all the time. If you spot any bugs, then please `fork it and start 68 | a pull request `__. 69 | -------------------------------------------------------------------------------- /tests/test_FBref.py: -------------------------------------------------------------------------------- 1 | """Unittests for class soccerdata.FBref.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import soccerdata as sd 7 | from soccerdata.fbref import _concat 8 | 9 | # Unittests ------------------------------------------------------------------- 10 | # Happy flow 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "stat_type", 15 | [ 16 | "standard", 17 | "keeper", 18 | "keeper_adv", 19 | "shooting", 20 | "passing", 21 | "passing_types", 22 | "goal_shot_creation", 23 | "defense", 24 | "possession", 25 | "playing_time", 26 | "misc", 27 | ], 28 | ) 29 | def test_read_team_season_stats(fbref_ligue1, stat_type): 30 | assert isinstance(fbref_ligue1.read_team_season_stats(stat_type), pd.DataFrame) 31 | 32 | 33 | @pytest.mark.parametrize( 34 | "stat_type", 35 | [ 36 | "standard", 37 | "shooting", 38 | "passing", 39 | "passing_types", 40 | "goal_shot_creation", 41 | "defense", 42 | "possession", 43 | "playing_time", 44 | "misc", 45 | "keeper", 46 | "keeper_adv", 47 | ], 48 | ) 49 | def test_read_player_season_stats(fbref_ligue1, stat_type): 50 | assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame) 51 | 52 | 53 | def test_read_schedule(fbref_ligue1): 54 | assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame) 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "stat_type", 59 | [ 60 | "summary", 61 | "keepers", 62 | "passing", 63 | "passing_types", 64 | "defense", 65 | "possession", 66 | "misc", 67 | ], 68 | ) 69 | def test_read_player_match_stats(fbref_ligue1, stat_type): 70 | assert isinstance( 71 | fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame 72 | ) 73 | 74 | 75 | def test_read_shot_events(fbref_ligue1): 76 | assert isinstance(fbref_ligue1.read_shot_events(match_id="796787da"), pd.DataFrame) 77 | 78 | 79 | def test_read_lineup(fbref_ligue1): 80 | assert isinstance(fbref_ligue1.read_lineup(match_id="796787da"), pd.DataFrame) 81 | 82 | 83 | def test_combine_big5(): 84 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 85 | assert len(fbref_bigfive.read_leagues()) == 1 86 | assert len(fbref_bigfive.read_seasons()) == 1 87 | 88 | 89 | @pytest.mark.parametrize( 90 | "stat_type", 91 | [ 92 | "standard", 93 | "keeper", 94 | "keeper_adv", 95 | "shooting", 96 | "passing", 97 | "passing_types", 98 | "goal_shot_creation", 99 | "defense", 100 | "possession", 101 | "playing_time", 102 | "misc", 103 | ], 104 | ) 105 | def test_combine_big5_team_season_stats(fbref_ligue1, stat_type): 106 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 107 | ligue1 = fbref_ligue1.read_team_season_stats(stat_type).loc["FRA-Ligue 1"] 108 | bigfive = fbref_bigfive.read_team_season_stats(stat_type).loc["FRA-Ligue 1"] 109 | cols = _concat([ligue1, bigfive]).columns 110 | ligue1.columns = cols 111 | bigfive.columns = cols 112 | pd.testing.assert_frame_equal( 113 | ligue1, 114 | bigfive, 115 | ) 116 | 117 | 118 | @pytest.mark.parametrize( 119 | "stat_type", 120 | [ 121 | "standard", 122 | "shooting", 123 | "passing", 124 | "passing_types", 125 | "goal_shot_creation", 126 | "defense", 127 | "possession", 128 | "playing_time", 129 | "misc", 130 | "keeper", 131 | "keeper_adv", 132 | ], 133 | ) 134 | def test_combine_big5_player_season_stats(fbref_ligue1, stat_type): 135 | fbref_bigfive = sd.FBref(["Big 5 European Leagues Combined"], 2021) 136 | ligue1 = fbref_ligue1.read_player_season_stats(stat_type).loc["FRA-Ligue 1"] 137 | bigfive = fbref_bigfive.read_player_season_stats(stat_type).loc["FRA-Ligue 1"] 138 | cols = _concat([ligue1, bigfive]).columns 139 | ligue1.columns = cols 140 | bigfive.columns = cols 141 | pd.testing.assert_frame_equal( 142 | ligue1, 143 | bigfive, 144 | ) 145 | -------------------------------------------------------------------------------- /soccerdata/match_history.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://www.football-data.co.uk/data.php.""" 2 | import itertools 3 | from pathlib import Path 4 | from typing import Callable, Dict, List, Optional, Union 5 | 6 | import pandas as pd 7 | 8 | from ._common import BaseRequestsReader, make_game_id 9 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 10 | 11 | MATCH_HISTORY_DATA_DIR = DATA_DIR / 'MatchHistory' 12 | MATCH_HISTORY_API = 'https://www.football-data.co.uk' 13 | 14 | 15 | class MatchHistory(BaseRequestsReader): 16 | """Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php. 17 | 18 | Data will be downloaded as necessary and cached locally in 19 | ``~/soccerdata/data/MatchHistory``. 20 | 21 | Parameters 22 | ---------- 23 | leagues : string or iterable 24 | IDs of leagues to include. 25 | seasons : string, int or list 26 | Seasons to include. Supports multiple formats. 27 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 28 | proxy : 'tor' or dict or list(dict) or callable, optional 29 | Use a proxy to hide your IP address. Valid options are: 30 | - "tor": Uses the Tor network. Tor should be running in 31 | the background on port 9050. 32 | - dict: A dictionary with the proxy to use. The dict should be 33 | a mapping of supported protocols to proxy addresses. For example:: 34 | 35 | { 36 | 'http': 'http://10.10.1.10:3128', 37 | 'https': 'http://10.10.1.10:1080', 38 | } 39 | 40 | - list(dict): A list of proxies to choose from. A different proxy will 41 | be selected from this list after failed requests, allowing rotating 42 | proxies. 43 | - callable: A function that returns a valid proxy. This function will 44 | be called after failed requests, allowing rotating proxies. 45 | no_cache : bool 46 | If True, will not use cached data. 47 | no_store : bool 48 | If True, will not store downloaded data. 49 | data_dir : Path, optional 50 | Path to directory where data will be cached. 51 | """ 52 | 53 | def __init__( 54 | self, 55 | leagues: Optional[Union[str, List[str]]] = None, 56 | seasons: Optional[Union[str, int, List]] = None, 57 | proxy: Optional[ 58 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 59 | ] = None, 60 | no_cache: bool = NOCACHE, 61 | no_store: bool = NOSTORE, 62 | data_dir: Path = MATCH_HISTORY_DATA_DIR, 63 | ): 64 | super().__init__( 65 | leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir 66 | ) 67 | self.seasons = seasons # type: ignore 68 | 69 | def read_games(self) -> pd.DataFrame: 70 | """Retrieve game history for the selected leagues and seasons. 71 | 72 | Column names are explained here: http://www.football-data.co.uk/notes.txt 73 | 74 | Returns 75 | ------- 76 | pd.DataFrame 77 | """ 78 | urlmask = MATCH_HISTORY_API + '/mmz4281/{}/{}.csv' 79 | filemask = '{}_{}.csv' 80 | col_rename = { 81 | 'Div': 'league', 82 | 'Date': 'date', 83 | 'Time': 'time', 84 | 'HomeTeam': 'home_team', 85 | 'AwayTeam': 'away_team', 86 | 'Referee': 'referee', 87 | } 88 | 89 | df_list = [] 90 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 91 | filepath = self.data_dir / filemask.format(lkey, skey) 92 | url = urlmask.format(skey, lkey) 93 | current_season = not self._is_complete(lkey, skey) 94 | reader = self.get(url, filepath, no_cache=current_season) 95 | 96 | df_list.append( 97 | pd.read_csv( 98 | reader, 99 | encoding='ISO-8859-1', 100 | ).assign(season=skey) 101 | ) 102 | 103 | df = ( 104 | pd.concat(df_list, sort=False) 105 | .rename(columns=col_rename) 106 | .assign(date=lambda x: pd.to_datetime(x["date"] + ' ' + x['time'])) 107 | .drop("time", axis=1) 108 | .pipe(self._translate_league) 109 | .replace( 110 | { 111 | 'home_team': TEAMNAME_REPLACEMENTS, 112 | 'away_team': TEAMNAME_REPLACEMENTS, 113 | } 114 | ) 115 | .dropna(subset=['home_team', 'away_team']) 116 | ) 117 | 118 | df['game'] = df.apply(make_game_id, axis=1) 119 | df.set_index(['league', 'season', 'game'], inplace=True) 120 | df.sort_index(inplace=True) 121 | return df 122 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | tests: 9 | name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }} 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - { python: "3.9", os: "ubuntu-latest", session: "pre-commit" } 16 | - { python: "3.9", os: "ubuntu-latest", session: "mypy" } 17 | - { python: "3.9", os: "ubuntu-latest", session: "tests" } 18 | - { python: "3.8", os: "ubuntu-latest", session: "tests" } 19 | - { python: "3.7", os: "ubuntu-latest", session: "tests" } 20 | - { python: "3.9", os: "windows-latest", session: "tests" } 21 | - { python: "3.9", os: "macos-latest", session: "tests" } 22 | - { python: "3.9", os: "ubuntu-latest", session: "docs-build" } 23 | 24 | env: 25 | NOXSESSION: ${{ matrix.session }} 26 | FORCE_COLOR: "1" 27 | PRE_COMMIT_COLOR: "always" 28 | 29 | steps: 30 | - name: Check out the repository 31 | uses: actions/checkout@v3.1.0 32 | 33 | - name: Set up Python ${{ matrix.python }} 34 | uses: actions/setup-python@v4.3.0 35 | with: 36 | python-version: ${{ matrix.python }} 37 | 38 | - name: Upgrade pip 39 | run: | 40 | pip install --constraint=.github/workflows/constraints.txt pip 41 | pip --version 42 | 43 | - name: Upgrade pip in virtual environments 44 | shell: python 45 | run: | 46 | import os 47 | import pip 48 | 49 | with open(os.environ["GITHUB_ENV"], mode="a") as io: 50 | print(f"VIRTUALENV_PIP={pip.__version__}", file=io) 51 | 52 | - name: Install Poetry 53 | run: | 54 | pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry 55 | poetry --version 56 | 57 | - name: Install Nox 58 | run: | 59 | pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox 60 | pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry 61 | nox --version 62 | 63 | - name: Compute pre-commit cache key 64 | if: matrix.session == 'pre-commit' 65 | id: pre-commit-cache 66 | shell: python 67 | run: | 68 | import hashlib 69 | import sys 70 | 71 | python = "py{}.{}".format(*sys.version_info[:2]) 72 | payload = sys.version.encode() + sys.executable.encode() 73 | digest = hashlib.sha256(payload).hexdigest() 74 | result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8]) 75 | 76 | print("::set-output name=result::{}".format(result)) 77 | 78 | - name: Restore pre-commit cache 79 | uses: actions/cache@v3.0.10 80 | if: matrix.session == 'pre-commit' 81 | with: 82 | path: ~/.cache/pre-commit 83 | key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }} 84 | restore-keys: | 85 | ${{ steps.pre-commit-cache.outputs.result }}- 86 | 87 | - name: Install pandoc 88 | if: matrix.session == 'docs-build' 89 | run: sudo apt-get install -y pandoc 90 | 91 | - name: Run Nox 92 | run: | 93 | nox --force-color --python=${{ matrix.python }} 94 | 95 | - name: Upload coverage data 96 | if: always() && matrix.session == 'tests' 97 | uses: actions/upload-artifact@v3.1.0 98 | with: 99 | name: coverage-data 100 | path: ".coverage.*" 101 | 102 | - name: Upload documentation 103 | if: matrix.session == 'docs-build' 104 | uses: actions/upload-artifact@v3.1.0 105 | with: 106 | name: docs 107 | path: docs/_build 108 | 109 | coverage: 110 | runs-on: ubuntu-latest 111 | needs: tests 112 | steps: 113 | - name: Check out the repository 114 | uses: actions/checkout@v3.1.0 115 | 116 | - name: Set up Python 117 | uses: actions/setup-python@v4.3.0 118 | with: 119 | python-version: "3.9" 120 | 121 | - name: Upgrade pip 122 | run: | 123 | pip install --constraint=.github/workflows/constraints.txt pip 124 | pip --version 125 | 126 | - name: Install Poetry 127 | run: | 128 | pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry 129 | poetry --version 130 | 131 | - name: Install Nox 132 | run: | 133 | pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox 134 | pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry 135 | nox --version 136 | 137 | - name: Download coverage data 138 | uses: actions/download-artifact@v3.0.0 139 | with: 140 | name: coverage-data 141 | 142 | - name: Combine coverage data and display human readable report 143 | run: | 144 | nox --force-color --session=coverage 145 | 146 | - name: Create coverage report 147 | run: | 148 | nox --force-color --session=coverage -- xml 149 | 150 | - name: Upload coverage report 151 | uses: codecov/codecov-action@v3.1.1 152 | -------------------------------------------------------------------------------- /soccerdata/_config.py: -------------------------------------------------------------------------------- 1 | """Configurations.""" 2 | 3 | import json 4 | import logging 5 | import logging.config 6 | import os 7 | import sys 8 | from pathlib import Path 9 | 10 | import pretty_errors # NOQA: F401 (imported but unused) 11 | from rich.logging import RichHandler 12 | 13 | # Configuration 14 | NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", 'False').lower() in ('true', '1', 't') 15 | NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", 'False').lower() in ('true', '1', 't') 16 | LOGLEVEL = os.environ.get('SOCCERDATA_LOGLEVEL', 'INFO').upper() 17 | 18 | # Directories 19 | BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata")) 20 | LOGS_DIR = Path(BASE_DIR, "logs") 21 | DATA_DIR = Path(BASE_DIR, "data") 22 | CONFIG_DIR = Path(BASE_DIR, "config") 23 | 24 | # Create dirs 25 | LOGS_DIR.mkdir(parents=True, exist_ok=True) 26 | DATA_DIR.mkdir(parents=True, exist_ok=True) 27 | CONFIG_DIR.mkdir(parents=True, exist_ok=True) 28 | 29 | # Logger 30 | logging_config = { 31 | "version": 1, 32 | "disable_existing_loggers": False, 33 | "formatters": { 34 | "minimal": {"format": "%(message)s"}, 35 | "detailed": { 36 | "format": "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n" # noqa: E501 37 | }, 38 | }, 39 | "handlers": { 40 | "console": { 41 | "class": "logging.StreamHandler", 42 | "stream": sys.stdout, 43 | "formatter": "minimal", 44 | "level": logging.DEBUG, 45 | }, 46 | "info": { 47 | "class": "logging.handlers.RotatingFileHandler", 48 | "filename": Path(LOGS_DIR, "info.log"), 49 | "maxBytes": 10485760, # 1 MB 50 | "backupCount": 10, 51 | "formatter": "detailed", 52 | "level": logging.INFO, 53 | }, 54 | "error": { 55 | "class": "logging.handlers.RotatingFileHandler", 56 | "filename": Path(LOGS_DIR, "error.log"), 57 | "maxBytes": 10485760, # 1 MB 58 | "backupCount": 10, 59 | "formatter": "detailed", 60 | "level": logging.ERROR, 61 | }, 62 | }, 63 | "loggers": { 64 | "root": { 65 | "handlers": ["console", "info", "error"], 66 | "level": LOGLEVEL, 67 | "propagate": True, 68 | }, 69 | }, 70 | } 71 | logging.config.dictConfig(logging_config) 72 | logger = logging.getLogger("root") 73 | logger.handlers[0] = RichHandler(markup=True) 74 | 75 | # Team name replacements 76 | TEAMNAME_REPLACEMENTS = {} 77 | _f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json" 78 | if _f_custom_teamnname_replacements.is_file(): 79 | with open(_f_custom_teamnname_replacements, encoding='utf8') as json_file: 80 | for team, to_replace_list in json.load(json_file).items(): 81 | for to_replace in to_replace_list: 82 | TEAMNAME_REPLACEMENTS[to_replace] = team 83 | logger.info("Custom team name replacements loaded from %s.", _f_custom_teamnname_replacements) 84 | else: 85 | logger.info( 86 | "No custom team name replacements found. You can configure these in %s.", 87 | _f_custom_teamnname_replacements, 88 | ) 89 | 90 | 91 | # League dict 92 | LEAGUE_DICT = { 93 | "ENG-Premier League": { 94 | "ClubElo": "ENG_1", 95 | "MatchHistory": "E0", 96 | "FiveThirtyEight": "premier-league", 97 | "FBref": "Premier League", 98 | "ESPN": "eng.1", 99 | "SoFIFA": "English Premier League (1)", 100 | "WhoScored": "England - Premier League", 101 | "season_start": "Aug", 102 | "season_end": "May", 103 | }, 104 | "ESP-La Liga": { 105 | "ClubElo": "ESP_1", 106 | "MatchHistory": "SP1", 107 | "FiveThirtyEight": "la-liga", 108 | "FBref": "La Liga", 109 | "ESPN": "esp.1", 110 | "SoFIFA": "Spain Primera Division (1)", 111 | "WhoScored": "Spain - LaLiga", 112 | "season_start": "Aug", 113 | "season_end": "May", 114 | }, 115 | "ITA-Serie A": { 116 | "ClubElo": "ITA_1", 117 | "MatchHistory": "I1", 118 | "FiveThirtyEight": "serie-a", 119 | "FBref": "Serie A", 120 | "ESPN": "ita.1", 121 | "SoFIFA": " Italian Serie A (1)", 122 | "WhoScored": "Italy - Serie A", 123 | "season_start": "Aug", 124 | "season_end": "May", 125 | }, 126 | "GER-Bundesliga": { 127 | "ClubElo": "GER_1", 128 | "MatchHistory": "D1", 129 | "FiveThirtyEight": "bundesliga", 130 | "FBref": "Fußball-Bundesliga", 131 | "ESPN": "ger.1", 132 | "SoFIFA": "German 1. Bundesliga (1)", 133 | "WhoScored": "Germany - Bundesliga", 134 | "season_start": "Aug", 135 | "season_end": "May", 136 | }, 137 | "FRA-Ligue 1": { 138 | "ClubElo": "FRA_1", 139 | "MatchHistory": "F1", 140 | "FiveThirtyEight": "ligue-1", 141 | "FBref": "Ligue 1", 142 | "ESPN": "fra.1", 143 | "SoFIFA": "French Ligue 1 (1)", 144 | "WhoScored": "France - Ligue 1", 145 | "season_start": "Aug", 146 | "season_end": "May", 147 | }, 148 | } 149 | _f_custom_league_dict = CONFIG_DIR / "league_dict.json" 150 | if _f_custom_league_dict.is_file(): 151 | with open(_f_custom_league_dict, encoding='utf8') as json_file: 152 | LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)} 153 | logger.info("Custom league dict loaded from %s.", _f_custom_league_dict) 154 | else: 155 | logger.info( 156 | "No custom league dict found. You can configure additional leagues in %s.", 157 | _f_custom_league_dict, 158 | ) 159 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Nox sessions.""" 2 | import os 3 | import shlex 4 | import shutil 5 | import sys 6 | from pathlib import Path 7 | from textwrap import dedent 8 | 9 | import nox 10 | 11 | try: 12 | from nox_poetry import Session, session 13 | except ImportError: 14 | message = f"""\ 15 | Nox failed to import the 'nox-poetry' package. 16 | 17 | Please install it using the following command: 18 | 19 | {sys.executable} -m pip install nox-poetry""" 20 | raise SystemExit(dedent(message)) from None 21 | 22 | 23 | package = "soccerdata" 24 | python_versions = ["3.9", "3.8", "3.7"] 25 | nox.needs_version = ">= 2021.6.6" 26 | nox.options.sessions = ( 27 | "pre-commit", 28 | "mypy", 29 | "tests", 30 | "docs-build", 31 | ) 32 | 33 | 34 | def activate_virtualenv_in_precommit_hooks(session: Session) -> None: 35 | """Activate virtualenv in hooks installed by pre-commit. 36 | 37 | This function patches git hooks installed by pre-commit to activate the 38 | session's virtual environment. This allows pre-commit to locate hooks in 39 | that environment when invoked from git. 40 | 41 | Parameters 42 | ---------- 43 | session : Session 44 | The Session object. 45 | """ 46 | assert session.bin is not None # noqa: S101 47 | 48 | # Only patch hooks containing a reference to this session's bindir. Support 49 | # quoting rules for Python and bash, but strip the outermost quotes so we 50 | # can detect paths within the bindir, like /python. 51 | bindirs = [ 52 | bindir[1:-1] if bindir[0] in "'\"" else bindir 53 | for bindir in (repr(session.bin), shlex.quote(session.bin)) 54 | ] 55 | 56 | virtualenv = session.env.get("VIRTUAL_ENV") 57 | if virtualenv is None: 58 | return 59 | 60 | headers = { 61 | # pre-commit < 2.16.0 62 | "python": f"""\ 63 | import os 64 | os.environ["VIRTUAL_ENV"] = {virtualenv!r} 65 | os.environ["PATH"] = os.pathsep.join(( 66 | {session.bin!r}, 67 | os.environ.get("PATH", ""), 68 | )) 69 | """, 70 | # pre-commit >= 2.16.0 71 | "bash": f"""\ 72 | VIRTUAL_ENV={shlex.quote(virtualenv)} 73 | PATH={shlex.quote(session.bin)}"{os.pathsep}$PATH" 74 | """, 75 | } 76 | 77 | hookdir = Path(".git") / "hooks" 78 | if not hookdir.is_dir(): 79 | return 80 | 81 | for hook in hookdir.iterdir(): 82 | if hook.name.endswith(".sample") or not hook.is_file(): 83 | continue 84 | 85 | if not hook.read_bytes().startswith(b"#!"): 86 | continue 87 | 88 | text = hook.read_text() 89 | 90 | if not any( 91 | Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text 92 | for bindir in bindirs 93 | ): 94 | continue 95 | 96 | lines = text.splitlines() 97 | 98 | for executable, header in headers.items(): 99 | if executable in lines[0].lower(): 100 | lines.insert(1, dedent(header)) 101 | hook.write_text("\n".join(lines)) 102 | break 103 | 104 | 105 | @session(name="pre-commit", python=python_versions[0]) 106 | def precommit(session: Session) -> None: 107 | """Lint using pre-commit.""" 108 | args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"] 109 | session.install( 110 | "black", 111 | "darglint", 112 | "flake8", 113 | "flake8-bugbear", 114 | "flake8-docstrings", 115 | "flake8-rst-docstrings", 116 | "pep8-naming", 117 | "pre-commit", 118 | "pre-commit-hooks", 119 | "pyupgrade", 120 | "isort", 121 | ) 122 | session.run("pre-commit", *args) 123 | if args and args[0] == "install": 124 | activate_virtualenv_in_precommit_hooks(session) 125 | 126 | 127 | @session(python=python_versions) 128 | def mypy(session: Session) -> None: 129 | """Type-check using mypy.""" 130 | args = session.posargs or ["soccerdata", "tests", "docs/conf.py"] 131 | session.install(".") 132 | session.install("mypy", "pytest") 133 | session.run("mypy", "--install-types", "--non-interactive", *args) 134 | if not session.posargs: 135 | session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py") 136 | 137 | 138 | @session(python=python_versions) 139 | def tests(session: Session) -> None: 140 | """Run the test suite.""" 141 | args = session.posargs or ["-m", "not e2e and not fails_gha"] 142 | session.install(".") 143 | session.install("coverage[toml]", "pytest", "pytest-mock", "time-machine", "pygments") 144 | try: 145 | session.run("coverage", "run", "--parallel", "-m", "pytest", *args) 146 | finally: 147 | if session.interactive: 148 | session.notify("coverage", posargs=[]) 149 | 150 | 151 | @session(python=python_versions[0]) 152 | def coverage(session: Session) -> None: 153 | """Produce the coverage report.""" 154 | args = session.posargs or ["report"] 155 | 156 | session.install("coverage[toml]") 157 | 158 | if not session.posargs and any(Path().glob(".coverage.*")): 159 | session.run("coverage", "combine") 160 | 161 | session.run("coverage", *args) 162 | 163 | 164 | @session(name="docs-build", python=python_versions[0]) 165 | def docs_build(session: Session) -> None: 166 | """Build the documentation.""" 167 | args = session.posargs or ["docs", "docs/_build"] 168 | if not session.posargs and "FORCE_COLOR" in os.environ: 169 | args.insert(0, "--color") 170 | 171 | session.install(".") 172 | session.install("sphinx", "sphinx-click", "furo", "nbsphinx", "ipython") 173 | 174 | build_dir = Path("docs", "_build") 175 | if build_dir.exists(): 176 | shutil.rmtree(build_dir) 177 | 178 | session.run("sphinx-build", *args, env={'SOCCERDATA_DIR': '~/soccerdata'}) 179 | 180 | 181 | @session(python=python_versions[0]) 182 | def docs(session: Session) -> None: 183 | """Build and serve the documentation with live reloading on file changes.""" 184 | args = session.posargs or ["--host=0.0.0.0", "docs", "docs/_build"] 185 | session.install(".") 186 | session.install("sphinx", "sphinx-autobuild", "furo", "nbsphinx", "ipython") 187 | 188 | build_dir = Path("docs", "_build") 189 | if build_dir.exists(): 190 | shutil.rmtree(build_dir) 191 | 192 | session.run("sphinx-autobuild", *args, env={'SOCCERDATA_DIR': '~/soccerdata'}) 193 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Contributor Guide 3 | ================= 4 | 5 | This document lays out guidelines and advice for contributing to this project. 6 | If you're thinking of contributing, please start by reading this document and 7 | getting a feel for how contributing to this project works. If you have any 8 | questions, feel free to reach out to `Pieter Robberechts`_, the primary maintainer. 9 | 10 | .. _Pieter Robberechts: https://people.cs.kuleuven.be/~pieter.robberechts/ 11 | 12 | The guide is split into sections based on the type of contribution you're 13 | thinking of making. 14 | 15 | 16 | .. _bug-reports: 17 | 18 | Bug Reports 19 | ----------- 20 | 21 | Bug reports are hugely important! Before you raise one, though, please check 22 | through the `GitHub issues`_, **both open and closed**, to confirm that the bug 23 | hasn't been reported before. 24 | 25 | When filing an issue, make sure to answer these questions: 26 | 27 | - Which Python version are you using? 28 | - Which version of soccerdata are you using? 29 | - What did you do? 30 | - What did you expect to see? 31 | - What did you see instead? 32 | 33 | The best way to get your bug fixed is to provide a test case, 34 | and/or steps to reproduce the issue. 35 | 36 | .. _GitHub issues: https://github.com/probberechts/soccerdata/issues 37 | 38 | 39 | Feature Requests 40 | ---------------- 41 | 42 | If you believe there is a feature missing, feel free to raise a feature 43 | request on the `Issue Tracker`_. 44 | 45 | .. _Issue tracker: https://github.com/probberechts/soccerdata/issues 46 | 47 | 48 | Documentation Contributions 49 | --------------------------- 50 | 51 | Documentation improvements are always welcome! The documentation files live in 52 | the ``docs/`` directory of the codebase. They're written in 53 | `reStructuredText`_, and use `Sphinx`_ to generate the full suite of 54 | documentation. 55 | 56 | You do not have to setup a development environment to make small changes to 57 | the docs. Instead, you can `edit files directly on GitHub`_ and suggest changes. 58 | 59 | When contributing documentation, please do your best to follow the style of the 60 | documentation files. This means a soft-limit of 79 characters wide in your text 61 | files and a semi-formal, yet friendly and approachable, prose style. 62 | 63 | When presenting Python code, use single-quoted strings (``'hello'`` instead of 64 | ``"hello"``). 65 | 66 | .. _reStructuredText: http://docutils.sourceforge.net/rst.html 67 | .. _Sphinx: http://sphinx-doc.org/index.html 68 | .. _edit files directly on GitHub: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files 69 | 70 | 71 | Code Contributions 72 | ------------------ 73 | 74 | If you intend to contribute code, do not feel the need to sit on your 75 | contribution until it is perfectly polished and complete. It helps everyone 76 | involved for you to seek feedback as early as you possibly can. Submitting an 77 | early, unfinished version of your contribution for feedback can save you from 78 | putting a lot of work into a contribution that is not suitable for the 79 | project. 80 | 81 | Setting up your development environment 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 83 | 84 | You need Python 3.7.1+ and the following tools: 85 | 86 | - Poetry_ 87 | - Nox_ 88 | - nox-poetry_ 89 | 90 | Install the package with development requirements: 91 | 92 | .. code:: console 93 | 94 | $ poetry install 95 | 96 | You can now run an interactive Python session. 97 | 98 | .. code:: console 99 | 100 | $ poetry run python 101 | 102 | .. _Poetry: https://python-poetry.org/ 103 | .. _Nox: https://nox.thea.codes/ 104 | .. _nox-poetry: https://nox-poetry.readthedocs.io/ 105 | 106 | Steps for submitting Code 107 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 108 | 109 | When contributing code, you'll want to follow this checklist: 110 | 111 | 1. Fork the repository on GitHub. 112 | 2. Run the tests to confirm they all pass on your system. If they don't, you'll 113 | need to investigate why they fail. If you're unable to diagnose this 114 | yourself, raise it as a bug report. 115 | 3. Write tests that demonstrate your bug or feature. Ensure that they fail. 116 | 4. Make your change. 117 | 5. Run the entire test suite again, confirming that all tests pass *including 118 | the ones you just added*. 119 | 6. Make sure your code follows the code style discussed below. 120 | 7. Send a GitHub Pull Request to the main repository's ``master`` branch. 121 | GitHub Pull Requests are the expected method of code collaboration on this 122 | project. 123 | 124 | Testing the project 125 | ~~~~~~~~~~~~~~~~~~~ 126 | 127 | Run the full test suite: 128 | 129 | .. code:: console 130 | 131 | $ nox 132 | 133 | List the available Nox sessions: 134 | 135 | .. code:: console 136 | 137 | $ nox --list-sessions 138 | 139 | You can also run a specific Nox session. 140 | For example, invoke the unit test suite like this: 141 | 142 | .. code:: console 143 | 144 | $ nox --session=tests 145 | 146 | Unit tests are located in the ``tests`` directory, 147 | and are written using the pytest_ testing framework. 148 | 149 | .. _pytest: https://pytest.readthedocs.io/ 150 | 151 | Code style 152 | ~~~~~~~~~~~ 153 | 154 | The soccerdata codebase uses the `PEP 8`_ code style. In addition, we have 155 | a few guidelines: 156 | 157 | - Line-length can exceed 79 characters, to 100, when convenient. 158 | - Line-length can exceed 100 characters, when doing otherwise would be *terribly* inconvenient. 159 | - Always use single-quoted strings (e.g. ``'#soccer'``), unless a single-quote occurs within the string. 160 | 161 | To ensure all code conforms to this format. You can format the code using the 162 | pre-commit hooks. 163 | 164 | .. code:: console 165 | 166 | $ nox --session=pre-commit 167 | 168 | Docstrings are to follow the `numpydoc guidelines`_. 169 | 170 | .. _PEP 8: https://pep8.org/ 171 | .. _black: https://black.readthedocs.io/en/stable/ 172 | .. _numpydoc guidelines: https://numpydoc.readthedocs.io/en/latest/format.html 173 | 174 | Submitting changes 175 | ~~~~~~~~~~~~~~~~~~ 176 | 177 | Open a `pull request`_ to submit changes to this project. 178 | 179 | Your pull request needs to meet the following guidelines for acceptance: 180 | 181 | - The Nox test suite must pass without errors and warnings. 182 | - Include unit tests. 183 | - If your changes add functionality, update the documentation accordingly. 184 | 185 | Feel free to submit early, though. We can always iterate on this. 186 | 187 | To run linting and code formatting checks before committing your change, you 188 | can install pre-commit as a Git hook by running the following command: 189 | 190 | .. code:: console 191 | 192 | $ nox --session=pre-commit -- install 193 | 194 | It is recommended to open an issue before starting work on anything. 195 | 196 | .. _pull request: https://github.com/probberechts/soccerdata/pulls 197 | .. github-only 198 | -------------------------------------------------------------------------------- /soccerdata/clubelo.py: -------------------------------------------------------------------------------- 1 | """Scraper for api.clubelo.com.""" 2 | import re 3 | from datetime import datetime, timedelta 4 | from pathlib import Path 5 | from typing import Callable, Dict, List, Optional, Union 6 | 7 | import pandas as pd 8 | from unidecode import unidecode 9 | 10 | from ._common import BaseRequestsReader, standardize_colnames 11 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 12 | 13 | CLUB_ELO_DATADIR = DATA_DIR / "ClubElo" 14 | CLUB_ELO_API = "http://api.clubelo.com" 15 | 16 | 17 | class ClubElo(BaseRequestsReader): 18 | """Provides pd.DataFrames from CSV API at http://api.clubelo.com. 19 | 20 | Data will be downloaded as necessary and cached locally in 21 | ``~/soccerdata/data/ClubElo``. 22 | 23 | Since the source does not provide league names, this class will not filter 24 | by league. League names will be inserted from the other sources where 25 | available. Leagues that are only covered by clubelo.com will have NaN 26 | values. 27 | 28 | Parameters 29 | ---------- 30 | proxy : 'tor' or dict or list(dict) or callable, optional 31 | Use a proxy to hide your IP address. Valid options are: 32 | - "tor": Uses the Tor network. Tor should be running in 33 | the background on port 9050. 34 | - dict: A dictionary with the proxy to use. The dict should be 35 | a mapping of supported protocols to proxy addresses. For example:: 36 | 37 | { 38 | 'http': 'http://10.10.1.10:3128', 39 | 'https': 'http://10.10.1.10:1080', 40 | } 41 | 42 | - list(dict): A list of proxies to choose from. A different proxy will 43 | be selected from this list after failed requests, allowing rotating 44 | proxies. 45 | - callable: A function that returns a valid proxy. This function will 46 | be called after failed requests, allowing rotating proxies. 47 | no_cache : bool 48 | If True, will not use cached data. 49 | no_store : bool 50 | If True, will not store downloaded data. 51 | data_dir : Path 52 | Path to directory where data will be cached. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | proxy: Optional[ 58 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 59 | ] = None, 60 | no_cache: bool = NOCACHE, 61 | no_store: bool = NOSTORE, 62 | data_dir: Path = CLUB_ELO_DATADIR, 63 | ): 64 | """Initialize a new ClubElo reader.""" 65 | super().__init__(no_cache=no_cache, no_store=no_store, data_dir=data_dir) 66 | 67 | def read_by_date(self, date: Optional[Union[str, datetime]] = None) -> pd.DataFrame: 68 | """Retrieve ELO scores for all teams at specified date. 69 | 70 | Elo scores are available as early as 1939. Values before 1960 should 71 | be considered provisional. 72 | 73 | Parameters 74 | ---------- 75 | date : datetime object or string like 'YYYY-MM-DD' 76 | Date for which to retrieve ELO scores. If no date is specified, 77 | get today's scores. 78 | 79 | Returns 80 | ------- 81 | pd.DataFrame 82 | """ 83 | if not date: 84 | date = datetime.today() 85 | elif isinstance(date, str): 86 | date = datetime.strptime(date, "%Y-%m-%d") 87 | else: 88 | pass # Assume datetime object 89 | 90 | datestring = date.strftime("%Y-%m-%d") 91 | filepath = self.data_dir / f"{datestring}.csv" 92 | url = f"{CLUB_ELO_API}/{datestring}" 93 | 94 | data = self.get(url, filepath) 95 | 96 | df = ( 97 | pd.read_csv( 98 | data, parse_dates=["From", "To"], infer_datetime_format=True, dayfirst=False 99 | ) 100 | .pipe(standardize_colnames) 101 | .rename(columns={"club": "team"}) 102 | .replace({"team": TEAMNAME_REPLACEMENTS}) 103 | .replace("None", float("nan")) 104 | .assign(rank=lambda x: x["rank"].astype("float")) 105 | .assign(league=lambda x: x["country"] + "_" + x["level"].astype(str)) 106 | .pipe(self._translate_league) 107 | .reset_index(drop=True) 108 | .set_index("team") 109 | ) 110 | return df 111 | 112 | def read_team_history( 113 | self, team: str, max_age: Union[int, timedelta] = 1 114 | ) -> Optional[pd.DataFrame]: 115 | """Retrieve full ELO history for one club. 116 | 117 | For the exact spelling of a club's name, check the result 118 | of :func:`~soccerdata.ClubElo.read_by_date` or 119 | `clubelo.com `__. You can also use 120 | alternative team names specified in `teamname_replacements.json`. 121 | Values before 1960 should be considered provisional. 122 | 123 | Parameters 124 | ---------- 125 | team : str 126 | The club's name 127 | max_age : int for age in days, or timedelta object 128 | The max. age of locally cached file before re-download. 129 | 130 | Raises 131 | ------ 132 | TypeError 133 | If max_age is not an integer or timedelta object. 134 | ValueError 135 | If no ratings for the given team are available. 136 | 137 | Returns 138 | ------- 139 | pd.DataFrame 140 | """ 141 | teams_to_check = [k for k, v in TEAMNAME_REPLACEMENTS.items() if v == team] 142 | teams_to_check.append(team) 143 | 144 | for i, _ in enumerate(teams_to_check): 145 | teams_to_check[i] = unidecode(teams_to_check[i]) 146 | teams_to_check[i] = re.sub(r"[\s']", "", teams_to_check[i]) 147 | 148 | for _team in teams_to_check: 149 | filepath = self.data_dir / f"{_team}.csv" 150 | url = f"{CLUB_ELO_API}/{_team}" 151 | data = self.get(url, filepath, max_age) 152 | 153 | df = ( 154 | pd.read_csv( 155 | data, 156 | parse_dates=["From", "To"], 157 | infer_datetime_format=True, 158 | dayfirst=False, 159 | ) 160 | .pipe(standardize_colnames) 161 | .rename(columns={"club": "team"}) 162 | .replace("None", float("nan")) 163 | .assign(rank=lambda x: x["rank"].astype("float")) 164 | .set_index("from") 165 | .sort_index() 166 | ) 167 | 168 | if len(df) > 0: 169 | # clubelo.com returns a CSV with just a header for nonexistent club 170 | df.replace({"team": TEAMNAME_REPLACEMENTS}, inplace=True) 171 | return df 172 | 173 | raise ValueError(f"No data found for team {team}") 174 | -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | """Unittests for soccerdata._common.""" 2 | 3 | import datetime 4 | 5 | import pandas as pd 6 | import pytest 7 | import time_machine 8 | 9 | import soccerdata 10 | from soccerdata._common import ( 11 | BaseRequestsReader, 12 | make_game_id, 13 | season_code, 14 | standardize_colnames, 15 | ) 16 | 17 | # _download_and_save 18 | 19 | 20 | def test_download_and_save_not_cached(tmp_path): 21 | reader = BaseRequestsReader() 22 | url = "http://api.clubelo.com/Barcelona" 23 | filepath = tmp_path / "Barcelona.csv" 24 | data = reader._download_and_save(url, filepath) 25 | assert isinstance(pd.read_csv(data), pd.DataFrame) 26 | 27 | 28 | def test_download_and_save_cached(tmp_path): 29 | reader = BaseRequestsReader() 30 | url = "http://api.clubelo.com/Barcelona" 31 | filepath = tmp_path / "Barcelona.csv" 32 | data = reader._download_and_save(url, filepath) 33 | data = reader._download_and_save(url, filepath) 34 | assert isinstance(pd.read_csv(data), pd.DataFrame) 35 | 36 | 37 | def test_download_and_save_no_cache(tmp_path): 38 | reader = BaseRequestsReader(no_cache=True) 39 | url = "http://api.clubelo.com/Barcelona" 40 | filepath = tmp_path / "Barcelona.csv" 41 | filepath.write_text("bogus") 42 | data = reader._download_and_save(url, filepath) 43 | assert len(pd.read_csv(data)) > 1 44 | 45 | 46 | def test_download_and_save_no_store_no_filepath(): 47 | reader = BaseRequestsReader(no_store=True) 48 | url = "http://api.clubelo.com/Barcelona" 49 | data = reader._download_and_save(url, filepath=None) 50 | assert isinstance(pd.read_csv(data), pd.DataFrame) 51 | 52 | 53 | def test_download_and_save_no_cache_filepath(tmp_path): 54 | reader = BaseRequestsReader(no_store=True) 55 | url = "http://api.clubelo.com/Barcelona" 56 | filepath = tmp_path / "Barcelona.csv" 57 | data = reader._download_and_save(url, filepath) 58 | assert isinstance(pd.read_csv(data), pd.DataFrame) 59 | assert not filepath.exists() 60 | 61 | 62 | # def test_download_and_save_requests_tor(tmp_path): 63 | # url = "https://check.torproject.org/api/ip" 64 | # reader = BaseRequestsReader(proxy=None) 65 | # ip_without_proxy = reader.get(url, tmp_path / "myip.txt") 66 | # ip_without_proxy = json.load(ip_without_proxy) 67 | # proxy_reader = BaseRequestsReader(proxy="tor") 68 | # ip_with_proxy = proxy_reader.get(url, tmp_path / "myproxyip.txt") 69 | # ip_with_proxy = json.load(ip_with_proxy) 70 | # assert ip_without_proxy["IP"] != ip_with_proxy["IP"] 71 | # assert ip_with_proxy["IsTor"] 72 | # 73 | # 74 | # def test_download_and_save_selenium_tor(tmp_path): 75 | # url = "https://check.torproject.org/api/ip" 76 | # reader = BaseSeleniumReader(proxy=None).get(url, tmp_path / "myip.txt") 77 | # ip_without_proxy = html.parse(reader).xpath("//pre")[0].text 78 | # ip_without_proxy = json.loads(ip_without_proxy) 79 | # proxy_reader = BaseSeleniumReader(proxy="tor").get(url, tmp_path / "myproxyip.txt") 80 | # ip_with_proxy = html.parse(proxy_reader).xpath("//pre")[0].text 81 | # ip_with_proxy = json.loads(ip_with_proxy) 82 | # assert ip_without_proxy["IP"] != ip_with_proxy["IP"] 83 | # assert ip_with_proxy["IsTor"] 84 | # 85 | 86 | # make_game_id 87 | 88 | 89 | def test_make_game_id(): 90 | s = pd.Series( 91 | { 92 | "date": datetime.datetime(1993, 7, 30), 93 | "home_team": "Barcelona", 94 | "away_team": "Real Madrid", 95 | } 96 | ) 97 | game_id = make_game_id(s) 98 | assert game_id == "1993-07-30 Barcelona-Real Madrid" 99 | 100 | 101 | # standardize_colnames 102 | 103 | 104 | def test_standardize_colnames(): 105 | df = pd.DataFrame( 106 | columns=[ 107 | "First Test", 108 | "SecondTest", 109 | "thirdTest", 110 | "Fourthtest", 111 | "Fifth-test", 112 | "TestSix", 113 | ] 114 | ) 115 | df = standardize_colnames( 116 | df, cols=["First Test", "SecondTest", "thirdTest", "Fourthtest", "Fifth-test"] 117 | ) 118 | assert df.columns.tolist() == [ 119 | "first_test", 120 | "second_test", 121 | "third_test", 122 | "fourthtest", 123 | "fifth_test", 124 | "TestSix", 125 | ] 126 | 127 | 128 | # is_complete 129 | 130 | 131 | def test_is_complete(): 132 | reader = BaseRequestsReader(no_store=True) 133 | with time_machine.travel(datetime.datetime(2020, 12, 25, 1, 24)): 134 | assert reader._is_complete("ENG-Premier League", "1920") 135 | assert not reader._is_complete("ENG-Premier League", "2021") 136 | with time_machine.travel(datetime.datetime(2021, 2, 25, 1, 24)): 137 | assert reader._is_complete("ENG-Premier League", "1920") 138 | assert not reader._is_complete("ENG-Premier League", "2021") 139 | with time_machine.travel(datetime.datetime(2021, 7, 1, 1, 24)): 140 | assert reader._is_complete("ENG-Premier League", "1920") 141 | assert reader._is_complete("ENG-Premier League", "2021") 142 | assert not reader._is_complete("ENG-Premier League", "2122") 143 | 144 | 145 | def test_is_complete_default_value(mocker): 146 | mocker.patch.object(soccerdata._common, "LEAGUE_DICT", {"FAKE-Dummy League": {}}) 147 | reader = BaseRequestsReader(no_store=True) 148 | with time_machine.travel(datetime.datetime(2020, 12, 25, 1, 24)): 149 | assert reader._is_complete("FAKE-Dummy League", "1920") 150 | 151 | 152 | def test_is_complete_undefined_league(mocker): 153 | reader = BaseRequestsReader(no_store=True) 154 | with pytest.raises(ValueError): 155 | reader._is_complete("FAKE-Dummy League", "1920") 156 | 157 | 158 | # Season codes 159 | def test_season_pattern1a(): 160 | assert season_code("9495") == "9495" 161 | 162 | 163 | def test_season_pattern1a_warn(): 164 | with pytest.warns(UserWarning) as record: 165 | assert season_code("2021") == "2021" 166 | 167 | # check that only one warning was raised 168 | assert len(record) == 1 169 | # check that the message matches 170 | msg = 'Season id "2021" is ambiguous: interpreting as "20-21"' 171 | assert record[0].message.args[0] == msg # type: ignore 172 | 173 | 174 | def test_season_pattern1b(): 175 | my_season = check_post = "1998" 176 | assert season_code(my_season) == "9899" 177 | assert my_season == check_post 178 | 179 | 180 | def test_season_pattern1c(): 181 | assert season_code("1999") == "9900" 182 | 183 | 184 | def test_season_pattern2(): 185 | assert season_code("11") == "1112" 186 | assert season_code("99") == "9900" 187 | 188 | 189 | def test_season_pattern3(): 190 | assert season_code("2011-2012") == "1112" 191 | assert season_code("1999-2000") == "9900" 192 | 193 | 194 | def test_season_pattern4(): 195 | assert season_code("2011-12") == "1112" 196 | assert season_code("1999-00") == "9900" 197 | 198 | 199 | def test_season_pattern5(): 200 | assert season_code("13-14") == "1314" 201 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Usage 4 | ===== 5 | 6 | This tutorial will walk you through installing, configuring, and using 7 | SoccerData. 8 | 9 | 10 | Installation 11 | ------------ 12 | 13 | SoccerData can be easily installed via `pip `__: 14 | 15 | .. code:: bash 16 | 17 | python3 -m pip install soccerdata 18 | 19 | 20 | Global configuration 21 | --------------------- 22 | 23 | Several settings that can be configured globally using the following environment variables: 24 | 25 | ``SOCCERDATA_DIR`` 26 | The directory where the downloaded data is cached and where logs are 27 | stored. By default, all data is stored to ``~/soccerdata`` on Linux / Mac 28 | OS and ``C:\Users\yourusername\soccerdata`` on Windows. 29 | ``SOCCERDATA_NOCACHE`` 30 | If set to "true", no cached data is returned. Note that no-cache does not 31 | mean "don't cache". All downloaded data is still cached and overwrites 32 | existing caches. If the sense of "don't cache" that you want is actually 33 | "don't store", then ``SOCCERDATA_NOSTORE`` is the option to use. By default, 34 | data is retrieved from the cache. 35 | ``SOCCERDATA_NOSTORE`` 36 | If set to "true", no data is stored. By default, data is cached. 37 | ``SOCCERDATA_LOGLEVEL`` 38 | The level of logging to use. By default, this is set to "INFO". 39 | 40 | Example: 41 | 42 | .. code-block:: bash 43 | 44 | # bash 45 | export SOCCERDATA_DIR = "~/soccerdata" 46 | export SOCCERDATA_NOCACHE = "False" 47 | export SOCCERDATA_NOSTORE = "False" 48 | export SOCCERDATA_LOGLEVEL = "INFO" 49 | 50 | Scraping data 51 | ------------- 52 | 53 | Each of the supported data sources has its corresponding class for fetching 54 | data with a uniform API. For example, the :class:`~soccerdata.FBref` class is 55 | used to fetch data from `fbref.com `__. 56 | 57 | .. code:: python 58 | 59 | import soccerdata as sd 60 | 61 | # Create scraper class instance 62 | fbref = sd.FBref() 63 | 64 | This will create a ``soccerdata/FBref/`` folder in your home directory in 65 | which all scraped data will be cached and where logs will be saved. If you 66 | prefer to store the data in a different folder or disable caching, you can 67 | configure this using environment variables (see above) or by setting the 68 | ``data_dir``, ``no_cache`` and ``no_store`` parameters which are supported by 69 | each scraper class. 70 | 71 | .. code:: python 72 | 73 | # Create scraper class instance with custom caching behavior 74 | fbref = sd.FBref(data_dir="/tmp", no_cache=True, no_store=True) 75 | 76 | Once you have a scraper class instance, you can use it to fetch data. See the 77 | :ref:`API reference ` for the full list of options available for each scraper. For 78 | example, to fetch aggregated shooting stats for all teams: 79 | 80 | .. code:: python 81 | 82 | # Create dataframes 83 | season_stats = fbref.read_team_season_stats(stat_type='shooting') 84 | 85 | 86 | The data is always returned as a convenient Pandas DataFrame. 87 | 88 | .. csv-table:: 89 | :file: output.csv 90 | :header-rows: 1 91 | 92 | Not all data sources provide data for all leagues. The leagues available for 93 | each source can be listed with the :meth:`~soccerdata.FBref.available_leagues` 94 | class method. 95 | 96 | .. code:: python 97 | 98 | sd.FBref.available_leagues() 99 | >>> ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A'] 100 | 101 | 102 | By default, the data for all available leagues and 10 most recent seasons will 103 | be downloaded. In most cases, you would want to limit the data to a specific 104 | league and / or seasons. This can be done by passing a list of leagues and 105 | seasons to the constructor of the scraper class. For example: 106 | 107 | .. code:: python 108 | 109 | # Create scraper class instance filtering on specific leagues and seasons 110 | fbref = sd.FBref(leagues=['ENG-Premier League'], seasons=['1718', '1819']) 111 | 112 | 113 | See the examples and :ref:`API reference ` for detailed instructions for 114 | each of the available data sources. 115 | 116 | Additional setup for WhoScored 117 | ------------------------------ 118 | 119 | WhoScored implements strong protection against scraping using Incapsula. To 120 | circumvent this, this scraper uses Selenium with the ChromeDriver extension to 121 | emulate a real user. Before using this scraper, you will have to `install 122 | Chrome`_. A Selenium driver matching your Chrome version will be downloaded 123 | automatically when you run the scraper. 124 | 125 | Even with this setup, it is likely that your IP address will get blocked 126 | eventually. Therefore, is is recommended to setup a SOCKS5 proxy with Tor. 127 | Checkout the `installation guide`_ on the Tor website for installation 128 | instructions. After installing Tor, make sure to start it up before scraping. 129 | This can easily be done by running the ``tor`` command from your terminal (in 130 | a separate window), Tor will start up and run on “localhost:9050” by default. 131 | Once Tor is running, you can enable the extension by setting ``proxy='tor'``. 132 | 133 | .. code:: python 134 | 135 | ws = sd.WhoScored(proxy='tor') 136 | 137 | The code snippet above assumes you have a Tor proxy running on 138 | "localhost:9050". Many distributions indeed default to having a SOCKS proxy 139 | listening on port 9050, but some may not. In particular, the Tor Browser 140 | Bundle defaults to listening on port 9150. You can specify a custom host and 141 | port as 142 | 143 | .. code:: python 144 | 145 | ws = sd.WhoScored(proxy={ 146 | "http": "socks5://127.0.0.1:9150", 147 | "https": "socks5://127.0.0.1:9150", 148 | }) 149 | 150 | 151 | .. _insallation guide: https://community.torproject.org/onion-services/setup/install/ 152 | .. _install Chrome: https://www.google.com/chrome/ 153 | 154 | 155 | Adding additional leagues 156 | ------------------------- 157 | 158 | The top-5 European leagues are fully supported. If you want to add more 159 | leagues, you can configure these in ``SOCCERDATA_DIR/config/league_dict.json``. 160 | This file should contain a mapping between a generic name for the league and 161 | the identifier used internally by each data source that you want to support. 162 | For example, for the Dutch Eredivisie this would be: 163 | 164 | .. code-block:: json 165 | 166 | { 167 | "NED-Eredivisie": { 168 | "ClubElo": "NED_1", 169 | "MatchHistory": "N1", 170 | "SoFIFA": "Holland Eredivisie (1)", 171 | "FBref": "Dutch Eredivisie", 172 | "ESPN": "ned.1", 173 | "FiveThirtyEight": "eredivisie", 174 | "WhoScored": "Netherlands - Eredivisie" 175 | "season_start": "Aug", 176 | "season_end": "May", 177 | }, 178 | } 179 | 180 | The ``season_end`` and ``season_start`` fields are optional. This should be the 181 | month in which the last game and first game of a season are played, 182 | respectively. If they are not provided, June is used as the last month of the 183 | season and July as the first one. 184 | 185 | Note that the provided scrapers might give some errors for the leagues you add 186 | yourself. This is because the same data is not always available for all seasons. 187 | 188 | 189 | Uniform team names 190 | ------------------ 191 | 192 | Each data source uses a different set of team names, which makes it difficult 193 | to combine data from multiple sources. To mitigate this, SoccerData allows 194 | translating the team names to uniform names. This is done by providing 195 | a ``SOCCERDATA_DIR/config/team_dict.json`` file. This file should contain a 196 | mapping between a generic name for each team and the team name used by each 197 | data source that you want to support. The example below will map "Tottenham 198 | Hotspur", "Tottenham Hotspur FC" and "Spurs" to "Tottenham" in all scraped 199 | data. 200 | 201 | .. code-block:: json 202 | 203 | { 204 | "Tottenham": ["Tottenham Hotspur", "Tottenham Hotspur FC", "Spurs"], 205 | } 206 | 207 | Next steps 208 | ---------- 209 | Look at you! You’re now basically an expert at SoccerData! ✨ 210 | 211 | From this point you can: 212 | 213 | - Look at the example notebooks for each :ref:`Data source `. 214 | - Take a deep dive into the :ref:`API `. 215 | - Give us feedback or contribute, see :ref:`Contributing `. 216 | 217 | Have fun! 🎉 218 | -------------------------------------------------------------------------------- /soccerdata/fivethirtyeight.py: -------------------------------------------------------------------------------- 1 | """Scraper for https://projects.fivethirtyeight.com/soccer-predictions.""" 2 | import itertools 3 | import json 4 | from pathlib import Path 5 | from typing import Callable, Dict, List, Optional, Union 6 | 7 | import pandas as pd 8 | 9 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 11 | 12 | FIVETHIRTYEIGHT_DATA_DIR = DATA_DIR / "FiveThirtyEight" 13 | FIVETHIRTYEIGHT_API = "https://projects.fivethirtyeight.com/soccer-predictions" 14 | 15 | 16 | class FiveThirtyEight(BaseRequestsReader): 17 | """Provides pd.DataFrames from fivethirtyeight's "Club Soccer Predictions" project. 18 | 19 | Data will be downloaded as necessary and cached locally in 20 | ``~/soccerdata/data/FiveThirtyEight``. 21 | 22 | Original project and background info: 23 | https://projects.fivethirtyeight.com/soccer-predictions/ and 24 | https://fivethirtyeight.com/features/how-our-club-soccer-projections-work/ 25 | 26 | 27 | Parameters 28 | ---------- 29 | leagues : string or iterable, optional 30 | IDs of Leagues to include. 31 | seasons : string, int or list, optional 32 | Seasons to include. Supports multiple formats. 33 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 34 | proxy : 'tor' or dict or list(dict) or callable, optional 35 | Use a proxy to hide your IP address. Valid options are: 36 | - "tor": Uses the Tor network. Tor should be running in 37 | the background on port 9050. 38 | - dict: A dictionary with the proxy to use. The dict should be 39 | a mapping of supported protocols to proxy addresses. For example:: 40 | 41 | { 42 | 'http': 'http://10.10.1.10:3128', 43 | 'https': 'http://10.10.1.10:1080', 44 | } 45 | 46 | - list(dict): A list of proxies to choose from. A different proxy will 47 | be selected from this list after failed requests, allowing rotating 48 | proxies. 49 | - callable: A function that returns a valid proxy. This function will 50 | be called after failed requests, allowing rotating proxies. 51 | no_cache : bool 52 | If True, will not use cached data. 53 | no_store : bool 54 | If True, will not store downloaded data. 55 | data_dir : Path 56 | Path to directory where data will be cached. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | leagues: Optional[Union[str, List[str]]] = None, 62 | seasons: Optional[Union[str, int, List]] = None, 63 | proxy: Optional[ 64 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 65 | ] = None, 66 | no_cache: bool = NOCACHE, 67 | no_store: bool = NOSTORE, 68 | data_dir: Path = FIVETHIRTYEIGHT_DATA_DIR, 69 | ): 70 | """Initialize a new FiveThirtyEight reader.""" 71 | super().__init__( 72 | leagues=leagues, proxy=proxy, no_cache=no_cache, no_store=no_store, data_dir=data_dir 73 | ) 74 | self.seasons = seasons # type: ignore 75 | self._data = {} 76 | 77 | url = f"{FIVETHIRTYEIGHT_API}/data.json" 78 | filepath = self.data_dir / "latest.json" 79 | reader = self.get(url, filepath) 80 | 81 | for k, v in json.load(reader).items(): 82 | self._data[k] = v 83 | 84 | def read_leagues(self) -> pd.DataFrame: 85 | """Retrieve the selected leagues from the datasource. 86 | 87 | Returns 88 | ------- 89 | pd.DataFrame 90 | """ 91 | df = ( 92 | pd.DataFrame.from_dict(self._data["leagues"]) 93 | .rename(columns={"slug": "league", "id": "league_id"}) 94 | .pipe(self._translate_league) 95 | .pipe(standardize_colnames) 96 | .drop(columns=["overview_column", "custom_template", "skip_cols"]) 97 | .set_index("league") 98 | .loc[self._selected_leagues.keys()] 99 | .sort_index() 100 | ) 101 | return df 102 | 103 | def read_games(self) -> pd.DataFrame: 104 | """Retrieve all games for the selected leagues. 105 | 106 | Returns 107 | ------- 108 | pd.DataFrame 109 | """ 110 | col_rename = { 111 | "adj_score1": "adj_score_home", 112 | "adj_score2": "adj_score_away", 113 | "chances1": "chances_home", 114 | "chances2": "chances_away", 115 | "datetime": "date", 116 | "moves1": "moves_home", 117 | "moves2": "moves_away", 118 | "prob1": "prob_home", 119 | "prob2": "prob_away", 120 | "probtie": "prob_tie", 121 | "score1": "score_home", 122 | "score2": "score_away", 123 | "team1": "home_team", 124 | "team1_code": "home_code", 125 | "team1_id": "home_id", 126 | "team1_sdr_id": "home_sdr_id", 127 | "team2": "away_team", 128 | "team2_code": "away_code", 129 | "team2_id": "away_id", 130 | "team2_sdr_id": "away_sdr_id", 131 | } 132 | 133 | filemask = "matches_{}_{}.csv" 134 | urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_matches.json" 135 | data = [] 136 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 137 | filepath = self.data_dir / filemask.format(lkey, skey) 138 | url = urlmask.format(skey[:2], lkey) 139 | reader = self.get(url, filepath) 140 | data.extend([{"league": lkey, "season": skey, **d} for d in json.load(reader)]) 141 | 142 | df = ( 143 | pd.DataFrame.from_dict(data) 144 | .rename(columns=col_rename) 145 | .assign(date=lambda x: pd.to_datetime(x["date"])) 146 | .replace( 147 | { 148 | "home_team": TEAMNAME_REPLACEMENTS, 149 | "away_team": TEAMNAME_REPLACEMENTS, 150 | } 151 | ) 152 | .drop("id", axis=1) 153 | .drop("league_id", axis=1) 154 | .replace("None", float("nan")) 155 | .pipe(self._translate_league) 156 | ) 157 | 158 | df = df[~df.date.isna()] 159 | df["game"] = df.apply(make_game_id, axis=1) 160 | df.set_index(["league", "season", "game"], inplace=True) 161 | df.sort_index(inplace=True) 162 | return df 163 | 164 | def read_forecasts(self) -> pd.DataFrame: 165 | """Retrieve the forecasted results for the selected leagues. 166 | 167 | Returns 168 | ------- 169 | pd.DataFrame 170 | """ 171 | filemask = "forecasts_{}_{}.csv" 172 | urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_forecast.json" 173 | data = [] 174 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 175 | filepath = self.data_dir / filemask.format(lkey, skey) 176 | url = urlmask.format(skey[:2], lkey) 177 | reader = self.get(url, filepath) 178 | 179 | forecasts = json.load(reader) 180 | for f in forecasts["forecasts"]: 181 | for t in f["teams"]: 182 | data.append( 183 | { 184 | "league": lkey, 185 | "season": skey, 186 | "last_updated": f["last_updated"], 187 | **t, 188 | } 189 | ) 190 | df = ( 191 | pd.DataFrame.from_dict(data) 192 | .rename(columns={"name": "team"}) 193 | .replace({"team": TEAMNAME_REPLACEMENTS}) 194 | .replace("None", float("nan")) 195 | .pipe(self._translate_league) 196 | .set_index(["league", "season", "last_updated", "team"]) 197 | .sort_index() 198 | ) 199 | return df 200 | 201 | def read_clinches(self) -> pd.DataFrame: 202 | """Retrieve clinches for the selected leagues. 203 | 204 | Returns 205 | ------- 206 | pd.DataFrame 207 | """ 208 | filemask = "clinches_{}_{}.csv" 209 | urlmask = FIVETHIRTYEIGHT_API + "/forecasts/20{}_{}_clinches.json" 210 | data = [] 211 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 212 | filepath = self.data_dir / filemask.format(lkey, skey) 213 | url = urlmask.format(skey[:2], lkey) 214 | reader = self.get(url, filepath) 215 | data.extend([{"league": lkey, "season": skey, **c} for c in json.load(reader)]) 216 | 217 | teams = ( 218 | self.read_games()[["home_team", "home_id"]] 219 | .drop_duplicates() 220 | .rename(columns={"home_team": "team", "home_id": "team_id"}) 221 | ) 222 | df = ( 223 | pd.DataFrame.from_dict(data) 224 | .assign(date=lambda x: pd.to_datetime(x["dt"])) 225 | .merge(teams, on="team_id", how="left") 226 | .replace({"team": TEAMNAME_REPLACEMENTS}) 227 | .drop("dt", axis=1) 228 | .drop("league_id", axis=1) 229 | .drop("team_id", axis=1) 230 | .pipe(self._translate_league) 231 | .set_index(["league", "season", "date"]) 232 | .sort_index() 233 | ) 234 | return df 235 | -------------------------------------------------------------------------------- /soccerdata/sofifa.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://sofifa.com.""" 2 | import re 3 | from pathlib import Path 4 | from typing import Callable, Dict, List, Optional, Union 5 | 6 | import pandas as pd 7 | from lxml import html 8 | 9 | from ._common import BaseRequestsReader, standardize_colnames 10 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 11 | 12 | SO_FIFA_DATADIR = DATA_DIR / "SoFIFA" 13 | SO_FIFA_API = "https://sofifa.com" 14 | 15 | 16 | class SoFIFA(BaseRequestsReader): 17 | """Provides pd.DataFrames from data at http://sofifa.com. 18 | 19 | Data will be downloaded as necessary and cached locally in 20 | ``~/soccerdata/data/SoFIFA``. 21 | 22 | Parameters 23 | ---------- 24 | leagues : string or iterable, optional 25 | IDs of leagues to include. 26 | seasons : string, int or list, optional 27 | Seasons to include. Supports multiple formats. 28 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 29 | proxy : 'tor' or dict or list(dict) or callable, optional 30 | Use a proxy to hide your IP address. Valid options are: 31 | - "tor": Uses the Tor network. Tor should be running in 32 | the background on port 9050. 33 | - dict: A dictionary with the proxy to use. The dict should be 34 | a mapping of supported protocols to proxy addresses. For example:: 35 | 36 | { 37 | 'http': 'http://10.10.1.10:3128', 38 | 'https': 'http://10.10.1.10:1080', 39 | } 40 | 41 | - list(dict): A list of proxies to choose from. A different proxy will 42 | be selected from this list after failed requests, allowing rotating 43 | proxies. 44 | - callable: A function that returns a valid proxy. This function will 45 | be called after failed requests, allowing rotating proxies. 46 | no_cache : bool 47 | If True, will not use cached data. 48 | no_store : bool 49 | If True, will not store downloaded data. 50 | data_dir : Path 51 | Path to directory where data will be cached. 52 | """ 53 | 54 | def __init__( 55 | self, 56 | leagues: Optional[Union[str, List[str]]] = None, 57 | seasons: Optional[Union[str, int, List]] = None, 58 | proxy: Optional[ 59 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 60 | ] = None, 61 | no_cache: bool = NOCACHE, 62 | no_store: bool = NOSTORE, 63 | data_dir: Path = SO_FIFA_DATADIR, 64 | ): 65 | """Initialize SoFIFA reader.""" 66 | super().__init__( 67 | leagues=leagues, 68 | proxy=proxy, 69 | no_cache=no_cache, 70 | no_store=no_store, 71 | data_dir=data_dir, 72 | ) 73 | self.rate_limit = 2 74 | self.seasons = seasons # type: ignore 75 | 76 | def read_leagues(self) -> pd.DataFrame: 77 | """Retrieve selected leagues from the datasource. 78 | 79 | Returns 80 | ------- 81 | pd.DataFrame 82 | """ 83 | # read html page (overview) 84 | filepath = self.data_dir / "leagues.html" 85 | reader = self.get(SO_FIFA_API, filepath) 86 | 87 | # extract league links 88 | leagues = [] 89 | tree = html.parse(reader) 90 | for node in tree.xpath("//select[@id='choices-lg']/optgroup/option"): 91 | leagues.append( 92 | { 93 | "league_id": int(node.get("value")), 94 | "league": node.text, 95 | } 96 | ) 97 | df = pd.DataFrame(leagues).pipe(self._translate_league).set_index("league").sort_index() 98 | return df[df.index.isin(self._selected_leagues.keys())] 99 | 100 | def read_teams(self) -> pd.DataFrame: 101 | """Retrieve teams from the datasource for the selected leagues. 102 | 103 | Returns 104 | ------- 105 | pd.DataFrame 106 | """ 107 | # build url 108 | urlmask = SO_FIFA_API + "/teams?lg={}&v={}" 109 | filemask = "teams_{}_{}.html" 110 | 111 | # get league IDs 112 | leagues = self.read_leagues() 113 | 114 | # collect teams 115 | teams = [] 116 | for lkey, _ in self._selected_leagues.items(): 117 | league_id = leagues.at[lkey, "league_id"] 118 | for skey in self.seasons: 119 | # read html page (league overview) 120 | season_id = skey[:2] 121 | filepath = self.data_dir / filemask.format(lkey, skey) 122 | url = urlmask.format(league_id, season_id) 123 | reader = self.get(url, filepath) 124 | 125 | # extract team links 126 | tree = html.parse(reader) 127 | pat_team = re.compile(r"\/team\/(\d+)\/[\w-]+\/") 128 | for node in tree.xpath("//a[contains(@href,'/team/')]"): 129 | # extract team IDs from links 130 | teams.append( 131 | { 132 | "team_id": int( 133 | re.search(pat_team, node.get("href")).group(1) # type: ignore 134 | ), 135 | "team": node.xpath(".//div")[0].text, 136 | "league": lkey, 137 | "season": skey, 138 | } 139 | ) 140 | 141 | # return data frame 142 | df = ( 143 | pd.DataFrame(teams) 144 | .replace({"team": TEAMNAME_REPLACEMENTS}) 145 | .set_index(["league", "season", "team"]) 146 | .sort_index() 147 | ) 148 | return df 149 | 150 | def read_players(self) -> pd.DataFrame: 151 | """Retrieve players from the datasource for the selected leagues. 152 | 153 | Returns 154 | ------- 155 | pd.DataFrame 156 | """ 157 | # build url 158 | urlmask = SO_FIFA_API + "/team/{}?v={}" 159 | filemask = str(self.data_dir / "players_{}_{}.html") 160 | 161 | # get team IDs 162 | teams = self.read_teams().reset_index() 163 | 164 | # collect players 165 | players = [] 166 | for _, team in teams.iterrows(): 167 | season_id = team.season[:2] 168 | team_name = team.team 169 | # read html page (team overview) 170 | filepath = self.data_dir / filemask.format(team_name, season_id) 171 | url = urlmask.format(team["team_id"], season_id) 172 | reader = self.get(url, filepath) 173 | 174 | # extract player links 175 | tree = html.parse(reader) 176 | pat_player = re.compile(r"\/player\/(\d+)\/[\w-]+\/") 177 | for node in tree.xpath("//a[contains(@href,'/player/') and @title]"): 178 | # extract player IDs from links 179 | # extract player names from links 180 | players.append( 181 | { 182 | "player_id": int( 183 | re.search(pat_player, node.get("href")).group(1) # type: ignore 184 | ), 185 | "player": node.get("title"), 186 | "team": team_name, 187 | "league": team.league, 188 | "season": team.season, 189 | } 190 | ) 191 | 192 | # return data frame 193 | df = pd.DataFrame(players).set_index(["league", "season", "team", "player"]).sort_index() 194 | return df 195 | 196 | def read_ratings(self) -> pd.DataFrame: 197 | """Retrieve ratings from the datasource for the selected leagues. 198 | 199 | Returns 200 | ------- 201 | pd.DataFrame 202 | """ 203 | # build url 204 | urlmask = SO_FIFA_API + "/player/{}?v={}" 205 | filemask = "player_{}_{}.html" 206 | 207 | # get player IDs 208 | players = self.read_players().reset_index() 209 | 210 | # prepare empty data frame 211 | ratings = [] 212 | 213 | # define labels to use for score extraction from player profile pages 214 | score_labels = [ 215 | "Overall Rating", 216 | "Potential", 217 | "Crossing", 218 | "Finishing", 219 | "Heading Accuracy", 220 | "Short Passing", 221 | "Volleys", 222 | "Dribbling", 223 | "Curve", 224 | "FK Accuracy", 225 | "Long Passing", 226 | "Ball Control", 227 | "Acceleration", 228 | "Sprint Speed", 229 | "Agility", 230 | "Reactions", 231 | "Balance", 232 | "Shot Power", 233 | "Jumping", 234 | "Stamina", 235 | "Strength", 236 | "Long Shots", 237 | "Aggression", 238 | "Interceptions", 239 | "Positioning", 240 | "Vision", 241 | "Penalties", 242 | "Composure", 243 | "Marking", 244 | "Standing Tackle", 245 | "Sliding Tackle", 246 | "GK Diving", 247 | "GK Handling", 248 | "GK Kicking", 249 | "GK Positioning", 250 | "GK Reflexes", 251 | ] 252 | 253 | for _, player in players.iterrows(): 254 | # read html page (player overview) 255 | player_name = player.player 256 | filepath = self.data_dir / filemask.format(player_name, player.season) 257 | url = urlmask.format(player["player_id"], player.season[:2]) 258 | reader = self.get(url, filepath) 259 | 260 | # extract scores one-by-one 261 | tree = html.parse(reader) 262 | scores = { 263 | "player": player_name, 264 | "league": player.league, 265 | "season": player.season, 266 | } 267 | for s in score_labels: 268 | nodes = tree.xpath( 269 | "(//li[not(self::script)] | //div)" 270 | f"[.//text()[contains(.,'{s}')]]" 271 | "/span[contains(@class, 'tag')]" 272 | ) 273 | # for multiple matches, only accept first match 274 | if len(nodes) >= 1: 275 | scores[s] = nodes[0].text.strip() 276 | # if there's no match, put NA 277 | else: 278 | scores[s] = None 279 | ratings.append(scores) 280 | # return data frame 281 | df = ( 282 | pd.DataFrame(ratings) 283 | .pipe(standardize_colnames) 284 | .set_index(["league", "season", "player"]) 285 | .sort_index() 286 | ) 287 | return df 288 | -------------------------------------------------------------------------------- /docs/datasources/ClubElo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e621e3ae", 7 | "metadata": { 8 | "nbsphinx": "hidden" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "env: SOCCERDATA_LOGLEVEL=ERROR\n", 16 | "env: SOCCERDATA_NOCACHE=True\n", 17 | "env: SOCCERDATA_NOSTORE=True\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "%env SOCCERDATA_LOGLEVEL=ERROR\n", 23 | "%env SOCCERDATA_NOCACHE=True\n", 24 | "%env SOCCERDATA_NOSTORE=True" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "2454afe6", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import soccerdata as sd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "b5784f2d", 40 | "metadata": {}, 41 | "source": [ 42 | "# ClubElo" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "id": "8dab5be9", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Provides pd.DataFrames from CSV API at http://api.clubelo.com.\n", 56 | "\n", 57 | " Data will be downloaded as necessary and cached locally in\n", 58 | " ``~/soccerdata/data/ClubElo``.\n", 59 | "\n", 60 | " Since the source does not provide league names, this class will not filter\n", 61 | " by league. League names will be inserted from the other sources where\n", 62 | " available. Leagues that are only covered by clubelo.com will have NaN\n", 63 | " values.\n", 64 | "\n", 65 | " Parameters\n", 66 | " ----------\n", 67 | " proxy : 'tor' or or dict or list(dict) or callable, optional\n", 68 | " Use a proxy to hide your IP address. Valid options are:\n", 69 | " - \"tor\": Uses the Tor network. Tor should be running in\n", 70 | " the background on port 9050.\n", 71 | " - dict: A dictionary with the proxy to use. The dict should be\n", 72 | " a mapping of supported protocols to proxy addresses. For example::\n", 73 | "\n", 74 | " {\n", 75 | " 'http': 'http://10.10.1.10:3128',\n", 76 | " 'https': 'http://10.10.1.10:1080',\n", 77 | " }\n", 78 | "\n", 79 | " - list(dict): A list of proxies to choose from. A different proxy will\n", 80 | " be selected from this list after failed requests, allowing rotating\n", 81 | " proxies.\n", 82 | " - callable: A function that returns a valid proxy. This function will\n", 83 | " be called after failed requests, allowing rotating proxies.\n", 84 | " no_cache : bool\n", 85 | " If True, will not use cached data.\n", 86 | " no_store : bool\n", 87 | " If True, will not store downloaded data.\n", 88 | " data_dir : Path\n", 89 | " Path to directory where data will be cached.\n", 90 | " \n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "elo = sd.ClubElo()\n", 96 | "print(elo.__doc__)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "3a4c2916", 102 | "metadata": {}, 103 | "source": [ 104 | "## ELO scores for all teams at specified date" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "id": "745be31a", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/html": [ 116 | "
\n", 117 | "\n", 130 | "\n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
rankcountrylevelelofromtoleague
team
Liverpool1.0ENG12047.0838622022-04-202022-04-24ENG-Premier League
Man City2.0ENG12037.0599372022-04-212022-04-23ENG-Premier League
Bayern3.0GER11984.7753912022-04-182022-04-23GER-Bundesliga
Real Madrid4.0ESP11969.5843512022-04-212022-04-26ESP-La Liga
Chelsea5.0ENG11921.1014402022-04-212022-04-24ENG-Premier League
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " rank country level elo from to \\\n", 210 | "team \n", 211 | "Liverpool 1.0 ENG 1 2047.083862 2022-04-20 2022-04-24 \n", 212 | "Man City 2.0 ENG 1 2037.059937 2022-04-21 2022-04-23 \n", 213 | "Bayern 3.0 GER 1 1984.775391 2022-04-18 2022-04-23 \n", 214 | "Real Madrid 4.0 ESP 1 1969.584351 2022-04-21 2022-04-26 \n", 215 | "Chelsea 5.0 ENG 1 1921.101440 2022-04-21 2022-04-24 \n", 216 | "\n", 217 | " league \n", 218 | "team \n", 219 | "Liverpool ENG-Premier League \n", 220 | "Man City ENG-Premier League \n", 221 | "Bayern GER-Bundesliga \n", 222 | "Real Madrid ESP-La Liga \n", 223 | "Chelsea ENG-Premier League " 224 | ] 225 | }, 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "current_elo = elo.read_by_date()\n", 233 | "current_elo.head()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "246ca661", 239 | "metadata": {}, 240 | "source": [ 241 | "## Full ELO history for one club" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "id": "1c87e14a", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
rankteamcountryleveleloto
from
1939-10-22NaNBarcelonaESP11636.7045901939-12-03
1939-12-04NaNBarcelonaESP11626.1021731939-12-10
1939-12-11NaNBarcelonaESP11636.7282711939-12-17
1939-12-18NaNBarcelonaESP11646.9516601939-12-24
1939-12-25NaNBarcelonaESP11637.4243161939-12-31
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " rank team country level elo to\n", 340 | "from \n", 341 | "1939-10-22 NaN Barcelona ESP 1 1636.704590 1939-12-03\n", 342 | "1939-12-04 NaN Barcelona ESP 1 1626.102173 1939-12-10\n", 343 | "1939-12-11 NaN Barcelona ESP 1 1636.728271 1939-12-17\n", 344 | "1939-12-18 NaN Barcelona ESP 1 1646.951660 1939-12-24\n", 345 | "1939-12-25 NaN Barcelona ESP 1 1637.424316 1939-12-31" 346 | ] 347 | }, 348 | "execution_count": 5, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "barca_elo = elo.read_team_history(\"Barcelona\")\n", 355 | "barca_elo.head()" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "soccerdata", 362 | "language": "python", 363 | "name": "soccerdata" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.9.6" 376 | }, 377 | "toc": { 378 | "base_numbering": 1, 379 | "nav_menu": {}, 380 | "number_sections": true, 381 | "sideBar": true, 382 | "skip_h1_title": false, 383 | "title_cell": "Table of Contents", 384 | "title_sidebar": "Contents", 385 | "toc_cell": false, 386 | "toc_position": {}, 387 | "toc_section_display": true, 388 | "toc_window_display": true 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 5 393 | } 394 | -------------------------------------------------------------------------------- /soccerdata/espn.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://site.api.espn.com/apis/site/v2/sports/soccer.""" 2 | import datetime 3 | import itertools 4 | import json 5 | import re 6 | from pathlib import Path 7 | from typing import Callable, Dict, List, Optional, Union 8 | 9 | import pandas as pd 10 | import requests 11 | 12 | from ._common import BaseRequestsReader, make_game_id, standardize_colnames 13 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS 14 | 15 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/summary?event=513466 16 | # http://site.api.espn.com/apis/site/v2/sports/soccer/eng.1/scoreboard?dates=20180901 17 | 18 | ESPN_DATADIR = DATA_DIR / 'ESPN' 19 | ESPN_API = 'http://site.api.espn.com/apis/site/v2/sports/soccer' 20 | 21 | 22 | class ESPN(BaseRequestsReader): 23 | """Provides pd.DataFrames from JSON api available at http://site.api.espn.com. 24 | 25 | Data will be downloaded as necessary and cached locally in 26 | ``~/soccerdata/data/ESPN``. 27 | 28 | Parameters 29 | ---------- 30 | leagues : string or iterable, optional 31 | IDs of leagues to include. 32 | 33 | seasons : string, int or list, optional 34 | Seasons to include. Supports multiple formats. 35 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 36 | proxy : 'tor' or dict or list(dict) or callable, optional 37 | Use a proxy to hide your IP address. Valid options are: 38 | - "tor": Uses the Tor network. Tor should be running in 39 | the background on port 9050. 40 | - dict: A dictionary with the proxy to use. The dict should be 41 | a mapping of supported protocols to proxy addresses. For example:: 42 | 43 | { 44 | 'http': 'http://10.10.1.10:3128', 45 | 'https': 'http://10.10.1.10:1080', 46 | } 47 | 48 | - list(dict): A list of proxies to choose from. A different proxy will 49 | be selected from this list after failed requests, allowing rotating 50 | proxies. 51 | - callable: A function that returns a valid proxy. This function will 52 | be called after failed requests, allowing rotating proxies. 53 | no_cache : bool 54 | If True, will not use cached data. 55 | no_store : bool 56 | If True, will not store downloaded data. 57 | data_dir : Path 58 | Path to directory where data will be cached. 59 | """ 60 | 61 | def __init__( 62 | self, 63 | leagues: Optional[Union[str, List[str]]] = None, 64 | seasons: Optional[Union[str, int, List]] = None, 65 | proxy: Optional[ 66 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 67 | ] = None, 68 | no_cache: bool = NOCACHE, 69 | no_store: bool = NOSTORE, 70 | data_dir: Path = ESPN_DATADIR, 71 | ): 72 | """Initialize a new ESPN reader.""" 73 | super().__init__( 74 | leagues=leagues, 75 | proxy=proxy, 76 | no_cache=no_cache, 77 | no_store=no_store, 78 | data_dir=data_dir, 79 | ) 80 | self.seasons = seasons # type: ignore 81 | 82 | def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: 83 | """Retrieve the game schedule for the selected leagues and seasons. 84 | 85 | Parameters 86 | ---------- 87 | force_cache : bool 88 | By default no cached data is used for the current season. 89 | If True, will force the use of cached data anyway. 90 | 91 | Returns 92 | ------- 93 | pd.DataFrame 94 | """ 95 | urlmask = ESPN_API + '/{}/scoreboard?dates={}' 96 | filemask = 'Schedule_{}_{}.json' 97 | 98 | df_list = [] 99 | # Get match days 100 | for lkey, skey in itertools.product(self._selected_leagues.values(), self.seasons): 101 | if int(skey[:2]) > int(str(datetime.datetime.now().year + 1)[-2:]): 102 | start_date = ''.join(['19', skey[:2], '07', '01']) 103 | else: 104 | start_date = ''.join(['20', skey[:2], '07', '01']) 105 | 106 | url = urlmask.format(lkey, start_date) 107 | resp = requests.get(url=url) 108 | data = resp.json() 109 | 110 | match_dates = [ 111 | datetime.datetime.strptime(d, '%Y-%m-%dT%H:%MZ').strftime('%Y%m%d') 112 | for d in data['leagues'][0]['calendar'] 113 | ] 114 | for date in match_dates: 115 | url = urlmask.format(lkey, date) 116 | filepath = self.data_dir / filemask.format(lkey, date) 117 | current_season = not self._is_complete(lkey, skey) 118 | reader = self.get(url, filepath, no_cache=current_season and not force_cache) 119 | 120 | data = json.load(reader) 121 | df_list.extend( 122 | [ 123 | { 124 | 'league': lkey, 125 | 'season': skey, 126 | 'date': e['date'], 127 | 'home_team': e['competitions'][0]['competitors'][0]['team']['name'], 128 | 'away_team': e['competitions'][0]['competitors'][1]['team']['name'], 129 | 'game_id': int(e['id']), 130 | 'league_id': lkey, 131 | } 132 | for e in data['events'] 133 | ] 134 | ) 135 | df = ( 136 | pd.DataFrame(df_list) 137 | .pipe(self._translate_league) 138 | .replace({'home_team': TEAMNAME_REPLACEMENTS, 'away_team': TEAMNAME_REPLACEMENTS}) 139 | .assign(date=lambda x: pd.to_datetime(x['date'])) 140 | .dropna(subset=['home_team', 'away_team', 'date']) 141 | .assign(game=lambda df: df.apply(make_game_id, axis=1)) 142 | .set_index(['league', 'season', 'game']) 143 | .sort_index() 144 | ) 145 | 146 | return df 147 | 148 | def read_matchsheet(self, match_id: Optional[Union[int, List[int]]] = None) -> pd.DataFrame: 149 | """Retrieve match sheets for the selected leagues and seasons. 150 | 151 | Parameters 152 | ---------- 153 | match_id : int or list of int, optional 154 | Retrieve the match sheet for a specific game. 155 | 156 | Raises 157 | ------ 158 | ValueError 159 | If no games with the given IDs were found for the selected seasons and leagues. 160 | 161 | Returns 162 | ------- 163 | pd.DataFrame. 164 | """ 165 | urlmask = ESPN_API + '/{}/summary?event={}' 166 | filemask = 'Summary_{}.json' 167 | 168 | df_schedule = self.read_schedule().reset_index() 169 | if match_id is not None: 170 | iterator = df_schedule[ 171 | df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id) 172 | ] 173 | if len(iterator) == 0: 174 | raise ValueError( 175 | 'No games with the given IDs found for the selected seasons and leagues.' 176 | ) 177 | else: 178 | iterator = df_schedule 179 | 180 | df_list = [] 181 | for i, match in iterator.iterrows(): 182 | url = urlmask.format(match['league_id'], match['game_id']) 183 | filepath = self.data_dir / filemask.format(match['game_id']) 184 | reader = self.get(url, filepath) 185 | 186 | data = json.load(reader) 187 | for i in range(2): 188 | match_sheet = { 189 | 'game': match['game'], 190 | 'league': match['league'], 191 | 'season': match['season'], 192 | 'team': data['boxscore']['form'][i]['team']['displayName'], 193 | 'is_home': (i == 0), 194 | 'venue': data['gameInfo']['venue']['fullName'] 195 | if 'venue' in data['gameInfo'] 196 | else None, 197 | 'attendance': data['gameInfo']['attendance'], 198 | 'capacity': data['gameInfo']['venue']['capacity'] 199 | if 'venue' in data['gameInfo'] 200 | else None, 201 | 'roster': data['rosters'][i]['roster'], 202 | } 203 | if 'statistics' in data['boxscore']['teams'][i]: 204 | for stat in data['boxscore']['teams'][i]['statistics']: 205 | match_sheet[stat['name']] = stat['displayValue'] 206 | df_list.append(match_sheet) 207 | df = ( 208 | pd.DataFrame(df_list) 209 | .replace({'team': TEAMNAME_REPLACEMENTS}) 210 | .pipe(standardize_colnames) 211 | .set_index(['league', 'season', 'game', 'team']) 212 | .sort_index() 213 | ) 214 | return df 215 | 216 | def read_lineup( # noqa: C901 217 | self, match_id: Optional[Union[int, List[int]]] = None 218 | ) -> pd.DataFrame: 219 | """Retrieve lineups for the selected leagues and seasons. 220 | 221 | Parameters 222 | ---------- 223 | match_id : int or list of int, optional 224 | Retrieve the lineup for a specific game. 225 | 226 | Raises 227 | ------ 228 | ValueError 229 | If no games with the given IDs were found for the selected seasons and leagues. 230 | 231 | Returns 232 | ------- 233 | pd.DataFrame. 234 | """ 235 | urlmask = ESPN_API + '/{}/summary?event={}' 236 | filemask = 'Summary_{}.json' 237 | 238 | df_schedule = self.read_schedule().reset_index() 239 | if match_id is not None: 240 | iterator = df_schedule[ 241 | df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id) 242 | ] 243 | if len(iterator) == 0: 244 | raise ValueError( 245 | 'No games with the given IDs found for the selected seasons and leagues.' 246 | ) 247 | else: 248 | iterator = df_schedule 249 | 250 | df_list = [] 251 | for i, match in iterator.iterrows(): 252 | url = urlmask.format(match['league_id'], match['game_id']) 253 | filepath = self.data_dir / filemask.format(match['game_id']) 254 | reader = self.get(url, filepath) 255 | 256 | data = json.load(reader) 257 | for i in range(2): 258 | for p in data['rosters'][i]['roster']: 259 | match_sheet = { 260 | 'game': match['game'], 261 | 'league': match['league'], 262 | 'season': match['season'], 263 | 'team': data['boxscore']['form'][i]['team']['displayName'], 264 | 'is_home': (i == 0), 265 | 'player': p['athlete']['displayName'], 266 | 'position': p['position']['name'] if 'position' in p else None, 267 | 'formation_place': p['formationPlace'] if 'formationPlace' in p else None, 268 | } 269 | 270 | if p['starter']: 271 | match_sheet['sub_in'] = 'start' 272 | elif p['subbedIn']: 273 | ii = [i for i, x in enumerate(p['plays']) if x['substitution']][0] 274 | match_sheet['sub_in'] = sum( 275 | map( 276 | int, 277 | re.findall( 278 | r'(\d{1,3})', 279 | p['plays'][ii]['clock']['displayValue'], 280 | ), 281 | ) 282 | ) 283 | else: 284 | match_sheet['sub_in'] = None 285 | 286 | if (p['starter'] or p['subbedIn']) and not p['subbedOut']: 287 | match_sheet['sub_out'] = 'end' 288 | elif p['subbedOut']: 289 | j = 0 if not p['subbedIn'] else 1 290 | ii = [i for i, x in enumerate(p['plays']) if x['substitution']][j] 291 | match_sheet['sub_out'] = sum( 292 | map( 293 | int, 294 | re.findall( 295 | r'(\d{1,3})', 296 | p['plays'][ii]['clock']['displayValue'], 297 | ), 298 | ) 299 | ) 300 | else: 301 | match_sheet['sub_out'] = None 302 | 303 | if 'stats' in p: 304 | for stat in p['stats']: 305 | match_sheet[stat['name']] = stat['value'] 306 | 307 | df_list.append(match_sheet) 308 | df = ( 309 | pd.DataFrame(df_list) 310 | .replace({'team': TEAMNAME_REPLACEMENTS}) 311 | .pipe(standardize_colnames) 312 | .set_index(['league', 'season', 'game', 'team', 'player']) 313 | .sort_index() 314 | ) 315 | return df 316 | -------------------------------------------------------------------------------- /soccerdata/_common.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import pprint 4 | import random 5 | import re 6 | import time 7 | import warnings 8 | from abc import ABC, abstractmethod 9 | from datetime import date, datetime, timedelta 10 | from pathlib import Path 11 | from typing import IO, Callable, Dict, Iterable, List, Optional, Union 12 | 13 | import numpy as np 14 | import pandas as pd 15 | import requests 16 | import undetected_chromedriver as uc 17 | from dateutil.relativedelta import relativedelta 18 | from selenium.common.exceptions import WebDriverException 19 | 20 | from ._config import DATA_DIR, LEAGUE_DICT, logger 21 | 22 | 23 | class BaseReader(ABC): 24 | """Base class for data readers. 25 | 26 | Parameters 27 | ---------- 28 | leagues : str or list of str, optional 29 | The leagues to read. If None, all available leagues are read. 30 | proxy : 'tor' or or dict or list(dict) or callable, optional 31 | Use a proxy to hide your IP address. Valid options are: 32 | - "tor": Uses the Tor network. Tor should be running in 33 | the background on port 9050. 34 | - dict: A dictionary with the proxy to use. The dict should be 35 | a mapping of supported protocols to proxy addresses. For example:: 36 | 37 | { 38 | 'http': 'http://10.10.1.10:3128', 39 | 'https': 'http://10.10.1.10:1080', 40 | } 41 | 42 | - list(dict): A list of proxies to choose from. A different proxy will 43 | be selected from this list after failed requests, allowing rotating 44 | proxies. 45 | - callable: A function that returns a valid proxy. This function will 46 | be called after failed requests, allowing rotating proxies. 47 | no_cache : bool 48 | If True, will not use cached data. 49 | no_store : bool 50 | If True, will not store downloaded data. 51 | data_dir : Path 52 | Path to directory where data will be cached. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | leagues: Optional[Union[str, List[str]]] = None, 58 | proxy: Optional[ 59 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 60 | ] = None, 61 | no_cache: bool = False, 62 | no_store: bool = False, 63 | data_dir: Path = DATA_DIR, 64 | ): 65 | """Create a new data reader.""" 66 | if isinstance(proxy, str) and proxy.lower() == "tor": 67 | self.proxy = lambda: { 68 | "http": "socks5://127.0.0.1:9050", 69 | "https": "socks5://127.0.0.1:9050", 70 | } 71 | elif isinstance(proxy, dict): 72 | self.proxy = lambda: proxy # type: ignore 73 | elif isinstance(proxy, list): 74 | self.proxy = lambda: random.choice(proxy) # type: ignore 75 | elif callable(proxy): 76 | self.proxy = proxy 77 | else: 78 | self.proxy = lambda: {} 79 | 80 | self._selected_leagues = leagues # type: ignore 81 | self.no_cache = no_cache 82 | self.no_store = no_store 83 | self.data_dir = data_dir 84 | self.rate_limit = 0 85 | self.max_delay = 0 86 | if self.no_store: 87 | logger.info("Caching is disabled") 88 | else: 89 | logger.info("Saving cached data to %s", self.data_dir) 90 | self.data_dir.mkdir(parents=True, exist_ok=True) 91 | 92 | def get( 93 | self, 94 | url: str, 95 | filepath: Optional[Path] = None, 96 | max_age: Optional[Union[int, timedelta]] = None, 97 | no_cache: bool = False, 98 | var: Optional[str] = None, 99 | ) -> IO[bytes]: 100 | """Load data from `url`. 101 | 102 | By default, the source of `url` is downloaded and saved to `filepath`. 103 | If `filepath` exists, the `url` is not visited and the cached data is 104 | returned. 105 | 106 | Parameters 107 | ---------- 108 | url : str 109 | URL to download. 110 | filepath : Path, optional 111 | Path to save downloaded file. If None, downloaded data is not cached. 112 | max_age : int for age in days, or timedelta object 113 | The max. age of locally cached file before re-download. 114 | no_cache : bool 115 | If True, will not use cached data. Overrides the class property. 116 | var : str, optional 117 | Return a javascript variable instead of the page source. 118 | 119 | Raises 120 | ------ 121 | TypeError 122 | If max_age is not an integer or timedelta object. 123 | 124 | Returns 125 | ------- 126 | io.BufferedIOBase 127 | File-like object of downloaded data. 128 | """ 129 | is_cached = self._is_cached(filepath, max_age) 130 | if no_cache or self.no_cache or not is_cached: 131 | logger.debug("Scraping %s", url) 132 | return self._download_and_save(url, filepath, var) 133 | logger.debug("Retrieving %s from cache", url) 134 | assert filepath is not None 135 | return filepath.open(mode="rb") 136 | 137 | def _is_cached( 138 | self, 139 | filepath: Optional[Path] = None, 140 | max_age: Optional[Union[int, timedelta]] = None, 141 | ) -> bool: 142 | """Check if `filepath` contains valid cached data. 143 | 144 | Parameters 145 | ---------- 146 | filepath : Path, optional 147 | Path where file should be cached. If None, return False. 148 | max_age : int for age in days, or timedelta object 149 | The max. age of locally cached file. 150 | 151 | Raises 152 | ------ 153 | TypeError 154 | If max_age is not an integer or timedelta object. 155 | 156 | Returns 157 | ------- 158 | bool 159 | True in case of a cache hit, otherwise False. 160 | """ 161 | # Validate inputs 162 | if max_age is not None: 163 | if isinstance(max_age, int): 164 | _max_age = timedelta(days=max_age) 165 | elif isinstance(max_age, timedelta): 166 | _max_age = max_age 167 | else: 168 | raise TypeError("max_age must be of type int or datetime.timedelta") 169 | else: 170 | _max_age = None 171 | 172 | cache_invalid = False 173 | # Check if cached file is too old 174 | if _max_age is not None and filepath is not None and filepath.exists(): 175 | last_modified = datetime.fromtimestamp(filepath.stat().st_mtime) 176 | now = datetime.now() 177 | if (now - last_modified) > _max_age: 178 | cache_invalid = True 179 | 180 | return not cache_invalid and filepath is not None and filepath.exists() 181 | 182 | @abstractmethod 183 | def _download_and_save( 184 | self, 185 | url: str, 186 | filepath: Optional[Path] = None, 187 | var: Optional[str] = None, 188 | ) -> IO[bytes]: 189 | """Download data at `url` to `filepath`. 190 | 191 | Parameters 192 | ---------- 193 | url : str 194 | URL to download. 195 | filepath : Path, optional 196 | Path to save downloaded file. If None, downloaded data is not cached. 197 | var : str, optional 198 | Return a javascript variable instead of the page source. 199 | 200 | Returns 201 | ------- 202 | io.BufferedIOBase 203 | File-like object of downloaded data. 204 | """ 205 | 206 | @classmethod 207 | def available_leagues(cls) -> List[str]: 208 | """Return a list of league IDs available for this source.""" 209 | return sorted(cls._all_leagues().keys()) 210 | 211 | @classmethod 212 | def _all_leagues(cls) -> Dict[str, str]: 213 | """Return a dict mapping all canonical league IDs to source league IDs.""" 214 | if not hasattr(cls, "_all_leagues_dict"): 215 | cls._all_leagues_dict = { # type: ignore 216 | k: v[cls.__name__] for k, v in LEAGUE_DICT.items() if cls.__name__ in v 217 | } 218 | return cls._all_leagues_dict # type: ignore 219 | 220 | @classmethod 221 | def _translate_league(cls, df: pd.DataFrame, col: str = "league") -> pd.DataFrame: 222 | """Map source league ID to canonical ID.""" 223 | flip = {v: k for k, v in cls._all_leagues().items()} 224 | mask = ~df[col].isin(flip) 225 | df.loc[mask, col] = np.nan 226 | df[col] = df[col].replace(flip) 227 | return df 228 | 229 | @property 230 | def _selected_leagues(self) -> Dict[str, str]: 231 | """Return a dict mapping selected canonical league IDs to source league IDs.""" 232 | return self._leagues_dict 233 | 234 | @_selected_leagues.setter 235 | def _selected_leagues(self, ids: Optional[Union[str, List[str]]] = None) -> None: 236 | if ids is None: 237 | self._leagues_dict = self._all_leagues() 238 | else: 239 | if len(ids) == 0: 240 | raise ValueError("Empty iterable not allowed for 'leagues'") 241 | if isinstance(ids, str): 242 | ids = [ids] 243 | tmp_league_dict = {} 244 | for i in ids: 245 | if i not in self._all_leagues(): 246 | raise ValueError( 247 | f""" 248 | Invalid league '{i}'. Valid leagues are: 249 | { pprint.pformat(self.available_leagues()) } 250 | """ 251 | ) 252 | tmp_league_dict[i] = self._all_leagues()[i] 253 | self._leagues_dict = tmp_league_dict 254 | 255 | def _is_complete(self, league: str, season: str) -> bool: 256 | """Check if a season is complete.""" 257 | if league in LEAGUE_DICT: 258 | league_dict = LEAGUE_DICT[league] 259 | else: 260 | flip = {v: k for k, v in self._all_leagues().items()} 261 | if league in flip: 262 | league_dict = LEAGUE_DICT[flip[league]] 263 | else: 264 | raise ValueError(f"Invalid league '{league}'") 265 | if "season_end" not in league_dict: 266 | season_ends = date(datetime.strptime(season[-2:], "%y").year, 7, 1) 267 | else: 268 | season_ends = ( 269 | date( 270 | datetime.strptime(season[-2:], "%y").year, 271 | datetime.strptime(league_dict["season_end"], "%b").month, 272 | 1, 273 | ) 274 | + relativedelta(months=1) 275 | ) 276 | return date.today() >= season_ends 277 | 278 | @property 279 | def leagues(self) -> List[str]: 280 | """Return a list of selected leagues.""" 281 | return list(self._leagues_dict.keys()) 282 | 283 | @property 284 | def seasons(self) -> List[str]: 285 | """Return a list of selected seasons.""" 286 | return self._season_ids 287 | 288 | @seasons.setter 289 | def seasons(self, seasons: Optional[Union[str, int, Iterable[Union[str, int]]]]) -> None: 290 | if seasons is None: 291 | logger.info("No seasons provided. Will retrieve data for the last 5 seasons.") 292 | year = datetime.today().year 293 | seasons = range(year, year - 6, -1) 294 | if isinstance(seasons, str) or isinstance(seasons, int): 295 | seasons = [seasons] 296 | self._season_ids = [season_code(s) for s in seasons] 297 | 298 | 299 | class BaseRequestsReader(BaseReader): 300 | """Base class for readers that use the Python requests module.""" 301 | 302 | def __init__( 303 | self, 304 | leagues: Optional[Union[str, List[str]]] = None, 305 | proxy: Optional[ 306 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 307 | ] = None, 308 | no_cache: bool = False, 309 | no_store: bool = False, 310 | data_dir: Path = DATA_DIR, 311 | ): 312 | """Initialize the reader.""" 313 | super().__init__( 314 | no_cache=no_cache, 315 | no_store=no_store, 316 | leagues=leagues, 317 | proxy=proxy, 318 | data_dir=data_dir, 319 | ) 320 | 321 | self._session = self._init_session() 322 | 323 | def _init_session(self) -> requests.Session: 324 | session = requests.Session() 325 | session.proxies.update(self.proxy()) 326 | return session 327 | 328 | def _download_and_save( 329 | self, 330 | url: str, 331 | filepath: Optional[Path] = None, 332 | var: Optional[str] = None, 333 | ) -> IO[bytes]: 334 | """Download file at url to filepath. Overwrites if filepath exists.""" 335 | for i in range(5): 336 | try: 337 | response = self._session.get(url, stream=True) 338 | time.sleep(self.rate_limit + random.random() * self.max_delay) 339 | response.raise_for_status() 340 | if not self.no_store and filepath is not None: 341 | with filepath.open(mode="wb") as fh: 342 | fh.write(response.content) 343 | return io.BytesIO(response.content) 344 | except Exception: 345 | logger.exception( 346 | "Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1 347 | ) 348 | self._session = self._init_session() 349 | continue 350 | 351 | raise ConnectionError("Could not download %s." % url) 352 | 353 | 354 | class BaseSeleniumReader(BaseReader): 355 | """Base class for readers that use Selenium.""" 356 | 357 | def __init__( 358 | self, 359 | leagues: Optional[Union[str, List[str]]] = None, 360 | proxy: Optional[ 361 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 362 | ] = None, 363 | no_cache: bool = False, 364 | no_store: bool = False, 365 | data_dir: Path = DATA_DIR, 366 | path_to_browser: Optional[Path] = None, 367 | headless: bool = True, 368 | ): 369 | """Initialize the reader.""" 370 | super().__init__( 371 | no_cache=no_cache, 372 | no_store=no_store, 373 | leagues=leagues, 374 | proxy=proxy, 375 | data_dir=data_dir, 376 | ) 377 | self.path_to_browser = path_to_browser 378 | self.headless = headless 379 | 380 | try: 381 | self._driver = self._init_webdriver() 382 | except WebDriverException as e: 383 | logger.error( 384 | """ 385 | The ChromeDriver was unable to initiate/spawn a new 386 | WebBrowser. You will not be able to scrape new data. 387 | %s 388 | """, 389 | e, 390 | ) 391 | 392 | def _init_webdriver(self) -> "uc.Chrome": 393 | """Start the Selenium driver.""" 394 | # Quit existing driver 395 | if hasattr(self, "_driver"): 396 | self._driver.quit() 397 | # Start a new driver 398 | chrome_options = uc.ChromeOptions() 399 | if self.headless: 400 | chrome_options.add_argument("--headless") 401 | if self.path_to_browser is not None: 402 | chrome_options.add_argument("--binary-location=" + str(self.path_to_browser)) 403 | proxy = self.proxy() 404 | if len(proxy): 405 | proxy_str = ";".join(f"{prot}={url}" for prot, url in proxy.items()) 406 | resolver_rules = "MAP * ~NOTFOUND , EXCLUDE 127.0.0.1" 407 | chrome_options.add_argument("--proxy-server=" + proxy_str) 408 | chrome_options.add_argument("--host-resolver-rules=" + resolver_rules) 409 | return uc.Chrome(options=chrome_options) 410 | 411 | def _download_and_save( # noqa: C901 412 | self, 413 | url: str, 414 | filepath: Optional[Path] = None, 415 | var: Optional[str] = None, 416 | ) -> IO[bytes]: 417 | """Download file at url to filepath. Overwrites if filepath exists.""" 418 | for i in range(5): 419 | try: 420 | self._driver.get(url) 421 | time.sleep(self.rate_limit + random.random() * self.max_delay) 422 | if "Incapsula incident ID" in self._driver.page_source: 423 | raise WebDriverException( 424 | "Your IP is blocked. Use tor or a proxy to continue scraping." 425 | ) 426 | if var is None: 427 | response = self._driver.execute_script( 428 | "return document.body.innerHTML;" 429 | ).encode("utf-8") 430 | else: 431 | response = json.dumps(self._driver.execute_script("return " + var)).encode( 432 | "utf-8" 433 | ) 434 | if not self.no_store and filepath is not None: 435 | filepath.parent.mkdir(parents=True, exist_ok=True) 436 | with filepath.open(mode="wb") as fh: 437 | fh.write(response) 438 | return io.BytesIO(response) 439 | except Exception: 440 | logger.exception( 441 | "Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1 442 | ) 443 | self._driver = self._init_webdriver() 444 | continue 445 | 446 | raise ConnectionError("Could not download %s." % url) 447 | 448 | 449 | def season_code(season: Union[str, int]) -> str: # noqa: C901 450 | """Convert a string or int to a season code like '1718'.""" 451 | season = str(season) 452 | pat1 = re.compile(r"^[0-9]{4}$") # 1994 | 9495 453 | pat2 = re.compile(r"^[0-9]{2}$") # 94 454 | pat3 = re.compile(r"^[0-9]{4}-[0-9]{4}$") # 1994-1995 455 | pat4 = re.compile(r"^[0-9]{4}/[0-9]{4}$") # 1994/1995 456 | pat5 = re.compile(r"^[0-9]{4}-[0-9]{2}$") # 1994-95 457 | pat6 = re.compile(r"^[0-9]{2}-[0-9]{2}$") # 94-95 458 | 459 | if re.match(pat1, season): 460 | if int(season[2:]) == int(season[:2]) + 1: 461 | if season == "1920" or season == "2021": 462 | msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format( 463 | season, season[:2], season[-2:] 464 | ) 465 | warnings.warn(msg) 466 | return season # 9495 467 | elif season[2:] == "99": 468 | return "".join([season[2:], "00"]) # 1999 469 | else: 470 | return "".join([season[-2:], f"{int(season[-2:]) + 1:02d}"]) # 1994 471 | elif re.match(pat2, season): 472 | if season == "99": 473 | return "".join([season, "00"]) # 99 474 | else: 475 | return "".join([season, f"{int(season) + 1:02d}"]) # 94 476 | elif re.match(pat3, season): 477 | return "".join([season[2:4], season[-2:]]) # 1994-1995 478 | elif re.match(pat4, season): 479 | return "".join([season[2:4], season[-2:]]) # 1994/1995 480 | elif re.match(pat5, season): 481 | return "".join([season[2:4], season[-2:]]) # 1994-95 482 | elif re.match(pat6, season): 483 | return "".join([season[:2], season[-2:]]) # 94-95 484 | else: 485 | return season 486 | 487 | 488 | def make_game_id(row: pd.Series) -> str: 489 | """Return a game id based on date, home and away team.""" 490 | if pd.isnull(row["date"]): 491 | game_id = "{}-{}".format( 492 | row["home_team"], 493 | row["away_team"], 494 | ) 495 | else: 496 | game_id = "{} {}-{}".format( 497 | row["date"].strftime("%Y-%m-%d"), 498 | row["home_team"], 499 | row["away_team"], 500 | ) 501 | return game_id 502 | 503 | 504 | def standardize_colnames(df: pd.DataFrame, cols: Optional[List[str]] = None) -> pd.DataFrame: 505 | """Convert DataFrame column names to snake case.""" 506 | 507 | def to_snake(name: str) -> str: 508 | name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) 509 | name = re.sub("__([A-Z])", r"_\1", name) 510 | name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name) 511 | return name.lower().replace("-", "_").replace(" ", "") 512 | 513 | if cols is None: 514 | cols = list(df.columns) 515 | 516 | return df.rename(columns={c: to_snake(c) for c in cols}) 517 | 518 | 519 | def get_proxy() -> Dict[str, str]: 520 | """Return a public proxy.""" 521 | # list of free proxy apis 522 | # protocols: http, https, socks4 and socks5 523 | list_of_proxy_content = [ 524 | "https://proxylist.geonode.com/api/proxy-list?sort_by=lastChecked&sort_type=desc", 525 | ] 526 | 527 | # extracting json data from this list of proxies 528 | full_proxy_list = [] 529 | for proxy_url in list_of_proxy_content: 530 | proxy_json = json.loads(requests.get(proxy_url).text)["data"] 531 | full_proxy_list.extend(proxy_json) 532 | 533 | if not full_proxy_list: 534 | logger.info("There are currently no proxies available. Exiting...") 535 | return {} 536 | else: 537 | logger.info(f"Found {len(full_proxy_list)} proxy servers. Checking...\n") 538 | 539 | # creating proxy dict 540 | final_proxy_list = [] 541 | for proxy in full_proxy_list: 542 | protocol = proxy["protocols"][0] 543 | ip_ = proxy["ip"] 544 | port = proxy["port"] 545 | 546 | proxy = { 547 | "https": protocol + "://" + ip_ + ":" + port, 548 | "http": protocol + "://" + ip_ + ":" + port, 549 | } 550 | 551 | final_proxy_list.append(proxy) 552 | 553 | # trying proxy 554 | for proxy in final_proxy_list: 555 | if check_proxy(proxy): 556 | return proxy 557 | 558 | logger.info("There are currently no proxies available. Exiting...") 559 | return {} 560 | 561 | 562 | def check_proxy(proxy: dict) -> bool: 563 | """Check if proxy is working.""" 564 | try: 565 | r0 = requests.get("https://ipinfo.io/json", proxies=proxy, timeout=15) 566 | return r0.status_code == 200 567 | except Exception as error: 568 | logger.error(f"BAD PROXY: Reason: {str(error)}\n") 569 | return False 570 | -------------------------------------------------------------------------------- /docs/datasources/SoFIFA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e621e3ae", 7 | "metadata": { 8 | "nbsphinx": "hidden" 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import pandas as pd\n", 13 | "pd.set_option('display.max_columns', None)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "0f792a6b", 20 | "metadata": { 21 | "nbsphinx": "hidden" 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "env: SOCCERDATA_LOGLEVEL=ERROR\n", 29 | "env: SOCCERDATA_NOCACHE=True\n", 30 | "env: SOCCERDATA_NOSTORE=True\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "%env SOCCERDATA_LOGLEVEL=ERROR\n", 36 | "%env SOCCERDATA_NOCACHE=True\n", 37 | "%env SOCCERDATA_NOSTORE=True" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "id": "2454afe6", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import soccerdata as sd" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "b5784f2d", 53 | "metadata": {}, 54 | "source": [ 55 | "# SoFIFA" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "id": "8dab5be9", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stderr", 66 | "output_type": "stream", 67 | "text": [ 68 | "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/soccerdata/_common.py:466: UserWarning: Season id \"2021\" is ambiguous: interpreting as \"20-21\"\n", 69 | " warnings.warn(msg)\n" 70 | ] 71 | }, 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Provides pd.DataFrames from data at http://sofifa.com.\n", 77 | "\n", 78 | " Data will be downloaded as necessary and cached locally in\n", 79 | " ``~/soccerdata/data/SoFIFA``.\n", 80 | "\n", 81 | " Parameters\n", 82 | " ----------\n", 83 | " leagues : string or iterable, optional\n", 84 | " IDs of leagues to include.\n", 85 | " seasons : string, int or list, optional\n", 86 | " Seasons to include. Supports multiple formats.\n", 87 | " Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]\n", 88 | " proxy : 'tor' or or dict or list(dict) or callable, optional\n", 89 | " Use a proxy to hide your IP address. Valid options are:\n", 90 | " - \"tor\": Uses the Tor network. Tor should be running in\n", 91 | " the background on port 9050.\n", 92 | " - dict: A dictionary with the proxy to use. The dict should be\n", 93 | " a mapping of supported protocols to proxy addresses. For example::\n", 94 | "\n", 95 | " {\n", 96 | " 'http': 'http://10.10.1.10:3128',\n", 97 | " 'https': 'http://10.10.1.10:1080',\n", 98 | " }\n", 99 | "\n", 100 | " - list(dict): A list of proxies to choose from. A different proxy will\n", 101 | " be selected from this list after failed requests, allowing rotating\n", 102 | " proxies.\n", 103 | " - callable: A function that returns a valid proxy. This function will\n", 104 | " be called after failed requests, allowing rotating proxies.\n", 105 | " no_cache : bool\n", 106 | " If True, will not use cached data.\n", 107 | " no_store : bool\n", 108 | " If True, will not store downloaded data.\n", 109 | " data_dir : Path\n", 110 | " Path to directory where data will be cached.\n", 111 | " \n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "sofifa = sd.SoFIFA(leagues=\"ENG-Premier League\", seasons=2021)\n", 117 | "print(sofifa.__doc__)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "3a4c2916", 123 | "metadata": {}, 124 | "source": [ 125 | "## EA Sports FIFA player ratings" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "id": "745be31a", 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/html": [ 137 | "
\n", 138 | "\n", 151 | "\n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | "
overall_ratingpotentialcrossingfinishingheading_accuracyshort_passingvolleysdribblingcurvefk_accuracylong_passingball_controlaccelerationsprint_speedagilityreactionsbalanceshot_powerjumpingstaminastrengthlong_shotsaggressioninterceptionspositioningvisionpenaltiescomposuremarkingstanding_tacklesliding_tacklegk_divinggk_handlinggk_kickinggk_positioninggk_reflexes
leagueseasonplayer
ENG-Premier League2021Aaron CresswellBest Overall RatingNone8354687748768076697772677378817186756166737760685973None78791479912
Aaron LennonBest Overall RatingNone7458267268796653647280778173936460685357555072726273None504314771611
Aaron RamsdaleBest Overall RatingNone1514155416181716543547505380496366355915352418642565None16158277847884
Abdoulaye DoucouréBest Overall RatingNone6975698168794442778167766579688473908277788177755477None80761512121514
Adam WebsterBest Overall RatingNone5030787427682725707364746374573777697732757626532477None777510814712
\n", 436 | "
" 437 | ], 438 | "text/plain": [ 439 | " overall_rating potential \\\n", 440 | "league season player \n", 441 | "ENG-Premier League 2021 Aaron Cresswell Best Overall Rating None \n", 442 | " Aaron Lennon Best Overall Rating None \n", 443 | " Aaron Ramsdale Best Overall Rating None \n", 444 | " Abdoulaye Doucouré Best Overall Rating None \n", 445 | " Adam Webster Best Overall Rating None \n", 446 | "\n", 447 | " crossing finishing \\\n", 448 | "league season player \n", 449 | "ENG-Premier League 2021 Aaron Cresswell 83 54 \n", 450 | " Aaron Lennon 74 58 \n", 451 | " Aaron Ramsdale 15 14 \n", 452 | " Abdoulaye Doucouré 69 75 \n", 453 | " Adam Webster 50 30 \n", 454 | "\n", 455 | " heading_accuracy short_passing \\\n", 456 | "league season player \n", 457 | "ENG-Premier League 2021 Aaron Cresswell 68 77 \n", 458 | " Aaron Lennon 26 72 \n", 459 | " Aaron Ramsdale 15 54 \n", 460 | " Abdoulaye Doucouré 69 81 \n", 461 | " Adam Webster 78 74 \n", 462 | "\n", 463 | " volleys dribbling curve \\\n", 464 | "league season player \n", 465 | "ENG-Premier League 2021 Aaron Cresswell 48 76 80 \n", 466 | " Aaron Lennon 68 79 66 \n", 467 | " Aaron Ramsdale 16 18 17 \n", 468 | " Abdoulaye Doucouré 68 79 44 \n", 469 | " Adam Webster 27 68 27 \n", 470 | "\n", 471 | " fk_accuracy long_passing \\\n", 472 | "league season player \n", 473 | "ENG-Premier League 2021 Aaron Cresswell 76 69 \n", 474 | " Aaron Lennon 53 64 \n", 475 | " Aaron Ramsdale 16 54 \n", 476 | " Abdoulaye Doucouré 42 77 \n", 477 | " Adam Webster 25 70 \n", 478 | "\n", 479 | " ball_control acceleration \\\n", 480 | "league season player \n", 481 | "ENG-Premier League 2021 Aaron Cresswell 77 72 \n", 482 | " Aaron Lennon 72 80 \n", 483 | " Aaron Ramsdale 35 47 \n", 484 | " Abdoulaye Doucouré 81 67 \n", 485 | " Adam Webster 73 64 \n", 486 | "\n", 487 | " sprint_speed agility reactions \\\n", 488 | "league season player \n", 489 | "ENG-Premier League 2021 Aaron Cresswell 67 73 78 \n", 490 | " Aaron Lennon 77 81 73 \n", 491 | " Aaron Ramsdale 50 53 80 \n", 492 | " Abdoulaye Doucouré 76 65 79 \n", 493 | " Adam Webster 74 63 74 \n", 494 | "\n", 495 | " balance shot_power jumping \\\n", 496 | "league season player \n", 497 | "ENG-Premier League 2021 Aaron Cresswell 81 71 86 \n", 498 | " Aaron Lennon 93 64 60 \n", 499 | " Aaron Ramsdale 49 63 66 \n", 500 | " Abdoulaye Doucouré 68 84 73 \n", 501 | " Adam Webster 57 37 77 \n", 502 | "\n", 503 | " stamina strength long_shots \\\n", 504 | "league season player \n", 505 | "ENG-Premier League 2021 Aaron Cresswell 75 61 66 \n", 506 | " Aaron Lennon 68 53 57 \n", 507 | " Aaron Ramsdale 35 59 15 \n", 508 | " Abdoulaye Doucouré 90 82 77 \n", 509 | " Adam Webster 69 77 32 \n", 510 | "\n", 511 | " aggression interceptions \\\n", 512 | "league season player \n", 513 | "ENG-Premier League 2021 Aaron Cresswell 73 77 \n", 514 | " Aaron Lennon 55 50 \n", 515 | " Aaron Ramsdale 35 24 \n", 516 | " Abdoulaye Doucouré 78 81 \n", 517 | " Adam Webster 75 76 \n", 518 | "\n", 519 | " positioning vision penalties \\\n", 520 | "league season player \n", 521 | "ENG-Premier League 2021 Aaron Cresswell 60 68 59 \n", 522 | " Aaron Lennon 72 72 62 \n", 523 | " Aaron Ramsdale 18 64 25 \n", 524 | " Abdoulaye Doucouré 77 75 54 \n", 525 | " Adam Webster 26 53 24 \n", 526 | "\n", 527 | " composure marking \\\n", 528 | "league season player \n", 529 | "ENG-Premier League 2021 Aaron Cresswell 73 None \n", 530 | " Aaron Lennon 73 None \n", 531 | " Aaron Ramsdale 65 None \n", 532 | " Abdoulaye Doucouré 77 None \n", 533 | " Adam Webster 77 None \n", 534 | "\n", 535 | " standing_tackle sliding_tackle \\\n", 536 | "league season player \n", 537 | "ENG-Premier League 2021 Aaron Cresswell 78 79 \n", 538 | " Aaron Lennon 50 43 \n", 539 | " Aaron Ramsdale 16 15 \n", 540 | " Abdoulaye Doucouré 80 76 \n", 541 | " Adam Webster 77 75 \n", 542 | "\n", 543 | " gk_diving gk_handling gk_kicking \\\n", 544 | "league season player \n", 545 | "ENG-Premier League 2021 Aaron Cresswell 14 7 9 \n", 546 | " Aaron Lennon 14 7 7 \n", 547 | " Aaron Ramsdale 82 77 84 \n", 548 | " Abdoulaye Doucouré 15 12 12 \n", 549 | " Adam Webster 10 8 14 \n", 550 | "\n", 551 | " gk_positioning gk_reflexes \n", 552 | "league season player \n", 553 | "ENG-Premier League 2021 Aaron Cresswell 9 12 \n", 554 | " Aaron Lennon 16 11 \n", 555 | " Aaron Ramsdale 78 84 \n", 556 | " Abdoulaye Doucouré 15 14 \n", 557 | " Adam Webster 7 12 " 558 | ] 559 | }, 560 | "execution_count": 5, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "ratings = sofifa.read_ratings()\n", 567 | "ratings.head()" 568 | ] 569 | } 570 | ], 571 | "metadata": { 572 | "kernelspec": { 573 | "display_name": "soccerdata", 574 | "language": "python", 575 | "name": "soccerdata" 576 | }, 577 | "language_info": { 578 | "codemirror_mode": { 579 | "name": "ipython", 580 | "version": 3 581 | }, 582 | "file_extension": ".py", 583 | "mimetype": "text/x-python", 584 | "name": "python", 585 | "nbconvert_exporter": "python", 586 | "pygments_lexer": "ipython3", 587 | "version": "3.9.6" 588 | }, 589 | "toc": { 590 | "base_numbering": 1, 591 | "nav_menu": {}, 592 | "number_sections": true, 593 | "sideBar": true, 594 | "skip_h1_title": false, 595 | "title_cell": "Table of Contents", 596 | "title_sidebar": "Contents", 597 | "toc_cell": false, 598 | "toc_position": {}, 599 | "toc_section_display": true, 600 | "toc_window_display": true 601 | } 602 | }, 603 | "nbformat": 4, 604 | "nbformat_minor": 5 605 | } 606 | -------------------------------------------------------------------------------- /soccerdata/fbref.py: -------------------------------------------------------------------------------- 1 | """Scraper for http://fbref.com.""" 2 | import itertools 3 | import warnings 4 | from functools import reduce 5 | from pathlib import Path 6 | from typing import Callable, Dict, List, Optional, Union 7 | 8 | import pandas as pd 9 | from lxml import etree, html 10 | 11 | from ._common import ( 12 | BaseRequestsReader, 13 | make_game_id, 14 | season_code, 15 | standardize_colnames, 16 | ) 17 | from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger 18 | 19 | FBREF_DATADIR = DATA_DIR / "FBref" 20 | FBREF_API = "https://fbref.com" 21 | 22 | BIG_FIVE_DICT = { 23 | "it Serie A": "ITA-Serie A", 24 | "fr Ligue 1": "FRA-Ligue 1", 25 | "es La Liga": "ESP-La Liga", 26 | "eng Premier League": "ENG-Premier League", 27 | "de Bundesliga": "GER-Bundesliga", 28 | } 29 | 30 | 31 | class FBref(BaseRequestsReader): 32 | """Provides pd.DataFrames from data at http://fbref.com. 33 | 34 | Data will be downloaded as necessary and cached locally in 35 | ``~/soccerdata/data/FBref``. 36 | 37 | Parameters 38 | ---------- 39 | leagues : string or iterable, optional 40 | IDs of leagues to include. For efficiently reading data from the Top-5 41 | European leagues, use "Big 5 European Leagues Combined". 42 | seasons : string, int or list, optional 43 | Seasons to include. Supports multiple formats. 44 | Examples: '16-17'; 2016; '2016-17'; [14, 15, 16] 45 | proxy : 'tor' or dict or list(dict) or callable, optional 46 | Use a proxy to hide your IP address. Valid options are: 47 | - "tor": Uses the Tor network. Tor should be running in 48 | the background on port 9050. 49 | - dict: A dictionary with the proxy to use. The dict should be 50 | a mapping of supported protocols to proxy addresses. For example:: 51 | 52 | { 53 | 'http': 'http://10.10.1.10:3128', 54 | 'https': 'http://10.10.1.10:1080', 55 | } 56 | 57 | - list(dict): A list of proxies to choose from. A different proxy will 58 | be selected from this list after failed requests, allowing rotating 59 | proxies. 60 | - callable: A function that returns a valid proxy. This function will 61 | be called after failed requests, allowing rotating proxies. 62 | no_cache : bool 63 | If True, will not use cached data. 64 | no_store : bool 65 | If True, will not store downloaded data. 66 | data_dir : Path 67 | Path to directory where data will be cached. 68 | """ 69 | 70 | def __init__( 71 | self, 72 | leagues: Optional[Union[str, List[str]]] = None, 73 | seasons: Optional[Union[str, int, List]] = None, 74 | proxy: Optional[ 75 | Union[str, Dict[str, str], List[Dict[str, str]], Callable[[], Dict[str, str]]] 76 | ] = None, 77 | no_cache: bool = NOCACHE, 78 | no_store: bool = NOSTORE, 79 | data_dir: Path = FBREF_DATADIR, 80 | ): 81 | """Initialize FBref reader.""" 82 | super().__init__( 83 | leagues=leagues, 84 | proxy=proxy, 85 | no_cache=no_cache, 86 | no_store=no_store, 87 | data_dir=data_dir, 88 | ) 89 | self.rate_limit = 3 90 | self.seasons = seasons # type: ignore 91 | # check if all top 5 leagues are selected 92 | selected_leagues = set(self._leagues_dict.keys()) 93 | if set(BIG_FIVE_DICT.values()).issubset(selected_leagues): 94 | warnings.warn( 95 | "You are trying to scrape data for all of the Big 5 European leagues. " 96 | "This can be done more efficiently by setting " 97 | "leagues='Big 5 European Leagues Combined'." 98 | ) 99 | 100 | @property 101 | def leagues(self) -> List[str]: 102 | """Return a list of selected leagues.""" 103 | selected_leagues = set(self._leagues_dict.keys()) 104 | if "Big 5 European Leagues Combined" in selected_leagues: 105 | selected_leagues -= set(BIG_FIVE_DICT.values()) 106 | return list(selected_leagues) 107 | 108 | @classmethod 109 | def _all_leagues(cls) -> Dict[str, str]: 110 | """Return a dict mapping all canonical league IDs to source league IDs.""" 111 | res = super()._all_leagues() 112 | res.update({"Big 5 European Leagues Combined": "Big 5 European Leagues Combined"}) 113 | return res 114 | 115 | def read_leagues(self) -> pd.DataFrame: 116 | """Retrieve selected leagues from the datasource. 117 | 118 | Returns 119 | ------- 120 | pd.DataFrame 121 | """ 122 | url = f"{FBREF_API}/en/comps/" 123 | filepath = self.data_dir / "leagues.html" 124 | reader = self.get(url, filepath) 125 | 126 | # extract league links 127 | leagues = [] 128 | tree = html.parse(reader) 129 | for table in tree.xpath("//table[contains(@id, 'comps')]"): 130 | df_table = pd.read_html(etree.tostring(table, method="html"))[0] 131 | df_table["url"] = table.xpath(".//th[@data-stat='league_name']/a/@href") 132 | leagues.append(df_table) 133 | df = ( 134 | pd.concat(leagues) 135 | .pipe(standardize_colnames) 136 | .rename(columns={"competition_name": "league"}) 137 | .pipe(self._translate_league) 138 | .drop_duplicates(subset="league") 139 | .set_index("league") 140 | .sort_index() 141 | ) 142 | df["country"] = df["country"].apply( 143 | lambda x: x.split(" ")[1] if isinstance(x, str) else None 144 | ) 145 | return df[df.index.isin(self.leagues)] 146 | 147 | def read_seasons(self) -> pd.DataFrame: 148 | """Retrieve the selected seasons for the selected leagues. 149 | 150 | Returns 151 | ------- 152 | pd.DataFrame 153 | """ 154 | filemask = "seasons_{}.html" 155 | df_leagues = self.read_leagues() 156 | 157 | seasons = [] 158 | for lkey, league in df_leagues.iterrows(): 159 | url = FBREF_API + league.url 160 | filepath = self.data_dir / filemask.format(lkey) 161 | reader = self.get(url, filepath) 162 | 163 | # extract season links 164 | tree = html.parse(reader) 165 | df_table = pd.read_html(etree.tostring(tree), attrs={"id": "seasons"})[0] 166 | df_table["url"] = tree.xpath( 167 | "//table[@id='seasons']//th[@data-stat='year_id']/a/@href" 168 | ) 169 | seasons.append(df_table) 170 | 171 | df = pd.concat(seasons).pipe(standardize_colnames) 172 | # A competition name field is not inlcuded in the Big 5 European Leagues Combined 173 | if "competition_name" in df.columns: 174 | df = df.rename(columns={"competition_name": "league"}).pipe(self._translate_league) 175 | else: 176 | df["league"] = "Big 5 European Leagues Combined" 177 | df["season"] = df["season"].apply(lambda x: season_code(x)) 178 | df = df.set_index(["league", "season"]).sort_index() 179 | return df.loc[df.index.isin(itertools.product(self.leagues, self.seasons))] 180 | 181 | def read_team_season_stats( 182 | self, stat_type: str = "standard", opponent_stats: bool = False 183 | ) -> pd.DataFrame: 184 | """Retrieve teams from the datasource for the selected leagues. 185 | 186 | The following stat types are available: 187 | * 'standard' 188 | * 'keeper' 189 | * 'keeper_adv' 190 | * 'shooting' 191 | * 'passing' 192 | * 'passing_types' 193 | * 'goal_shot_creation' 194 | * 'defense' 195 | * 'possession' 196 | * 'playing_time' 197 | * 'misc' 198 | 199 | Parameters 200 | ---------- 201 | stat_type: str 202 | Type of stats to retrieve. 203 | opponent_stats: bool 204 | If True, will retrieve opponent stats. 205 | 206 | Raises 207 | ------ 208 | TypeError 209 | If ``stat_type`` is not valid. 210 | 211 | Returns 212 | ------- 213 | pd.DataFrame 214 | """ 215 | team_stats = [ 216 | "standard", 217 | "keeper", 218 | "keeper_adv", 219 | "shooting", 220 | "passing", 221 | "passing_types", 222 | "goal_shot_creation", 223 | "defense", 224 | "possession", 225 | "playing_time", 226 | "misc", 227 | ] 228 | 229 | filemask = "teams_{}_{}_{}.html" 230 | 231 | if stat_type not in team_stats: 232 | raise TypeError(f"Invalid argument: stat_type should be in {team_stats}") 233 | 234 | if stat_type == "standard": 235 | page = "stats" 236 | elif stat_type == "keeper": 237 | page = "keepers" 238 | elif stat_type == "keeper_adv": 239 | page = "keepersadv" 240 | elif stat_type == "goal_shot_creation": 241 | page = "gca" 242 | stat_type = "gca" 243 | elif stat_type == "playing_time": 244 | page = "playingtime" 245 | else: 246 | page = stat_type 247 | 248 | if opponent_stats: 249 | stat_type += "_against" 250 | else: 251 | stat_type += "_for" 252 | 253 | # get league IDs 254 | seasons = self.read_seasons() 255 | 256 | # collect teams 257 | teams = [] 258 | for (lkey, skey), season in seasons.iterrows(): 259 | big_five = lkey == "Big 5 European Leagues Combined" 260 | # read html page (league overview) 261 | filepath = self.data_dir / filemask.format( 262 | lkey, skey, stat_type if big_five else "all" 263 | ) 264 | url = ( 265 | FBREF_API 266 | + "/".join(season.url.split("/")[:-1]) 267 | + (f"/{page}/squads/" if big_five else "/") 268 | + season.url.split("/")[-1] 269 | ) 270 | reader = self.get(url, filepath) 271 | 272 | # extract team links 273 | tree = html.parse(reader) 274 | if big_five: 275 | df_table = pd.read_html( 276 | etree.tostring(tree), attrs={"id": f"stats_teams_{stat_type}"} 277 | )[0] 278 | df_table["url"] = tree.xpath( 279 | f"//table[@id='stats_teams_{stat_type}']//td[@data-stat='team']/a/@href" 280 | ) 281 | df_table["league"] = ( 282 | df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT) 283 | ) 284 | df_table["season"] = skey 285 | df_table = df_table.drop("Rk", axis=1, level=1).drop("Comp", axis=1, level=1) 286 | else: 287 | df_table = pd.read_html( 288 | etree.tostring(tree), attrs={"id": f"stats_squads_{stat_type}"} 289 | )[0] 290 | df_table["url"] = tree.xpath( 291 | f"//table[@id='stats_squads_{stat_type}']//th[@data-stat='team']/a/@href" 292 | ) 293 | df_table["league"] = lkey 294 | df_table["season"] = skey 295 | teams.append(df_table) 296 | 297 | # return data frame 298 | df = ( 299 | _concat(teams) 300 | .rename(columns={"Squad": "team"}) 301 | .replace({"team": TEAMNAME_REPLACEMENTS}) 302 | .set_index(["league", "season", "team"]) 303 | .sort_index() 304 | ) 305 | return df 306 | 307 | def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: 308 | """Retrieve players from the datasource for the selected leagues. 309 | 310 | The following stat types are available: 311 | * 'standard' 312 | * 'shooting' 313 | * 'passing' 314 | * 'passing_types' 315 | * 'goal_shot_creation' 316 | * 'defense' 317 | * 'possession' 318 | * 'playing_time' 319 | * 'misc' 320 | * 'keeper' 321 | * 'keeper_adv' 322 | 323 | Parameters 324 | ---------- 325 | stat_type :str 326 | Type of stats to retrieve. 327 | 328 | Raises 329 | ------ 330 | TypeError 331 | If ``stat_type`` is not valid. 332 | 333 | Returns 334 | ------- 335 | pd.DataFrame 336 | """ 337 | player_stats = [ 338 | "standard", 339 | "keeper", 340 | "keeper_adv", 341 | "shooting", 342 | "passing", 343 | "passing_types", 344 | "goal_shot_creation", 345 | "defense", 346 | "possession", 347 | "playing_time", 348 | "misc", 349 | ] 350 | 351 | filemask = "players_{}_{}_{}.html" 352 | 353 | if stat_type not in player_stats: 354 | raise TypeError(f"Invalid argument: stat_type should be in {player_stats}") 355 | 356 | if stat_type == "standard": 357 | page = "stats" 358 | elif stat_type == "goal_shot_creation": 359 | page = "gca" 360 | stat_type = "gca" 361 | elif stat_type == "playing_time": 362 | page = "playingtime" 363 | elif stat_type == "keeper": 364 | page = "keepers" 365 | elif stat_type == "keeper_adv": 366 | page = "keepersadv" 367 | else: 368 | page = stat_type 369 | 370 | # get league IDs 371 | seasons = self.read_seasons() 372 | 373 | # collect players 374 | players = [] 375 | for (lkey, skey), season in seasons.iterrows(): 376 | big_five = lkey == "Big 5 European Leagues Combined" 377 | filepath = self.data_dir / filemask.format(lkey, skey, stat_type) 378 | url = ( 379 | FBREF_API 380 | + "/".join(season.url.split("/")[:-1]) 381 | + f"/{page}" 382 | + ("/players/" if big_five else "/") 383 | + season.url.split("/")[-1] 384 | ) 385 | reader = self.get(url, filepath) 386 | tree = html.parse(reader) 387 | if big_five: 388 | df_table = pd.read_html(etree.tostring(tree))[0] 389 | df_table[("Unnamed: league", "league")] = ( 390 | df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT) 391 | ) 392 | df_table[("Unnamed: season", "season")] = skey 393 | df_table = df_table.drop("Comp", axis=1, level=1) 394 | else: 395 | el = tree.xpath(f"//comment()[contains(.,'div_stats_{stat_type}')]") 396 | df_table = pd.read_html(el[0].text, attrs={"id": f"stats_{stat_type}"})[0] 397 | df_table[("Unnamed: league", "league")] = lkey 398 | df_table[("Unnamed: season", "season")] = skey 399 | players.append(df_table) 400 | 401 | # return dataframe 402 | df = _concat(players) 403 | df = df[df.Player != "Player"] 404 | df = ( 405 | df.drop("Matches", axis=1, level=0) 406 | .drop("Rk", axis=1, level=0) 407 | .rename(columns={"Player": "player", "Squad": "team"}) 408 | .replace({"team": TEAMNAME_REPLACEMENTS}) 409 | .set_index(["league", "season", "team", "player"]) 410 | .sort_index() 411 | ) 412 | df["Nation"] = df["Nation"].apply( 413 | lambda x: x.split(" ")[1] if isinstance(x, str) and " " in x else None 414 | ) 415 | return df 416 | 417 | def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: 418 | """Retrieve the game schedule for the selected leagues and seasons. 419 | 420 | Parameters 421 | ---------- 422 | force_cache : bool 423 | By default no cached data is used for the current season. 424 | If True, will force the use of cached data anyway. 425 | 426 | Returns 427 | ------- 428 | pd.DataFrame 429 | """ 430 | # get league IDs 431 | seasons = self.read_seasons() 432 | 433 | # collect teams 434 | schedule = [] 435 | for (lkey, skey), season in seasons.iterrows(): 436 | # read html page (league overview) 437 | url_stats = FBREF_API + season.url 438 | filepath_stats = self.data_dir / f"teams_{lkey}_{skey}.html" 439 | reader = self.get(url_stats, filepath_stats) 440 | tree = html.parse(reader) 441 | 442 | url_fixtures = FBREF_API + tree.xpath("//a[text()='Scores & Fixtures']")[0].get("href") 443 | filepath_fixtures = self.data_dir / f"schedule_{lkey}_{skey}.html" 444 | current_season = not self._is_complete(lkey, skey) 445 | reader = self.get( 446 | url_fixtures, filepath_fixtures, no_cache=current_season and not force_cache 447 | ) 448 | tree = html.parse(reader) 449 | table = tree.xpath("//table[contains(@id, 'sched')]")[0] 450 | df_table = pd.read_html(etree.tostring(table))[0] 451 | df_table["Match Report"] = [ 452 | mlink.xpath("./a/@href")[0] 453 | if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report" 454 | else None 455 | for mlink in table.xpath(".//td[@data-stat='match_report']") 456 | ] 457 | df_table["league"] = lkey 458 | df_table["season"] = skey 459 | df_table = df_table.dropna(how="all") 460 | schedule.append(df_table) 461 | df = ( 462 | pd.concat(schedule) 463 | .rename( 464 | columns={ 465 | "Wk": "week", 466 | "Home": "home_team", 467 | "Away": "away_team", 468 | "xG": "home_xg", 469 | "xG.1": "away_xg", 470 | } 471 | ) 472 | .replace( 473 | { 474 | "home_team": TEAMNAME_REPLACEMENTS, 475 | "away_team": TEAMNAME_REPLACEMENTS, 476 | } 477 | ) 478 | .pipe(standardize_colnames) 479 | ) 480 | df["date"] = pd.to_datetime(df["date"]).ffill() 481 | df["game"] = df.apply(make_game_id, axis=1) 482 | df.loc[~df.match_report.isna(), "game_id"] = ( 483 | df.loc[~df.match_report.isna(), "match_report"].str.split("/").str[3] 484 | ) 485 | df = df.set_index(["league", "season", "game"]).sort_index() 486 | return df 487 | 488 | def _parse_teams(self, tree: etree.ElementTree) -> List[Dict]: 489 | """Parse the teams from a match summary page. 490 | 491 | Parameters 492 | ---------- 493 | tree : etree.ElementTree 494 | The match summary page. 495 | 496 | Returns 497 | ------- 498 | list of dict 499 | """ 500 | team_nodes = tree.xpath("//div[@class='scorebox']//strong/a")[:2] 501 | teams = [] 502 | for team in team_nodes: 503 | teams.append({"id": team.get("href").split("/")[3], "name": team.text.strip()}) 504 | return teams 505 | 506 | def read_lineup( 507 | self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False 508 | ) -> pd.DataFrame: 509 | """Retrieve lineups for the selected leagues and seasons. 510 | 511 | Parameters 512 | ---------- 513 | match_id : int or list of int, optional 514 | Retrieve the lineup for a specific game. 515 | force_cache : bool 516 | By default no cached data is used to scrape the list of available 517 | games for the current season. If True, will force the use of 518 | cached data anyway. 519 | 520 | Raises 521 | ------ 522 | ValueError 523 | If no games with the given IDs were found for the selected seasons and leagues. 524 | 525 | Returns 526 | ------- 527 | pd.DataFrame. 528 | """ 529 | urlmask = FBREF_API + "/en/matches/{}" 530 | filemask = "match_{}.html" 531 | 532 | # Retrieve games for which a match report is available 533 | df_schedule = self.read_schedule(force_cache).reset_index() 534 | df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()] 535 | # Select requested games if available 536 | if match_id is not None: 537 | iterator = df_schedule[ 538 | df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id) 539 | ] 540 | if len(iterator) == 0: 541 | raise ValueError("No games found with the given IDs in the selected seasons.") 542 | else: 543 | iterator = df_schedule 544 | 545 | lineups = [] 546 | for i, game in iterator.iterrows(): 547 | url = urlmask.format(game["game_id"]) 548 | # get league and season 549 | logger.info( 550 | "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"] 551 | ) 552 | filepath = self.data_dir / filemask.format(game["game_id"]) 553 | reader = self.get(url, filepath) 554 | tree = html.parse(reader) 555 | teams = self._parse_teams(tree) 556 | tables = tree.xpath("//div[@class='lineup']") 557 | for i, table in enumerate(tables): 558 | df_table = pd.read_html(etree.tostring(table))[0] 559 | df_table.columns = ["jersey_number", "player"] 560 | df_table["team"] = teams[i]["name"] 561 | if "Bench" in df_table.jersey_number.values: 562 | bench_idx = df_table.index[df_table.jersey_number == "Bench"][0] 563 | df_table.loc[:bench_idx, "is_starter"] = True 564 | df_table.loc[bench_idx:, "is_starter"] = False 565 | df_table["game"] = game["game"] 566 | df_table["league"] = game["league"] 567 | df_table["season"] = game["season"] 568 | df_table["game"] = game["game"] 569 | df_table.drop(bench_idx, inplace=True) 570 | lineups.append(df_table) 571 | df = pd.concat(lineups).set_index(["league", "season", "game", "team", "player"]) 572 | # TODO: sub in, sub out, position 573 | return df 574 | 575 | def read_player_match_stats( 576 | self, 577 | stat_type: str = "summary", 578 | match_id: Optional[Union[str, List[str]]] = None, 579 | force_cache: bool = False, 580 | ) -> pd.DataFrame: 581 | """Retrieve the match stats for the selected leagues and seasons. 582 | 583 | The following stat types are available: 584 | * 'summary' 585 | * 'keepers' 586 | * 'passing' 587 | * 'passing_types' 588 | * 'defense' 589 | * 'possession' 590 | * 'misc' 591 | 592 | Parameters 593 | ---------- 594 | stat_type : str 595 | Type of stats to retrieve. 596 | match_id : int or list of int, optional 597 | Retrieve the event stream for a specific game. 598 | force_cache : bool 599 | By default no cached data is used to scrape the list of available 600 | games for the current season. If True, will force the use of 601 | cached data anyway. 602 | 603 | Raises 604 | ------ 605 | ValueError 606 | If no games with the given IDs were found for the selected seasons and leagues. 607 | TypeError 608 | If ``stat_type`` is not valid. 609 | 610 | Returns 611 | ------- 612 | pd.DataFrame 613 | """ 614 | match_stats = [ 615 | "summary", 616 | "keepers", 617 | "passing", 618 | "passing_types", 619 | "defense", 620 | "possession", 621 | "misc", 622 | ] 623 | 624 | urlmask = FBREF_API + "/en/matches/{}" 625 | filemask = "match_{}.html" 626 | 627 | if stat_type not in match_stats: 628 | raise TypeError(f"Invalid argument: stat_type should be in {match_stats}") 629 | 630 | # Retrieve games for which a match report is available 631 | df_schedule = self.read_schedule(force_cache).reset_index() 632 | df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()] 633 | # Selec requested games if available 634 | if match_id is not None: 635 | iterator = df_schedule[ 636 | df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id) 637 | ] 638 | if len(iterator) == 0: 639 | raise ValueError("No games found with the given IDs in the selected seasons.") 640 | else: 641 | iterator = df_schedule 642 | 643 | stats = [] 644 | for i, game in iterator.iterrows(): 645 | url = urlmask.format(game["game_id"]) 646 | # get league and season 647 | logger.info( 648 | "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"] 649 | ) 650 | filepath = self.data_dir / filemask.format(game["game_id"]) 651 | reader = self.get(url, filepath) 652 | tree = html.parse(reader) 653 | (home_team, away_team) = self._parse_teams(tree) 654 | if stat_type == "keepers": 655 | id_format = "keeper_stats_{}" 656 | else: 657 | id_format = "stats_{}_" + stat_type 658 | table = tree.xpath("//table[@id='" + id_format.format(home_team["id"]) + "']")[0] 659 | df_table = pd.read_html(etree.tostring(table))[0] 660 | df_table["team"] = home_team["name"] 661 | df_table["game"] = game["game"] 662 | df_table["league"] = game["league"] 663 | df_table["season"] = game["season"] 664 | df_table["game_id"] = game["game_id"] 665 | stats.append(df_table) 666 | table = tree.xpath("//table[@id='" + id_format.format(away_team["id"]) + "']")[0] 667 | df_table = pd.read_html(etree.tostring(table))[0] 668 | df_table["team"] = away_team["name"] 669 | df_table["game"] = game["game"] 670 | df_table["league"] = game["league"] 671 | df_table["season"] = game["season"] 672 | df_table["game_id"] = game["game_id"] 673 | stats.append(df_table) 674 | 675 | df = _concat(stats) 676 | df = df[~df.Player.str.contains(r"^\d+\sPlayers$")] 677 | df = ( 678 | df.rename(columns={"Player": "player"}) 679 | .replace({"team": TEAMNAME_REPLACEMENTS}) 680 | .set_index(["league", "season", "game", "team", "player"]) 681 | .sort_index() 682 | ) 683 | return df 684 | 685 | def read_shot_events( 686 | self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False 687 | ) -> pd.DataFrame: 688 | """Retrieve shooting and shot creation event data for the selected seasons or selected matches. 689 | 690 | The data returned includes who took the shot, when, with which body 691 | part and from how far away. Additionally, the player creating the 692 | chance and also the creation before this are included in the data. 693 | 694 | Parameters 695 | ---------- 696 | match_id : int or list of int, optional 697 | Retrieve the lineup for a specific game. 698 | force_cache : bool 699 | By default no cached data is used to scrape the list of available 700 | games for the current season. If True, will force the use of 701 | cached data anyway. 702 | 703 | Raises 704 | ------ 705 | ValueError 706 | If no games with the given IDs were found for the selected seasons and leagues. 707 | 708 | Returns 709 | ------- 710 | pd.DataFrame. 711 | """ 712 | urlmask = FBREF_API + "/en/matches/{}" 713 | filemask = "match_{}.html" 714 | 715 | # Retrieve games for which a match report is available 716 | df_schedule = self.read_schedule(force_cache).reset_index() 717 | df_schedule = df_schedule[~df_schedule.game_id.isna() & ~df_schedule.match_report.isnull()] 718 | # Selec requested games if available 719 | if match_id is not None: 720 | iterator = df_schedule[ 721 | df_schedule.game_id.isin([match_id] if isinstance(match_id, str) else match_id) 722 | ] 723 | if len(iterator) == 0: 724 | raise ValueError("No games found with the given IDs in the selected seasons.") 725 | else: 726 | iterator = df_schedule 727 | 728 | shots = [] 729 | for i, game in iterator.iterrows(): 730 | url = urlmask.format(game["game_id"]) 731 | # get league and season 732 | logger.info( 733 | "[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"] 734 | ) 735 | filepath = self.data_dir / filemask.format(game["game_id"]) 736 | reader = self.get(url, filepath) 737 | tree = html.parse(reader) 738 | df_table = pd.read_html(etree.tostring(tree), attrs={"id": "shots_all"})[0] 739 | df_table["league"] = game["league"] 740 | df_table["season"] = game["season"] 741 | df_table["game"] = game["game"] 742 | shots.append(df_table) 743 | 744 | df = ( 745 | _concat(shots) 746 | .rename(columns={"Squad": "team"}) 747 | .replace({"team": TEAMNAME_REPLACEMENTS}) 748 | .pipe( 749 | standardize_colnames, 750 | cols=["Outcome", "Minute", "Distance", "Player", "Body Part", "Notes", "Event"], 751 | ) 752 | .set_index(["league", "season", "game", "team", "player"]) 753 | .sort_index() 754 | .dropna(how="all") 755 | ) 756 | return df 757 | 758 | 759 | def _concat(dfs: List[pd.DataFrame]) -> pd.DataFrame: 760 | """Merge matching tables scraped from different pages. 761 | 762 | The level 0 headers are not consitent across seasons and leagues, this 763 | function tries to determine uniform column names. 764 | 765 | Parameters 766 | ---------- 767 | dfs : list(pd.DataFrame) 768 | Input dataframes. 769 | 770 | Returns 771 | ------- 772 | pd.DataFrame 773 | Concatenated dataframe with uniform column names. 774 | """ 775 | # Look for the most complete level 0 columns 776 | all_columns = [] 777 | for df in dfs: 778 | columns = pd.DataFrame(df.columns.tolist()) 779 | # Move missing columns to level 0 780 | columns.replace({"": None}, inplace=True) 781 | mask = pd.isnull(columns[1]) 782 | columns.loc[mask, [0, 1]] = columns.loc[mask, [1, 0]].values 783 | # Rename unnamed columns 784 | mask = columns[0].str.startswith("Unnamed:").fillna(False) 785 | columns.loc[mask, 0] = None 786 | all_columns.append(columns) 787 | columns = reduce(lambda l, r: l.combine_first(r), all_columns) 788 | 789 | # Move the remaining missing columns back to level 1 and replace with empyt string 790 | mask = pd.isnull(columns[0]) 791 | columns.loc[mask, [0, 1]] = columns.loc[mask, [1, 0]].values 792 | columns.loc[mask, 1] = "" 793 | 794 | for df in dfs: 795 | df.columns = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist()) 796 | 797 | return pd.concat(dfs) 798 | --------------------------------------------------------------------------------