├── laliga-logo.png ├── .gitignore ├── requirements.txt ├── .github └── workflows │ └── ci.yml ├── main.py ├── LICENSE ├── laliga ├── player.py ├── wrangling.py ├── utils.py ├── network.py └── core.py ├── README.md ├── settings.py └── datasets └── col-specs.csv /laliga-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdelquin/laliga-data/HEAD/laliga-logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | .env 4 | .DS_Store 5 | *.log 6 | *.log.* 7 | prod-test.sh 8 | .python-version 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium==3.14.1 2 | pandas==1.3.5 3 | lxml==4.7.1 4 | html5lib==1.1 5 | bs4==0.0.1 6 | requests==2.26.0 7 | prettyconf==2.2.1 8 | logzero==1.7.0 9 | click==8.0.3 10 | typer==0.4.0 11 | user_agent==0.1.10 12 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | # schedule: 4 | # - cron: "0 3 * * 3" 5 | workflow_dispatch: 6 | jobs: 7 | laliga-scraping: 8 | name: Scrap laliga.com and generate csv dataset files 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-python@v2 13 | with: 14 | python-version: "3.10" 15 | - uses: BSFishy/pip-action@v1 16 | with: 17 | requirements: requirements.txt 18 | - run: python main.py -v 19 | env: 20 | PAGINATOR_XPATH: ${{ secrets.PAGINATOR_XPATH }} 21 | COMPETITIONS_DIV_XPATH: ${{ secrets.COMPETITIONS_DIV_XPATH }} 22 | COMPETITIONS_UL_XPATH: ${{ secrets.COMPETITIONS_UL_XPATH }} 23 | - uses: stefanzweifel/git-auto-commit-action@v4 24 | with: 25 | commit_message: Update dataset file 26 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logzero 2 | import typer 3 | 4 | from laliga.core import LaLigaScraper 5 | from laliga.utils import init_logger 6 | 7 | app = typer.Typer(add_completion=False) 8 | logger = init_logger() 9 | 10 | 11 | @app.command() 12 | def run( 13 | verbose: bool = typer.Option( 14 | False, '--verbose', '-v', show_default=False, help='Loglevel increased to debug.' 15 | ), 16 | num_players: int = typer.Option( 17 | 0, 18 | '--num_players', 19 | '-n', 20 | help='Num players (per competition) to be scraped. ' 21 | 'If 0, all available players will be retrieved.', 22 | ), 23 | ): 24 | logger.setLevel(logzero.DEBUG if verbose else logzero.INFO) 25 | 26 | scraper = LaLigaScraper() 27 | scraper.get_player_data(num_players) 28 | scraper.to_dataframe() 29 | scraper.wrangle_dataframe() 30 | scraper.to_csv() 31 | 32 | 33 | if __name__ == "__main__": 34 | app() 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Sergio Delgado Quintero 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /laliga/player.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | import settings 6 | 7 | from . import network, utils 8 | 9 | 10 | class Player: 11 | def __init__( 12 | self, url, competition, selected_properties=settings.PLAYER_PROPS_SELECTION 13 | ): 14 | self.url = url 15 | self.competition = competition 16 | self.selected_properties = selected_properties 17 | self.data = {} 18 | self._add_custom_properties() 19 | 20 | def _add_custom_properties(self): 21 | self.data[settings.COMPETITION_COLUMN] = self.competition 22 | self.data[settings.PLAYER_URL_COLUMN] = self.url 23 | 24 | def _extract_properties(self): 25 | # custom properties 26 | for target_key, source_keys in self.selected_properties.items(): 27 | self.data[target_key] = utils.get_value_from_nested_keys( 28 | self.source_properties, source_keys 29 | ) 30 | # player stats 31 | for item in self.source_properties['playerStats']: 32 | self.data[item['name']] = item['stat'] 33 | 34 | def get_data(self): 35 | if response := network.make_request(self.url): 36 | soup = BeautifulSoup(response.text, 'html.parser') 37 | script_contents = soup.find('script', id=settings.SCRIPT_DATA_ID).string 38 | source = json.loads(script_contents) 39 | self.source_properties = source['props']['pageProps'] 40 | self._extract_properties() 41 | return self.data 42 | 43 | def __str__(self): 44 | return self.data.get('name', self.url) 45 | -------------------------------------------------------------------------------- /laliga/wrangling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from logzero import logger 4 | 5 | import settings 6 | 7 | 8 | def fill_na(df: pd.DataFrame, last_prop_idx, replacement_value=0): 9 | df.iloc[:, last_prop_idx + 1 :] = df.iloc[:, last_prop_idx + 1 :].fillna( 10 | replacement_value 11 | ) 12 | return df 13 | 14 | 15 | def cols_to_int64(df: pd.DataFrame): 16 | logger.debug('Converting non-decimal columns to Int64Dtype') 17 | for column in df.columns: 18 | if df[column].dtype == 'float64': 19 | if all((df[column] / np.floor(df[column])).dropna() == 1): 20 | df[column] = df[column].astype(pd.Int64Dtype()) 21 | return df 22 | 23 | 24 | def fix_twitter_col(df: pd.DataFrame, twitter_base_url, twitter_col='twitter'): 25 | logger.debug('Adding url to Twitter nicknames') 26 | df[twitter_col] = df[twitter_col].str.replace('@', twitter_base_url) 27 | return df 28 | 29 | 30 | def sort_stats_cols(df: pd.DataFrame, last_prop_idx): 31 | logger.debug('Sorting player stats columns') 32 | sorted_columns = list(df.columns[: last_prop_idx + 1]) + sorted( 33 | df.columns[last_prop_idx + 1 :] 34 | ) 35 | return df[sorted_columns] 36 | 37 | 38 | def wrangle_dataframe( 39 | df: pd.DataFrame, 40 | twitter_base_url=settings.TWITTER_BASE_URL, 41 | player_props_selection=settings.PLAYER_PROPS_SELECTION, 42 | ): 43 | last_prop_idx = df.columns.get_loc(list(player_props_selection.keys())[-1]) 44 | df = cols_to_int64(df) 45 | df = fix_twitter_col(df, twitter_base_url) 46 | df = sort_stats_cols(df, last_prop_idx) 47 | return df 48 | -------------------------------------------------------------------------------- /laliga/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from urllib.parse import urljoin 4 | 5 | import logzero 6 | from selenium import webdriver 7 | from selenium.webdriver.firefox.options import Options 8 | 9 | import settings 10 | 11 | 12 | def init_logger(): 13 | console_logformat = ( 14 | '%(asctime)s ' 15 | '%(color)s' 16 | '[%(levelname)-8s] ' 17 | '%(end_color)s ' 18 | '%(message)s ' 19 | '%(color)s' 20 | '(%(filename)s:%(lineno)d)' 21 | '%(end_color)s' 22 | ) 23 | # remove colors on logfile 24 | file_logformat = re.sub(r'%\((end_)?color\)s', '', console_logformat) 25 | 26 | console_formatter = logzero.LogFormatter(fmt=console_logformat) 27 | file_formatter = logzero.LogFormatter(fmt=file_logformat) 28 | logzero.setup_default_logger(formatter=console_formatter) 29 | logzero.logfile( 30 | settings.LOGFILE, 31 | maxBytes=settings.LOGFILE_SIZE, 32 | backupCount=settings.LOGFILE_BACKUP_COUNT, 33 | formatter=file_formatter, 34 | ) 35 | return logzero.logger 36 | 37 | 38 | def init_webdriver(headless=settings.SELENIUM_HEADLESS): 39 | options = Options() 40 | options.headless = headless 41 | profile = webdriver.FirefoxProfile() 42 | return webdriver.Firefox( 43 | options=options, firefox_profile=profile, service_log_path=os.devnull 44 | ) 45 | 46 | 47 | def build_url(path, base_url=settings.LALIGA_ADV_STATS_URL): 48 | return urljoin(base_url, path) 49 | 50 | 51 | def get_value_from_nested_keys(data: dict, keys: tuple, k=0): 52 | if data is None: 53 | return data 54 | if len(keys) == 1: 55 | return data.get(keys[0]) 56 | return get_value_from_nested_keys(data.get(keys[0]), keys[1:], k + 1) 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # laliga-data 2 | 3 | Scraping data from spanish football [La Liga](https://laliga.com/) website. 4 | 5 | ![LaLiga Logo](laliga-logo.png) 6 | 7 | This is a tool entirely written in Python that allows to **scrap all player data available in the official website of the spanish football league** for the current season. At the time of writing, three competitions are available: female first division, male first division and male second division. 8 | 9 | ## Setup 10 | 11 | Create a Python virtualenv and install requirements: 12 | 13 | ```console 14 | $ python3.10 -m venv venv 15 | $ source venv/bin/activate 16 | $ pip install -r requirements.txt 17 | ``` 18 | 19 | Optionally, you can create a `.env` file in the working directory to overwrite settings from [settings.py](settings.py). 20 | 21 | ### Other requirements 22 | 23 | There are few external requirements for the project to work properly: 24 | 25 | - [geckodriver](https://github.com/mozilla/geckodriver/releases) 26 | - [Firefox Browser](https://www.mozilla.org/firefox/download/) 27 | 28 | ## Usage 29 | 30 | ```console 31 | $ python main.py --help 32 | Usage: main.py [OPTIONS] 33 | 34 | Options: 35 | -v, --verbose Loglevel increased to debug. 36 | -n, --num_players INTEGER Num players (per competition) to be scraped. If 37 | 0, all available players will be retrieved. 38 | [default: 0] 39 | --help Show this message and exit. 40 | ``` 41 | 42 | A common usage would be just `python main.py -v`. It takes aproximately 2 hours to finish execution (depending on the network issues). 43 | 44 | Once finished, a **csv file** will be present in repo containing all scraped data from players. 45 | 46 | ## Data 47 | 48 | - Generated datasets are stored in the [datasets](datasets) folder and **updated weekly**. 49 | - Files will have a name like `S2122-laliga-players.csv` depending on the football season. 50 | - A description of the columns can be found in [col-specs.csv](datasets/col-specs.csv). 51 | - Datasets are also available at [Kaggle](https://www.kaggle.com/sdelquin/laliga-data). 52 | -------------------------------------------------------------------------------- /laliga/network.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | import user_agent 5 | from logzero import logger 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver import Firefox as FirefoxWebDriver 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | 10 | import settings 11 | 12 | 13 | def get_user_agent_header(): 14 | return {'User-Agent': user_agent.generate_user_agent()} 15 | 16 | 17 | def make_request( 18 | url, 19 | method='get', 20 | include_user_agent=True, 21 | timeout=settings.REQUESTS_TIMEOUT, 22 | num_retries=settings.REQUESTS_RETRIES, 23 | req_delay=settings.REQUESTS_DELAY, 24 | ): 25 | logger.debug(f'Requesting {url}') 26 | 27 | req = getattr(requests, method) 28 | retry = 0 29 | while True: 30 | try: 31 | headers = get_user_agent_header() if include_user_agent else {} 32 | response = req(url, headers=headers, timeout=timeout) 33 | except requests.exceptions.ReadTimeout as err: 34 | logger.error(err) 35 | else: 36 | logger.debug(f'Response status code: {response.status_code}') 37 | if response.status_code // 100 == 2: # 2XX 38 | return response 39 | logger.debug(f'Request delay: {req_delay} seconds') 40 | time.sleep(req_delay) 41 | if retry >= num_retries: 42 | break 43 | retry += 1 44 | logger.debug(f'Network retry {retry}') 45 | 46 | 47 | def selenium_wait( 48 | driver: FirefoxWebDriver, 49 | until, 50 | timeout=settings.SELENIUM_TIMEOUT, 51 | num_retries=settings.SELENIUM_RETRIES, 52 | req_delay=settings.SELENIUM_DELAY, 53 | ): 54 | retry = 0 55 | while True: 56 | try: 57 | response = WebDriverWait(driver, timeout=timeout).until(until) 58 | except TimeoutException as err: 59 | # This exception does not include any message 60 | logger.error('TimeoutException by Selenium') 61 | if retry >= num_retries: 62 | raise err 63 | else: 64 | return response 65 | logger.debug(f'Request delay: {req_delay} seconds') 66 | time.sleep(req_delay) 67 | retry += 1 68 | logger.debug(f'Network retry {retry}') 69 | driver.refresh() 70 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from prettyconf import config 4 | 5 | PROJECT_DIR = Path(__file__).resolve().parent 6 | PROJECT_NAME = PROJECT_DIR.name 7 | 8 | LALIGA_LANGCODE = config('LALIGA_LANGCODE', default='en-ES') 9 | LALIGA_STATS_URL = config( 10 | 'LALIGA_STATS_URL', 11 | default='https://www.laliga.com/en-ES/stats/laliga-santander/scorers', 12 | ) 13 | LALIGA_ADV_STATS_URL = config( 14 | 'LALIGA_ADV_STATS_URL', 15 | default=f'https://www.laliga.com/{LALIGA_LANGCODE}/advanced-stats', 16 | ) 17 | 18 | SELENIUM_HEADLESS = config('SELENIUM_HEADLESS', default=True, cast=lambda v: bool(int(v))) 19 | 20 | COMPETITIONS_DIV_XPATH = config( 21 | 'COMPETITIONS_DIV_XPATH', 22 | default='/html/body/div[1]/div[6]/div[1]/div/div[2]/div[3]/div[1]/div/div[1]', 23 | ) 24 | COMPETITIONS_UL_XPATH = config( 25 | 'COMPETITIONS_UL_XPATH', 26 | default='/html/body/div[1]/div[6]/div[1]/div/div[2]/div[3]/div[1]/ul', 27 | ) 28 | PAGINATOR_XPATH = config('PAGINATOR_XPATH', default='/html/body/div[1]/div[6]/div[4]/div/div/div') 29 | PAGINATOR_TOP = config('PAGINATOR_TOP', default=1500, cast=int) 30 | SCRIPT_DATA_ID = config('SCRIPT_DATA_ID', default='__NEXT_DATA__') 31 | DROPDOWN_OFFSET = config('DROPDOWN_OFFSET', default=30, cast=int) 32 | 33 | DATASETS_FOLDER = config('DATASETS_FOLDER', default=PROJECT_DIR / 'datasets', cast=Path) 34 | PLAYERS_FILEPATH = config( 35 | 'PLAYERS_FILEPATH', default=DATASETS_FOLDER / 'laliga-players.csv', cast=Path 36 | ) 37 | 38 | COMPETITION_COLUMN = config('COMPETITION_COLUMN', default='competition') 39 | PLAYER_URL_COLUMN = config('PLAYER_URL_COLUMN', default='player.url') 40 | TWITTER_BASE_URL = config('TWITTER_BASE_URL', default='https://twitter.com/') 41 | 42 | LOGFILE = config('LOGFILE', default=PROJECT_DIR / (PROJECT_NAME + '.log'), cast=Path) 43 | LOGFILE_SIZE = config('LOGFILE_SIZE', cast=float, default=1e6) 44 | LOGFILE_BACKUP_COUNT = config('LOGFILE_BACKUP_COUNT', cast=int, default=3) 45 | 46 | REQUESTS_TIMEOUT = config('REQUESTS_TIMEOUT', default=5, cast=int) # seconds 47 | REQUESTS_DELAY = config('REQUESTS_DELAY', default=1, cast=int) # seconds 48 | REQUESTS_RETRIES = config('REQUESTS_RETRIES', default=3, cast=int) 49 | 50 | SELENIUM_TIMEOUT = config('SELENIUM_TIMEOUT', default=30, cast=int) # seconds 51 | SELENIUM_DELAY = config('SELENIUM_DELAY', default=1, cast=int) # seconds 52 | SELENIUM_RETRIES = config('SELENIUM_RETRIES', default=3, cast=int) 53 | 54 | 55 | PLAYER_PROPS_SELECTION = { 56 | 'id': ('player', 'id'), 57 | 'slug': ('player', 'slug'), 58 | 'name': ('player', 'name'), 59 | 'nickname': ('player', 'nickname'), 60 | 'firstname': ('player', 'firstname'), 61 | 'lastname': ('player', 'lastname'), 62 | 'gender': ('player', 'gender'), 63 | 'date_of_birth': ('player', 'date_of_birth'), 64 | 'place_of_birth': ('player', 'place_of_birth'), 65 | 'weight': ('player', 'weight'), 66 | 'height': ('player', 'height'), 67 | 'international': ('player', 'international'), 68 | 'twitter': ('player', 'twitter'), 69 | 'instagram': ('player', 'instagram'), 70 | 'country': ('player', 'country', 'id'), 71 | 'team': ('player', 'team', 'nickname'), 72 | 'team.shortname': ('player', 'team', 'shortname'), 73 | 'team.foundation': ('player', 'team', 'foundation'), 74 | 'team.shield': ('player', 'team', 'shield', 'resizes', 'medium'), 75 | 'shirt_number': ('player', 'squad', 'shirt_number'), 76 | 'position': ('player', 'squad', 'position', 'name'), 77 | 'photo': ('player', 'photos', '001', '512x556'), 78 | 'stadium': ('club', 'venue', 'name'), 79 | 'stadium.image': ('club', 'venue', 'image', 'resizes', 'medium'), 80 | } 81 | 82 | SKIPPED_COMPETITIONS = config( 83 | 'SKIPPED_COMPETITIONS', default='Mundial FIFA', cast=lambda v: v.split(',') 84 | ) 85 | -------------------------------------------------------------------------------- /laliga/core.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import pandas as pd 5 | from bs4 import BeautifulSoup 6 | from logzero import logger 7 | from selenium.common.exceptions import TimeoutException 8 | from selenium.webdriver.common.action_chains import ActionChains 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | import settings 13 | 14 | from . import network 15 | from .player import Player 16 | from .utils import build_url, init_webdriver 17 | from .wrangling import wrangle_dataframe 18 | 19 | 20 | class LaLigaScraper: 21 | def __init__( 22 | self, 23 | url=settings.LALIGA_ADV_STATS_URL, 24 | paginator_xpath=settings.PAGINATOR_XPATH, 25 | paginator_top=settings.PAGINATOR_TOP, 26 | competitions_div_xpath=settings.COMPETITIONS_DIV_XPATH, 27 | competitions_ul_xpath=settings.COMPETITIONS_UL_XPATH, 28 | output_filepath=settings.PLAYERS_FILEPATH, 29 | stats_url=settings.LALIGA_STATS_URL, 30 | ): 31 | self.url = url 32 | self.paginator_xpath = paginator_xpath 33 | self.paginator_top = paginator_top 34 | self.competitions_div_xpath = competitions_div_xpath 35 | self.competitions_ul_xpath = competitions_ul_xpath 36 | self.output_filepath = output_filepath 37 | self.stats_url = stats_url 38 | self.current_page = 0 39 | self.current_competition = 0 40 | self.player_data = [] 41 | self.webdriver = init_webdriver() 42 | 43 | logger.info(f'Moving to {self.url}') 44 | self.webdriver.get(self.url) 45 | 46 | self._accept_cookies() 47 | self._get_season() 48 | 49 | def __del__(self): 50 | self.webdriver.quit() 51 | 52 | def _accept_cookies(self): 53 | logger.debug('Accepting cookies') 54 | accept_cookies_btn = network.selenium_wait( 55 | self.webdriver, 56 | EC.presence_of_element_located((By.ID, 'onetrust-accept-btn-handler')), 57 | ) 58 | accept_cookies_btn.click() 59 | time.sleep(1) 60 | 61 | def _close_advertisement(self): 62 | logger.debug('Closing advertisement') 63 | try: 64 | adv_button = network.selenium_wait( 65 | self.webdriver, 66 | EC.element_to_be_clickable((By.CLASS_NAME, 'rctfl-close')), 67 | num_retries=0, 68 | ) 69 | adv_button.click() 70 | time.sleep(1) 71 | except TimeoutException: 72 | logger.warning('No advertisements found') 73 | 74 | def _get_season(self): 75 | logger.info('Getting season') 76 | response = network.make_request(self.stats_url) 77 | soup = BeautifulSoup(response.text, 'html.parser') 78 | self.season = ''.join(re.search(r'(\d\d)/(\d\d)', soup.find('h1').text).groups()) 79 | 80 | @property 81 | def seasoned_output_filepath(self): 82 | new_file_stem = f'S{self.season}-{self.output_filepath.stem}' 83 | return self.output_filepath.with_stem(new_file_stem) 84 | 85 | def _scroll_to_paginator(self): 86 | logger.debug('Scrolling to paginator') 87 | js_code = f"window.scrollTo({{'top': {self.paginator_top}}})" 88 | self.webdriver.execute_script(js_code) 89 | time.sleep(1) 90 | 91 | def _load_next_players_table(self): 92 | paginator = network.selenium_wait( 93 | self.webdriver, 94 | EC.element_to_be_clickable((By.XPATH, self.paginator_xpath)), 95 | ) 96 | for div in paginator.find_elements_by_tag_name('div'): 97 | page = div.text.strip() 98 | if page.isnumeric(): 99 | if int(page) == self.current_page + 1: 100 | self._scroll_to_paginator() 101 | div.click() 102 | table = network.selenium_wait( 103 | self.webdriver, 104 | EC.presence_of_element_located((By.TAG_NAME, 'table')), 105 | ) 106 | self.current_page += 1 107 | return table 108 | 109 | def get_player_urls(self): 110 | self.current_page = 0 111 | while table := self._load_next_players_table(): 112 | logger.debug(f'Getting player urls from table in page {self.current_page}') 113 | soup = BeautifulSoup(table.get_attribute('outerHTML'), 'html.parser') 114 | for tr in soup.tbody.find_all('tr'): 115 | yield build_url(tr.td.a['href']) 116 | 117 | def _load_next_competition(self): 118 | competitions_div = self.webdriver.find_element_by_xpath(self.competitions_div_xpath) 119 | self.webdriver.execute_script('window.scrollTo(0, 0);') 120 | time.sleep(1) 121 | competitions_div.click() 122 | competitions_ul = self.webdriver.find_element_by_xpath(self.competitions_ul_xpath) 123 | competitions = competitions_ul.find_elements_by_tag_name('li') 124 | if self.current_competition >= len(competitions): 125 | return None 126 | competition_name = competitions[self.current_competition].text 127 | logger.info(f'Loading competition "{competition_name}"') 128 | actions = ActionChains(self.webdriver) 129 | actions.move_to_element(competitions_div) 130 | actions.move_by_offset(0, (self.current_competition + 1) * settings.DROPDOWN_OFFSET) 131 | actions.click() 132 | actions.perform() 133 | self.current_competition += 1 134 | return competition_name 135 | 136 | def get_player_data_by_competition(self, competition: str, num_players=0): 137 | logger.info('Getting player data') 138 | num_checked_players = 1 139 | for player_url in self.get_player_urls(): 140 | logger.debug(f'[{num_checked_players:03d}] {player_url}') 141 | player = Player(player_url, competition) 142 | if data := player.get_data(): 143 | self.player_data.append(data) 144 | if num_checked_players == num_players: 145 | break 146 | num_checked_players += 1 147 | else: 148 | logger.error('Unable to retrieve data') 149 | 150 | def get_player_data(self, num_players=0): 151 | while competition := self._load_next_competition(): 152 | if competition in settings.SKIPPED_COMPETITIONS: 153 | continue 154 | time.sleep(10) 155 | self.get_player_data_by_competition(competition, num_players) 156 | 157 | def to_dataframe(self): 158 | logger.info('Converting player data to Pandas DataFrame') 159 | self.df = pd.DataFrame(self.player_data) 160 | 161 | def wrangle_dataframe(self): 162 | logger.info('Wrangling player dataframe') 163 | self.df = wrangle_dataframe(self.df) 164 | 165 | def to_csv(self): 166 | logger.info(f'Dumping player dataframe to {self.seasoned_output_filepath}') 167 | self.df.to_csv(self.seasoned_output_filepath, index=False) 168 | -------------------------------------------------------------------------------- /datasets/col-specs.csv: -------------------------------------------------------------------------------- 1 | Column,Category,Description,Type country,Personal,Birth country of the player (Alpha-2),string date_of_birth,Personal,Date of birth of the player,date firstname,Personal,First name of the player,string gender,Personal,Gender of the player,string height,Personal,Height (cm) of the player,integer id,Personal,Player id for La Liga,integer instagram,Personal,Instagram url of the player,string international,Personal,Indicate if the player has played with a national team,boolean lastname,Personal,Last name of the player,string name,Personal,Full name of the player,string nickname,Personal,Shortname as the player is usually known,string photo,Personal,Url for the player photo (image),url place_of_birth,Personal,Place of birth of the player,string player.url,Personal,Individual url for player in LaLiga,string position,Personal,Position in field where the player plays,string shirt_number,Personal,Shirt number of the player,int slug,Personal,Player slug (no spaces string identifier),string stadium,Personal,Stadium where the player plays,string stadium.image,Personal,Url for the stadium image,url team,Personal,Team of the player,string team.foundation,Personal,Date of team foundation,date team.shield,Personal,Url for the team shield (image),url team.shortname,Personal,Team shortname (3 chars) of the player,string twitter,Personal,Twitter url of the player,string weight,Personal,Weight (kg) of the player,integer appearances,Classics,Matches where the player took part in,integer competition,Classics,Name of the competition,string games_played,Classics,Matches where the player took part in,integer goals,Classics,Total goals made by player,integer index,Classics,Player index,integer penalty_goals,Classics,Penalty goals made by player,integer recoveries,Classics,Recoveries made by player,integer starts,Classics,Games played in startline,integer substitute_off,Classics,Times the player has been substitute off,integer substitute_on,Classics,Times the player has been substitute on,integer team_games_played,Classics,Number of games played by the player team,integer time_played,Classics,Time played (minutes) by player,integer total_passes,Classics,Total passes made by player,integer total_shots,Classics,Total shots made by player,integer touches,Classics,Ball touches made by player,integer aerial_duels,Deffensives,Aerials duels where the player was involved,integer aerial_duels_lost,Deffensives,Aerials duels lost where the player was involved,integer aerial_duels_won,Deffensives,Aerials duels won where the player was involved,integer attempts_from_set_pieces,Deffensives,Shots on target by player,integer backward_passes,Deffensives,Passes made backwards by player,integer blocked_shots,Deffensives,Blocked shots by player,integer blocks,Deffensives,Blocks made by player,integer catches,Deffensives,Catches made by player (usually goalkeepers),integer clean_sheets,Deffensives,Matches where the team gave up no goals,integer clearances_off_the_line,Deffensives,Goal clearances off the line,integer drops,Deffensives,Restarting plays due to reasons other than normal gameplay,integer duels,Deffensives,Total duels where the player took part in,integer duels_lost,Deffensives,Lost duels where the player took part in,integer duels_won,Deffensives,Won duels where the player took part in,integer fifty_fifty,Deffensives,Challenge between two players that allows equal chances of acquiring the control of the ball.,integer gk_successful_distribution,Deffensives,Successful passes by goalkeeper,integer gk_unsuccessful_distribution,Deffensives,Unsuccessful passes by goalkeeper,integer goalkeeper_smother,Deffensives,Smother technique to catch the ball (goalkeepers),integer goals_conceded,Deffensives,Goals conceded when player is in the match,integer goals_conceded_inside_box,Deffensives,Goals conceded inside box when player is in the match,integer goals_conceded_outside_box,Deffensives,Goals conceded outside box when player is in the match,integer ground_duels,Deffensives,Ground duels where the player took part in,integer ground_duels_lost,Deffensives,Lost ground duels where the player took part in,integer ground_duels_won,Deffensives,Won ground duels where the player took part in,integer handballs_conceded,Deffensives,Handballs conceded by player,integer interceptions,Deffensives,Interceptions made by player,integer last_man_tackle,Deffensives,Tackles when player is the last player (defense),integer own_goal_scored,Deffensives,Own goal scored by player,integer penalties_faced,Deffensives,Faced penalties by player,integer penalties_saved,Deffensives,Saved penalties by player (usually goalkeeper),integer penalty_goals_conceded,Deffensives,Penalty goals conceded when player is in the match,integer saves_from_penalty,Deffensives,Saves from penalty made by player (usually goalkeeper),integer saves_made,Deffensives,Saves made by player (usuarlly goalkeeper),integer saves_made_caught,Deffensives,Saves with caught made by player (usually goalkeeper),integer saves_made_from_inside_box,Deffensives,Saves made from inside box by player (usually goalkeeper),integer saves_made_from_outside_box,Deffensives,Saves made from outside box by player (usually goalkeeper),integer saves_made_parried,Deffensives,Saves made by player not holding the ball (usually goalkeeper),integer tackles_lost,Deffensives,Lost tackles made by player,integer tackles_won,Deffensives,Won tackles made by player,integer times_tackled,Deffensives,Times the player has been tackled,integer total_clearances,Deffensives,Total clearances made by player,integer total_tackles,Deffensives,Total tackles made by player,integer foul_attempted_tackle,Discipline,Foul attempted tackle by player,integer foul_won_penalty,Discipline,Foul penalties won by player,integer offsides,Discipline,Offsides of player,integer penalties_conceded,Discipline,Penalties conceded by player,integer punches,Discipline,Punches made by player,integer red_cards_2nd_yellow,Discipline,Red cards to the player after second yellow card,integer straight_red_cards,Discipline,Straight red cards received by player,integer total_fouls_conceded,Discipline,Total fouls conceded by player,integer total_red_cards,Discipline,Total red cards received by player,integer yellow_cards,Discipline,Yellow cards received by player,integer assists_intentional ,Offensives,Assists given by player,integer away_goals,Offensives,Away goals made by player,integer corners_taken_incl_short_corners,Offensives,Taken corners by player (includes short corners),integer corners_won,Offensives,Won corners by player,integer crosses_not_claimed,Offensives,Crosses not claimed by player,integer forward_passes,Offensives,Passes made forwards by player,integer goal_assists,Offensives,Goal asists made by player,integer goals_from_inside_box,Offensives,Goals from inside box made by player,integer goals_from_outside_box,Offensives,Goals from outside box made by player,integer headed_goals,Offensives,Headed goals made by player,integer hit_woodwork,Offensives,Woodwork hits made by player,integer home_goals,Offensives,Home goals made by player,integer key_passes_attempt_assists,Offensives,Key passes with attempt of assist made by player,integer left_foot_goals,Offensives,Left foot goals made by player,integer leftside_passes,Offensives,Passes to the left side of the field made by player,integer open_play_passes,Offensives,Passes in open play made by player,integer other_goals,Offensives,Goals not covered by other categories,integer penalties_off_target,Offensives,Penalties made out of the goal by player,integer penalties_taken,Offensives,Taken penalties by player,integer putthrough_blocked_distribution,Offensives,Put through blocked distribution,integer putthrough_blocked_distribution_won,Offensives,Put through blocked distribution won,integer right_foot_goals,Offensives,Right foot goals made by player,integer rightside_passes,Offensives,Passes to the right side made by player,integer second_goal_assists,Offensives,Second goal assists made by player,integer set_pieces_goals,Offensives,Set pieces goals made by player,integer shots_off_target_inc_woodwork,Offensives,Shots off target included woodwork made by player,integer shots_on_target_inc_goals,Offensives,Shots on target included woodwork made by player,integer successful_corners_into_box,Offensives,Successful corners hitted into box by player,integer successful_crosses_corners,Offensives,Successful corner crosses made by player,integer successful_crosses_open_play,Offensives,Successful crosses on open play made by player,integer successful_dribbles,Offensives,Successful dribbles made by player,integer successful_fifty_fifty,Offensives,Successful fifty fifty challenges won by player,integer successful_launches,Offensives,Successful launches made by player,integer successful_layoffs,Offensives,Successful layoffs made by player,integer successful_long_passes,Offensives,Successful long passes made by player,integer successful_open_play_passes,Offensives,Successful open play passes made by player,integer successful_passes_opposition_half,Offensives,Successful passes in opossition half made by player,integer successful_passes_own_half,Offensives,Successful passes in own half made by player,integer successful_short_passes,Offensives,Successful short passes made by player,integer through_balls,Offensives,Passes into space between defenders made by player,integer throw_ins_to_opposition_player,Offensives,Throw-ins to an opponent made by player,integer throw_ins_to_own_player,Offensives,Throw-ins to a teammate made by player,integer total_fouls_won,Offensives,Total fouls won by player,integer total_losses_of_possession,Offensives,Total losses of possesion of player,integer total_successful_passes_excl_crosses_corners,Offensives,Total successful passes excluding crosses corners,integer total_touches_in_opposition_box,Offensives,Total touches in opposition box made by player,integer total_unsuccessful_passes_excl_crosses_corners,Offensives,Total unsuccessful passes excluding crosses corners,integer unsuccessful_corners_into_box,Offensives,Unsuccessful corners into the box made by player,integer unsuccessful_crosses_corners,Offensives,Unsuccessful crosses corners made by player,integer unsuccessful_crosses_open_play,Offensives,Unsuccessful crosses in open play made by player,integer unsuccessful_dribbles,Offensives,Unsuccessful dribbles made by player,integer unsuccessful_launches,Offensives,Unsuccessful launches made by player,integer unsuccessful_layoffs,Offensives,Unsuccessful layoffs made by player,integer unsuccessful_long_passes,Offensives,Unsuccessful long passes made by player,integer unsuccessful_passes_opposition_half,Offensives,Unsuccessful passes in opossition half made by player,integer unsuccessful_passes_own_half,Offensives,Unsuccessful passes in own half made by player,integer unsuccessful_short_passes,Offensives,Unsuccessful short passes made by player,integer winning_goal,Offensives,Winning goals made by player,integer --------------------------------------------------------------------------------