├── .gitignore ├── README.md ├── data └── .gitkeep ├── main.py ├── parser.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/*.csv 2 | __init__.py 3 | __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fbref scraper 2 | 3 | Python scraper for https://fbref.com as a CLI. 4 | 5 | ## Installation 6 | 7 | `pip install -r requirements.txt` 8 | 9 | > TODO: create a real Python module and real installable CLI. 10 | 11 | ## Example 12 | 13 | There are actually two available "endpoints" to scrape data: 14 | 15 | 👉 `/matches` 16 | 17 | ```bash 18 | python main.py --date 2023-01-01 --page matches/ --data_dir=/tmp/data 19 | ``` 20 | 21 | 👉 `/fixtures` 22 | 23 | ```bash 24 | python main.py --date 2023-01-01 --page fixtures/ --fixture_url=https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures --data_dir=/tmp/data 25 | `````` 26 | 27 | The CLI has several common options: 28 | 29 | * `--date`: provide a date (sometimes useful for specific pages) - format is `%YYYY-%MM-%DD` 30 | * `--page`: the "endpoint", i.e. the page to scrape. 31 | * `--data_dir`: the path where scraped data will be stored 32 | 33 | 34 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ben8t/fbref_scraper/e399f12cdffd2755682ad388b62625ad5fb903e7/data/.gitkeep -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from lxml import etree, html 3 | import pandas as pd 4 | import re 5 | import requests 6 | from parser import match_parser, fixture_parser 7 | from utils import * 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser(description="Parse fbref /matchs page") 11 | parser.add_argument("--date", help="The match date") 12 | parser.add_argument("--page", help="matches/ or fixtures/") 13 | parser.add_argument("--fixture_url", help="fixture url") 14 | parser.add_argument("--data_dir", help="Directory to store downloaded data", default="data") 15 | args = parser.parse_args() 16 | 17 | if not is_date_format(args.date): 18 | raise Exception(f"The --date parameter '{args.date}' is not in 'yyyy-mm-dd' format") 19 | 20 | date = args.date # date = "2023-01-03" 21 | 22 | if args.page == "matches/": 23 | url = f"https://fbref.com/en/matchs/{date}" 24 | parsed_data = match_parser(url, date=date) 25 | parsed_data.to_csv(f"data/matchs_{date.replace('-','')}.csv", index=False) 26 | 27 | if args.page == "fixtures/" and args.fixture_url is not None: 28 | url = args.fixture_url 29 | parsed_data, league = fixture_parser(url) 30 | parsed_data.to_csv(f'data/fixtures_{league.lower().replace(" ","_")}.csv', index=False) 31 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | from lxml import etree, html 2 | import pandas as pd 3 | import re 4 | import requests 5 | from utils import * 6 | 7 | def match_parser(match_url: str, date: str) -> pd.DataFrame: 8 | response = requests.get(match_url) 9 | tree = html.fromstring(response.content) 10 | items = tree.xpath('//div[@class="table_wrapper tabbed"]') 11 | 12 | results = [] 13 | for item in items: 14 | league = item.find("span").text.replace('"','').replace('>','') 15 | table = etree.fromstring(etree.tostring(item)).xpath("//table")[0] 16 | df = pd.read_html(etree.tostring(table))[0].assign(league=league, date=date) 17 | results.append(df) 18 | 19 | final_data = snake_case_column_names(pd.concat(results)) 20 | return final_data 21 | 22 | 23 | def fixture_parser(fixture_url: str) -> pd.DataFrame: 24 | response = requests.get(fixture_url) 25 | tree = html.fromstring(response.content) 26 | league = fixture_url.split("/")[-1].replace("-Scores-and-Fixtures","").replace("-"," ") 27 | table = etree.fromstring(etree.tostring(tree.xpath('//*[@class="table_wrapper tabbed"]')[0])).xpath("//table")[0] 28 | final_data = pd.read_html(etree.tostring(table))[0].assign(league=league) 29 | final_data = snake_case_column_names(final_data) 30 | return final_data, league -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | beautifulsoup4 4 | html5lib -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def is_date_format(input_string): 4 | pattern = r'^\d{4}-\d{2}-\d{2}$' 5 | return bool(re.match(pattern, input_string)) 6 | 7 | def snake_case_column_names(df): 8 | # Define a function to convert a string to snake_case 9 | def to_snake_case(s): 10 | s = re.sub(r'[^a-zA-Z0-9]+', '_', s) # Replace all non-alphanumeric characters with underscores 11 | s = re.sub(r'(?<=[a-z])([A-Z0-9])', r'_\1', s) # Insert underscore before capital letters or digits following lowercase letters 12 | return s.lower() 13 | 14 | # Rename the columns using the to_snake_case function 15 | df.columns = [to_snake_case(col) for col in df.columns] 16 | 17 | return df --------------------------------------------------------------------------------