├── .gitignore
├── README.md
├── data
    └── .gitkeep
├── main.py
├── parser.py
├── requirements.txt
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data/*.csv
2 | __init__.py
3 | __pycache__


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fbref scraper
 2 | 
 3 | Python scraper for https://fbref.com as a CLI.
 4 | 
 5 | ## Installation
 6 | 
 7 | `pip install -r requirements.txt`
 8 | 
 9 | > TODO: create a real Python module and real installable CLI.
10 | 
11 | ## Example
12 | 
13 | There are actually two available "endpoints" to scrape data:
14 | 
15 | 👉 `/matches`
16 | 
17 | ```bash
18 | python main.py --date 2023-01-01 --page matches/ --data_dir=/tmp/data
19 | ```
20 | 
21 | 👉 `/fixtures`
22 | 
23 | ```bash
24 | python main.py --date 2023-01-01 --page fixtures/ --fixture_url=https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures --data_dir=/tmp/data
25 | ``````
26 | 
27 | The CLI has several common options:
28 | 
29 | * `--date`: provide a date (sometimes useful for specific pages) - format is `%YYYY-%MM-%DD`
30 | * `--page`: the "endpoint", i.e. the page to scrape.
31 | * `--data_dir`: the path where scraped data will be stored
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ben8t/fbref_scraper/e399f12cdffd2755682ad388b62625ad5fb903e7/data/.gitkeep


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from lxml import etree, html
 3 | import pandas as pd
 4 | import re
 5 | import requests
 6 | from parser import match_parser, fixture_parser
 7 | from utils import *
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(description="Parse fbref /matchs page")
11 |     parser.add_argument("--date", help="The match date")
12 |     parser.add_argument("--page", help="matches/ or fixtures/")
13 |     parser.add_argument("--fixture_url", help="fixture url")
14 |     parser.add_argument("--data_dir", help="Directory to store downloaded data", default="data")
15 |     args = parser.parse_args()
16 | 
17 |     if not is_date_format(args.date):
18 |         raise Exception(f"The --date parameter '{args.date}' is not in 'yyyy-mm-dd' format")
19 |     
20 |     date = args.date  # date = "2023-01-03"
21 |     
22 |     if args.page == "matches/":
23 |         url = f"https://fbref.com/en/matchs/{date}"
24 |         parsed_data = match_parser(url, date=date)
25 |         parsed_data.to_csv(f"data/matchs_{date.replace('-','')}.csv", index=False)
26 | 
27 |     if args.page == "fixtures/" and args.fixture_url is not None:
28 |         url = args.fixture_url
29 |         parsed_data, league = fixture_parser(url)
30 |         parsed_data.to_csv(f'data/fixtures_{league.lower().replace(" ","_")}.csv', index=False)
31 | 


--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree, html
 2 | import pandas as pd
 3 | import re
 4 | import requests
 5 | from utils import *
 6 | 
 7 | def match_parser(match_url: str, date: str) -> pd.DataFrame:
 8 |     response = requests.get(match_url)
 9 |     tree = html.fromstring(response.content)
10 |     items = tree.xpath('//div[@class="table_wrapper tabbed"]')
11 | 
12 |     results = []
13 |     for item in items:
14 |         league = item.find("span").text.replace('"','').replace('>','')
15 |         table = etree.fromstring(etree.tostring(item)).xpath("//table")[0]
16 |         df = pd.read_html(etree.tostring(table))[0].assign(league=league, date=date)
17 |         results.append(df)
18 | 
19 |     final_data = snake_case_column_names(pd.concat(results))
20 |     return final_data
21 | 
22 | 
23 | def fixture_parser(fixture_url: str) -> pd.DataFrame:
24 |     response = requests.get(fixture_url)
25 |     tree = html.fromstring(response.content)
26 |     league = fixture_url.split("/")[-1].replace("-Scores-and-Fixtures","").replace("-"," ")
27 |     table = etree.fromstring(etree.tostring(tree.xpath('//*[@class="table_wrapper tabbed"]')[0])).xpath("//table")[0]
28 |     final_data = pd.read_html(etree.tostring(table))[0].assign(league=league)
29 |     final_data = snake_case_column_names(final_data)
30 |     return final_data, league


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | beautifulsoup4
4 | html5lib


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def is_date_format(input_string):
 4 |     pattern = r'^\d{4}-\d{2}-\d{2}$'
 5 |     return bool(re.match(pattern, input_string))
 6 | 
 7 | def snake_case_column_names(df):
 8 |     # Define a function to convert a string to snake_case
 9 |     def to_snake_case(s):
10 |         s = re.sub(r'[^a-zA-Z0-9]+', '_', s)  # Replace all non-alphanumeric characters with underscores
11 |         s = re.sub(r'(?<=[a-z])([A-Z0-9])', r'_\1', s)  # Insert underscore before capital letters or digits following lowercase letters
12 |         return s.lower()
13 | 
14 |     # Rename the columns using the to_snake_case function
15 |     df.columns = [to_snake_case(col) for col in df.columns]
16 | 
17 |     return df


--------------------------------------------------------------------------------