├── .github ├── FlashscoreScraping.gif └── Logo.svg ├── src ├── cli │ ├── loader │ │ └── index.js │ ├── progressbar │ │ └── index.js │ ├── prompts │ │ ├── leagues │ │ │ └── index.js │ │ ├── season │ │ │ └── index.js │ │ ├── fileType │ │ │ └── index.js │ │ ├── countries │ │ │ └── index.js │ │ └── index.js │ └── arguments │ │ └── index.js ├── constants │ └── index.js ├── files │ ├── handle │ │ └── index.js │ ├── json │ │ └── index.js │ └── csv │ │ └── index.js ├── scraper │ ├── services │ │ ├── seasons │ │ │ └── index.js │ │ ├── countries │ │ │ └── index.js │ │ ├── leagues │ │ │ └── index.js │ │ └── matches │ │ │ └── index.js │ └── index.js └── index.js ├── package.json ├── LICENSE ├── .gitignore └── README.md /.github/FlashscoreScraping.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gustavofariaa/FlashscoreScraping/HEAD/.github/FlashscoreScraping.gif -------------------------------------------------------------------------------- /src/cli/loader/index.js: -------------------------------------------------------------------------------- 1 | import { loading } from "cli-loading-animation"; 2 | 3 | export const { start, stop } = loading("Loading..."); 4 | -------------------------------------------------------------------------------- /src/cli/progressbar/index.js: -------------------------------------------------------------------------------- 1 | import cliProgress from "cli-progress"; 2 | 3 | export const initializeProgressbar = (total) => { 4 | const progressbar = new cliProgress.SingleBar({ 5 | format: "Progress: {bar} | {percentage}% | {value}/{total} matches", 6 | barCompleteChar: "\u2588", 7 | barIncompleteChar: "\u2591", 8 | hideCursor: true, 9 | }); 10 | console.info(""); 11 | progressbar.start(total, 0); 12 | return progressbar; 13 | }; 14 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "flashscore-scraping", 3 | "version": "1.4.0", 4 | "description": "A scraping tool for retrieving football data.", 5 | "type": "module", 6 | "scripts": { 7 | "start": "node src/index.js" 8 | }, 9 | "dependencies": { 10 | "chalk": "^5.6.2", 11 | "cli-loading-animation": "^1.0.6", 12 | "cli-progress": "^3.12.0", 13 | "inquirer": "^12.10.0", 14 | "jsonexport": "^3.2.0", 15 | "p-limit": "^7.2.0", 16 | "playwright": "^1.56.1" 17 | }, 18 | "license": "UNLICENSED" 19 | } 20 | -------------------------------------------------------------------------------- /src/constants/index.js: -------------------------------------------------------------------------------- 1 | export const BASE_URL = "https://www.flashscore.com"; 2 | export const OUTPUT_PATH = "./src/data"; 3 | export const TIMEOUT = 2500; 4 | export const FileTypes = Object.freeze({ 5 | JSON: { 6 | label: "JSON (Padrão)", 7 | argument: "json", 8 | extension: ".json", 9 | }, 10 | JSON_ARRAY: { 11 | label: "JSON Array (Lista)", 12 | argument: "json-array", 13 | extension: ".array.json", 14 | }, 15 | CSV: { 16 | label: "Arquivo CSV", 17 | argument: "csv", 18 | extension: ".csv", 19 | }, 20 | }); 21 | -------------------------------------------------------------------------------- /src/files/handle/index.js: -------------------------------------------------------------------------------- 1 | import { FileTypes } from "../../constants/index.js"; 2 | 3 | import { writeJsonToFile } from "../../files/json/index.js"; 4 | import { writeCsvToFile } from "../../files/csv/index.js"; 5 | 6 | export const writeDataToFile = (data, fileName, fileType) => { 7 | const outputFileName = `${fileName}${fileType.extension}`; 8 | 9 | switch (fileType) { 10 | case FileTypes.JSON: 11 | case FileTypes.JSON_ARRAY: 12 | writeJsonToFile(data, outputFileName, fileType === FileTypes.JSON_ARRAY); 13 | break; 14 | 15 | case FileTypes.CSV: 16 | writeCsvToFile(data, outputFileName); 17 | break; 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /src/scraper/services/seasons/index.js: -------------------------------------------------------------------------------- 1 | import { TIMEOUT } from "../../../constants/index.js"; 2 | import { openPageAndNavigate, waitForSelectorSafe } from "../../index.js"; 3 | 4 | export const getListOfSeasons = async (context, leagueUrl) => { 5 | const page = await openPageAndNavigate(context, `${leagueUrl}/archive`); 6 | 7 | await waitForSelectorSafe(page, ["div.archive__season > a"], TIMEOUT); 8 | 9 | const listOfLeagueSeasons = await page.evaluate(() => { 10 | return Array.from(document.querySelectorAll("div.archive__season > a")).map( 11 | (element) => { 12 | return { name: element.innerText.trim(), url: element.href }; 13 | } 14 | ); 15 | }); 16 | 17 | await page.close(); 18 | return listOfLeagueSeasons; 19 | }; 20 | -------------------------------------------------------------------------------- /src/files/json/index.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import path from "path"; 3 | 4 | import { OUTPUT_PATH } from "../../constants/index.js"; 5 | 6 | export const writeJsonToFile = (data, fileName, asArray) => { 7 | const preparedData = asArray ? toArray(data) : data; 8 | 9 | const filePath = path.join(OUTPUT_PATH, fileName); 10 | const fileContent = JSON.stringify(preparedData, null, 2); 11 | 12 | try { 13 | fs.mkdirSync(path.dirname(filePath), { recursive: true }); 14 | fs.writeFileSync(filePath, fileContent, "utf-8"); 15 | } catch (error) { 16 | throw Error(`❌ Failed to create directories or write the JSON file`); 17 | } 18 | }; 19 | 20 | const toArray = (data) => 21 | Object.entries(data).map(([matchId, matchData]) => ({ 22 | matchId, 23 | ...structuredClone(matchData), 24 | })); 25 | -------------------------------------------------------------------------------- /src/cli/prompts/leagues/index.js: -------------------------------------------------------------------------------- 1 | import inquirer from "inquirer"; 2 | 3 | import { getListOfLeagues } from "../../../scraper/services/leagues/index.js"; 4 | 5 | import { start, stop } from "../../loader/index.js"; 6 | 7 | export const selectLeague = async (context, countryId) => { 8 | start(); 9 | const leagues = await getListOfLeagues(context, countryId); 10 | stop(); 11 | const options = leagues.map((element) => element.name); 12 | 13 | const { choice } = await inquirer.prompt([ 14 | { 15 | type: "list", 16 | name: "choice", 17 | message: "Select a league:", 18 | choices: [...options, "Cancel", new inquirer.Separator()], 19 | }, 20 | ]); 21 | 22 | if (choice === "Cancel") { 23 | console.info("\nNo option selected. Exiting...\n"); 24 | throw Error; 25 | } 26 | 27 | return leagues.find((league) => league.name === choice); 28 | }; 29 | -------------------------------------------------------------------------------- /src/cli/prompts/season/index.js: -------------------------------------------------------------------------------- 1 | import inquirer from "inquirer"; 2 | 3 | import { getListOfSeasons } from "../../../scraper/services/seasons/index.js"; 4 | 5 | import { start, stop } from "../../loader/index.js"; 6 | 7 | export const selectSeason = async (context, leagueUrl) => { 8 | start(); 9 | const seasons = await getListOfSeasons(context, leagueUrl); 10 | stop(); 11 | const options = seasons.map((season) => season.name); 12 | 13 | const { choice } = await inquirer.prompt([ 14 | { 15 | type: "list", 16 | name: "choice", 17 | message: "Select a league season:", 18 | choices: [...options, "Cancel", new inquirer.Separator()], 19 | }, 20 | ]); 21 | 22 | if (choice === "Cancel") { 23 | console.info("\nNo option selected. Exiting...\n"); 24 | throw Error; 25 | } 26 | 27 | return seasons.find((season) => season.name === choice); 28 | }; 29 | -------------------------------------------------------------------------------- /src/cli/prompts/fileType/index.js: -------------------------------------------------------------------------------- 1 | import inquirer from "inquirer"; 2 | import chalk from "chalk"; 3 | 4 | import { FileTypes } from "../../../constants/index.js"; 5 | 6 | export const selectFileType = async (fileType) => { 7 | if (fileType) { 8 | console.info( 9 | `${chalk.green("✔")} File type: ${chalk.cyan(fileType.label)}` 10 | ); 11 | return fileType; 12 | } 13 | 14 | const choices = Object.values(FileTypes).map((type) => type.label); 15 | const { choice } = await inquirer.prompt([ 16 | { 17 | type: "list", 18 | name: "choice", 19 | message: "Select a output file type:", 20 | choices: [...choices, "Cancel"], 21 | }, 22 | ]); 23 | 24 | if (choice === "Cancel") { 25 | console.info("\nNo option selected. Exiting...\n"); 26 | throw Error; 27 | } 28 | 29 | return Object.values(FileTypes).find((type) => type.label === choice); 30 | }; 31 | -------------------------------------------------------------------------------- /src/scraper/services/countries/index.js: -------------------------------------------------------------------------------- 1 | import { BASE_URL } from "../../../constants/index.js"; 2 | import { 3 | openPageAndNavigate, 4 | waitAndClick, 5 | waitForSelectorSafe, 6 | } from "../../index.js"; 7 | 8 | export const getListOfCountries = async (context) => { 9 | const page = await openPageAndNavigate(context, BASE_URL); 10 | 11 | await waitAndClick(page, "#category-left-menu > div > span"); 12 | await waitForSelectorSafe(page, ["#category-left-menu > div > div > a"]); 13 | 14 | const listOfCountries = await page.evaluate(() => { 15 | return Array.from( 16 | document.querySelectorAll("#category-left-menu > div > div > a") 17 | ).map((element) => { 18 | return { 19 | name: element.innerText.trim(), 20 | url: element.href, 21 | id: element.id, 22 | }; 23 | }); 24 | }); 25 | 26 | await page.close(); 27 | return listOfCountries; 28 | }; 29 | -------------------------------------------------------------------------------- /src/scraper/services/leagues/index.js: -------------------------------------------------------------------------------- 1 | import { BASE_URL, TIMEOUT } from "../../../constants/index.js"; 2 | import { 3 | openPageAndNavigate, 4 | waitAndClick, 5 | waitForSelectorSafe, 6 | } from "../../index.js"; 7 | 8 | export const getListOfLeagues = async (context, countryId) => { 9 | const page = await openPageAndNavigate(context, BASE_URL); 10 | 11 | await waitAndClick(page, "#category-left-menu > div > span"); 12 | await waitAndClick(page, `#${countryId}`); 13 | await waitForSelectorSafe(page, [`#${countryId} ~ span > a`], TIMEOUT); 14 | 15 | const listOfLeagues = await page.evaluate((countryId) => { 16 | return Array.from( 17 | document.querySelectorAll(`#${countryId} ~ span > a`) 18 | ).map((element) => { 19 | return { name: element.innerText.trim(), url: element.href }; 20 | }); 21 | }, countryId); 22 | 23 | await page.close(); 24 | return listOfLeagues; 25 | }; 26 | -------------------------------------------------------------------------------- /src/scraper/index.js: -------------------------------------------------------------------------------- 1 | import { TIMEOUT } from "../constants/index.js"; 2 | 3 | export const openPageAndNavigate = async (context, url) => { 4 | const page = await context.newPage(); 5 | await page.goto(url, { waitUntil: "domcontentloaded" }); 6 | return page; 7 | }; 8 | 9 | export const waitAndClick = async (page, selector, timeout = TIMEOUT) => { 10 | await page.waitForSelector(selector, { timeout }); 11 | await page.evaluate(async (selector) => { 12 | await new Promise((resolve) => setTimeout(resolve, 500)); 13 | const element = document.querySelector(selector); 14 | if (element) { 15 | element.scrollIntoView(); 16 | element.click(); 17 | } 18 | }, selector); 19 | }; 20 | 21 | export const waitForSelectorSafe = async ( 22 | page, 23 | selectors = [], 24 | timeout = TIMEOUT 25 | ) => { 26 | return Promise.all( 27 | selectors.map(async (selector) => { 28 | try { 29 | await page.waitForSelector(selector, { timeout }); 30 | } catch {} 31 | }) 32 | ); 33 | }; 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to -------------------------------------------------------------------------------- /src/files/csv/index.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import path from "path"; 3 | import jsonexport from "jsonexport"; 4 | 5 | import { OUTPUT_PATH } from "../../constants/index.js"; 6 | 7 | export const writeCsvToFile = (data, fileName) => { 8 | const filePath = path.join(OUTPUT_PATH, fileName); 9 | 10 | const csvData = convertDataToCsv(data); 11 | 12 | jsonexport(csvData, (error, fileContent) => { 13 | if (error) throw error; 14 | 15 | try { 16 | fs.mkdirSync(path.dirname(filePath), { recursive: true }); 17 | fs.writeFileSync(filePath, fileContent); 18 | } catch (error) { 19 | throw Error(`❌ Failed to create directories or write the CSV file`); 20 | } 21 | }); 22 | }; 23 | 24 | const convertDataToCsv = (data) => 25 | Object.keys(data).map((matchId) => { 26 | const { stage, status, date, home, away, result, information, statistics } = 27 | data[matchId]; 28 | const informationObject = {}; 29 | const statisticsObject = {}; 30 | 31 | information.forEach((info) => { 32 | informationObject[info.category.toLowerCase().replace(/ /g, "_")] = 33 | info.value; 34 | }); 35 | 36 | statistics.forEach((stat) => { 37 | statisticsObject[stat.category.toLowerCase().replace(/ /g, "_")] = { 38 | home: stat.homeValue, 39 | away: stat.awayValue, 40 | }; 41 | }); 42 | 43 | return { 44 | matchId, 45 | stage, 46 | status, 47 | date, 48 | home, 49 | away, 50 | result, 51 | information: { ...informationObject }, 52 | statistics: { ...statisticsObject }, 53 | }; 54 | }); 55 | -------------------------------------------------------------------------------- /src/cli/prompts/countries/index.js: -------------------------------------------------------------------------------- 1 | import inquirer from "inquirer"; 2 | import chalk from "chalk"; 3 | 4 | import { getListOfCountries } from "../../../scraper/services/countries/index.js"; 5 | 6 | import { start, stop } from "../../loader/index.js"; 7 | 8 | export const selectCountry = async (context, inputCountry) => { 9 | start(); 10 | const countries = await getListOfCountries(context); 11 | stop(); 12 | 13 | const selected = findCountry(countries, inputCountry); 14 | if (selected) { 15 | console.info(`${chalk.green("✔")} Country: ${chalk.cyan(selected.name)}`); 16 | return selected; 17 | } else if (inputCountry) { 18 | throw Error( 19 | `❌ No country found for "${inputCountry}"\n` + 20 | `Please verify that the country name provided is correct` 21 | ); 22 | } 23 | 24 | const choices = countries.map(({ name }) => name).sort(); 25 | const { choice } = await inquirer.prompt([ 26 | { 27 | type: "list", 28 | name: "choice", 29 | message: "Select a country:", 30 | choices: [...choices, "Cancel", new inquirer.Separator()], 31 | }, 32 | ]); 33 | 34 | if (choice === "Cancel") { 35 | console.info("\nNo option selected. Exiting...\n"); 36 | throw Error; 37 | } 38 | 39 | return findCountry(countries, choice); 40 | }; 41 | 42 | const findCountry = (countries, targetName) => { 43 | if (!targetName) return null; 44 | return countries.find( 45 | ({ name }) => formatCountryName(name) === formatCountryName(targetName) 46 | ); 47 | }; 48 | 49 | const formatCountryName = (name) => { 50 | return name 51 | .toLowerCase() 52 | .replace(/[^a-z0-9\s]/g, "") 53 | .trim() 54 | .replace(/\s+/g, "-"); 55 | }; 56 | -------------------------------------------------------------------------------- /src/cli/prompts/index.js: -------------------------------------------------------------------------------- 1 | import chalk from "chalk"; 2 | 3 | import { BASE_URL, OUTPUT_PATH } from "../../constants/index.js"; 4 | 5 | import { selectFileType } from "./fileType/index.js"; 6 | import { selectCountry } from "./countries/index.js"; 7 | import { selectLeague } from "./leagues/index.js"; 8 | import { selectSeason } from "./season/index.js"; 9 | 10 | export const promptUserOptions = async (context, cliOptions) => { 11 | const fileType = await selectFileType(cliOptions?.fileType); 12 | const country = await selectCountry(context, cliOptions?.country); 13 | const season = await resolveSeason(context, cliOptions, country); 14 | 15 | const fileName = generateFileName(country?.name, season?.name); 16 | 17 | console.info(`\n📝 Starting data collection...`); 18 | console.info( 19 | `📁 File will be saved to: ${chalk.cyan( 20 | `${OUTPUT_PATH}/${fileName}${fileType.extension}` 21 | )}` 22 | ); 23 | 24 | return { fileName, season, fileType }; 25 | }; 26 | 27 | const resolveSeason = async (context, cliOptions, country) => { 28 | if (!cliOptions?.league) { 29 | const league = await selectLeague(context, country?.id); 30 | return await selectSeason(context, league?.url); 31 | } 32 | 33 | const leagueName = capitalizeWords(cliOptions.league); 34 | console.info(`${chalk.green("✔")} League season: ${chalk.cyan(leagueName)}`); 35 | 36 | return { 37 | name: leagueName, 38 | url: `${BASE_URL}/football/${country?.name}/${cliOptions.league}`.toLowerCase(), 39 | }; 40 | }; 41 | 42 | const generateFileName = (countryName = "", seasonName = "") => { 43 | return `${countryName}_${seasonName}` 44 | .toLowerCase() 45 | .replace(/[^a-z0-9]+/g, "_") 46 | .replace(/^_+|_+$/g, ""); 47 | }; 48 | 49 | const capitalizeWords = (str) => { 50 | return str 51 | .replace(/[-_]/g, " ") 52 | .replace(/\b\w/g, (char) => char.toUpperCase()); 53 | }; 54 | -------------------------------------------------------------------------------- /src/cli/arguments/index.js: -------------------------------------------------------------------------------- 1 | import { FileTypes } from "../../constants/index.js"; 2 | 3 | export const parseArguments = () => { 4 | const args = process.argv.slice(2); 5 | const options = { 6 | country: null, 7 | league: null, 8 | fileType: null, 9 | concurrency: 10, 10 | saveInterval: 10, 11 | headless: true, 12 | }; 13 | 14 | args.forEach((arg) => { 15 | if (arg.startsWith("country=")) options.country = arg.split("=")[1]; 16 | if (arg.startsWith("league=")) options.league = arg.split("=")[1]; 17 | if (arg.startsWith("fileType=")) options.fileType = arg.split("=")[1]; 18 | if (arg.startsWith("concurrency=")) 19 | options.concurrency = Number(arg.split("=")[1]); 20 | if (arg.startsWith("saveInterval=")) 21 | options.saveInterval = Number(arg.split("=")[1]); 22 | if (arg.startsWith("headless=")) 23 | options.headless = arg.split("=")[1] !== "false"; 24 | if (arg === "--no-headless") options.headless = false; 25 | if (arg === "--headless") options.headless = true; 26 | }); 27 | 28 | if (options.fileType) { 29 | const userInput = options.fileType; 30 | const matchedType = Object.values(FileTypes).find( 31 | (type) => type.argument === userInput 32 | ); 33 | 34 | if (!matchedType) { 35 | const acceptedTypes = Object.values(FileTypes) 36 | .map((type) => `"${type.argument}"`) 37 | .join(", "); 38 | throw Error( 39 | `❌ Invalid fileType: "${userInput}"\n` + 40 | `Accepted file types are: ${acceptedTypes}` 41 | ); 42 | } 43 | 44 | options.fileType = matchedType; 45 | } 46 | 47 | if (options.league && !options.country) { 48 | throw Error( 49 | `❌ Missing required argument: country=\n` + 50 | `You provided a league "${options.league}" but did not specify a country\n` + 51 | `Usage example: country= league=` 52 | ); 53 | } 54 | 55 | return options; 56 | }; 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | data -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import { chromium } from "playwright"; 2 | import pLimit from "p-limit"; 3 | import chalk from "chalk"; 4 | 5 | import { OUTPUT_PATH } from "./constants/index.js"; 6 | import { parseArguments } from "./cli/arguments/index.js"; 7 | import { promptUserOptions } from "./cli/prompts/index.js"; 8 | import { start, stop } from "./cli/loader/index.js"; 9 | import { initializeProgressbar } from "./cli/progressbar/index.js"; 10 | 11 | import { 12 | getMatchLinks, 13 | getMatchData, 14 | } from "./scraper/services/matches/index.js"; 15 | 16 | import { writeDataToFile } from "./files/handle/index.js"; 17 | 18 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); 19 | const withRetry = async (fn, retries = 3) => { 20 | try { 21 | return await fn(); 22 | } catch (err) { 23 | if (retries === 0) throw err; 24 | const delay = (4 - retries) * 500; 25 | console.warn(`⚠️ Retry in ${delay}ms...`); 26 | await sleep(delay); 27 | return withRetry(fn, retries - 1); 28 | } 29 | }; 30 | 31 | (async () => { 32 | let browser; 33 | let context; 34 | 35 | try { 36 | const cliOptions = parseArguments(); 37 | 38 | browser = await chromium.launch({ headless: cliOptions.headless }); 39 | context = await browser.newContext(); 40 | 41 | const { fileName, season, fileType } = await promptUserOptions( 42 | context, 43 | cliOptions 44 | ); 45 | 46 | start(); 47 | 48 | const matchLinksResults = await getMatchLinks( 49 | context, 50 | season?.url, 51 | "results" 52 | ); 53 | const matchLinksFixtures = await getMatchLinks( 54 | context, 55 | season?.url, 56 | "fixtures" 57 | ); 58 | const matchLinks = [...matchLinksFixtures, ...matchLinksResults]; 59 | 60 | if (matchLinks.length === 0) { 61 | throw Error( 62 | `❌ No matches found on the results page\n` + 63 | `Please verify that the league name provided is correct` 64 | ); 65 | } 66 | 67 | stop(); 68 | 69 | const progressbar = initializeProgressbar(matchLinks.length); 70 | const limit = pLimit(cliOptions.concurrency); 71 | 72 | const matchData = {}; 73 | let processedCount = 0; 74 | 75 | const tasks = matchLinks.map((matchLink) => 76 | limit(async () => { 77 | const data = await withRetry(() => getMatchData(context, matchLink)); 78 | matchData[matchLink.id] = data; 79 | 80 | processedCount += 1; 81 | if (processedCount % cliOptions.saveInterval === 0) { 82 | writeDataToFile(matchData, fileName, fileType); 83 | } 84 | 85 | progressbar.increment(); 86 | }) 87 | ); 88 | 89 | await Promise.all(tasks); 90 | 91 | progressbar.stop(); 92 | writeDataToFile(matchData, fileName, fileType); 93 | 94 | console.info("\n✅ Data collection and file writing completed!"); 95 | console.info( 96 | `📁 File saved to: ${chalk.cyan( 97 | `${OUTPUT_PATH}/${fileName}${fileType.extension}` 98 | )}\n` 99 | ); 100 | } catch (error) { 101 | stop(); 102 | if (error.message) console.error(`\n${error.message}\n`); 103 | } finally { 104 | await context?.close(); 105 | await browser?.close(); 106 | } 107 | })(); 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | FlashscoreScraping logo 3 |

4 | 5 |

6 | Scrape match results, statistics, and league data from Flashscore 7 |

8 | 9 |

10 | 11 | 12 | 13 |

14 | 15 |

16 | Demo 17 |

18 | 19 | ## About the Project 20 | 21 | [Flashscore](https://flashscore.com) is one of the most popular platforms for real-time sports results, statistics, and standings, but it has **no official API**. 22 | 23 | **FlashscoreScraping** bridges this gap by extracting structured match data directly from the site, enabling use in: 24 | 25 | - 🔎 Tracking favorite teams and leagues 26 | - 📊 Building analytics dashboards 27 | - 🧠 Research & academic datasets 28 | - 🤖 Bots, automations, ML training, data pipelines 29 | 30 | ## Getting Started 31 | 32 | ```bash 33 | git clone https://github.com/gustavofariaa/FlashscoreScraping.git 34 | cd FlashscoreScraping 35 | npm install 36 | npx playwright install-deps chromium 37 | npm run start 38 | ``` 39 | 40 | ## Command-Line Parameters 41 | 42 | | Parameter | Default | Required | Description | 43 | | :------------- | :-----: | :------------------: | :--------------------------------------------------------------------------------- | 44 | | `country` | - | ✅ if using `league` | Country to scrape (e.g. `brazil`) | 45 | | `league` | - | ❌ | Specific league to scrape (e.g. `serie-a`) | 46 | | `fileType` | `json` | ❌ | Output format: `json`, `json-array` or `csv` | 47 | | `concurrency` | `10` | ❌ | Number of matches scraped in parallel. Higher = faster, but heavier on CPU/network | 48 | | `saveInterval` | `10` | ❌ | Number of matches processed before data is saved to disk | 49 | | `headless` | `true` | ❌ | Show browser UI (`false`) or run hidden (`true`) | 50 | 51 | ### Examples 52 | 53 | Scrape Brazilian Serie A 2023 into JSON: 54 | 55 | ```bash 56 | npm run start country=brazil league=serie-a-2023 fileType=json 57 | ``` 58 | 59 | Scrape Premier League 22/23 with visible browser and export CSV: 60 | 61 | ```bash 62 | npm run start country=england league=premier-league-2022-2023 headless=false fileType=csv 63 | ``` 64 | 65 | ## Data Structure 66 | 67 | Each match result includes: 68 | 69 | ```json 70 | { 71 | "IHCq3ARL": { 72 | "stage": "FINAL", 73 | "date": "20.02.2022 16:00", 74 | "status": "AFTER PENALTIES", 75 | "home": { 76 | "name": "Atletico-MG", 77 | "image": "https://static.flashscore.com/res/image/data/WbSJHDh5-pCk2vaSD.png" 78 | }, 79 | "away": { 80 | "name": "Flamengo RJ", 81 | "image": "https://static.flashscore.com/res/image/data/ADvIaiZA-2R2JjDQC.png" 82 | }, 83 | "result": { 84 | "home": "3", 85 | "away": "2", 86 | "regulationTime": "2-2", 87 | "penalties": "8-7" 88 | }, 89 | "information": [ 90 | { 91 | "category": "Referee", 92 | "value": "Daronco A. (Bra)" 93 | }, 94 | { 95 | "category": "Venue", 96 | "value": "Arena Pantanal (Cuiabá)" 97 | }, 98 | { 99 | "category": "Capacity", 100 | "value": "44 000" 101 | } 102 | ], 103 | "statistics": [ 104 | { 105 | "category": "Ball Possession", 106 | "homeValue": "57%", 107 | "awayValue": "43%" 108 | } 109 | // statistics are dynamic and may vary per match 110 | ] 111 | } 112 | } 113 | ``` 114 | 115 | ### Field Reference 116 | 117 | | Field | Type | Description | 118 | | :-------------- | :------- | :----------------------------------------------------- | 119 | | `matchId` | `string` | Unique identifier for the match | 120 | | `stage` | `string` | Competition phase or round (e.g., `QUARTER-FINALS`) | 121 | | `status` | `string` | Current or final state of the match (e.g., `FINISHED`) | 122 | | `date` | `string` | Full match date & time (dd.mm.yyyy hh:mm) | 123 | | `home` / `away` | `object` | Team data (name + logo URL) | 124 | | `result` | `object` | Match score data (may be empty if not available) | 125 | | `information` | `array` | Extra match info (referee, stadium, etc.) | 126 | | `statistics` | `array` | Variable-length list of stats (depends on match) | 127 | 128 | ## Issues & Contribution 129 | 130 | Found a bug? Want to suggest a feature? [Open an issue](https://github.com/gustavofariaa/FlashscoreScraping/issues) 131 | 132 | ## Support 133 | 134 | If this project helped you, consider leaving a star. It motivates development and helps more people find the repo. 135 | 136 | -------------------------------------------------------------------------------- /src/scraper/services/matches/index.js: -------------------------------------------------------------------------------- 1 | import { openPageAndNavigate, waitForSelectorSafe } from "../../index.js"; 2 | 3 | export const getMatchLinks = async (context, leagueSeasonUrl, type) => { 4 | const page = await openPageAndNavigate(context, `${leagueSeasonUrl}/${type}`); 5 | 6 | const LOAD_MORE_SELECTOR = '[data-testid="wcl-buttonLink"]'; 7 | const MATCH_SELECTOR = 8 | ".event__match.event__match--static.event__match--twoLine"; 9 | const CLICK_DELAY = 600; 10 | const MAX_EMPTY_CYCLES = 4; 11 | 12 | let emptyCycles = 0; 13 | 14 | while (true) { 15 | const countBefore = await page.$$eval(MATCH_SELECTOR, (els) => els.length); 16 | 17 | const loadMoreBtn = await page.$(LOAD_MORE_SELECTOR); 18 | if (!loadMoreBtn) break; 19 | 20 | try { 21 | await loadMoreBtn.click(); 22 | await page.waitForTimeout(CLICK_DELAY); 23 | } catch { 24 | break; 25 | } 26 | 27 | const countAfter = await page.$$eval(MATCH_SELECTOR, (els) => els.length); 28 | 29 | if (countAfter === countBefore) { 30 | emptyCycles++; 31 | if (emptyCycles >= MAX_EMPTY_CYCLES) break; 32 | } else { 33 | emptyCycles = 0; 34 | } 35 | } 36 | 37 | await waitForSelectorSafe(page, [MATCH_SELECTOR]); 38 | 39 | const matchIdList = await page.evaluate(() => { 40 | return Array.from( 41 | document.querySelectorAll( 42 | ".event__match.event__match--static.event__match--twoLine" 43 | ) 44 | ).map((element) => { 45 | const id = element?.id?.replace("g_1_", ""); 46 | const url = element.querySelector("a.eventRowLink")?.href ?? null; 47 | return { id, url }; 48 | }); 49 | }); 50 | 51 | await page.close(); 52 | 53 | console.info(`✅ Found ${matchIdList.length} matches for ${type}`); 54 | return matchIdList; 55 | }; 56 | 57 | export const getMatchData = async (context, { id: matchId, url }) => { 58 | const page = await openPageAndNavigate(context, url); 59 | 60 | await waitForSelectorSafe(page, [ 61 | ".duelParticipant__startTime", 62 | "div[data-testid='wcl-summaryMatchInformation'] > div'", 63 | ]); 64 | 65 | const matchData = await extractMatchData(page); 66 | const information = await extractMatchInformation(page); 67 | 68 | const statsLink = buildStatsUrl(url); 69 | await page.goto(statsLink, { waitUntil: "domcontentloaded" }); 70 | 71 | await waitForSelectorSafe(page, [ 72 | "div[data-testid='wcl-statistics']", 73 | "div[data-testid='wcl-statistics-value']", 74 | ]); 75 | 76 | const statistics = await extractMatchStatistics(page); 77 | 78 | await page.close(); 79 | return { matchId, ...matchData, information, statistics }; 80 | }; 81 | 82 | const buildStatsUrl = (matchUrl) => { 83 | if (!matchUrl) return null; 84 | 85 | const url = new URL(matchUrl); 86 | const base = url.origin + url.pathname.replace(/\/$/, ""); 87 | const mid = url.searchParams.get("mid"); 88 | 89 | return `${base}/summary/stats/0/?mid=${mid}`; 90 | }; 91 | 92 | const extractMatchData = async (page) => { 93 | await waitForSelectorSafe(page, [ 94 | "span[data-testid='wcl-scores-overline-03']", 95 | ".duelParticipant__startTime", 96 | ".fixedHeaderDuel__detailStatus", 97 | ".tournamentHeader__country > a", 98 | ".detailScore__wrapper span:not(.detailScore__divider)", 99 | ".duelParticipant__home .participant__image", 100 | ".duelParticipant__away .participant__image", 101 | ".duelParticipant__home .participant__participantName.participant__overflow", 102 | ".duelParticipant__away .participant__participantName.participant__overflow", 103 | ]); 104 | 105 | return await page.evaluate(() => { 106 | return { 107 | stage: Array.from( 108 | document.querySelectorAll("span[data-testid='wcl-scores-overline-03']") 109 | )?.[2] 110 | ?.innerText.trim() 111 | ?.split(" - ") 112 | .pop() 113 | .trim(), 114 | date: document 115 | .querySelector(".duelParticipant__startTime") 116 | ?.innerText.trim(), 117 | status: 118 | document 119 | .querySelector(".fixedHeaderDuel__detailStatus") 120 | ?.innerText.trim() ?? "NOT STARTED", 121 | home: { 122 | name: document 123 | .querySelector( 124 | ".duelParticipant__home .participant__participantName.participant__overflow" 125 | ) 126 | ?.innerText.trim(), 127 | image: document.querySelector( 128 | ".duelParticipant__home .participant__image" 129 | )?.src, 130 | }, 131 | away: { 132 | name: document 133 | .querySelector( 134 | ".duelParticipant__away .participant__participantName.participant__overflow" 135 | ) 136 | ?.innerText.trim(), 137 | image: document.querySelector( 138 | ".duelParticipant__away .participant__image" 139 | )?.src, 140 | }, 141 | result: { 142 | home: Array.from( 143 | document.querySelectorAll( 144 | ".detailScore__wrapper span:not(.detailScore__divider)" 145 | ) 146 | )?.[0]?.innerText.trim(), 147 | away: Array.from( 148 | document.querySelectorAll( 149 | ".detailScore__wrapper span:not(.detailScore__divider)" 150 | ) 151 | )?.[1]?.innerText.trim(), 152 | regulationTime: document 153 | .querySelector(".detailScore__fullTime") 154 | ?.innerText.trim() 155 | .replace(/[\n()]/g, ""), 156 | penalties: Array.from( 157 | document.querySelectorAll('[data-testid="wcl-scores-overline-02"]') 158 | ) 159 | .find( 160 | (element) => element.innerText.trim().toLowerCase() === "penalties" 161 | ) 162 | ?.nextElementSibling?.innerText?.trim() 163 | .replace(/\s+/g, ""), 164 | }, 165 | }; 166 | }); 167 | }; 168 | 169 | const extractMatchInformation = async (page) => { 170 | return await page.evaluate(async () => { 171 | const elements = Array.from( 172 | document.querySelectorAll( 173 | "div[data-testid='wcl-summaryMatchInformation'] > div" 174 | ) 175 | ); 176 | return elements.reduce((acc, element, index) => { 177 | if (index % 2 === 0) { 178 | acc.push({ 179 | category: element?.textContent 180 | .trim() 181 | .replace(/\s+/g, " ") 182 | .replace(/(^[:\s]+|[:\s]+$|:)/g, ""), 183 | value: elements[index + 1]?.innerText 184 | .trim() 185 | .replace(/\s+/g, " ") 186 | .replace(/(^[:\s]+|[:\s]+$|:)/g, ""), 187 | }); 188 | } 189 | return acc; 190 | }, []); 191 | }); 192 | }; 193 | 194 | const extractMatchStatistics = async (page) => { 195 | return await page.evaluate(async () => { 196 | return Array.from( 197 | document.querySelectorAll("div[data-testid='wcl-statistics']") 198 | ).map((element) => ({ 199 | category: element 200 | .querySelector("div[data-testid='wcl-statistics-category']") 201 | ?.innerText.trim(), 202 | homeValue: Array.from( 203 | element.querySelectorAll( 204 | "div[data-testid='wcl-statistics-value'] > strong" 205 | ) 206 | )?.[0]?.innerText.trim(), 207 | awayValue: Array.from( 208 | element.querySelectorAll( 209 | "div[data-testid='wcl-statistics-value'] > strong" 210 | ) 211 | )?.[1]?.innerText.trim(), 212 | })); 213 | }); 214 | }; 215 | -------------------------------------------------------------------------------- /.github/Logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------