├── .gitignore ├── README.md ├── requirements.txt └── webscraping.py /.gitignore: -------------------------------------------------------------------------------- 1 | geckodriver.log 2 | ranking.json 3 | 4 | # ignore directory .vscode/ and venv/ 5 | .vscode/ 6 | venv/ 7 | 8 | # Ignore driver 9 | geckodriver.exe 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Scraping JavaScript Generated Pages with Python 2 | 3 | This project was created just for educational proposes. 4 | 5 | The code shows how to do web scraping dynamic content pages generated from Javascript using Python and Selenium. 6 | 7 | We use as data the NBA site to extract stats information from players and generate a json file with some top 10 rankings. 8 | **Important: Educational Purposes Only** 9 | 10 | ## Getting Started 11 | 12 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 13 | 14 | ## Prerequisites 15 | 16 | What things you need to install the software and how to install them 17 | 18 | * Python 3.x 19 | * Geckodriver 20 | * Firefox (you can use another browser) 21 | * Some Python libraries following 22 | 23 | ## Installing 24 | 25 | A step by step series of examples that tell you how to get a development env running 26 | 27 | ### Install the following Python libraries: 28 | 29 | * **requests2** - Requests is the only Non-GMO HTTP library for Python, safe for human consumption; 30 | * **pandas** - A great Python Data Analysis Library; 31 | * **lxml** - Library for processing XML and HTML; 32 | * **beautfulsoup4** - Library for pulling data out of HTML and XML files; 33 | * **selenium** - An API to write functional/acceptance tests using Selenium WebDriver. 34 | 35 | With: 36 | ``` 37 | pip install -r requirements.txt 38 | ``` 39 | 40 | ### Geckodriver 41 | 42 | [You can find install instructions in the official repository.](https://github.com/mozilla/geckodriver/releases) 43 | 44 | 45 | ## Running the code 46 | 47 | ``` 48 | python webscraping.py 49 | ``` 50 | 51 | ## Contributing 52 | 53 | Feel free to submitting pull requests to us. 54 | 55 | ## Authors 56 | 57 | * **Gabriel Froes** - *Initial work* - [Twitter](https://www.twitter.com/gabrielfroes) 58 | * **Vanessa Weber** - *Initial work* - [Twitter](https://www.twitter.com/nessaweberfroes) 59 | 60 | ## License 61 | 62 | This project is licensed under the [GNU General Public License](https://opensource.org/licenses/GPL-3.0). 63 | 64 | ## Acknowledgments 65 | 66 | * First steps in Python language 67 | * Create simple and useful things 68 | * Build content for [Código Fonte TV](https://youtu.be/Vxl5jUltHBo), our Youtube Channel. 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests2==2.16.0 2 | pandas==1.0.1 3 | lxml==4.9.1 4 | beautifulsoup4==4.8.2 5 | selenium==3.141.0 6 | -------------------------------------------------------------------------------- /webscraping.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import requests 4 | import pandas as pd 5 | from bs4 import BeautifulSoup 6 | from selenium import webdriver 7 | from selenium.webdriver.firefox.options import Options 8 | import json 9 | 10 | # Grab content from URL (Pegar conteúdo HTML a partir da URL) 11 | url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1" 12 | top10ranking = {} 13 | 14 | rankings = { 15 | '3points': {'field': 'FG3M', 'label': '3PM'}, 16 | 'points': {'field': 'PTS', 'label': 'PTS'}, 17 | 'assistants': {'field': 'AST', 'label': 'AST'}, 18 | 'rebounds': {'field': 'REB', 'label': 'REB'}, 19 | 'steals': {'field': 'STL', 'label': 'STL'}, 20 | 'blocks': {'field': 'BLK', 'label': 'BLK'}, 21 | } 22 | 23 | 24 | def buildrank(type): 25 | 26 | field = rankings[type]['field'] 27 | label = rankings[type]['label'] 28 | 29 | driver.find_element_by_xpath( 30 | f"//div[@class='nba-stat-table']//table//thead//tr//th[@data-field='{field}']").click() 31 | 32 | element = driver.find_element_by_xpath( 33 | "//div[@class='nba-stat-table']//table") 34 | html_content = element.get_attribute('outerHTML') 35 | 36 | # Parse HTML (Parsear o conteúdo HTML) - BeaultifulSoup 37 | soup = BeautifulSoup(html_content, 'html.parser') 38 | table = soup.find(name='table') 39 | 40 | # Data Structure Conversion (Estruturar conteúdo em um Data Frame) - Pandas 41 | df_full = pd.read_html(str(table))[0].head(10) 42 | df = df_full[['Unnamed: 0', 'PLAYER', 'TEAM', label]] 43 | df.columns = ['pos', 'player', 'team', 'total'] 44 | 45 | # Convert to Dict (Transformar os Dados em um Dicionário de dados próprio) 46 | return df.to_dict('records') 47 | 48 | 49 | option = Options() 50 | option.headless = True 51 | driver = webdriver.Firefox(options=option) 52 | 53 | driver.get(url) 54 | driver.implicitly_wait(10) # in seconds 55 | 56 | for k in rankings: 57 | top10ranking[k] = buildrank(k) 58 | 59 | driver.quit() 60 | 61 | # Dump and Save to JSON file (Converter e salvar em um arquivo JSON) 62 | with open('ranking.json', 'w', encoding='utf-8') as jp: 63 | js = json.dumps(top10ranking, indent=4) 64 | jp.write(js) 65 | --------------------------------------------------------------------------------