├── download all django videos ├── __init__.py ├── go_django.py └── download_videos_godjango.py ├── automate your hack nyu form ├── __init__.py ├── data1.csv ├── hack_nyu.py └── fill_up_nyu.py ├── scrape all donald trump quotes ├── __init__.py ├── brain_quote_page.py ├── extract_donald_trump_quotes.py └── write_data_1.csv ├── short tutorial to handle csv files ├── __init__.py └── read_write_csv.py ├── scrape top tech news from hacker news website ├── __init__.py ├── hacker_news.py ├── extact_hacker_news.py └── write_data_1.csv ├── html dom tree.png ├── .gitignore ├── selenium-installation-guide.md ├── identify xpath.md └── README.md /download all django videos/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /automate your hack nyu form/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrape all donald trump quotes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /short tutorial to handle csv files/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrape top tech news from hacker news website/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /html dom tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rakeshsukla53/webscraping-selenium/HEAD/html dom tree.png -------------------------------------------------------------------------------- /scrape top tech news from hacker news website/hacker_news.py: -------------------------------------------------------------------------------- 1 | class HackerNews(object): 2 | title_url = '.title>a' 3 | -------------------------------------------------------------------------------- /automate your hack nyu form/data1.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,email_id 2 | Terri, Burns, terri.burns@nyu.edu 3 | Freia, Lobo, freia@nyu.edu 4 | jhishan, khan, jhishan@nyu.edu 5 | 6 | -------------------------------------------------------------------------------- /scrape all donald trump quotes/brain_quote_page.py: -------------------------------------------------------------------------------- 1 | 2 | class BrainQuotePage(object): 3 | donald_trump_quotes = 'span[class="bqQuoteLink"]' 4 | donald_trump_links = 'span[class="bqQuoteLink"] a' 5 | donald_trump_next_page = '.pagination-sm .active + li a ' 6 | -------------------------------------------------------------------------------- /download all django videos/go_django.py: -------------------------------------------------------------------------------- 1 | class GoDjango(object): 2 | pro_video_tag = '.media.episode-list-item.padding-15 div div span' 3 | video_container = '.media.episode-list-item.padding-15' 4 | first_video_title = 'h4[class="media-heading"] a' 5 | next_button_click = 'li[class="active"] + li a' 6 | video_download_link = 'div[class="video-description"] + iframe' 7 | -------------------------------------------------------------------------------- /automate your hack nyu form/hack_nyu.py: -------------------------------------------------------------------------------- 1 | class HackNNYU(object): 2 | first_name = 'input[ng-model="credentials.first_name"]' 3 | last_name = 'input[ng-model="credentials.last_name"]' 4 | email = '.col-sm-12>input[ng-model="credentials.email"]' 5 | password = '.col-sm-12>input[ng-model="credentials.password"]' 6 | agree_checkbox = '.ng-binding>input[ng-model="checkModel"]' 7 | sign_up_button = 'div>button[type="submit"]' 8 | accept_button = 'button[ng-click="positive()"]' 9 | -------------------------------------------------------------------------------- /scrape top tech news from hacker news website/extact_hacker_news.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from hacker_news import HackerNews 3 | import csv 4 | 5 | 6 | def extract_top_news(): 7 | """ scrape all top news from the hacker news website """ 8 | data = [] 9 | driver = webdriver.Firefox() 10 | driver.get('https://news.ycombinator.com/') 11 | element_list = driver.find_elements_by_css_selector(HackerNews.title_url) 12 | for element in element_list: 13 | try: 14 | title_url = (element.text.encode('ascii', 'replace'), element.get_attribute('href').encode('ascii', 'replace')) 15 | data.append(title_url) 16 | except Exception as e: 17 | print e 18 | headers = ('Title', 'Title_URL') 19 | with open('write_data_1.csv', 'w+') as data_file: 20 | writer = csv.writer(data_file) 21 | writer.writerow(headers) 22 | writer.writerows(data) 23 | driver.quit() 24 | 25 | if __name__ == '__main__': 26 | extract_top_news() 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | .idea 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # Created by .ignore support plugin (hsz.mobi) 62 | -------------------------------------------------------------------------------- /selenium-installation-guide.md: -------------------------------------------------------------------------------- 1 | Python, Selenium Installation Guide Download 2 | 3 | Installing Python and Selenium 4 | 5 | Installing Python: 6 | 7 | Windows : http://python.org/download/. 8 | 9 | Note : IF you are using Linux, MacOS X, Unix operating Systems then python will be installed by default with OS 10 | 11 | 1.What is PIP installer Tool? 12 | pip is a package management system used to install and manage software packages written in Python 13 | pip is a recursive acronym that can stand for either "Pip Installs Packages" or "Pip Installs Python 14 | 15 | 2.Where do we get this PIP Tool? And how to configure it in our Local Machines 16 | 17 | 3.Installing Selenium 18 | Use Below command on PIP to install Selenium Package 19 | pip install selenium 20 | 21 | This command will set up the Selenium WebDriver client library on your machine with all modules and classes that we will need to create automated scripts using Python 22 | 23 | 4.pip install -U selenium 24 | The optional –U flag will upgrade the existing version of the installed package 25 | 26 | -------------------------------------------------------------------------------- /scrape all donald trump quotes/extract_donald_trump_quotes.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from brain_quote_page import BrainQuotePage 3 | 4 | 5 | def extract_donald_trump_quotes(): 6 | """ scrape all donald trump quotes """ 7 | browser = webdriver.Firefox() 8 | browser.get('http://www.brainyquote.com/quotes/authors/d/donald_trump.html') 9 | while True: 10 | try: 11 | all_quotes = browser.find_elements_by_css_selector(BrainQuotePage.donald_trump_quotes) 12 | all_quote_links = browser.find_elements_by_css_selector(BrainQuotePage.donald_trump_links) 13 | for quotes, quote_link in zip(all_quotes, all_quote_links): 14 | # donald trump quote and link 15 | print (quotes.text, quote_link.get_attribute('href')) 16 | next_page = browser.find_element_by_css_selector(BrainQuotePage.donald_trump_next_page) 17 | next_page.click() 18 | except: 19 | # we have reached the last page 20 | break 21 | browser.quit() 22 | 23 | if __name__ == '__main__': 24 | extract_donald_trump_quotes() 25 | -------------------------------------------------------------------------------- /short tutorial to handle csv files/read_write_csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | with open('data1.csv', 'r+') as data_file: 4 | data = csv.DictReader(data_file) 5 | for row in data: 6 | print(row) 7 | 8 | with open('data1.csv', 'r+') as data_file: 9 | data = csv.reader(data_file, delimiter='|') 10 | for row in data: 11 | print(row) 12 | 13 | data = [ 14 | ('japanese', '5', '2001'), 15 | ('korean', '2', '1998'), 16 | ('german', '4', '2005'), 17 | ('english', '10', '1990'), 18 | ('tamil', '7', '2010'), 19 | ] 20 | 21 | headers = ('language', 'ability', 'started') 22 | 23 | with open('write_data_1.csv', 'w+') as data_file: 24 | writer = csv.writer(data_file) 25 | writer.writerow(headers) 26 | writer.writerows(data) 27 | 28 | data = [ 29 | {'language': 'japanese', 'ability': '5', 'started': '2001'}, 30 | {'language': 'korean', 'ability': '2', 'started': '1998'}, 31 | {'language': 'german', 'ability': '4', 'started': '2005'}, 32 | {'language': 'english', 'ability': '10', 'started': '1990'}, 33 | {'language':'tamil', 'ability': '7', 'started': '2010'} 34 | ] 35 | 36 | with open('write_data_2.csv', 'w+') as data_file: 37 | writer = csv.DictWriter(data_file, fieldnames=headers) 38 | writer.writeheader() 39 | writer.writerows(data) 40 | 41 | -------------------------------------------------------------------------------- /download all django videos/download_videos_godjango.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from go_django import GoDjango 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.common.by import By 6 | from time import sleep 7 | 8 | 9 | def download_videos(): 10 | """ download all videos from www.godjango.com """ 11 | driver = webdriver.Firefox() 12 | driver.get('https://godjango.com/browse/') 13 | WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, GoDjango.first_video_title))) 14 | while True: 15 | for element in range(0, 10): 16 | all_video_elements = driver.find_elements_by_css_selector(GoDjango.first_video_title) 17 | all_video_elements[element].click() 18 | try: 19 | print driver.find_element_by_css_selector(GoDjango.video_download_link).get_attribute('src') 20 | except: 21 | print "Video is private" 22 | driver.execute_script("window.history.go(-1)") 23 | driver.execute_script("window.scrollTo(0, 465);") 24 | driver.execute_script("window.scrollTo(0, 4650);") 25 | sleep(1) 26 | driver.find_element_by_css_selector(GoDjango.next_button_click).click() 27 | driver.quit() 28 | 29 | if __name__ == '__main__': 30 | download_videos() 31 | 32 | -------------------------------------------------------------------------------- /automate your hack nyu form/fill_up_nyu.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | import csv 7 | from time import sleep 8 | 9 | 10 | # set the scrolling behavior to down 11 | DesiredCapabilities.FIREFOX["elementScrollBehavior"] = 1 12 | 13 | 14 | def fill_up_hack_nyu(student): 15 | """ automate your hack nyu form using selenium""" 16 | driver = webdriver.Firefox() 17 | wait = WebDriverWait(driver, 10) 18 | # load the page 19 | driver.get("http://hacknyu.org/signup") 20 | # get the form element 21 | form = driver.find_element_by_css_selector("form[name='signupForm']") 22 | # fill the fields 23 | form.find_element_by_css_selector("input[name='firstName']").send_keys(student['first_name']) 24 | form.find_element_by_css_selector("input[name='lastName']").send_keys(student['last_name']) 25 | form.find_element_by_css_selector("input[name='email']").send_keys(student['email_id']) 26 | form.find_element_by_css_selector("input[name='password']").send_keys("technyu") 27 | # click and accept terms 28 | form.find_element_by_xpath("//input[@name='terms']/..").click() 29 | wait.until(EC.presence_of_element_located((By.XPATH, "//button[.='Accept']"))).click() 30 | wait.until_not(EC.presence_of_element_located((By.CSS_SELECTOR, ".modal"))) 31 | # click on submit 32 | form.find_element_by_css_selector("button[type='submit']").click() 33 | driver.quit() 34 | 35 | 36 | def read_csv_files(): 37 | with open('data1.csv', 'r+') as data_file: 38 | data = csv.DictReader(data_file) 39 | for row in data: 40 | fill_up_hack_nyu(row) 41 | sleep(1) 42 | 43 | read_csv_files() 44 | -------------------------------------------------------------------------------- /identify xpath.md: -------------------------------------------------------------------------------- 1 | XPath locator examples 2 | 3 | To find the link in this page: 4 | 5 | 6 |

The fox jumped over the lazy brown dog.

7 | 8 | 9 | A raw XPath traverses the hierarchy from the root element of the document (page) to the desired element: 10 | 11 | /html/body/p/a 12 | 13 | 14 | Child of Element ID 15 | 16 | XPath can find an element by ID like this: 17 | 18 | //*[@id="element_id"] 19 | 20 | So if you need to find an element that is near another element with an ID, like the link in this example: 21 | 22 | 23 |

The fox jumped over the lazy brown dog.

24 | 25 | 26 | you could try an XPath like this to find the first link that is a child of the element with ID=”fox”: 27 | 28 | //*[@id="fox"]/a 29 | 30 | 31 | Button Text 32 | 33 | There are two ways to declare a standard button in HTML, discounting the many ways to make something that looks like a button, but is not. To determine how an element is declared in the HTML, see how to inspect an element in the browser. 34 | 35 | If the button is declared with the