├── README.md ├── instagram_bot.py ├── twitter_bot.py ├── twitter_twint_script.ipynb ├── web_scraping_beautifulsoup.ipynb ├── web_scraping_with_scrapy.py ├── web_scraping_with_selenium_part1.py └── web_scraping_with_selenium_part2.py /README.md: -------------------------------------------------------------------------------- 1 | # Web-Scraping 2 | This is code repository of web scraping tutorials that I shared on my YouTube channel. You can find videos in this playlist: https://youtube.com/playlist?list=PLE_KQmLMvwwOze_b-ZLkGJzCzbejz_Yaz 3 | -------------------------------------------------------------------------------- /instagram_bot.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * @channel Morita DataLand 3 | * @author Morita Tarvirdians 4 | * @email tarvirdians.morita@gmail.com 5 | * @desc Instagram Bot using Selenium and Python 6 | ''' 7 | 8 | from selenium import webdriver 9 | from selenium.webdriver.common.keys import Keys 10 | import data 11 | from time import sleep 12 | 13 | 14 | DRIVER_PATH = r"C:\\Program Files (x86)\\chromedriver.exe" 15 | 16 | # chrome options 17 | options = webdriver.ChromeOptions() 18 | options.add_argument('--ignore-certificate-errors') 19 | options.add_argument('--ignore-ssl-errors') 20 | options.add_experimental_option('excludeSwitches', ['enable-logging']) 21 | 22 | class Bot(): 23 | def __init__(self, username, password) -> None: 24 | self.driver = webdriver.Chrome(DRIVER_PATH, chrome_options=options) 25 | self.driver.get('https://www.instagram.com/') 26 | sleep(3) 27 | self.driver.find_element_by_name("username").send_keys(username) 28 | self.driver.find_element_by_name("password").send_keys(password) 29 | self.driver.find_element_by_xpath('//div[text() = "Log In"]').click() 30 | 31 | 32 | def user_info(self, user_id): 33 | search = self.driver.find_element_by_xpath('//span[text()="Search"]') 34 | search.click() 35 | sleep(5) 36 | search1 = self.driver.find_element_by_xpath('//input[@placeholder="Search"]') 37 | search1.send_keys(user_id) 38 | sleep(10) 39 | search1.send_keys(Keys.ENTER) 40 | sleep(1) 41 | search1.send_keys(Keys.ENTER) 42 | sleep(10) 43 | followers = self.driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/a/span') 44 | followings = self.driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[3]/a/span') 45 | print("follower count: ", followers.text) 46 | print("following count: ", followings.text) 47 | self.driver.close() 48 | 49 | 50 | insta_bot = Bot("morita_dataland", data.password) 51 | sleep(15) 52 | insta_bot.user_info("zuch") -------------------------------------------------------------------------------- /twitter_bot.py: -------------------------------------------------------------------------------- 1 | # import 2 | import tweepy as twp 3 | import keys 4 | import json 5 | 6 | #Authenticate to twitter 7 | auth = twp.OAuthHandler(keys.customer_key, keys.costumer_secret) 8 | auth.set_access_token(keys.access_token, keys.access_token_secret) 9 | api = twp.API(auth) 10 | 11 | try: 12 | api.verify_credentials() 13 | print("Authentication ok") 14 | except: 15 | print("error during authentication") 16 | 17 | 18 | #Info of users 19 | user_name="elonmusk" 20 | user = api.get_user(user_name) 21 | print("User details:") 22 | print(user.name) 23 | print(user.description) 24 | print(user.location) 25 | print(user.followers_count) 26 | print("\n\n") 27 | 28 | #Get follower names 29 | print("name of followers: ") 30 | for follower in user.followers(): 31 | print(follower.name) 32 | 33 | #Get tweets of a user 34 | tweets = api.user_timeline(screen_name= user_name, count = 10, include_rts= False) 35 | for tw in tweets: 36 | print(tw.text) 37 | print(tw.created_at) 38 | 39 | 40 | #Get tweets by hashtag 41 | hashtag = "#ai" 42 | tweets = twp.Cursor(api.search, q=hashtag, lang="en", include_rts=False).items(1) 43 | for tw in tweets: 44 | # extract _json 45 | json_obj = tw._json 46 | 47 | # pretty print 48 | print(json.dumps(json_obj, indent=1)) 49 | 50 | 51 | -------------------------------------------------------------------------------- /twitter_twint_script.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "* @channel Morita DataLand\n", 9 | "* @author Morita Tarvirdians\n", 10 | "* @email tarvirdians.morita@gmail.com\n", 11 | "* @desc Scrape Tweets using Twint\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import twint" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import nest_asyncio\n", 30 | "nest_asyncio.apply()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# Configure\n", 40 | "c = twint.Config()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "c.Search = \"btc\"\n", 50 | "c.Limit = 10" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "c.Store_csv = True\n", 60 | "c.Output = \"Twint_output.csv\"" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# Run\n", 70 | "twint.run.Search(c)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import pandas as pd" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "df = pd.read_csv(\"Twint_output.csv\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "df" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [] 106 | } 107 | ], 108 | "metadata": { 109 | "kernelspec": { 110 | "display_name": "Python 3", 111 | "language": "python", 112 | "name": "python3" 113 | }, 114 | "language_info": { 115 | "codemirror_mode": { 116 | "name": "ipython", 117 | "version": 3 118 | }, 119 | "file_extension": ".py", 120 | "mimetype": "text/x-python", 121 | "name": "python", 122 | "nbconvert_exporter": "python", 123 | "pygments_lexer": "ipython3", 124 | "version": "3.8.5" 125 | } 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 4 129 | } 130 | -------------------------------------------------------------------------------- /web_scraping_beautifulsoup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Web Scraping with Python - BeautifulSoup" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Channel: Morita DataLand" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Author: Morita Tarvirdians" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import requests" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from bs4 import BeautifulSoup" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "page = requests.get(\"https://www.monster.ca/jobs/search/?q=data-scientist&where=Ontario\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "page" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "soup = BeautifulSoup(page.text, \"html.parser\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "soup.find_all(\"a\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "result = soup.find(id=\"SearchResults\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "print(result.prettify())" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "locations = result.find_all('div', class_=\"location\")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "locations" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "companies = result.find_all('div', class_=\"company\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "companies" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "for item in companies:\n", 139 | " print(item.text)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "job_sections = result.find_all('section', class_=\"card-content\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "for sec in job_sections:\n", 158 | " print(sec.text)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "import xlsxwriter" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "workbook = xlsxwriter.Workbook('Jobs.xlsx')\n", 177 | "worksheet = workbook.add_worksheet()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "row = 0\n", 187 | "col = 0" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "for sec in job_sections:\n", 197 | " title = sec.find(\"h2\", class_=\"title\")\n", 198 | " company = sec.find(\"div\", class_=\"company\")\n", 199 | " location = sec.find(\"div\", class_=\"location\")\n", 200 | " if None in (title, company, location):\n", 201 | " continue\n", 202 | " worksheet.write(row, col, title.text)\n", 203 | " worksheet.write(row, col+1, company.text)\n", 204 | " worksheet.write(row, col+2, location.text)\n", 205 | " \n", 206 | " row+=1\n", 207 | "workbook.close()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.8.5" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 4 239 | } 240 | -------------------------------------------------------------------------------- /web_scraping_with_scrapy.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class QuotesSpider(scrapy.Spider): 4 | name = "quotes" 5 | start_urls=["http://quotes.toscrape.com/"] 6 | 7 | def parse(self, response): 8 | for item in response.css("div.quote"): 9 | yield{ 10 | "text" :item.css("span.text::text").get(), 11 | "author" : item.css("small.author::text").get(), 12 | "tags" : item.css("div.tags a.tag::text").getall() 13 | } 14 | 15 | next_page = response.css("li.next a::attr(href)").get() 16 | if next_page is not None: 17 | yield response.follow(next_page, callback=self.parse) -------------------------------------------------------------------------------- /web_scraping_with_selenium_part1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * @channel Morita DataLand 3 | * @author Morita Tarvirdians 4 | * @email tarvirdians.morita@gmail.com 5 | * @desc Web Scraping With Selenium 6 | ''' 7 | 8 | from selenium import webdriver 9 | from selenium.webdriver.common.keys import Keys 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | import time 14 | import pandas as pd 15 | 16 | driver_path = r"C:\\Program Files (x86)\\chromedriver.exe" 17 | 18 | # chrome options 19 | options = webdriver.ChromeOptions() 20 | options.add_argument('--ignore-certificate-errors') 21 | options.add_argument('--ignore-ssl-errors') 22 | options.add_experimental_option('excludeSwitches', ['enable-logging']) 23 | 24 | 25 | 26 | driver = webdriver.Chrome(driver_path, chrome_options=options) 27 | #driver.get("https://google.com") 28 | 29 | # get url 30 | url = "https://www.mastersportal.com/" 31 | driver.get(url) 32 | 33 | # locate HomeWhat element in the page and send keys to it 34 | what_search_box = driver.find_element_by_id("HomeWhat") 35 | what_search_box.send_keys("computer") 36 | what_search_box.send_keys(Keys.ENTER) 37 | 38 | # locate HomeWhere element in the page and send keys to it 39 | where_search_box = driver.find_element_by_id("HomeWhere") 40 | where_search_box.send_keys("Germany") 41 | where_search_box.send_keys(Keys.ENTER) 42 | 43 | time.sleep(5) 44 | 45 | #get some fields and write them in the csv file 46 | lst = [] 47 | cols = ["title", "uni", "fee"] 48 | titles = driver.find_elements_by_class_name("StudyTitle") 49 | unis = driver.find_elements_by_class_name("LocationFact:nth-child(1)") 50 | fees = driver.find_elements_by_class_name("KeyFact:nth-child(1)") 51 | 52 | for (title, uni, fee) in (zip(titles, unis, fees)): 53 | lst.append((title.text, uni.text, fee.text)) 54 | 55 | driver.quit() 56 | 57 | df = pd.DataFrame(lst, columns=cols ) 58 | 59 | df.to_csv("out.csv") 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /web_scraping_with_selenium_part2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * @channel Morita DataLand 3 | * @author Morita Tarvirdians 4 | * @email tarvirdians.morita@gmail.com 5 | * @desc Web Scraping With Selenium 6 | ''' 7 | 8 | from selenium import webdriver 9 | from selenium.webdriver.common.keys import Keys 10 | import time 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.support.ui import WebDriverWait 13 | from selenium.webdriver.support import expected_conditions as EC 14 | 15 | 16 | driver_path = r"C:\\Program Files (x86)\\chromedriver.exe" 17 | 18 | # chrome options 19 | options = webdriver.ChromeOptions() 20 | options.add_argument("--headless") 21 | options.add_argument('--ignore-certificate-errors') 22 | options.add_argument('--ignore-ssl-errors') 23 | options.add_experimental_option('excludeSwitches', ['enable-logging']) 24 | 25 | # set driver 26 | driver = webdriver.Chrome(driver_path, chrome_options=options) 27 | 28 | # get url 29 | url = "https://www.mastersportal.com/" 30 | driver.get(url) 31 | 32 | # element = driver.find_element_by_id("LoginButton") 33 | # element.click() 34 | try: 35 | driver.execute_script("window.scrollTo(0,1000)") 36 | # driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") 37 | element = WebDriverWait(driver, 10).until( 38 | EC.presence_of_element_located((By.CLASS_NAME, "TakeTest")) 39 | ) 40 | element.click() 41 | time.sleep(3) 42 | driver.get_screenshot_as_file("screenshot.png") 43 | driver.back() 44 | 45 | except: 46 | driver.quit() 47 | --------------------------------------------------------------------------------