├── README.md
├── instagram_bot.py
├── twitter_bot.py
├── twitter_twint_script.ipynb
├── web_scraping_beautifulsoup.ipynb
├── web_scraping_with_scrapy.py
├── web_scraping_with_selenium_part1.py
└── web_scraping_with_selenium_part2.py


/README.md:
--------------------------------------------------------------------------------
1 | # Web-Scraping
2 | This is code repository of web scraping tutorials that I shared on my YouTube channel. You can find videos in this playlist: https://youtube.com/playlist?list=PLE_KQmLMvwwOze_b-ZLkGJzCzbejz_Yaz
3 | 


--------------------------------------------------------------------------------
/instagram_bot.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * @channel Morita DataLand
 3 |  * @author Morita Tarvirdians
 4 |  * @email tarvirdians.morita@gmail.com
 5 |  * @desc Instagram Bot using Selenium and Python
 6 | '''
 7 | 
 8 | from selenium import webdriver
 9 | from selenium.webdriver.common.keys import Keys
10 | import data
11 | from time import sleep
12 | 
13 | 
14 | DRIVER_PATH = r"C:\\Program Files (x86)\\chromedriver.exe"
15 | 
16 | # chrome options
17 | options = webdriver.ChromeOptions()
18 | options.add_argument('--ignore-certificate-errors')
19 | options.add_argument('--ignore-ssl-errors')
20 | options.add_experimental_option('excludeSwitches', ['enable-logging'])
21 | 
22 | class Bot():
23 |     def __init__(self, username, password) -> None:
24 |         self.driver = webdriver.Chrome(DRIVER_PATH, chrome_options=options)
25 |         self.driver.get('https://www.instagram.com/')
26 |         sleep(3)
27 |         self.driver.find_element_by_name("username").send_keys(username)
28 |         self.driver.find_element_by_name("password").send_keys(password)
29 |         self.driver.find_element_by_xpath('//div[text() = "Log In"]').click()
30 | 
31 | 
32 |     def user_info(self, user_id):
33 |         search = self.driver.find_element_by_xpath('//span[text()="Search"]')
34 |         search.click()
35 |         sleep(5)
36 |         search1 = self.driver.find_element_by_xpath('//input[@placeholder="Search"]')
37 |         search1.send_keys(user_id)
38 |         sleep(10)
39 |         search1.send_keys(Keys.ENTER)
40 |         sleep(1)
41 |         search1.send_keys(Keys.ENTER)
42 |         sleep(10)
43 |         followers = self.driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/a/span')
44 |         followings = self.driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[3]/a/span')
45 |         print("follower count:  ", followers.text)
46 |         print("following count: ", followings.text)
47 |         self.driver.close()
48 | 
49 | 
50 | insta_bot = Bot("morita_dataland", data.password)
51 | sleep(15)
52 | insta_bot.user_info("zuch")


--------------------------------------------------------------------------------
/twitter_bot.py:
--------------------------------------------------------------------------------
 1 | # import
 2 | import tweepy as twp
 3 | import keys
 4 | import json
 5 | 
 6 | #Authenticate to twitter
 7 | auth = twp.OAuthHandler(keys.customer_key, keys.costumer_secret)
 8 | auth.set_access_token(keys.access_token, keys.access_token_secret)
 9 | api = twp.API(auth)
10 | 
11 | try:
12 |     api.verify_credentials()
13 |     print("Authentication ok")
14 | except:
15 |     print("error during authentication")
16 | 
17 | 
18 | #Info of users
19 | user_name="elonmusk"
20 | user = api.get_user(user_name)
21 | print("User details:")
22 | print(user.name)
23 | print(user.description)
24 | print(user.location)
25 | print(user.followers_count)
26 | print("\n\n")
27 | 
28 | #Get follower names
29 | print("name of followers: ")
30 | for follower in user.followers():
31 |     print(follower.name)
32 | 
33 | #Get tweets of a user
34 | tweets = api.user_timeline(screen_name= user_name, count = 10, include_rts= False)
35 | for tw in tweets:
36 |     print(tw.text)
37 |     print(tw.created_at)
38 | 
39 | 
40 | #Get tweets by hashtag
41 | hashtag = "#ai"
42 | tweets = twp.Cursor(api.search, q=hashtag, lang="en", include_rts=False).items(1)
43 | for tw in tweets:
44 |     # extract _json
45 |     json_obj = tw._json
46 | 
47 | # pretty print
48 | print(json.dumps(json_obj, indent=1))
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/twitter_twint_script.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "* @channel Morita DataLand\n",
  9 |     "* @author Morita Tarvirdians\n",
 10 |     "* @email tarvirdians.morita@gmail.com\n",
 11 |     "* @desc Scrape Tweets using Twint\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import twint"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import nest_asyncio\n",
 30 |     "nest_asyncio.apply()"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "# Configure\n",
 40 |     "c = twint.Config()"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "c.Search = \"btc\"\n",
 50 |     "c.Limit = 10"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "c.Store_csv = True\n",
 60 |     "c.Output = \"Twint_output.csv\""
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Run\n",
 70 |     "twint.run.Search(c)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import pandas as pd"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "df = pd.read_csv(\"Twint_output.csv\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "df"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": []
106 |   }
107 |  ],
108 |  "metadata": {
109 |   "kernelspec": {
110 |    "display_name": "Python 3",
111 |    "language": "python",
112 |    "name": "python3"
113 |   },
114 |   "language_info": {
115 |    "codemirror_mode": {
116 |     "name": "ipython",
117 |     "version": 3
118 |    },
119 |    "file_extension": ".py",
120 |    "mimetype": "text/x-python",
121 |    "name": "python",
122 |    "nbconvert_exporter": "python",
123 |    "pygments_lexer": "ipython3",
124 |    "version": "3.8.5"
125 |   }
126 |  },
127 |  "nbformat": 4,
128 |  "nbformat_minor": 4
129 | }
130 | 


--------------------------------------------------------------------------------
/web_scraping_beautifulsoup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Web Scraping with Python - BeautifulSoup"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Channel: Morita DataLand"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Author: Morita Tarvirdians"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import requests"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "from bs4 import BeautifulSoup"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "page = requests.get(\"https://www.monster.ca/jobs/search/?q=data-scientist&where=Ontario\")"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "page"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "soup = BeautifulSoup(page.text, \"html.parser\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "soup.find_all(\"a\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "result = soup.find(id=\"SearchResults\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "print(result.prettify())"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "locations = result.find_all('div', class_=\"location\")"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "locations"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "companies = result.find_all('div', class_=\"company\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "companies"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "for item in companies:\n",
139 |     "    print(item.text)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "job_sections = result.find_all('section', class_=\"card-content\")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "for sec in job_sections:\n",
158 |     "    print(sec.text)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "import xlsxwriter"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "workbook = xlsxwriter.Workbook('Jobs.xlsx')\n",
177 |     "worksheet = workbook.add_worksheet()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "row = 0\n",
187 |     "col = 0"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "for sec in job_sections:\n",
197 |     "    title = sec.find(\"h2\", class_=\"title\")\n",
198 |     "    company = sec.find(\"div\", class_=\"company\")\n",
199 |     "    location = sec.find(\"div\", class_=\"location\")\n",
200 |     "    if None in (title, company, location):\n",
201 |     "        continue\n",
202 |     "    worksheet.write(row, col, title.text)\n",
203 |     "    worksheet.write(row, col+1, company.text)\n",
204 |     "    worksheet.write(row, col+2, location.text)\n",
205 |     "    \n",
206 |     "    row+=1\n",
207 |     "workbook.close()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": []
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "kernelspec": {
220 |    "display_name": "Python 3",
221 |    "language": "python",
222 |    "name": "python3"
223 |   },
224 |   "language_info": {
225 |    "codemirror_mode": {
226 |     "name": "ipython",
227 |     "version": 3
228 |    },
229 |    "file_extension": ".py",
230 |    "mimetype": "text/x-python",
231 |    "name": "python",
232 |    "nbconvert_exporter": "python",
233 |    "pygments_lexer": "ipython3",
234 |    "version": "3.8.5"
235 |   }
236 |  },
237 |  "nbformat": 4,
238 |  "nbformat_minor": 4
239 | }
240 | 


--------------------------------------------------------------------------------
/web_scraping_with_scrapy.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | class QuotesSpider(scrapy.Spider):
 4 |     name = "quotes"
 5 |     start_urls=["http://quotes.toscrape.com/"]
 6 | 
 7 |     def parse(self, response):
 8 |         for item in response.css("div.quote"):
 9 |             yield{
10 |                 "text" :item.css("span.text::text").get(),
11 |                 "author" : item.css("small.author::text").get(),
12 |                 "tags" : item.css("div.tags a.tag::text").getall()
13 |             }
14 | 
15 |         next_page = response.css("li.next a::attr(href)").get()
16 |         if next_page is not None:
17 |             yield response.follow(next_page, callback=self.parse)


--------------------------------------------------------------------------------
/web_scraping_with_selenium_part1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * @channel Morita DataLand
 3 |  * @author Morita Tarvirdians
 4 |  * @email tarvirdians.morita@gmail.com
 5 |  * @desc Web Scraping With Selenium
 6 | '''
 7 | 
 8 | from selenium import webdriver
 9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | import time
14 | import pandas as pd
15 | 
16 | driver_path = r"C:\\Program Files (x86)\\chromedriver.exe"
17 | 
18 | # chrome options
19 | options = webdriver.ChromeOptions()
20 | options.add_argument('--ignore-certificate-errors')
21 | options.add_argument('--ignore-ssl-errors')
22 | options.add_experimental_option('excludeSwitches', ['enable-logging'])
23 | 
24 | 
25 | 
26 | driver = webdriver.Chrome(driver_path, chrome_options=options)
27 | #driver.get("https://google.com")
28 | 
29 | # get url
30 | url = "https://www.mastersportal.com/"
31 | driver.get(url)
32 | 
33 | # locate HomeWhat element in the page and send keys to it
34 | what_search_box = driver.find_element_by_id("HomeWhat")
35 | what_search_box.send_keys("computer")
36 | what_search_box.send_keys(Keys.ENTER)
37 | 
38 | # locate HomeWhere element in the page and send keys to it
39 | where_search_box = driver.find_element_by_id("HomeWhere")
40 | where_search_box.send_keys("Germany")
41 | where_search_box.send_keys(Keys.ENTER)
42 | 
43 | time.sleep(5)
44 | 
45 | #get some fields and write them in the csv file
46 | lst = []
47 | cols = ["title", "uni", "fee"]
48 | titles = driver.find_elements_by_class_name("StudyTitle")
49 | unis = driver.find_elements_by_class_name("LocationFact:nth-child(1)")
50 | fees = driver.find_elements_by_class_name("KeyFact:nth-child(1)")
51 | 
52 | for (title, uni, fee) in (zip(titles, unis, fees)):
53 |     lst.append((title.text, uni.text, fee.text))
54 | 
55 | driver.quit()
56 | 
57 | df = pd.DataFrame(lst, columns=cols )
58 | 
59 | df.to_csv("out.csv")
60 |  
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/web_scraping_with_selenium_part2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * @channel Morita DataLand
 3 |  * @author Morita Tarvirdians
 4 |  * @email tarvirdians.morita@gmail.com
 5 |  * @desc Web Scraping With Selenium
 6 | '''
 7 | 
 8 | from selenium import webdriver
 9 | from selenium.webdriver.common.keys import Keys
10 | import time
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.ui import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 | 
15 | 
16 | driver_path = r"C:\\Program Files (x86)\\chromedriver.exe"
17 | 
18 | # chrome options
19 | options = webdriver.ChromeOptions()
20 | options.add_argument("--headless")
21 | options.add_argument('--ignore-certificate-errors')
22 | options.add_argument('--ignore-ssl-errors')
23 | options.add_experimental_option('excludeSwitches', ['enable-logging'])
24 | 
25 | # set driver
26 | driver = webdriver.Chrome(driver_path, chrome_options=options)
27 | 
28 | # get url
29 | url = "https://www.mastersportal.com/"
30 | driver.get(url)
31 | 
32 | # element = driver.find_element_by_id("LoginButton")
33 | # element.click()
34 | try:
35 |     driver.execute_script("window.scrollTo(0,1000)")
36 |     # driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
37 |     element = WebDriverWait(driver, 10).until(
38 |         EC.presence_of_element_located((By.CLASS_NAME, "TakeTest"))
39 |     )
40 |     element.click()
41 |     time.sleep(3)
42 |     driver.get_screenshot_as_file("screenshot.png")
43 |     driver.back()
44 |     
45 | except:
46 |     driver.quit()
47 | 


--------------------------------------------------------------------------------