├── src
├── __init__.py
├── swiggy.py
├── zomato.py
├── soundcloud.py
├── spotify.py
├── youtube.py
├── linkedin.py
├── wikipedia.py
├── twitter.py
├── news.py
├── image.py
├── snapdeal.py
├── imdb.py
├── alibaba.py
├── amazon.py
├── instagram.py
├── flipkart.py
├── stock.py
└── PyScrappy.py
├── PyScrappy.png
├── LICENSE
├── .github
└── workflows
│ └── python-publish.yml
├── setup.py
└── README.md
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/PyScrappy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mldsveda/PyScrappy/HEAD/PyScrappy.png
--------------------------------------------------------------------------------
/src/swiggy.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | def scrappi(city, n_pages):
6 | lst_of_urls = []
7 | for i in range(1, n_pages+1): lst_of_urls.append("https://www.swiggy.com/" + city + '?page=' + str(i))
8 |
9 | def swiggy(soup):
10 | main = soup.find_all('div', {'class': 'nDVxx'})[0]
11 | lst = []
12 | for details in main.find_all('div', {'class': '_3XX_A'}):
13 | dictionary = {}
14 | dictionary['Name'] = details.find('div', {"class": "nA6kb"}).text
15 | dictionary['Cuisine'] = details.find('div', {"class": "_1gURR"}).text
16 | dictionary['Price'] = details.find('div', {"class": "nVWSi"}).text
17 | dictionary['Rating'] = details.find('div', {"class": "_9uwBC"}).text
18 | lst.append(dictionary)
19 | return lst
20 |
21 | x = []
22 | for i in lst_of_urls:
23 | try: url = requests.get(i)
24 | except: raise ValueError("Invalid value passed for 'city'")
25 | soup = BeautifulSoup(url.text, "lxml")
26 | x.extend(swiggy(soup))
27 |
28 | return pd.DataFrame(x)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Vedant Tibrewal, Vedaant Singh.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | jobs:
16 | deploy:
17 |
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: '3.x'
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install build
30 | - name: Build package
31 | run: python -m build
32 | - name: Publish package
33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 | with:
35 | user: __token__
36 | password: ${{ secrets.PYPI_API_TOKEN }}
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import setuptools
3 |
4 | with open("README.md", "r", encoding="utf-8") as fh:
5 | long_description = fh.read()
6 |
7 | setuptools.setup(
8 | name="PyScrappy",
9 | version="0.1.1",
10 | author="Vedant Tibrewal, Vedaant Singh",
11 | author_email="mlds93363@gmail.com",
12 | description="Powerful web scraping tool.",
13 | long_description=long_description,
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/mldsveda/PyScrappy",
16 | keywords=['PyScrappy', 'Scraping', 'E-Commerce', 'Wikipedia', 'Image Scrapper', 'YouTube', 'Scrapy', 'Twitter', 'Social Media', 'Web Scraping', 'News', 'Stocks', 'Songs', 'Food', 'Instagram', 'Movies'],
17 | classifiers=[
18 | "Programming Language :: Python :: 3",
19 | "License :: OSI Approved :: MIT License",
20 | "Operating System :: OS Independent",
21 | ],
22 | python_requires=">=3.6",
23 | py_modules=["PyScrappy", "alibaba", "amazon", "flipkart", "image", "imdb", "instagram", "linkedin", "news", "snapdeal", "soundcloud", "spotify", "stock", "swiggy", "twitter", "wikipedia", "youtube", "zomato"],
24 | package_dir={"": "src"},
25 | install_requires=[
26 | 'selenium',
27 | 'webdriver-manager',
28 | 'beautifulsoup4',
29 | 'requests',
30 | 'pandas',
31 | ],
32 | packages=setuptools.find_packages(where="src")
33 | )
34 |
--------------------------------------------------------------------------------
/src/zomato.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | from selenium import webdriver
4 | import time
5 |
6 | def scrappi(city, n_pages):
7 | if n_pages == 0: raise ValueError("'n_pages' must be greater than 0")
8 | city = city.replace(' ', '-')
9 | driver = webdriver.Chrome(ChromeDriverManager().install())
10 | driver.get("https://www.zomato.com/"+city+"/restaurants")
11 |
12 | for _ in range(n_pages):
13 | driver.execute_script('window.scrollBy(0, window.innerHeight);')
14 | time.sleep(4)
15 |
16 | def zomato(card):
17 | ls = []
18 | try: name = card.find_element_by_xpath('.//div/a[2]/div/p').text
19 | except: name = None
20 | try: cusine = card.find_element_by_xpath('.//div/a[2]/p').text
21 | except: cusine = None
22 | try: rating = card.find_element_by_xpath('.//div/a[2]/div[2]/section').get_attribute('value')
23 | except: rating = None
24 | try: price, delivery_time = card.find_element_by_xpath('.//div/a[2]/p[2]').text.split('\n')
25 | except: price, delivery_time = None, None
26 | try: reviews_count = card.find_element_by_xpath('.//div/a[2]/div[2]/section/div[2]').text[1:-1]
27 | except: reviews_count = None
28 | ls.extend([name, cusine, price, rating, delivery_time, reviews_count])
29 | return ls
30 |
31 | new_ls = []
32 | try: cards = driver.find_elements_by_class_name('jumbo-tracker')
33 | except: raise KeyError("Invalid value for 'city'")
34 | for card in cards:
35 | new_ls.append(zomato(card))
36 |
37 | driver.close()
38 | return pd.DataFrame(new_ls, columns = ["Name", "Cusine", "Price", "Rating", "Delivery Time", "Review counts"])
--------------------------------------------------------------------------------
/src/soundcloud.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | from selenium.webdriver.common.keys import Keys
4 | from selenium import webdriver
5 | from bs4 import BeautifulSoup
6 | import time, re
7 |
8 | def soundcloud_tracks(track_name, n_pages):
9 | chrome_options = webdriver.ChromeOptions()
10 | chrome_options.add_argument('--headless')
11 | chrome_options.headless = True
12 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line = False).install(), options = chrome_options)
13 | driver.create_options()
14 | driver.get('https://soundcloud.com/search/sounds?q='+track_name)
15 |
16 | for _ in range(n_pages):
17 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
18 | time.sleep(3)
19 |
20 | html = driver.page_source
21 | soup = BeautifulSoup(html, 'html.parser')
22 | main = soup.find_all("div", {"class": "search"})[0]
23 | songs = main.find_all('div', {'class': 'sound__content'})
24 | lst = []
25 | for song in songs:
26 | dictionary = {}
27 | dictionary['Uploader'] = song.find('span', {"class": "soundTitle__usernameText"}).text
28 | dictionary['Uploader'] = re.sub('[^a-zA-Z]', '', dictionary['Uploader'])
29 | dictionary['Music Title'] = (song.find('a', {"class": "soundTitle__title sc-link-dark sc-link-secondary"}).text).replace('\n', '')
30 | dictionary['Time of Upload'] = (song.find('span', {"class": "sc-visuallyhidden"}).text).replace('\n', '')
31 | dictionary['Plays'] = song.find('span', {"class": "sc-ministats"}).text
32 | dictionary['Plays'] = re.sub('[^0-9,]', '', dictionary['Plays'])[:-3]
33 | lst.append(dictionary)
34 |
35 | driver.close()
36 | return pd.DataFrame(lst)
--------------------------------------------------------------------------------
/src/spotify.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from time import sleep
3 | from webdriver_manager.chrome import ChromeDriverManager
4 | from selenium import webdriver
5 |
6 | def func(last, tracks):
7 | data = []
8 | for track in tracks:
9 | try:
10 | temp = (track.find_element_by_xpath("./div[@data-testid='tracklist-row']").text).split("\n")
11 | if last < int(temp[0]):
12 | if 'E' in temp: temp.remove('E')
13 | data.append(temp)
14 | except: pass
15 | return data
16 |
17 | def scrappi(track_type, n_pages):
18 | while n_pages<= 0:
19 | n_pages = int(input("Enter a valid 'n_pages', greater than 0: "))
20 |
21 | chrome_options = webdriver.ChromeOptions()
22 | chrome_options.add_argument('--headless')
23 | chrome_options.headless = True
24 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
25 | driver.create_options()
26 |
27 | driver.get(f"https://open.spotify.com/search/{track_type}/tracks")
28 | sleep(4)
29 |
30 | data = []
31 | last = 0
32 | for i in range(n_pages):
33 | main = driver.find_element_by_xpath(".//div[@data-testid='track-list']/div[2]/div[2]")
34 | data.extend(func(last, main.find_elements_by_xpath("./div")))
35 | last = int(data[-1][0])
36 | try:
37 | scroll = main.find_element_by_xpath("./div[last()]").location_once_scrolled_into_view
38 | sleep(4)
39 | except:
40 | try:
41 | scroll = main.find_element_by_xpath("./div[last()]").location_once_scrolled_into_view
42 | sleep(8)
43 | except:
44 | pass
45 |
46 | return pd.DataFrame(data, columns=["Id", "Title", "Singers", "Album", "Duration"])
--------------------------------------------------------------------------------
/src/youtube.py:
--------------------------------------------------------------------------------
1 | try:
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | except: raise ImportError("'webdriver-manager' package not installed")
4 | try:
5 | from selenium.webdriver.common.keys import Keys
6 | from selenium import webdriver
7 | except: raise ImportError("'selenium' package not installed")
8 | from bs4 import BeautifulSoup
9 | import pandas as pd
10 | import time
11 |
12 | usr_agent = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
16 | 'Accept-Encoding': 'none',
17 | 'Accept-Language': 'en-US,en;q=0.8',
18 | 'Connection': 'keep-alive',
19 | }
20 |
21 | def scrappi(url, n_pages):
22 |
23 | chrome_options = webdriver.ChromeOptions()
24 | chrome_options.add_argument('--headless')
25 | chrome_options.headless = True
26 | driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
27 | driver.create_options()
28 | driver.get(url)
29 |
30 | for _ in range(n_pages):
31 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
32 | time.sleep(3)
33 |
34 | html = driver.page_source
35 | soup = BeautifulSoup(html, 'html.parser')
36 | videos = soup.find_all("div", {"id": "dismissible"})
37 | lst = []
38 |
39 | for video in videos:
40 | dictionary = {}
41 | dictionary['Title'] = video.find("a", {"id": "video-title"}).text
42 | dictionary['Video_url'] = "https://www.youtube.com/" + video.find("a", {"id": "video-title"})['href']
43 | meta = video.find("div", {"id": "metadata-line"}).find_all('span')
44 | dictionary['Views'] = meta[0].text
45 | dictionary['Days'] = meta[1].text
46 |
47 | lst.append(dictionary)
48 |
49 | return pd.DataFrame(lst)
--------------------------------------------------------------------------------
/src/linkedin.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from time import sleep
3 | from webdriver_manager.chrome import ChromeDriverManager
4 | from selenium import webdriver
5 |
6 | def func(post):
7 | try: title = post.find_element_by_class_name("base-search-card__title").text
8 | except: title = None
9 | try: company = post.find_element_by_class_name("base-search-card__subtitle").text
10 | except: company = None
11 | try: location = post.find_element_by_class_name("job-search-card__location").text
12 | except: location = None
13 | try: salary = post.find_element_by_class_name("job-search-card__salary-info").text
14 | except: salary = "Not disclosed"
15 | try: benefits = post.find_element_by_class_name("job-search-card__benefits").text
16 | except: benefits = None
17 | try: date = post.find_element_by_class_name("job-search-card__listdate").text
18 | except: date = None
19 |
20 | return [title, company, location, salary, benefits, date]
21 |
22 | def scrappi(job_title, n_pages):
23 | while n_pages <= 0:
24 | print("'n_pages' must be greater then 0")
25 | n_pages = int(input("Enter 'n_pages' greater then 0: "))
26 |
27 | chrome_options = webdriver.ChromeOptions()
28 | chrome_options.add_argument('--headless')
29 | chrome_options.headless = True
30 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
31 | driver.create_options()
32 |
33 | driver.get("https://www.linkedin.com/jobs/search/?keywords="+job_title)
34 | sleep(4)
35 |
36 | for i in range(n_pages-1):
37 | driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
38 | sleep(4)
39 | try: driver.find_element_by_class_name("infinite-scroller__show-more-button").click()
40 | except: pass
41 |
42 | data = []
43 | for post in driver.find_elements_by_xpath(".//ul[@class='jobs-search__results-list']/li"):
44 | data.append(func(post))
45 |
46 | driver.close()
47 | return pd.DataFrame(data, columns=["Job Title", "Company Name", "Location", "Salary", "Benefits", "Date"])
--------------------------------------------------------------------------------
/src/wikipedia.py:
--------------------------------------------------------------------------------
1 | from urllib.request import urlopen
2 | from bs4 import BeautifulSoup
3 | import re
4 |
5 | usr_agent = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
9 | 'Accept-Encoding': 'none',
10 | 'Accept-Language': 'en-US,en;q=0.8',
11 | 'Connection': 'keep-alive',
12 | }
13 |
14 | def para(word):
15 | word=word.replace(' ', '_')
16 | # Specify url of the web page
17 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read()
18 | # Make a soup
19 | soup = BeautifulSoup(source,'lxml')
20 |
21 | # Extract the plain text content from paragraphs
22 | paras = []
23 | for paragraph in soup.find_all('p'):
24 | paras.append(str(paragraph.text))
25 | return paras
26 |
27 | def header(word):
28 | word=word.replace(' ', '_')
29 | # Specify url of the web page
30 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read()
31 | soup = BeautifulSoup(source, "lxml")
32 |
33 | # Extract text from paragraph headers
34 | heads = []
35 | for head in soup.find_all('span', attrs={'mw-headline'}):
36 | heads.append(str(head.text))
37 | return heads
38 |
39 | def text(word):
40 | word=word.replace(' ', '_')
41 | # Specify url of the web page
42 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read()
43 | soup = BeautifulSoup(source, "lxml")
44 |
45 | paras = []
46 | for paragraph in soup.find_all('p'):
47 | paras.append(str(paragraph.text))
48 |
49 | heads = []
50 | for head in soup.find_all('span', attrs={'mw-headline'}):
51 | heads.append(str(head.text))
52 |
53 | # Interleave paragraphs & headers
54 | text = [val for pair in zip(paras, heads) for val in pair]
55 | text = ' '.join(text)
56 |
57 | # Drop footnote superscripts in brackets
58 | text = re.sub(r"\[.*?\]+", '', text)
59 |
60 | # Replace '\n' (a new line) with '' and end the string at $1000.
61 | text = text.replace('\n', '')[:-11]
62 | return text
--------------------------------------------------------------------------------
/src/twitter.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | from selenium import webdriver
4 | from time import sleep
5 |
6 | def scrappi(hashtag, n_pages):
7 | chrome_options = webdriver.ChromeOptions()
8 | chrome_options.add_argument('--headless')
9 | chrome_options.headless = True
10 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
11 | driver.create_options()
12 | driver.get("https://twitter.com/search?q=%23"+hashtag.replace('#', ''))
13 | sleep(4)
14 |
15 | def twitter(card):
16 | data_lst = []
17 | try: name = card.find_element_by_xpath('.//span').text
18 | except: name = None
19 | try: twitter_handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
20 | except: twitter_handle = None
21 | try: post_time = card.find_element_by_xpath('.//time').get_attribute('datetime')
22 | except: post_time = None
23 | try: tweet = (card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text)+(card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text)
24 | except: tweet = None
25 | try: reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text
26 | except: reply_count = None
27 | try: retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text
28 | except: retweet_count = None
29 | try: like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text
30 | except: like_count = None
31 |
32 | data_lst.extend([name, twitter_handle, post_time, tweet, reply_count, retweet_count, like_count])
33 | return data_lst
34 |
35 | new_ls = []
36 | temp_set = set()
37 | for _ in range(n_pages):
38 | for card in driver.find_elements_by_xpath('//article[@data-testid="tweet"]'):
39 | ls = twitter(card)
40 | check = ''.join(ls)
41 | if check not in temp_set:
42 | new_ls.append(ls)
43 | temp_set.add(check)
44 | driver.execute_script('window.scrollBy(0, window.innerHeight*3);')
45 | sleep(4)
46 |
47 | driver.close()
48 | return pd.DataFrame(new_ls, columns = ["Name", "Twitter handle", "Post Time", "Tweet", "Reply Count", "Retweet Count", "Like Count"])
49 |
--------------------------------------------------------------------------------
/src/news.py:
--------------------------------------------------------------------------------
1 | try:
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | except: raise ImportError("'webdriver-manager' package not installed")
4 | try:
5 | from selenium.webdriver.common.keys import Keys
6 | from selenium import webdriver
7 | except: raise ImportError("'selenium' package not installed")
8 | from bs4 import BeautifulSoup
9 | import pandas as pd
10 | import time
11 |
12 | usr_agent = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
16 | 'Accept-Encoding': 'none',
17 | 'Accept-Language': 'en-US,en;q=0.8',
18 | 'Connection': 'keep-alive',
19 | }
20 |
21 | def scrappi(n_pages, genre):
22 | if genre not in ['national', 'business', 'sports', 'world', 'politics', 'technology', 'startup', 'entertainment',
23 | 'miscellaneous', 'hatke', 'science', 'automobile']:
24 | raise ValueError("'genre' value not exists")
25 |
26 | chrome_options = webdriver.ChromeOptions()
27 | chrome_options.add_argument('--headless')
28 | chrome_options.headless = True
29 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
30 | driver.create_options()
31 | driver.get('https://inshorts.com/en/read/'+genre)
32 |
33 | for _ in range(n_pages):
34 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
35 | time.sleep(3)
36 | driver.find_element_by_id('load-more-btn').click()
37 | text_field = driver.find_element_by_id('load-more-btn')
38 |
39 | html = driver.page_source
40 | soup = BeautifulSoup(html, 'html.parser')
41 | main = soup.find_all('div', {"class": "news-card z-depth-1"})
42 |
43 | lst = []
44 | for details in main:
45 | dictionary={}
46 | dictionary['Headlines'] = (details.find('a', {"class": "clickable"}).text).replace('\n', '')
47 | dictionary['Time'] = details.find('span', {"class": "time"}).text
48 | date = details.find('div', {"class": "news-card-author-time news-card-author-time-in-title"}).find_all('span')
49 | dictionary['Date'] = date[3].text
50 | dictionary['News'] = details.find('div', {"itemprop": "articleBody"}).text
51 | lst.append(dictionary)
52 |
53 | return pd.DataFrame(lst)
--------------------------------------------------------------------------------
/src/image.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import os
4 |
5 | usr_agent = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
9 | 'Accept-Encoding': 'none',
10 | 'Accept-Language': 'en-US,en;q=0.8',
11 | 'Connection': 'keep-alive',
12 | }
13 |
14 | def scrappi(data, n_images, img_format, folder_name):
15 |
16 | URL = ['https://www.bing.com/images/search?q=', 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&q=',
17 | 'https://images.search.yahoo.com/search/images?p=']
18 |
19 | def check(folder_name):
20 | try:
21 | os.mkdir(folder_name)
22 | return folder_name
23 | except:
24 | print("Folder Exist with that name!")
25 | folder_name = input("Enter a new Folder name: \n")
26 | try:
27 | os.mkdir(folder_name)
28 | return folder_name
29 | except: return check(folder_name)
30 |
31 | folder_name = check(folder_name)
32 |
33 | print('Starting to Download...')
34 |
35 | for i in URL:
36 | searchurl = i + str(data)
37 | response = requests.get(searchurl, headers = usr_agent)
38 | html = response.text
39 | soup = BeautifulSoup(html, 'html.parser')
40 | results = soup.findAll('img', limit = n_images)
41 |
42 | if len(results) != 0:
43 | for i, image in enumerate(results):
44 | try: image_link = image["data-srcset"]
45 | except:
46 | try: image_link = image["data-src"]
47 | except:
48 | try: image_link = image["data-fallback-src"]
49 | except:
50 | try: image_link = image["src"]
51 | except: pass
52 | try:
53 | r = requests.get(image_link).content
54 | try: r = str(r, 'utf-8')
55 | except UnicodeDecodeError:
56 | with open(f"{folder_name}/images{i+1}.{img_format}", "wb+") as f: f.write(r)
57 | except: pass
58 |
59 | return 'Successfully Downloaded ' + str(n_images) + ' images'
--------------------------------------------------------------------------------
/src/snapdeal.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | usr_agent = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
9 | 'Accept-Encoding': 'none',
10 | 'Accept-Language': 'en-US,en;q=0.8',
11 | 'Connection': 'keep-alive',
12 | }
13 |
14 | def scrappi(product_name, n_pages):
15 |
16 | snap = "https://www.snapdeal.com/search?keyword="+product_name
17 | url = requests.get(snap)
18 | soup = BeautifulSoup(url.text,"lxml")
19 |
20 | if n_pages == 0:
21 | print("Enter a valid number of Pages")
22 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: ")))
23 |
24 | initial_url = str(snap)
25 | lst_of_urls = []
26 | for i in range(1, n_pages+1):
27 | x = initial_url + '&page=' + str(i)
28 | lst_of_urls.append(x)
29 |
30 | def Card_Style(soup):
31 |
32 | lst=[]
33 | cnt = soup.find_all("div", {"class": "product-tuple-description"})
34 | for i in range(len(cnt)):
35 |
36 | try: name = cnt[i].find("p", {"class": "product-title"}).text
37 | except: name = "None"
38 |
39 | try: Price = cnt[i].find("span", {"class": "lfloat product-price"}).text
40 | except: Price = "None"
41 |
42 | try: original = cnt[i].find("span", {"class": "lfloat product-desc-price strike"}).text
43 | except: original = "None"
44 |
45 | try: rating = cnt[i].find("p",{"class":"product-rating-count"}).text
46 | except: rating = "None"
47 |
48 | lst.append([name, Price, original, rating])
49 |
50 | return lst
51 |
52 | def snapdeal(soup):
53 | if len(soup.find_all("div",class_="product-tuple-description"))>=1:
54 | return Card_Style(soup)
55 |
56 | x = []
57 | for i in lst_of_urls:
58 | url = requests.get(i)
59 | soup = BeautifulSoup(url.text,"lxml")
60 | abc = snapdeal(soup)
61 | for j in abc: x.append(j)
62 |
63 | return pd.DataFrame(x, columns =['Name', 'Price', 'Original Price', 'Number of Ratings'])
--------------------------------------------------------------------------------
/src/imdb.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from time import sleep
3 | from urllib.request import urlopen
4 | from bs4 import BeautifulSoup
5 |
6 | def scrappi(genre, n_pages):
7 | source = urlopen(f"https://www.imdb.com/search/title/?genres={genre}").read()
8 | data = {
9 | "title" : [],
10 | "year" : [],
11 | "certificate" : [],
12 | "runtime" : [],
13 | "genre" : [],
14 | "rating" : [],
15 | "description" : [],
16 | "stars" : [],
17 | "directors" : [],
18 | "votes" : []
19 | }
20 |
21 | for i in range(n_pages):
22 | soup = BeautifulSoup(source,'lxml')
23 | cards = soup.find_all("div", {"class":"lister-item-content"})
24 | for card in cards:
25 | try: data["title"].append(card.find("h3", {"class":"lister-item-header"}).find("a").text)
26 | except: data["title"].append(None)
27 | try: data["year"].append(card.find("h3", {"class":"lister-item-header"}).find_all("span")[-1].text[1:-1])
28 | except: data["year"].append(None)
29 | try: data["certificate"].append(card.find("span", {"class":"certificate"}).text)
30 | except: data["certificate"].append(None)
31 | try: data["runtime"].append(card.find("span", {"class":"runtime"}).text)
32 | except: data["runtime"].append(None)
33 | try: data["genre"].append((card.find("span", {"class":"genre"}).text).strip())
34 | except: data["genre"].append(None)
35 | try: data["rating"].append((card.find("div", {"class":"ratings-imdb-rating"}).text).strip())
36 | except: data["rating"].append(None)
37 | try: data["description"].append((card.find_all("p", {"class":"text-muted"})[-1].text).strip())
38 | except: data["description"].append(None)
39 | casts = card.find("p", {"class":""}).text.split("|")
40 | star, director = None, None
41 | for cast in casts:
42 | temp = cast.strip().replace("\n", "").replace(":", ",").split(",")
43 | if temp[0] in ["Star", "Stars"]: star = ', '.join(temp[1:])
44 | elif temp[0] in ["Director", "Directors"]: director = ', '.join(temp[1:])
45 | data["stars"].append(star)
46 | data["directors"].append(director)
47 | try: data["votes"].append(card.find("span", {"name":"nv"}).text)
48 | except: data["votes"].append(None)
49 | try:
50 | source = urlopen("https://www.imdb.com"+soup.find("a", {"class":"lister-page-next next-page"}).attrs['href']).read()
51 | except:
52 | break
53 |
54 | return pd.DataFrame(data)
--------------------------------------------------------------------------------
/src/alibaba.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | usr_agent = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
9 | 'Accept-Encoding': 'none',
10 | 'Accept-Language': 'en-US,en;q=0.8',
11 | 'Connection': 'keep-alive',
12 | }
13 |
14 | def scrappi(product_name, n_pages):
15 | ali = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText="+product_name
16 | url = requests.get(ali)
17 | soup = BeautifulSoup(url.text,"lxml")
18 |
19 | if n_pages == 0:
20 | print("Enter a valid number of Pages")
21 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: ")))
22 |
23 | initial_url = str(ali)
24 | lst_of_urls = []
25 | for i in range(1, n_pages+1):
26 | x = initial_url + '&page=' + str(i)
27 | lst_of_urls.append(x)
28 |
29 | def Card_Style(soup):
30 |
31 | lst=[]
32 | cnt = soup.find_all("div", {"class": "m-gallery-product-item-v2"})
33 | for i in range(len(cnt)):
34 |
35 | try: name = cnt[i].find("p", {"class": "elements-title-normal__content"}).text
36 | except: name = "None"
37 |
38 | try:
39 | Price = cnt[i].find("p", {"class": "elements-offer-price-normal medium"})['title']
40 | Price = '$' + str(Price).replace('$', '')
41 | except: Price = "None"
42 |
43 | try: n_item = cnt[i].find("span", {"class": "element-offer-minorder-normal__value"}).text
44 | except: n_item = "None"
45 |
46 | try: Description = cnt[i].find("div", {"class": "offer-tag-list"}).text
47 | except: Description = "None"
48 |
49 | try: rating = cnt[i].find("span",{"class":"seb-supplier-review__score"}).text
50 | except: rating = "None"
51 |
52 | lst.append([name, Price, n_item, Description, rating])
53 |
54 | return lst
55 |
56 | def alibaba(soup):
57 | if len(soup.find_all("div",class_="m-gallery-product-item-v2"))>=1: return Card_Style(soup)
58 |
59 | x = []
60 | for i in lst_of_urls:
61 | url = requests.get(i)
62 | soup = BeautifulSoup(url.text,"lxml")
63 | abc = alibaba(soup)
64 | if abc:
65 | for j in abc: x.append(j)
66 |
67 | return pd.DataFrame(x, columns =['Name', 'Price', 'Number of Items', 'Description', 'Ratings'])
--------------------------------------------------------------------------------
/src/amazon.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from time import sleep
3 | from webdriver_manager.chrome import ChromeDriverManager
4 | from selenium import webdriver
5 |
6 | def func(cards):
7 | data = []
8 | for card in cards:
9 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div[3]")
10 | except:
11 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div[2]")
12 | except:
13 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div/div[3]")
14 | except: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div/div[2]")
15 | try: description = info.find_element_by_xpath("./div[1]/h2").text
16 | except: description = None
17 | try: rating = info.find_element_by_xpath("./div[2]/div/span").get_attribute("aria-label")
18 | except: rating = None
19 | try: votes = info.find_elements_by_xpath("./div[2]/div/span")[1].text
20 | except: votes = None
21 | try: offer_price = info.find_element_by_class_name("a-price").text.replace("\n", ".")
22 | except: offer_price = None
23 | try: actual_price = info.find_element_by_class_name("a-price").find_element_by_xpath("..//span[@data-a-strike='true']").text
24 | except: actual_price = offer_price
25 |
26 | data.append([description, rating, votes, offer_price, actual_price])
27 |
28 | return data
29 |
30 | def scrappi(product_name, n_pages):
31 | chrome_options = webdriver.ChromeOptions()
32 | chrome_options.add_argument('--headless')
33 | chrome_options.headless = True
34 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
35 | driver.create_options()
36 |
37 | url = "https://www.amazon.com/s?k="+product_name
38 | driver.get(url)
39 | sleep(4)
40 |
41 | cards = driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]')
42 | while len(cards) == 0:
43 | driver.get(url)
44 | sleep(4)
45 |
46 | max_pages = int(driver.find_element_by_xpath(".//span[@class='s-pagination-strip']/span[last()]").text)
47 | while n_pages > max_pages or n_pages == 0:
48 | print(f"Please Enter a Valid Number of Pages Between 1 to {max_pages}:")
49 | n_pages = int(input())
50 |
51 | data = []
52 |
53 | while n_pages > 0:
54 | n_pages -= 1
55 | data.extend(func(driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]')))
56 | driver.find_element_by_class_name("s-pagination-next").click()
57 | sleep(4)
58 |
59 | driver.close()
60 | return pd.DataFrame(data, columns=["Description", "Rating", "Votes", "Offer Price", "Actual Price"])
--------------------------------------------------------------------------------
/src/instagram.py:
--------------------------------------------------------------------------------
1 | try:
2 | from webdriver_manager.chrome import ChromeDriverManager
3 | except: raise ImportError("'webdriver-manager' package not installed")
4 | try:
5 | from selenium.webdriver.common.keys import Keys
6 | from selenium import webdriver
7 | except: raise ImportError("'selenium' package not installed")
8 | from bs4 import BeautifulSoup
9 | import pandas as pd
10 | import time
11 |
12 | usr_agent = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
16 | 'Accept-Encoding': 'none',
17 | 'Accept-Language': 'en-US,en;q=0.8',
18 | 'Connection': 'keep-alive',
19 | }
20 |
21 |
22 | def account(insta_handle, n_pages):
23 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install())
24 | driver.get("https://www.instagram.com/"+insta_handle+"/")
25 |
26 | for _ in range(n_pages):
27 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
28 | time.sleep(3)
29 |
30 | html = driver.page_source
31 | soup = BeautifulSoup(html, 'html.parser')
32 | main = soup.find_all('section', {"class": "zwlfE"})
33 | main2 = soup.find_all('div', {"class": "_2z6nI"})
34 |
35 | for details in main:
36 | title = details.find('h1', {"class": "rhpdm"})
37 | info = details.find_all('span', {"class": "g47SY"})
38 | data = details.find('div', {"class": "-vDIg"}).find_all('span')
39 | posts = info[0].text
40 | followers = info[1].text
41 | following = info[2].text
42 | if(data): bio = data[0].text
43 | print("Name: ", title.text)
44 | print("Number of Posts: ", posts)
45 | print("Followers: ", followers)
46 | print("Following: ", following)
47 | if(data): print("Bio: ", bio)
48 | else: print("Bio: None")
49 | print()
50 | break
51 |
52 | post_url = []
53 | for i in main2:
54 | url = i.find_all('div', {"class": "v1Nh3 kIKUG _bz0w"})
55 | for x in url:
56 | u = 'https://www.instagram.com/'+x.a['href']
57 | post_url.append(u)
58 |
59 | def func(a):
60 | driver.get(a)
61 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
62 | time.sleep(3)
63 | html = driver.page_source
64 | soup = BeautifulSoup(html, "html.parser")
65 | main = soup.find_all('div', {"class": "C4VMK"})
66 | for details in main:
67 | title = details.find('span', {"class": ""})
68 | return title.text
69 |
70 | caption =[]
71 | for i in post_url:
72 | caption.append(func(i))
73 |
74 | driver.close()
75 | return pd.DataFrame({'Captions': caption})
76 |
77 | def hashtag(hashtag, n_posts):
78 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install())
79 | hashtag = hashtag.replace('#', '')
80 | driver.get("https://www.instagram.com/explore/tags/"+hashtag+"/")
81 |
82 | for _ in range(int(n_posts//3)):
83 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
84 | time.sleep(3)
85 |
86 | html = driver.page_source
87 | soup = BeautifulSoup(html, 'html.parser')
88 | main = soup.find_all('div', {"class": "EZdmt"})
89 |
90 | post_url = []
91 | for i in main:
92 | url = i.find_all('div', {"class": "v1Nh3 kIKUG _bz0w"})
93 | for x in url:
94 | u = 'https://www.instagram.com'+x.a['href']
95 | post_url.append(u)
96 |
97 | def func(a):
98 | driver.get(a)
99 | driver.find_element_by_tag_name('body').send_keys(Keys.END)
100 | time.sleep(3)
101 | html = driver.page_source
102 | soup = BeautifulSoup(html, "html.parser")
103 | main = soup.find_all('div', {"class": "C4VMK"})
104 | for details in main:
105 | title = details.find('span', {"class": ""})
106 | return title.text
107 |
108 | caption =[]
109 | for i in post_url:
110 | caption.append(func(i))
111 |
112 | driver.close()
113 | return pd.DataFrame({'Captions': caption})
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 |
6 |
7 | ## PyScrappy: powerful Python data scraping toolkit
8 |
9 | [](https://www.python.org/)
10 |
11 | [](https://www.python.org/downloads/release/python-360/)
12 | [](https://pypi.org/project/PyScrappy/)
13 |
14 | [](https://pypi.org/project/PyScrappy/)
15 | [](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
16 | 
17 |
18 | 
19 | 
20 | 
21 |
22 | [](https://pyscrappy.netlify.app/)
23 |
24 | ## What is it?
25 |
26 | **PyScrappy** is a Python package that provides a fast, flexible, and exhaustive way to scrape data from various different sources. Being an
27 | easy and intuitive library. It aims to be the fundamental high-level building block for scraping **data** in Python. Additionally, it has the broader goal of becoming **the most powerful and flexible open source data scraping tool available**.
28 |
29 | ## Main Features
30 |
31 | Here are just a few of the things that PyScrappy does well:
32 |
33 | - Easy scraping of [**Data**](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b) available on the internet
34 | - Returns a [**DataFrame**](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) for further analysis and research purposes.
35 | - Automatic [**Data Scraping**](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b): Other than a few user input parameters the whole process of scraping the data is automatic.
36 | - Powerful, flexible
37 |
38 | ## Where to get it
39 |
40 | The source code is currently hosted on GitHub at:
41 | https://github.com/mldsveda/PyScrappy
42 |
43 | Binary installers for the latest released version are available at the [Python
44 | Package Index (PyPI)](https://pypi.org/project/PyScrappy/).
45 |
46 | ```sh
47 | pip install PyScrappy
48 | ```
49 |
50 | ## Dependencies
51 |
52 | - [selenium](https://www.selenium.dev/) - Selenium is a free (open-source) automated testing framework used to validate web applications across different browsers and platforms.
53 | - [webdriver-manger](https://github.com/bonigarcia/webdrivermanager) - WebDriverManager is an API that allows users to automate the handling of driver executables like chromedriver.exe, geckodriver.exe etc required by Selenium WebDriver API. Now let us see, how can we set path for driver executables for different browsers like Chrome, Firefox etc.
54 | - [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - Beautiful Soup is a Python library for getting data out of HTML, XML, and other markup languages.
55 | - [pandas](https://pandas.pydata.org/) - Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.
56 |
57 | ## License
58 |
59 | [MIT](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
60 |
61 | ## Getting Help
62 |
63 | For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pyscrappy).
64 | Further, general questions and discussions can also take place on GitHub in this [repository](https://github.com/mldsveda/PyScrappy).
65 |
66 | ## Discussion and Development
67 |
68 | Most development discussions take place on GitHub in this [repository](https://github.com/mldsveda/PyScrappy).
69 |
70 | Also visit the official documentation of [PyScrappy](https://pyscrappy.netlify.app/) for more information.
71 |
72 | ## Contributing to PyScrappy
73 |
74 | All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome.
75 |
76 | If you are simply looking to start working with the PyScrappy codebase, navigate to the GitHub ["issues"](https://github.com/mldsveda/PyScrappy/issues) tab and start looking through interesting issues.
77 |
78 | ## End Notes
79 |
80 | _Learn More about this package on [Medium](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b)._
81 |
82 | ### **_This package is solely made for educational and research purposes._**
83 |
--------------------------------------------------------------------------------
/src/flipkart.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | usr_agent = {
6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
9 | 'Accept-Encoding': 'none',
10 | 'Accept-Language': 'en-US,en;q=0.8',
11 | 'Connection': 'keep-alive',
12 | }
13 |
14 | def scrappi(product_name, n_pages):
15 |
16 | flip = "https://www.flipkart.com/search?q="+product_name
17 | url = requests.get(flip)
18 | soup = BeautifulSoup(url.text,"lxml")
19 |
20 | if n_pages == 0:
21 | Page = soup.find("div",class_="_2MImiq").find("span", class_="").text
22 | c = Page.split()
23 | i=((c[3].replace(',','')))
24 | print("Enter valid number of Pages between 1 and {}".format(i))
25 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: ")))
26 |
27 | initial_url = str(flip)
28 | lst_of_urls = []
29 | for i in range(1, n_pages+1):
30 | x = initial_url + '&page=' + str(i)
31 | lst_of_urls.append(x)
32 |
33 | def rectangle(soup):
34 |
35 | lst = []
36 | cnt = soup.find_all("div", {"class": "_2kHMtA"})
37 | for i in range(len(cnt)):
38 |
39 | try: name = cnt[i].find("div", {"class": "_4rR01T"}).text
40 | except: name = "None"
41 |
42 | try: Price = cnt[i].find("div", {"class":"_30jeq3 _1_WHN1"}).text
43 | except: Price = "None"
44 |
45 | try:
46 | Priceo = cnt[i].find("div", {"class": "_3I9_wc _27UcVY"}).text.split()
47 | oprice = Priceo[0]
48 | except: oprice = "None"
49 |
50 | try: Description = cnt[i].find("li", {"class": "rgWa7D"}).text
51 | except: Description = "None"
52 |
53 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text
54 | except: rating="None"
55 |
56 | lst.append([name, Price, oprice, Description, rating])
57 |
58 | return lst
59 |
60 | def Card_Style(soup):
61 |
62 | lst=[]
63 | cnt = soup.find_all("div", {"class": "_4ddWXP"})
64 | for i in range(len(cnt)):
65 |
66 | try: name = cnt[i].find("a", {"class": "s1Q9rs"}).text
67 | except: name = "None"
68 |
69 | try: Price = cnt[i].find("div", {"class": "_30jeq3"}).text
70 | except: Price = "None"
71 |
72 | try:
73 | Priceo = cnt[i].find("div", {"class": "_3I9_wc"}).text.split()
74 | oprice = Priceo[0]
75 |
76 | except: oprice = "None"
77 |
78 | try: Description = cnt[i].find("div", {"class": "_3Djpdu"}).text
79 | except: Description = "None"
80 |
81 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text
82 | except: rating="None"
83 |
84 | lst.append([name, Price, oprice, Description, rating])
85 |
86 | return lst
87 |
88 | def OtherStyle(soup):
89 |
90 | lst=[]
91 | cnt = soup.find_all("div", {"class": "_1xHGtK _373qXS"})
92 | for i in range(len(cnt)):
93 |
94 | try: name = cnt[i].find("div", {"class": "_2WkVRV"}).text
95 | except: name = "None"
96 |
97 | try: Price = cnt[i].find("div", {"class": "_30jeq3"}).text
98 | except: Price = "None"
99 |
100 | try:
101 | Priceo = cnt[i].find("div", {"class": "_3I9_wc"}).text.split()
102 | oprice = Priceo[0]
103 | except: oprice = "None"
104 |
105 | try: Description = cnt[i].find("a", {"class": "IRpwTa"}).text
106 | except: Description = "None"
107 |
108 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text
109 | except: rating="None"
110 |
111 | lst.append([name, Price, oprice, Description, rating])
112 |
113 | return lst
114 |
115 | def flipkart(soup):
116 | if len(soup.find_all("div",class_="_4ddWXP"))>=1:
117 | return Card_Style(soup)
118 | elif len(soup.find_all("div",class_="_2kHMtA"))>=1:
119 | return rectangle(soup)
120 | elif len(soup.find_all("div", {"class": "_1xHGtK _373qXS"}))>=1:
121 | return OtherStyle(soup)
122 |
123 | x = []
124 | for i in lst_of_urls:
125 | url = requests.get(i)
126 | soup = BeautifulSoup(url.text,"lxml")
127 | abc = flipkart(soup)
128 | if abc:
129 | for j in abc: x.append(j)
130 |
131 | return pd.DataFrame(x, columns =['Name', 'Price', 'Original Price', 'Description', 'Rating'])
--------------------------------------------------------------------------------
/src/stock.py:
--------------------------------------------------------------------------------
1 | from webdriver_manager.chrome import ChromeDriverManager
2 | from selenium import webdriver
3 | from time import sleep
4 | import pandas as pd
5 |
6 | ########## Stock Analysis ##########
7 | def stock_analysis(stock_code, analysis_type):
8 | analysis = {
9 | "earning estimate" : 0,
10 | "revenue estimate" : 1,
11 | "earning history" : 2,
12 | "EPS trend" : 3,
13 | "EPS revision" : 4,
14 | "growth estimate" : 5
15 | }
16 | if analysis_type not in analysis.keys(): raise KeyError("Invalid value for 'analysis_type'")
17 |
18 | chrome_options = webdriver.ChromeOptions()
19 | chrome_options.add_argument('--headless')
20 | chrome_options.headless = True
21 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
22 | driver.create_options()
23 | driver.get("https://in.finance.yahoo.com/quote/"+stock_code+"/analysis?p="+stock_code)
24 |
25 | try: tables = driver.find_elements_by_xpath('.//section[@data-test="qsp-analyst"]/table')
26 | except: raise KeyError("Invalid value for 'stock_code'")
27 |
28 |
29 | try: df = pd.DataFrame([[data.text for data in row.find_elements_by_xpath('td')] for row in tables[analysis[analysis_type]].find_elements_by_xpath('.//tbody/tr')], columns = [head.text for head in tables[analysis[analysis_type]].find_elements_by_tag_name('th')])
30 | except:
31 | print("Analysis report not Available")
32 | return None
33 | return df
34 |
35 | ########## Stock Profile ##########
36 | def stock_profile(stock_code):
37 | chrome_options = webdriver.ChromeOptions()
38 | chrome_options.add_argument('--headless')
39 | chrome_options.headless = True
40 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
41 | driver.create_options()
42 | try:
43 | url = "https://in.finance.yahoo.com/quote/"+stock_code+"/profile?p="+stock_code
44 | driver.get(url)
45 | sleep(3)
46 |
47 | card = driver.find_element_by_xpath('.//div[@id="Main"]')
48 | executives = pd.DataFrame([[data.text for data in row.find_elements_by_xpath('td')] for row in card.find_elements_by_xpath('.//tbody/tr')], columns = [i.text for i in card.find_elements_by_tag_name('th')])
49 | description = card.find_element_by_xpath('.//section/section[2]/p').text
50 | pro_card = card.find_element_by_xpath('.//div[@data-test="qsp-profile"]')
51 | profile = {}
52 | profile["Company Name"] = pro_card.find_element_by_xpath('.//h3').text
53 | profile["Headquater"] = ', '.join(pro_card.find_element_by_xpath('.//p').text.split('\n')[:3])
54 | profile["Sector"], profile["Industry"], profile["Employees"] = [i.split(': ')[1] for i in pro_card.find_element_by_xpath('.//p[2]').text.split('\n')]
55 | except:
56 | print("Profile details not Available or the 'stock_code' is Invalid")
57 | return None, None, None
58 | return [profile, description, executives]
59 |
60 |
61 | ########## Historical Data ##########
62 | def stock_history(stock_code, stock_range, stock_frequency):
63 | chrome_options = webdriver.ChromeOptions()
64 | chrome_options.add_argument('--headless')
65 | chrome_options.headless = True
66 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options)
67 | driver.create_options()
68 |
69 | try:
70 | url = "https://in.finance.yahoo.com/quote/"+stock_code+"/history?p="+stock_code
71 | driver.get(url)
72 | sleep(3)
73 |
74 | card = driver.find_element_by_xpath('//section[@data-test="qsp-historical"]/div[1]')
75 | time_period = card.find_element_by_xpath('.//div[1]/div[@data-test="dropdown"]')
76 | time_period.click()
77 | sleep(3)
78 |
79 | if stock_range == "1d": time_period.find_element_by_xpath('.//button[@data-value="1_D"]').click()
80 | elif stock_range == "5d":time_period.find_element_by_xpath('.//button[@data-value="5_D"]').click()
81 | elif stock_range == "3m":time_period.find_element_by_xpath('.//button[@data-value="3_M"]').click()
82 | elif stock_range == "6m":time_period.find_element_by_xpath('.//button[@data-value="6_M"]').click()
83 | elif stock_range == "1y":time_period.find_element_by_xpath('.//button[@data-value="1_Y"]').click()
84 | elif stock_range == "5y":time_period.find_element_by_xpath('.//button[@data-value="5_Y"]').click()
85 | elif stock_range == "max":time_period.find_element_by_xpath('.//button[@data-value="MAX"]').click()
86 | else: raise KeyError("Invalid value passed for 'stock_range'")
87 | sleep(3)
88 |
89 | frequency = card.find_element_by_xpath('.//span[@data-test="historicalFrequency-selected"]')
90 | frequency.click()
91 | sleep(3)
92 |
93 | if stock_frequency == "Daily": card.find_element_by_xpath('.//div[@data-value="1d"]').click()
94 | elif stock_frequency == "Weekly": card.find_element_by_xpath('.//div[@data-value="1wk"]').click()
95 | elif stock_frequency == "Monthly": card.find_element_by_xpath('.//div[@data-value="1mo"]').click()
96 | else: raise KeyError("Invalid value passed for 'stock_frequency'")
97 | sleep(3)
98 |
99 | card.find_element_by_xpath('.//div[1]/button[@data-reactid="25"]').click()
100 | sleep(3)
101 | except:
102 | print("Historical Data Not Available or the 'stock_code' is Invalid")
103 | return
104 |
105 | try:
106 | link = driver.find_elements_by_link_text('Download')[0].get_attribute('href')
107 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install())
108 | driver.get(link)
109 | sleep(5)
110 | print("CSV Downloaded Successfully")
111 | driver.close()
112 | except:
113 | print("CSV file not Available or the 'stock_code' is Invalid")
114 | return
--------------------------------------------------------------------------------
/src/PyScrappy.py:
--------------------------------------------------------------------------------
1 | ############## ECommereceScrapper ##############
2 | class ECommerceScrapper():
3 |
4 | """
5 |
6 | ECommerece Scrapper: Helps in scrapping data from E-Comm websites
7 | 1. Alibaba
8 | 2. Amazon
9 | 3. Flipkart
10 | 4. Snapdeal
11 |
12 | Type: class
13 |
14 | Note
15 | ------
16 | Create an object of this class to procced further.
17 |
18 | Example
19 | ---------
20 | >>> obj = PyScrappy.ECommerceScrapper()
21 |
22 | """
23 |
24 | ############## Alibaba Scrapper ##############
25 | def alibaba_scrapper(self, product_name, n_pages):
26 |
27 | """
28 |
29 | Alibaba Scrapper: Helps in scrapping Alibaba data ('Name', 'Number of Items', 'Description', 'Ratings').
30 | return type: DataFrame
31 |
32 | Parameters
33 | ------------
34 | product_name: Enter the name of desired product
35 | Type: str
36 |
37 | n_pages: Enter the number of pages that you want to scrape
38 | Type: int
39 |
40 | Note
41 | ------
42 | Both the arguments are a compulsion.
43 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run.
44 |
45 | Example
46 | ---------
47 | >>> obj.alibabal_scrapper('product', 3)
48 | out: Name Number of Items Description Ratings
49 | abc 440 product a 3.5
50 | aec 240 product b 4.5
51 |
52 | """
53 |
54 | import alibaba
55 | return alibaba.scrappi(product_name, n_pages)
56 |
57 |
58 | ############## Amazon Scrapper ##############
59 | def amazon_scrapper(self, product_name, n_pages):
60 |
61 | """
62 |
63 | Amazon Scrapper: Helps in scrapping amazon data ('Description', 'Rating', 'Votes', 'Offer Price', 'Actual Price').
64 | return type: DataFrame
65 |
66 | Parameters
67 | ------------
68 | product_name: Enter the name of desired product
69 | Type: str
70 |
71 | n_pages: Enter the number of pages that you want to scrape
72 | Type: int
73 |
74 | Note
75 | ------
76 | Both the arguments are a compulsion.
77 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run.
78 |
79 | Example
80 | ---------
81 | >>> obj.amazon_scrapper('product', 3)
82 | out: Name Number of Items Description Ratings
83 | abc 440 product a 3.5
84 | aec 240 product b 4.5
85 |
86 | """
87 |
88 | import amazon
89 | return amazon.scrappi(product_name, n_pages)
90 |
91 |
92 | ############## Flipkart Scrapper ##############
93 | def flipkart_scrapper(self, product_name, n_pages):
94 |
95 | """
96 |
97 | Flipkart Scrapper: Helps in scrapping Flikart data ('Name', 'Price', 'Original Price', 'Description', 'Rating').
98 | return type: DataFrame
99 |
100 | Parameters
101 | ------------
102 | product_name: Enter the name of the desired product, which you want to scrape the data of
103 | Type: str
104 |
105 | n_pages: Enter the number of pages that you want to scrape
106 | Type: int
107 |
108 | Note
109 | ------
110 | Both the arguments are a compulsion.
111 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run.
112 |
113 | Example
114 | ---------
115 | >>> obj.flipkart_scrapper("Product Name", 3)
116 | out: Name Price Original Price Description Rating
117 | abc ₹340 ₹440 Product 4.2
118 | aec ₹140 ₹240 Product 4.7
119 |
120 | """
121 |
122 | import flipkart
123 | return flipkart.scrappi(product_name, n_pages)
124 |
125 |
126 | ############## Snapdeal Scrapper ##############
127 | def snapdeal_scrapper(self, product_name, n_pages):
128 |
129 | """
130 |
131 | Snapdeal Scrapper: Helps in scrapping Snapdeal data ('Name', 'Price', 'Original Price', 'Number of Ratings').
132 | return type: DataFrame
133 |
134 | Parameters
135 | ------------
136 | product_name: Enter the name of the desired product, which you want to scrape the data of
137 | Type: str
138 |
139 | n_pages: Enter the number of pages that you want to scrape
140 | Type: int
141 |
142 | Note
143 | ------
144 | Both the arguments are a compulsion.
145 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run.
146 |
147 | Example
148 | ---------
149 | >>> obj.snapdeal_scrapper('product', 3)
150 | out: Name Price Original Price Number of Ratings
151 | abc ₹340 ₹440 40
152 | aec ₹140 ₹240 34
153 |
154 | """
155 |
156 | import snapdeal
157 | return snapdeal.scrappi(product_name, n_pages)
158 |
159 | ########################################################################################################################
160 |
161 | ############## FoodScrapper ##############
162 | class FoodScrapper():
163 |
164 | """
165 |
166 | Food Scrapper: Helps in scrapping data from food delivery websites
167 | 1. Swiggy
168 | 2. Zomato
169 |
170 | Type: class
171 |
172 | Note
173 | ------
174 | Create an object of this class to procced further.
175 |
176 | Example
177 | ---------
178 | >>> obj = PyScrappy.FoodScrapper()
179 |
180 | """
181 |
182 | ############## Swiggy Scrapper ##############
183 | def swiggy_scrapper(self, city, n_pages):
184 |
185 | """
186 |
187 | Swiggy Scrapper: Helps in scrapping swiggy data ('Name', 'Cuisine', 'Price', 'Rating').
188 | return type: DataFrame
189 |
190 | Parameters
191 | ------------
192 | city: Enter the name of desired city were swiggy delivers
193 | Type: str
194 |
195 | n_pages: Enter the number of pages that you want to scrape
196 | Type: int
197 |
198 | Note
199 | ------
200 | Both the arguments are a compulsion.
201 |
202 | Example
203 | ---------
204 | >>> obj.swiggy_scrapper('city', 3)
205 | out: Name Cuisine Price Rating
206 | abc indian ₹123 for two 4.5
207 | xyz indian ₹342 for two 4.3
208 |
209 | """
210 |
211 | import swiggy
212 | return swiggy.scrappi(city, n_pages)
213 |
214 |
215 | ############## Zomato Scrapper ##############
216 | def zomato_scrapper(self, city, n_pages):
217 |
218 | """
219 |
220 | Zomato Scrapper: Helps in scrapping zomato data ("Name", "Cusine", "Price", "Rating", "Delivery Time", "Review counts").
221 | return type: DataFrame
222 |
223 | Parameters
224 | ------------
225 | city: Enter the name of desired city were zomato delivers
226 | Type: str
227 |
228 | n_pages: Enter the number of pages that you want to scrape
229 | Type: int
230 |
231 | Note
232 | ------
233 | Both the arguments are a compulsion.
234 |
235 | Example
236 | ---------
237 | >>> obj.zomato_scrapper('city', 3)
238 | out: Name Cuisine Price Rating Delivery Time Review Count
239 | abc indian ₹123 for two 4.5 45min 3.4K
240 | xyz indian ₹342 for two 4.3 40min 1.2K
241 |
242 | """
243 |
244 | import zomato
245 | return zomato.scrappi(city, n_pages)
246 |
247 | ########################################################################################################################
248 |
249 | ############## Image Scrapper ##############
250 | def image_scrapper(data_name, n_images=10, img_format='jpg', folder_name='images'):
251 |
252 | """
253 |
254 | Image Scrapper: Helps in scrapping images from "Google", "Yahoo", "Bing".
255 | Downloads it to the desired folder.
256 |
257 | Parameters
258 | ------------
259 | data_name: Enter the name of object/item whose images you want to scrape/download
260 | Type: str
261 |
262 | n_images: Enter the number of imsges you want to scrape of download
263 | Type: int
264 | Default: 10
265 |
266 | img_format: Enter the format of the image file
267 | Type: str
268 | Default: 'jpg'
269 | Accepted Values: 'jpg', 'jpeg', 'png', 'gif'
270 |
271 | folder_name: Enter the path/folder name where you want to download the images
272 | Type: str
273 | Default: 'images'
274 |
275 | Note
276 | ------
277 | Make sure the data_name is a valid name, and if you enter the directory make sure its a valid one.
278 | The scrapper will take some time to work. Wait for the images to get scrapped and downloaded, as it scrapes from all three search engines: Google, Yahoo and Bing.
279 |
280 | Feel free to experiment with different image formats.
281 |
282 | Example
283 | ---------
284 | >>> image_scarpper('Apple', 100, 'png', 'Apples')
285 | out: Starting to download...
286 | Successfully downloaded 100 images
287 |
288 | """
289 |
290 | import image
291 | return image.scrappi(data_name, n_images, img_format, folder_name)
292 |
293 | ########################################################################################################################
294 |
295 | ############## IMDB Scrapper ##############
296 | def imdb_scrapper(genre, n_pages):
297 |
298 | """
299 |
300 | IMDB Scrapper: Helps in scrapping movies from IMDB.
301 | return type: DataFrame
302 |
303 | Parameters
304 | ------------
305 | genre: Enter the genre of the movie
306 | Type: str
307 |
308 | n_pages: Enter the number of pages that it will scrape at a single run.
309 | Type: int
310 |
311 | Note
312 | ------
313 | both the parameters are compulsory.
314 |
315 | Example
316 | ---------
317 | >>> imdb_scrapper('action', 4)
318 | out: Title Year Certificate Runtime Genre Rating Description Stars Directors Votes
319 | asd 2022 UA 49min action 3.9 about the.. asd dfgv 23
320 | scr 2022 15+ 89min action 4.9 about the.. add dfgv 23
321 | """
322 |
323 | import imdb
324 | return imdb.scrappi(genre, n_pages)
325 |
326 | ########################################################################################################################
327 |
328 | ############## LinkedIn Scrapper ##############
329 | def linkedin_scrapper(job_title, n_pages):
330 |
331 | """
332 |
333 | LinkedIn Scrapper: Helps in scrapping job related data from LinkedIn (Job Title, Company Name, Location, Salary, Benefits, Date)
334 | return type: DataFrame
335 |
336 | Parameters
337 | ------------
338 | job_title: Enter the job title or type.
339 | Type: str
340 |
341 | n_pages: Enter the number of pages that it will scrape at a single run.
342 | Type: int
343 |
344 | Note
345 | ------
346 | Both the parameters is a compulsion
347 |
348 | Example
349 | ---------
350 | >>> linkedin_scrapper('python', 1)
351 | out: Job Title Company Name Location Salary Benefits Date
352 | abc PyScrappy US 2300 Actively Hiring +1 1 day ago
353 | abc PyScrappy US 2300 Actively Hiring +1 1 day ago
354 | ...
355 | ..
356 |
357 | """
358 |
359 | import linkedin
360 | return linkedin.scrappi(job_title, n_pages)
361 |
362 | ########################################################################################################################
363 |
364 | ############## News Scrapper ##############
365 | def news_scrapper(n_pages, genre = str()):
366 |
367 | """
368 |
369 | News Scrapper: Helps in scrapping News (Headlines, Time, Date, News)
370 | return type: DataFrame
371 |
372 | Parameters
373 | ------------
374 | n_pages: Enter the number of pages that it will scrape at a single run.
375 | Type: int
376 |
377 | genre: Enter the news genre
378 | Type: str
379 | Default: str() (None)
380 | Values accepted:
381 | 'national', 'business', 'sports', 'world', 'politics', 'technology', 'startup', 'entertainment',
382 | 'miscellaneous', 'hatke', 'science', 'automobile'
383 |
384 | Note
385 | ------
386 | n_pages in a compulsion
387 |
388 | Example
389 | ---------
390 | >>> news_scrapper(3, 'hatke')
391 | out: Headlines Time Date News
392 | New Package 08:19 pm 25 Jun 2021,Sunday PyScrappy is a new package...
393 | New Scrapper 08:19 am 25 Jun 2020,Wednesday PyScrappy is a new Scrapper...
394 |
395 | """
396 |
397 | import news
398 | return news.scrappi(n_pages, genre = genre)
399 |
400 | ########################################################################################################################
401 |
402 | ############## Social Media Scrapper ##############
403 | class SocialMediaScrapper():
404 |
405 | """
406 |
407 | Social Media Scrapper: Helps in scrapping data from social media platforms
408 | 1. Instagram
409 | 2. Twitter
410 | 3. YouTube
411 |
412 | Type: class
413 |
414 | Note
415 | ------
416 | Create an object of this class to procced further.
417 |
418 | Example
419 | ---------
420 | >>> obj = PyScrappy.SocialMediaScrapper()
421 |
422 | """
423 |
424 | ############## Instagram Scrapper ##############
425 | class InstagramScrapper():
426 |
427 | """
428 |
429 | Instagram Scrapper: Helps in scrapping instagram data (name, posts, followers, following, bio, captions)
430 | 1. Details and post captions based on Insta handle
431 | 2. Post captions based on #hashtags
432 |
433 | Type: class
434 |
435 | Note
436 | ------
437 | Create an object of this class to procced further.
438 |
439 | Example
440 | ---------
441 | >>> obj2 = obj.InstagramScrapper()
442 |
443 | """
444 |
445 | ############## Instagram account Scrapper ##############
446 | def account_scrapper(self, insta_handle, n_pages):
447 |
448 | """
449 |
450 | Instagram account Scrapper: Helps in scrapping instagram data (name, posts, followers, following, bio, captions)
451 | return type: DataFrame (for captions)
452 |
453 | Parameters
454 | ------------
455 | insta_handle: Enter the desired Insta handle/username
456 | Type: str
457 |
458 | n_pages: Enter the number of pages that you want to scrape
459 | Type: int
460 |
461 | Note
462 | ------
463 | Make sure the Instagram account is public, after certain number of runs, Instagram will ask you for your Instagram ID and PASSWORD, kindly enter it to continue.
464 |
465 | Example
466 | ---------
467 | >>> obj2.account_scrapper('Public_account_name', 3)
468 | out: Name: abc
469 | Posts: 50
470 | Followers: 128
471 | Following: 150
472 | Bio: Hello World!!
473 |
474 | Captions
475 | Hello World !!! My first picture.
476 | Hello World !!! My first program....
477 |
478 | """
479 |
480 | import instagram
481 | return instagram.account(insta_handle, n_pages)
482 |
483 |
484 | ############## Instagram hashtag Scrapper ##############
485 | def hashtag_scrapper(self, hashtag, n_posts):
486 |
487 | """
488 |
489 | Instagram hashtag Scrapper: Helps in scrapping instagram data (captions)
490 | return type: DataFrame
491 |
492 | Parameters
493 | ------------
494 | hashtag: Enter the desired hashtag
495 | Type: str
496 |
497 | n_posts: Enter the number of posts that you want to scrape
498 | Type: int
499 |
500 | Note
501 | ------
502 | After certain number of runs, Instagram will ask you for your Instagram ID and PASSWORD, kindly enter it to continue.
503 |
504 | Example
505 | ---------
506 | >>> obj2.hashtag_scrapper('#python', 3)
507 | out: Captions
508 | Hello World !!! My first picture. #python
509 | Hello World !!! My first program. #python
510 | This is scrapping package. #python
511 |
512 | """
513 |
514 | import instagram
515 | return instagram.hashtag(hashtag, n_posts)
516 |
517 |
518 | ############## Twitter Scrapper ##############
519 | def twitter_scrapper(self, hashtag, n_pages):
520 |
521 | """
522 |
523 | Twitter Scrapper: Helps in scrapping data from Twitter ("Name", "Twitter handle", "Post Time", "Tweet", "Reply Count", "Retweet Count", "Like Count")
524 | return type: DataFrame
525 |
526 | Parameters
527 | ------------
528 | hashtag: Enter the desired hashtag
529 | Type: str
530 |
531 | n_pages: Enter the number of pages that you want to scrape
532 | Type: int
533 |
534 | Note
535 | ------
536 | Both the arguments are a compulsion
537 |
538 | Example
539 | ---------
540 | >>> obj.twitter_scrapper('#python', 3)
541 | out: Name Twitter handle Post Time Tweet Reply Count Retweet Count Like Count
542 | asd @ksnkj 3:49:36 this is ... 102 230 1.2k
543 | fsd @ksdtj 6:49:36 it is a ... 12 30 1k
544 |
545 | """
546 |
547 | import twitter
548 | return twitter.scrappi(hashtag, n_pages)
549 |
550 |
551 | ############## YouTube Scrapper ##############
552 | def youtube_scrapper(self, video_sec_url, n_pages):
553 |
554 | """
555 |
556 | YouTube Scrapper: Helps in scrapping YouTube data ('Title', 'Video_url', 'Views', 'Days')
557 | return type: DataFrame
558 |
559 | Parameters
560 | ------------
561 | video_sec_url: Enter the desired YouTube URL (only video section)
562 | Type: str
563 |
564 | n_pages: The number of pages that it will scrape at a single run
565 | Type: int
566 |
567 | Note
568 | ------
569 | Make sure the url is a valid YouTube url, and please enter the url ending with 'videos', i.e urls only from the video sections are acceptable. The scrapping limit is unlimited.
570 |
571 | Example
572 | ---------
573 | >>> obj.youtube_scrapper('https://www.youtube.com/user/youtuber_name/videos', 3)
574 | out: Title Video_url Views Days
575 | My video https://www.youtube.com/user/youtuber_name/my_video 1.2m 30 days
576 | My video2 https://www.youtube.com/user/youtuber_name/my_video2 1m 2 weeks
577 |
578 | """
579 |
580 | import youtube
581 | return youtube.scrappi(video_sec_url, n_pages)
582 |
583 | ########################################################################################################################
584 |
585 | ############## Song Scrapper ##############
586 | class SongScrapper():
587 |
588 | """
589 |
590 | Song Scrapper: Helps in scrapping songs related data
591 | 1. Soundcloud
592 |
593 | Type: class
594 |
595 | Note
596 | ------
597 | Create an object of this class to procced further.
598 |
599 | Example
600 | ---------
601 | >>> obj = PyScrappy.SongScrapper()
602 |
603 | """
604 |
605 | ############## Soundcloud Scrapper ##############
606 | def soundcloud_scrapper(self, track_name, n_pages):
607 |
608 | """
609 |
610 | Soundcloud Scrapper: Helps in scrapping data from soundcloud ('Uploader', 'Music Title', 'Time of Upload', 'Plays')
611 | return type: DataFrame
612 |
613 | Parameters
614 | ------------
615 | track_name: Enter the name of desired track/song/music
616 | Type: str
617 |
618 | n_pages: The number of pages that it will scrape at a single run
619 | Type: int
620 |
621 | Note
622 | ------
623 | Make sure to enter a valid name
624 |
625 | Example
626 | ---------
627 | >>> obj.soundcloud_scrapper('music track', 3)
628 | out: Uploader Music Title Time of Upload Plays
629 | name1 music 3:34:76 234
630 | name2 music 5:6:34 445
631 |
632 | """
633 |
634 | import soundcloud
635 | return soundcloud.soundcloud_tracks(track_name, n_pages)
636 |
637 |
638 | ############## Spotify Scrapper ##############
639 | def spotify_scrapper(self, track_name, n_pages):
640 |
641 | """
642 |
643 | Spotify Scrapper: Helps in scrapping data from spotify ('Id', 'Title', 'Singers', 'Album', 'Duration')
644 | return type: DataFrame
645 |
646 | Parameters
647 | ------------
648 | track_name: Enter the name of desired track/song/music/artist/bodcast
649 | Type: str
650 |
651 | n_pages: The number of pages that it will scrape at a single run
652 | Type: int
653 |
654 | Note
655 | ------
656 | Make sure to enter a valid name
657 |
658 | Example
659 | ---------
660 | >>> obj.spotify_scrapper('pop', 3)
661 | out: Id Title Singers Album Duration
662 | 1 abc abc abc 2:30
663 | 2 def def def 2:30
664 |
665 | """
666 |
667 | import spotify
668 | return spotify.scrappi(track_name, n_pages)
669 |
670 | ########################################################################################################################
671 |
672 | ############## stock Scrapper ##############
673 | class StockScrapper():
674 |
675 | """
676 |
677 | Stock Scrapper: Helps in scrapping stock data
678 | 1. Analysis data of the stock
679 | 2. Historical data of the stock
680 | 3. Profile data of the stock
681 |
682 | Type: class
683 |
684 | Note
685 | ------
686 | Create an object of this class to procced further.
687 |
688 | Example
689 | ---------
690 | >>> obj = PyScrappy.StockScrapper()
691 |
692 | """
693 |
694 | ############## Analysis data scrapper ##############
695 | def analysis_data_scrapper(self, stock_code, analysis_type):
696 |
697 | """
698 |
699 | Analysis data scrapper: Helps in scrapping the Analytical data of the stock
700 | return type: DataFrame
701 |
702 | Parameters
703 | ------------
704 | stock_code: Enter the desired stock code
705 | Type: str
706 |
707 | analysis_type: Enter the name of the analysis type of the stock
708 | Type: str
709 | Accepted values: "earning estimate", "revenue estimate", "earning history", "EPS trend", "EPS revision", "growth estimate"
710 |
711 | Note
712 | ------
713 | Make sure you enter a valid stock code.
714 |
715 | Example
716 | ---------
717 | >>> obj.analysis_data_scrapper('STOCK_CODE', 'earning estimates')
718 |
719 | """
720 |
721 | import stock
722 | return stock.stock_analysis(stock_code, analysis_type)
723 |
724 |
725 | ############## Historical data scrapper ##############
726 | def historical_data_scrapper(self, stock_code, time_period, frequency):
727 |
728 | """
729 |
730 | Historical data scrapper: Helps in scrapping the Historical data of the stock ('Date', 'Open', 'High', 'Low', 'Close', 'Adjusted Close', 'Volume')
731 | return type: CSV file
732 |
733 | Parameters
734 | ------------
735 | stock_code: Enter the desired stock code
736 | Type: str
737 |
738 | time_period: Enter the range/time period of the stock
739 | Type: str
740 | Accepted values: '1d', '5d', '3m', '6m', '1y', '5y', 'max'
741 |
742 | frequency: Enter the time period interval of the data
743 | Type: str
744 | Accepted values: 'Daily', 'Weekly', 'Monthly'
745 |
746 | Note
747 | ------
748 | Make sure you enter a valid stock code, time period and time interval
749 |
750 | Example
751 | ---------
752 | >>> obj.historical_data_scrapper('STOCK_CODE', '1y', 'Daily')
753 |
754 | """
755 |
756 | import stock
757 | return stock.stock_history(stock_code, time_period, frequency)
758 |
759 |
760 | ############## Profile details scrapper ##############
761 | def profile_data_scrapper(self, stock_code):
762 |
763 | """
764 |
765 | Profile data scrapper: Helps in scrapping the Profile data of the stock (profile, description, executives)
766 | return type: [dict, str, DataFrame]
767 |
768 | Parameters
769 | ------------
770 | stock_code: Enter the desired stock code
771 | Type: str
772 |
773 | Note
774 | ------
775 | Make sure you enter a valid stock code.
776 | Make sure to store the result in three variable as list spreading
777 |
778 | Example
779 | ---------
780 | >>> profile, description, executives = obj.profile_data_scrapper('STOCK_CODE')
781 |
782 | """
783 |
784 | import stock
785 | return stock.stock_profile(stock_code)
786 |
787 | ########################################################################################################################
788 |
789 | ############## Wikipedia Scrapper ##############
790 | class WikipediaScrapper():
791 |
792 | """
793 |
794 | Wikipedia Scrapper: Helps in scrapping Wikipedia data
795 | 1. Header
796 | 2. Paragraph
797 | 3. Text
798 |
799 | Type: class
800 |
801 | Note
802 | ------
803 | Create an object of this class to procced further.
804 |
805 | Example
806 | ---------
807 | >>> obj = PyScrappy.WikipediaScrapper()
808 |
809 | """
810 |
811 | ############## Wikipedia Paragraph Scrapper ##############
812 | def para_scrapper(self, word):
813 |
814 | """
815 |
816 | Para Scrapper: Helps in scrapping paragraphs from Wikipedia.
817 |
818 | Parameters
819 | ------------
820 | word: Enter the desired keyword
821 | Type: str
822 |
823 | Note
824 | ------
825 | Make sure that the info of the word is available in Wikipedia.
826 |
827 | Example
828 | ---------
829 | >>> obj.para_scrapper("Python (programming language)")
830 | out: ['\n',
831 | "Python is an interpreted high-level general-purpose programming language.", .....]
832 |
833 | """
834 |
835 | import wikipedia
836 | return wikipedia.para(word)
837 |
838 | ############## Wikipedia Header Scrapper ##############
839 | def header_scrapper(self, word):
840 |
841 | """
842 |
843 | Header Scrapper: Helps in scrapping headers from Wikipedia.
844 |
845 | Parameters
846 | ------------
847 | word: Enter the desired keyword
848 | Type: str
849 |
850 | Note
851 | ------
852 | Make sure that the info of the word is available in Wikipedia.
853 |
854 | Example
855 | ---------
856 | >>> obj.header_scrapper("Python (programming language)")
857 | out: ['History',
858 | 'Design philosophy and features', ....]
859 |
860 | """
861 |
862 | import wikipedia
863 | return wikipedia.header(word)
864 |
865 | ############## Wikipedia Text Scrapper ##############
866 | def text_scrapper(self, word):
867 |
868 | """
869 |
870 | Text Scrapper: Helps in scrapping text from Wikipedia.
871 |
872 | Parameters
873 | ------------
874 | word: Enter the desired keyword
875 | Type: str
876 |
877 | Note
878 | ------
879 | Make sure that the info of the word is available in Wikipedia.
880 |
881 | Example
882 | ---------
883 | >>> obj.text_scrapper("Python (programming language)")
884 | out: ' History Python is an interpreted high-level general-purpose programming language..... '
885 |
886 | """
887 |
888 | import wikipedia
889 | return wikipedia.text(word)
890 |
891 | ########################################################################################################################
--------------------------------------------------------------------------------