├── src ├── __init__.py ├── swiggy.py ├── zomato.py ├── soundcloud.py ├── spotify.py ├── youtube.py ├── linkedin.py ├── wikipedia.py ├── twitter.py ├── news.py ├── image.py ├── snapdeal.py ├── imdb.py ├── alibaba.py ├── amazon.py ├── instagram.py ├── flipkart.py ├── stock.py └── PyScrappy.py ├── PyScrappy.png ├── LICENSE ├── .github └── workflows │ └── python-publish.yml ├── setup.py └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /PyScrappy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mldsveda/PyScrappy/HEAD/PyScrappy.png -------------------------------------------------------------------------------- /src/swiggy.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | def scrappi(city, n_pages): 6 | lst_of_urls = [] 7 | for i in range(1, n_pages+1): lst_of_urls.append("https://www.swiggy.com/" + city + '?page=' + str(i)) 8 | 9 | def swiggy(soup): 10 | main = soup.find_all('div', {'class': 'nDVxx'})[0] 11 | lst = [] 12 | for details in main.find_all('div', {'class': '_3XX_A'}): 13 | dictionary = {} 14 | dictionary['Name'] = details.find('div', {"class": "nA6kb"}).text 15 | dictionary['Cuisine'] = details.find('div', {"class": "_1gURR"}).text 16 | dictionary['Price'] = details.find('div', {"class": "nVWSi"}).text 17 | dictionary['Rating'] = details.find('div', {"class": "_9uwBC"}).text 18 | lst.append(dictionary) 19 | return lst 20 | 21 | x = [] 22 | for i in lst_of_urls: 23 | try: url = requests.get(i) 24 | except: raise ValueError("Invalid value passed for 'city'") 25 | soup = BeautifulSoup(url.text, "lxml") 26 | x.extend(swiggy(soup)) 27 | 28 | return pd.DataFrame(x) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Vedant Tibrewal, Vedaant Singh. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import setuptools 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | setuptools.setup( 8 | name="PyScrappy", 9 | version="0.1.1", 10 | author="Vedant Tibrewal, Vedaant Singh", 11 | author_email="mlds93363@gmail.com", 12 | description="Powerful web scraping tool.", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/mldsveda/PyScrappy", 16 | keywords=['PyScrappy', 'Scraping', 'E-Commerce', 'Wikipedia', 'Image Scrapper', 'YouTube', 'Scrapy', 'Twitter', 'Social Media', 'Web Scraping', 'News', 'Stocks', 'Songs', 'Food', 'Instagram', 'Movies'], 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | python_requires=">=3.6", 23 | py_modules=["PyScrappy", "alibaba", "amazon", "flipkart", "image", "imdb", "instagram", "linkedin", "news", "snapdeal", "soundcloud", "spotify", "stock", "swiggy", "twitter", "wikipedia", "youtube", "zomato"], 24 | package_dir={"": "src"}, 25 | install_requires=[ 26 | 'selenium', 27 | 'webdriver-manager', 28 | 'beautifulsoup4', 29 | 'requests', 30 | 'pandas', 31 | ], 32 | packages=setuptools.find_packages(where="src") 33 | ) 34 | -------------------------------------------------------------------------------- /src/zomato.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | from selenium import webdriver 4 | import time 5 | 6 | def scrappi(city, n_pages): 7 | if n_pages == 0: raise ValueError("'n_pages' must be greater than 0") 8 | city = city.replace(' ', '-') 9 | driver = webdriver.Chrome(ChromeDriverManager().install()) 10 | driver.get("https://www.zomato.com/"+city+"/restaurants") 11 | 12 | for _ in range(n_pages): 13 | driver.execute_script('window.scrollBy(0, window.innerHeight);') 14 | time.sleep(4) 15 | 16 | def zomato(card): 17 | ls = [] 18 | try: name = card.find_element_by_xpath('.//div/a[2]/div/p').text 19 | except: name = None 20 | try: cusine = card.find_element_by_xpath('.//div/a[2]/p').text 21 | except: cusine = None 22 | try: rating = card.find_element_by_xpath('.//div/a[2]/div[2]/section').get_attribute('value') 23 | except: rating = None 24 | try: price, delivery_time = card.find_element_by_xpath('.//div/a[2]/p[2]').text.split('\n') 25 | except: price, delivery_time = None, None 26 | try: reviews_count = card.find_element_by_xpath('.//div/a[2]/div[2]/section/div[2]').text[1:-1] 27 | except: reviews_count = None 28 | ls.extend([name, cusine, price, rating, delivery_time, reviews_count]) 29 | return ls 30 | 31 | new_ls = [] 32 | try: cards = driver.find_elements_by_class_name('jumbo-tracker') 33 | except: raise KeyError("Invalid value for 'city'") 34 | for card in cards: 35 | new_ls.append(zomato(card)) 36 | 37 | driver.close() 38 | return pd.DataFrame(new_ls, columns = ["Name", "Cusine", "Price", "Rating", "Delivery Time", "Review counts"]) -------------------------------------------------------------------------------- /src/soundcloud.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium import webdriver 5 | from bs4 import BeautifulSoup 6 | import time, re 7 | 8 | def soundcloud_tracks(track_name, n_pages): 9 | chrome_options = webdriver.ChromeOptions() 10 | chrome_options.add_argument('--headless') 11 | chrome_options.headless = True 12 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line = False).install(), options = chrome_options) 13 | driver.create_options() 14 | driver.get('https://soundcloud.com/search/sounds?q='+track_name) 15 | 16 | for _ in range(n_pages): 17 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 18 | time.sleep(3) 19 | 20 | html = driver.page_source 21 | soup = BeautifulSoup(html, 'html.parser') 22 | main = soup.find_all("div", {"class": "search"})[0] 23 | songs = main.find_all('div', {'class': 'sound__content'}) 24 | lst = [] 25 | for song in songs: 26 | dictionary = {} 27 | dictionary['Uploader'] = song.find('span', {"class": "soundTitle__usernameText"}).text 28 | dictionary['Uploader'] = re.sub('[^a-zA-Z]', '', dictionary['Uploader']) 29 | dictionary['Music Title'] = (song.find('a', {"class": "soundTitle__title sc-link-dark sc-link-secondary"}).text).replace('\n', '') 30 | dictionary['Time of Upload'] = (song.find('span', {"class": "sc-visuallyhidden"}).text).replace('\n', '') 31 | dictionary['Plays'] = song.find('span', {"class": "sc-ministats"}).text 32 | dictionary['Plays'] = re.sub('[^0-9,]', '', dictionary['Plays'])[:-3] 33 | lst.append(dictionary) 34 | 35 | driver.close() 36 | return pd.DataFrame(lst) -------------------------------------------------------------------------------- /src/spotify.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from time import sleep 3 | from webdriver_manager.chrome import ChromeDriverManager 4 | from selenium import webdriver 5 | 6 | def func(last, tracks): 7 | data = [] 8 | for track in tracks: 9 | try: 10 | temp = (track.find_element_by_xpath("./div[@data-testid='tracklist-row']").text).split("\n") 11 | if last < int(temp[0]): 12 | if 'E' in temp: temp.remove('E') 13 | data.append(temp) 14 | except: pass 15 | return data 16 | 17 | def scrappi(track_type, n_pages): 18 | while n_pages<= 0: 19 | n_pages = int(input("Enter a valid 'n_pages', greater than 0: ")) 20 | 21 | chrome_options = webdriver.ChromeOptions() 22 | chrome_options.add_argument('--headless') 23 | chrome_options.headless = True 24 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 25 | driver.create_options() 26 | 27 | driver.get(f"https://open.spotify.com/search/{track_type}/tracks") 28 | sleep(4) 29 | 30 | data = [] 31 | last = 0 32 | for i in range(n_pages): 33 | main = driver.find_element_by_xpath(".//div[@data-testid='track-list']/div[2]/div[2]") 34 | data.extend(func(last, main.find_elements_by_xpath("./div"))) 35 | last = int(data[-1][0]) 36 | try: 37 | scroll = main.find_element_by_xpath("./div[last()]").location_once_scrolled_into_view 38 | sleep(4) 39 | except: 40 | try: 41 | scroll = main.find_element_by_xpath("./div[last()]").location_once_scrolled_into_view 42 | sleep(8) 43 | except: 44 | pass 45 | 46 | return pd.DataFrame(data, columns=["Id", "Title", "Singers", "Album", "Duration"]) -------------------------------------------------------------------------------- /src/youtube.py: -------------------------------------------------------------------------------- 1 | try: 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | except: raise ImportError("'webdriver-manager' package not installed") 4 | try: 5 | from selenium.webdriver.common.keys import Keys 6 | from selenium import webdriver 7 | except: raise ImportError("'selenium' package not installed") 8 | from bs4 import BeautifulSoup 9 | import pandas as pd 10 | import time 11 | 12 | usr_agent = { 13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 16 | 'Accept-Encoding': 'none', 17 | 'Accept-Language': 'en-US,en;q=0.8', 18 | 'Connection': 'keep-alive', 19 | } 20 | 21 | def scrappi(url, n_pages): 22 | 23 | chrome_options = webdriver.ChromeOptions() 24 | chrome_options.add_argument('--headless') 25 | chrome_options.headless = True 26 | driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options) 27 | driver.create_options() 28 | driver.get(url) 29 | 30 | for _ in range(n_pages): 31 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 32 | time.sleep(3) 33 | 34 | html = driver.page_source 35 | soup = BeautifulSoup(html, 'html.parser') 36 | videos = soup.find_all("div", {"id": "dismissible"}) 37 | lst = [] 38 | 39 | for video in videos: 40 | dictionary = {} 41 | dictionary['Title'] = video.find("a", {"id": "video-title"}).text 42 | dictionary['Video_url'] = "https://www.youtube.com/" + video.find("a", {"id": "video-title"})['href'] 43 | meta = video.find("div", {"id": "metadata-line"}).find_all('span') 44 | dictionary['Views'] = meta[0].text 45 | dictionary['Days'] = meta[1].text 46 | 47 | lst.append(dictionary) 48 | 49 | return pd.DataFrame(lst) -------------------------------------------------------------------------------- /src/linkedin.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from time import sleep 3 | from webdriver_manager.chrome import ChromeDriverManager 4 | from selenium import webdriver 5 | 6 | def func(post): 7 | try: title = post.find_element_by_class_name("base-search-card__title").text 8 | except: title = None 9 | try: company = post.find_element_by_class_name("base-search-card__subtitle").text 10 | except: company = None 11 | try: location = post.find_element_by_class_name("job-search-card__location").text 12 | except: location = None 13 | try: salary = post.find_element_by_class_name("job-search-card__salary-info").text 14 | except: salary = "Not disclosed" 15 | try: benefits = post.find_element_by_class_name("job-search-card__benefits").text 16 | except: benefits = None 17 | try: date = post.find_element_by_class_name("job-search-card__listdate").text 18 | except: date = None 19 | 20 | return [title, company, location, salary, benefits, date] 21 | 22 | def scrappi(job_title, n_pages): 23 | while n_pages <= 0: 24 | print("'n_pages' must be greater then 0") 25 | n_pages = int(input("Enter 'n_pages' greater then 0: ")) 26 | 27 | chrome_options = webdriver.ChromeOptions() 28 | chrome_options.add_argument('--headless') 29 | chrome_options.headless = True 30 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 31 | driver.create_options() 32 | 33 | driver.get("https://www.linkedin.com/jobs/search/?keywords="+job_title) 34 | sleep(4) 35 | 36 | for i in range(n_pages-1): 37 | driver.execute_script("window.scrollBy(0, document.body.scrollHeight);") 38 | sleep(4) 39 | try: driver.find_element_by_class_name("infinite-scroller__show-more-button").click() 40 | except: pass 41 | 42 | data = [] 43 | for post in driver.find_elements_by_xpath(".//ul[@class='jobs-search__results-list']/li"): 44 | data.append(func(post)) 45 | 46 | driver.close() 47 | return pd.DataFrame(data, columns=["Job Title", "Company Name", "Location", "Salary", "Benefits", "Date"]) -------------------------------------------------------------------------------- /src/wikipedia.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | usr_agent = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 9 | 'Accept-Encoding': 'none', 10 | 'Accept-Language': 'en-US,en;q=0.8', 11 | 'Connection': 'keep-alive', 12 | } 13 | 14 | def para(word): 15 | word=word.replace(' ', '_') 16 | # Specify url of the web page 17 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read() 18 | # Make a soup 19 | soup = BeautifulSoup(source,'lxml') 20 | 21 | # Extract the plain text content from paragraphs 22 | paras = [] 23 | for paragraph in soup.find_all('p'): 24 | paras.append(str(paragraph.text)) 25 | return paras 26 | 27 | def header(word): 28 | word=word.replace(' ', '_') 29 | # Specify url of the web page 30 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read() 31 | soup = BeautifulSoup(source, "lxml") 32 | 33 | # Extract text from paragraph headers 34 | heads = [] 35 | for head in soup.find_all('span', attrs={'mw-headline'}): 36 | heads.append(str(head.text)) 37 | return heads 38 | 39 | def text(word): 40 | word=word.replace(' ', '_') 41 | # Specify url of the web page 42 | source = urlopen('https://en.wikipedia.org/wiki/'+str(word)).read() 43 | soup = BeautifulSoup(source, "lxml") 44 | 45 | paras = [] 46 | for paragraph in soup.find_all('p'): 47 | paras.append(str(paragraph.text)) 48 | 49 | heads = [] 50 | for head in soup.find_all('span', attrs={'mw-headline'}): 51 | heads.append(str(head.text)) 52 | 53 | # Interleave paragraphs & headers 54 | text = [val for pair in zip(paras, heads) for val in pair] 55 | text = ' '.join(text) 56 | 57 | # Drop footnote superscripts in brackets 58 | text = re.sub(r"\[.*?\]+", '', text) 59 | 60 | # Replace '\n' (a new line) with '' and end the string at $1000. 61 | text = text.replace('\n', '')[:-11] 62 | return text -------------------------------------------------------------------------------- /src/twitter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | from selenium import webdriver 4 | from time import sleep 5 | 6 | def scrappi(hashtag, n_pages): 7 | chrome_options = webdriver.ChromeOptions() 8 | chrome_options.add_argument('--headless') 9 | chrome_options.headless = True 10 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 11 | driver.create_options() 12 | driver.get("https://twitter.com/search?q=%23"+hashtag.replace('#', '')) 13 | sleep(4) 14 | 15 | def twitter(card): 16 | data_lst = [] 17 | try: name = card.find_element_by_xpath('.//span').text 18 | except: name = None 19 | try: twitter_handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text 20 | except: twitter_handle = None 21 | try: post_time = card.find_element_by_xpath('.//time').get_attribute('datetime') 22 | except: post_time = None 23 | try: tweet = (card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text)+(card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text) 24 | except: tweet = None 25 | try: reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text 26 | except: reply_count = None 27 | try: retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text 28 | except: retweet_count = None 29 | try: like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text 30 | except: like_count = None 31 | 32 | data_lst.extend([name, twitter_handle, post_time, tweet, reply_count, retweet_count, like_count]) 33 | return data_lst 34 | 35 | new_ls = [] 36 | temp_set = set() 37 | for _ in range(n_pages): 38 | for card in driver.find_elements_by_xpath('//article[@data-testid="tweet"]'): 39 | ls = twitter(card) 40 | check = ''.join(ls) 41 | if check not in temp_set: 42 | new_ls.append(ls) 43 | temp_set.add(check) 44 | driver.execute_script('window.scrollBy(0, window.innerHeight*3);') 45 | sleep(4) 46 | 47 | driver.close() 48 | return pd.DataFrame(new_ls, columns = ["Name", "Twitter handle", "Post Time", "Tweet", "Reply Count", "Retweet Count", "Like Count"]) 49 | -------------------------------------------------------------------------------- /src/news.py: -------------------------------------------------------------------------------- 1 | try: 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | except: raise ImportError("'webdriver-manager' package not installed") 4 | try: 5 | from selenium.webdriver.common.keys import Keys 6 | from selenium import webdriver 7 | except: raise ImportError("'selenium' package not installed") 8 | from bs4 import BeautifulSoup 9 | import pandas as pd 10 | import time 11 | 12 | usr_agent = { 13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 16 | 'Accept-Encoding': 'none', 17 | 'Accept-Language': 'en-US,en;q=0.8', 18 | 'Connection': 'keep-alive', 19 | } 20 | 21 | def scrappi(n_pages, genre): 22 | if genre not in ['national', 'business', 'sports', 'world', 'politics', 'technology', 'startup', 'entertainment', 23 | 'miscellaneous', 'hatke', 'science', 'automobile']: 24 | raise ValueError("'genre' value not exists") 25 | 26 | chrome_options = webdriver.ChromeOptions() 27 | chrome_options.add_argument('--headless') 28 | chrome_options.headless = True 29 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 30 | driver.create_options() 31 | driver.get('https://inshorts.com/en/read/'+genre) 32 | 33 | for _ in range(n_pages): 34 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 35 | time.sleep(3) 36 | driver.find_element_by_id('load-more-btn').click() 37 | text_field = driver.find_element_by_id('load-more-btn') 38 | 39 | html = driver.page_source 40 | soup = BeautifulSoup(html, 'html.parser') 41 | main = soup.find_all('div', {"class": "news-card z-depth-1"}) 42 | 43 | lst = [] 44 | for details in main: 45 | dictionary={} 46 | dictionary['Headlines'] = (details.find('a', {"class": "clickable"}).text).replace('\n', '') 47 | dictionary['Time'] = details.find('span', {"class": "time"}).text 48 | date = details.find('div', {"class": "news-card-author-time news-card-author-time-in-title"}).find_all('span') 49 | dictionary['Date'] = date[3].text 50 | dictionary['News'] = details.find('div', {"itemprop": "articleBody"}).text 51 | lst.append(dictionary) 52 | 53 | return pd.DataFrame(lst) -------------------------------------------------------------------------------- /src/image.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import os 4 | 5 | usr_agent = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 9 | 'Accept-Encoding': 'none', 10 | 'Accept-Language': 'en-US,en;q=0.8', 11 | 'Connection': 'keep-alive', 12 | } 13 | 14 | def scrappi(data, n_images, img_format, folder_name): 15 | 16 | URL = ['https://www.bing.com/images/search?q=', 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&q=', 17 | 'https://images.search.yahoo.com/search/images?p='] 18 | 19 | def check(folder_name): 20 | try: 21 | os.mkdir(folder_name) 22 | return folder_name 23 | except: 24 | print("Folder Exist with that name!") 25 | folder_name = input("Enter a new Folder name: \n") 26 | try: 27 | os.mkdir(folder_name) 28 | return folder_name 29 | except: return check(folder_name) 30 | 31 | folder_name = check(folder_name) 32 | 33 | print('Starting to Download...') 34 | 35 | for i in URL: 36 | searchurl = i + str(data) 37 | response = requests.get(searchurl, headers = usr_agent) 38 | html = response.text 39 | soup = BeautifulSoup(html, 'html.parser') 40 | results = soup.findAll('img', limit = n_images) 41 | 42 | if len(results) != 0: 43 | for i, image in enumerate(results): 44 | try: image_link = image["data-srcset"] 45 | except: 46 | try: image_link = image["data-src"] 47 | except: 48 | try: image_link = image["data-fallback-src"] 49 | except: 50 | try: image_link = image["src"] 51 | except: pass 52 | try: 53 | r = requests.get(image_link).content 54 | try: r = str(r, 'utf-8') 55 | except UnicodeDecodeError: 56 | with open(f"{folder_name}/images{i+1}.{img_format}", "wb+") as f: f.write(r) 57 | except: pass 58 | 59 | return 'Successfully Downloaded ' + str(n_images) + ' images' -------------------------------------------------------------------------------- /src/snapdeal.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | usr_agent = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', 8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 9 | 'Accept-Encoding': 'none', 10 | 'Accept-Language': 'en-US,en;q=0.8', 11 | 'Connection': 'keep-alive', 12 | } 13 | 14 | def scrappi(product_name, n_pages): 15 | 16 | snap = "https://www.snapdeal.com/search?keyword="+product_name 17 | url = requests.get(snap) 18 | soup = BeautifulSoup(url.text,"lxml") 19 | 20 | if n_pages == 0: 21 | print("Enter a valid number of Pages") 22 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: "))) 23 | 24 | initial_url = str(snap) 25 | lst_of_urls = [] 26 | for i in range(1, n_pages+1): 27 | x = initial_url + '&page=' + str(i) 28 | lst_of_urls.append(x) 29 | 30 | def Card_Style(soup): 31 | 32 | lst=[] 33 | cnt = soup.find_all("div", {"class": "product-tuple-description"}) 34 | for i in range(len(cnt)): 35 | 36 | try: name = cnt[i].find("p", {"class": "product-title"}).text 37 | except: name = "None" 38 | 39 | try: Price = cnt[i].find("span", {"class": "lfloat product-price"}).text 40 | except: Price = "None" 41 | 42 | try: original = cnt[i].find("span", {"class": "lfloat product-desc-price strike"}).text 43 | except: original = "None" 44 | 45 | try: rating = cnt[i].find("p",{"class":"product-rating-count"}).text 46 | except: rating = "None" 47 | 48 | lst.append([name, Price, original, rating]) 49 | 50 | return lst 51 | 52 | def snapdeal(soup): 53 | if len(soup.find_all("div",class_="product-tuple-description"))>=1: 54 | return Card_Style(soup) 55 | 56 | x = [] 57 | for i in lst_of_urls: 58 | url = requests.get(i) 59 | soup = BeautifulSoup(url.text,"lxml") 60 | abc = snapdeal(soup) 61 | for j in abc: x.append(j) 62 | 63 | return pd.DataFrame(x, columns =['Name', 'Price', 'Original Price', 'Number of Ratings']) -------------------------------------------------------------------------------- /src/imdb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from time import sleep 3 | from urllib.request import urlopen 4 | from bs4 import BeautifulSoup 5 | 6 | def scrappi(genre, n_pages): 7 | source = urlopen(f"https://www.imdb.com/search/title/?genres={genre}").read() 8 | data = { 9 | "title" : [], 10 | "year" : [], 11 | "certificate" : [], 12 | "runtime" : [], 13 | "genre" : [], 14 | "rating" : [], 15 | "description" : [], 16 | "stars" : [], 17 | "directors" : [], 18 | "votes" : [] 19 | } 20 | 21 | for i in range(n_pages): 22 | soup = BeautifulSoup(source,'lxml') 23 | cards = soup.find_all("div", {"class":"lister-item-content"}) 24 | for card in cards: 25 | try: data["title"].append(card.find("h3", {"class":"lister-item-header"}).find("a").text) 26 | except: data["title"].append(None) 27 | try: data["year"].append(card.find("h3", {"class":"lister-item-header"}).find_all("span")[-1].text[1:-1]) 28 | except: data["year"].append(None) 29 | try: data["certificate"].append(card.find("span", {"class":"certificate"}).text) 30 | except: data["certificate"].append(None) 31 | try: data["runtime"].append(card.find("span", {"class":"runtime"}).text) 32 | except: data["runtime"].append(None) 33 | try: data["genre"].append((card.find("span", {"class":"genre"}).text).strip()) 34 | except: data["genre"].append(None) 35 | try: data["rating"].append((card.find("div", {"class":"ratings-imdb-rating"}).text).strip()) 36 | except: data["rating"].append(None) 37 | try: data["description"].append((card.find_all("p", {"class":"text-muted"})[-1].text).strip()) 38 | except: data["description"].append(None) 39 | casts = card.find("p", {"class":""}).text.split("|") 40 | star, director = None, None 41 | for cast in casts: 42 | temp = cast.strip().replace("\n", "").replace(":", ",").split(",") 43 | if temp[0] in ["Star", "Stars"]: star = ', '.join(temp[1:]) 44 | elif temp[0] in ["Director", "Directors"]: director = ', '.join(temp[1:]) 45 | data["stars"].append(star) 46 | data["directors"].append(director) 47 | try: data["votes"].append(card.find("span", {"name":"nv"}).text) 48 | except: data["votes"].append(None) 49 | try: 50 | source = urlopen("https://www.imdb.com"+soup.find("a", {"class":"lister-page-next next-page"}).attrs['href']).read() 51 | except: 52 | break 53 | 54 | return pd.DataFrame(data) -------------------------------------------------------------------------------- /src/alibaba.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | usr_agent = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', 8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 9 | 'Accept-Encoding': 'none', 10 | 'Accept-Language': 'en-US,en;q=0.8', 11 | 'Connection': 'keep-alive', 12 | } 13 | 14 | def scrappi(product_name, n_pages): 15 | ali = "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText="+product_name 16 | url = requests.get(ali) 17 | soup = BeautifulSoup(url.text,"lxml") 18 | 19 | if n_pages == 0: 20 | print("Enter a valid number of Pages") 21 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: "))) 22 | 23 | initial_url = str(ali) 24 | lst_of_urls = [] 25 | for i in range(1, n_pages+1): 26 | x = initial_url + '&page=' + str(i) 27 | lst_of_urls.append(x) 28 | 29 | def Card_Style(soup): 30 | 31 | lst=[] 32 | cnt = soup.find_all("div", {"class": "m-gallery-product-item-v2"}) 33 | for i in range(len(cnt)): 34 | 35 | try: name = cnt[i].find("p", {"class": "elements-title-normal__content"}).text 36 | except: name = "None" 37 | 38 | try: 39 | Price = cnt[i].find("p", {"class": "elements-offer-price-normal medium"})['title'] 40 | Price = '$' + str(Price).replace('$', '') 41 | except: Price = "None" 42 | 43 | try: n_item = cnt[i].find("span", {"class": "element-offer-minorder-normal__value"}).text 44 | except: n_item = "None" 45 | 46 | try: Description = cnt[i].find("div", {"class": "offer-tag-list"}).text 47 | except: Description = "None" 48 | 49 | try: rating = cnt[i].find("span",{"class":"seb-supplier-review__score"}).text 50 | except: rating = "None" 51 | 52 | lst.append([name, Price, n_item, Description, rating]) 53 | 54 | return lst 55 | 56 | def alibaba(soup): 57 | if len(soup.find_all("div",class_="m-gallery-product-item-v2"))>=1: return Card_Style(soup) 58 | 59 | x = [] 60 | for i in lst_of_urls: 61 | url = requests.get(i) 62 | soup = BeautifulSoup(url.text,"lxml") 63 | abc = alibaba(soup) 64 | if abc: 65 | for j in abc: x.append(j) 66 | 67 | return pd.DataFrame(x, columns =['Name', 'Price', 'Number of Items', 'Description', 'Ratings']) -------------------------------------------------------------------------------- /src/amazon.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from time import sleep 3 | from webdriver_manager.chrome import ChromeDriverManager 4 | from selenium import webdriver 5 | 6 | def func(cards): 7 | data = [] 8 | for card in cards: 9 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div[3]") 10 | except: 11 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div[2]") 12 | except: 13 | try: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div/div[3]") 14 | except: info = card.find_element_by_class_name("s-card-container").find_element_by_xpath("./div/div/div[2]") 15 | try: description = info.find_element_by_xpath("./div[1]/h2").text 16 | except: description = None 17 | try: rating = info.find_element_by_xpath("./div[2]/div/span").get_attribute("aria-label") 18 | except: rating = None 19 | try: votes = info.find_elements_by_xpath("./div[2]/div/span")[1].text 20 | except: votes = None 21 | try: offer_price = info.find_element_by_class_name("a-price").text.replace("\n", ".") 22 | except: offer_price = None 23 | try: actual_price = info.find_element_by_class_name("a-price").find_element_by_xpath("..//span[@data-a-strike='true']").text 24 | except: actual_price = offer_price 25 | 26 | data.append([description, rating, votes, offer_price, actual_price]) 27 | 28 | return data 29 | 30 | def scrappi(product_name, n_pages): 31 | chrome_options = webdriver.ChromeOptions() 32 | chrome_options.add_argument('--headless') 33 | chrome_options.headless = True 34 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 35 | driver.create_options() 36 | 37 | url = "https://www.amazon.com/s?k="+product_name 38 | driver.get(url) 39 | sleep(4) 40 | 41 | cards = driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]') 42 | while len(cards) == 0: 43 | driver.get(url) 44 | sleep(4) 45 | 46 | max_pages = int(driver.find_element_by_xpath(".//span[@class='s-pagination-strip']/span[last()]").text) 47 | while n_pages > max_pages or n_pages == 0: 48 | print(f"Please Enter a Valid Number of Pages Between 1 to {max_pages}:") 49 | n_pages = int(input()) 50 | 51 | data = [] 52 | 53 | while n_pages > 0: 54 | n_pages -= 1 55 | data.extend(func(driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]'))) 56 | driver.find_element_by_class_name("s-pagination-next").click() 57 | sleep(4) 58 | 59 | driver.close() 60 | return pd.DataFrame(data, columns=["Description", "Rating", "Votes", "Offer Price", "Actual Price"]) -------------------------------------------------------------------------------- /src/instagram.py: -------------------------------------------------------------------------------- 1 | try: 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | except: raise ImportError("'webdriver-manager' package not installed") 4 | try: 5 | from selenium.webdriver.common.keys import Keys 6 | from selenium import webdriver 7 | except: raise ImportError("'selenium' package not installed") 8 | from bs4 import BeautifulSoup 9 | import pandas as pd 10 | import time 11 | 12 | usr_agent = { 13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 15 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 16 | 'Accept-Encoding': 'none', 17 | 'Accept-Language': 'en-US,en;q=0.8', 18 | 'Connection': 'keep-alive', 19 | } 20 | 21 | 22 | def account(insta_handle, n_pages): 23 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install()) 24 | driver.get("https://www.instagram.com/"+insta_handle+"/") 25 | 26 | for _ in range(n_pages): 27 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 28 | time.sleep(3) 29 | 30 | html = driver.page_source 31 | soup = BeautifulSoup(html, 'html.parser') 32 | main = soup.find_all('section', {"class": "zwlfE"}) 33 | main2 = soup.find_all('div', {"class": "_2z6nI"}) 34 | 35 | for details in main: 36 | title = details.find('h1', {"class": "rhpdm"}) 37 | info = details.find_all('span', {"class": "g47SY"}) 38 | data = details.find('div', {"class": "-vDIg"}).find_all('span') 39 | posts = info[0].text 40 | followers = info[1].text 41 | following = info[2].text 42 | if(data): bio = data[0].text 43 | print("Name: ", title.text) 44 | print("Number of Posts: ", posts) 45 | print("Followers: ", followers) 46 | print("Following: ", following) 47 | if(data): print("Bio: ", bio) 48 | else: print("Bio: None") 49 | print() 50 | break 51 | 52 | post_url = [] 53 | for i in main2: 54 | url = i.find_all('div', {"class": "v1Nh3 kIKUG _bz0w"}) 55 | for x in url: 56 | u = 'https://www.instagram.com/'+x.a['href'] 57 | post_url.append(u) 58 | 59 | def func(a): 60 | driver.get(a) 61 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 62 | time.sleep(3) 63 | html = driver.page_source 64 | soup = BeautifulSoup(html, "html.parser") 65 | main = soup.find_all('div', {"class": "C4VMK"}) 66 | for details in main: 67 | title = details.find('span', {"class": ""}) 68 | return title.text 69 | 70 | caption =[] 71 | for i in post_url: 72 | caption.append(func(i)) 73 | 74 | driver.close() 75 | return pd.DataFrame({'Captions': caption}) 76 | 77 | def hashtag(hashtag, n_posts): 78 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install()) 79 | hashtag = hashtag.replace('#', '') 80 | driver.get("https://www.instagram.com/explore/tags/"+hashtag+"/") 81 | 82 | for _ in range(int(n_posts//3)): 83 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 84 | time.sleep(3) 85 | 86 | html = driver.page_source 87 | soup = BeautifulSoup(html, 'html.parser') 88 | main = soup.find_all('div', {"class": "EZdmt"}) 89 | 90 | post_url = [] 91 | for i in main: 92 | url = i.find_all('div', {"class": "v1Nh3 kIKUG _bz0w"}) 93 | for x in url: 94 | u = 'https://www.instagram.com'+x.a['href'] 95 | post_url.append(u) 96 | 97 | def func(a): 98 | driver.get(a) 99 | driver.find_element_by_tag_name('body').send_keys(Keys.END) 100 | time.sleep(3) 101 | html = driver.page_source 102 | soup = BeautifulSoup(html, "html.parser") 103 | main = soup.find_all('div', {"class": "C4VMK"}) 104 | for details in main: 105 | title = details.find('span', {"class": ""}) 106 | return title.text 107 | 108 | caption =[] 109 | for i in post_url: 110 | caption.append(func(i)) 111 | 112 | driver.close() 113 | return pd.DataFrame({'Captions': caption}) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 |
5 |
6 | 7 | ## PyScrappy: powerful Python data scraping toolkit 8 | 9 | [![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/) 10 | 11 | [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) 12 | [![PyPI Latest Release](https://img.shields.io/pypi/v/PyScrappy.svg)](https://pypi.org/project/PyScrappy/) 13 | 14 | [![Package Status](https://img.shields.io/pypi/status/PyScrappy.svg)](https://pypi.org/project/PyScrappy/) 15 | [![License](https://img.shields.io/pypi/l/PyScrappy.svg)](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE) 16 | ![](https://img.shields.io/pypi/dm/PyScrappy) 17 | 18 | ![](https://komarev.com/ghpvc/?username=mldsveda&style=flat-square) 19 | ![stars](https://img.shields.io/github/stars/mldsveda/PyScrappy?style=social) 20 | ![forks](https://img.shields.io/github/forks/mldsveda/PyScrappy?style=social) 21 | 22 | [![](https://img.shields.io/badge/pyscrappy-official%20documentation-blue)](https://pyscrappy.netlify.app/) 23 | 24 | ## What is it? 25 | 26 | **PyScrappy** is a Python package that provides a fast, flexible, and exhaustive way to scrape data from various different sources. Being an 27 | easy and intuitive library. It aims to be the fundamental high-level building block for scraping **data** in Python. Additionally, it has the broader goal of becoming **the most powerful and flexible open source data scraping tool available**. 28 | 29 | ## Main Features 30 | 31 | Here are just a few of the things that PyScrappy does well: 32 | 33 | - Easy scraping of [**Data**](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b) available on the internet 34 | - Returns a [**DataFrame**](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) for further analysis and research purposes. 35 | - Automatic [**Data Scraping**](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b): Other than a few user input parameters the whole process of scraping the data is automatic. 36 | - Powerful, flexible 37 | 38 | ## Where to get it 39 | 40 | The source code is currently hosted on GitHub at: 41 | https://github.com/mldsveda/PyScrappy 42 | 43 | Binary installers for the latest released version are available at the [Python 44 | Package Index (PyPI)](https://pypi.org/project/PyScrappy/). 45 | 46 | ```sh 47 | pip install PyScrappy 48 | ``` 49 | 50 | ## Dependencies 51 | 52 | - [selenium](https://www.selenium.dev/) - Selenium is a free (open-source) automated testing framework used to validate web applications across different browsers and platforms. 53 | - [webdriver-manger](https://github.com/bonigarcia/webdrivermanager) - WebDriverManager is an API that allows users to automate the handling of driver executables like chromedriver.exe, geckodriver.exe etc required by Selenium WebDriver API. Now let us see, how can we set path for driver executables for different browsers like Chrome, Firefox etc. 54 | - [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - Beautiful Soup is a Python library for getting data out of HTML, XML, and other markup languages. 55 | - [pandas](https://pandas.pydata.org/) - Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. 56 | 57 | ## License 58 | 59 | [MIT](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE) 60 | 61 | ## Getting Help 62 | 63 | For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pyscrappy). 64 | Further, general questions and discussions can also take place on GitHub in this [repository](https://github.com/mldsveda/PyScrappy). 65 | 66 | ## Discussion and Development 67 | 68 | Most development discussions take place on GitHub in this [repository](https://github.com/mldsveda/PyScrappy). 69 | 70 | Also visit the official documentation of [PyScrappy](https://pyscrappy.netlify.app/) for more information. 71 | 72 | ## Contributing to PyScrappy 73 | 74 | All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. 75 | 76 | If you are simply looking to start working with the PyScrappy codebase, navigate to the GitHub ["issues"](https://github.com/mldsveda/PyScrappy/issues) tab and start looking through interesting issues. 77 | 78 | ## End Notes 79 | 80 | _Learn More about this package on [Medium](https://medium.com/analytics-vidhya/web-scraping-in-python-using-the-all-new-pyscrappy-5c136ed6906b)._ 81 | 82 | ### **_This package is solely made for educational and research purposes._** 83 | -------------------------------------------------------------------------------- /src/flipkart.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | usr_agent = { 6 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 7 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 8 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 9 | 'Accept-Encoding': 'none', 10 | 'Accept-Language': 'en-US,en;q=0.8', 11 | 'Connection': 'keep-alive', 12 | } 13 | 14 | def scrappi(product_name, n_pages): 15 | 16 | flip = "https://www.flipkart.com/search?q="+product_name 17 | url = requests.get(flip) 18 | soup = BeautifulSoup(url.text,"lxml") 19 | 20 | if n_pages == 0: 21 | Page = soup.find("div",class_="_2MImiq").find("span", class_="").text 22 | c = Page.split() 23 | i=((c[3].replace(',',''))) 24 | print("Enter valid number of Pages between 1 and {}".format(i)) 25 | return scrappi(product_name, n_pages=int(input("Enter a Page Number: "))) 26 | 27 | initial_url = str(flip) 28 | lst_of_urls = [] 29 | for i in range(1, n_pages+1): 30 | x = initial_url + '&page=' + str(i) 31 | lst_of_urls.append(x) 32 | 33 | def rectangle(soup): 34 | 35 | lst = [] 36 | cnt = soup.find_all("div", {"class": "_2kHMtA"}) 37 | for i in range(len(cnt)): 38 | 39 | try: name = cnt[i].find("div", {"class": "_4rR01T"}).text 40 | except: name = "None" 41 | 42 | try: Price = cnt[i].find("div", {"class":"_30jeq3 _1_WHN1"}).text 43 | except: Price = "None" 44 | 45 | try: 46 | Priceo = cnt[i].find("div", {"class": "_3I9_wc _27UcVY"}).text.split() 47 | oprice = Priceo[0] 48 | except: oprice = "None" 49 | 50 | try: Description = cnt[i].find("li", {"class": "rgWa7D"}).text 51 | except: Description = "None" 52 | 53 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text 54 | except: rating="None" 55 | 56 | lst.append([name, Price, oprice, Description, rating]) 57 | 58 | return lst 59 | 60 | def Card_Style(soup): 61 | 62 | lst=[] 63 | cnt = soup.find_all("div", {"class": "_4ddWXP"}) 64 | for i in range(len(cnt)): 65 | 66 | try: name = cnt[i].find("a", {"class": "s1Q9rs"}).text 67 | except: name = "None" 68 | 69 | try: Price = cnt[i].find("div", {"class": "_30jeq3"}).text 70 | except: Price = "None" 71 | 72 | try: 73 | Priceo = cnt[i].find("div", {"class": "_3I9_wc"}).text.split() 74 | oprice = Priceo[0] 75 | 76 | except: oprice = "None" 77 | 78 | try: Description = cnt[i].find("div", {"class": "_3Djpdu"}).text 79 | except: Description = "None" 80 | 81 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text 82 | except: rating="None" 83 | 84 | lst.append([name, Price, oprice, Description, rating]) 85 | 86 | return lst 87 | 88 | def OtherStyle(soup): 89 | 90 | lst=[] 91 | cnt = soup.find_all("div", {"class": "_1xHGtK _373qXS"}) 92 | for i in range(len(cnt)): 93 | 94 | try: name = cnt[i].find("div", {"class": "_2WkVRV"}).text 95 | except: name = "None" 96 | 97 | try: Price = cnt[i].find("div", {"class": "_30jeq3"}).text 98 | except: Price = "None" 99 | 100 | try: 101 | Priceo = cnt[i].find("div", {"class": "_3I9_wc"}).text.split() 102 | oprice = Priceo[0] 103 | except: oprice = "None" 104 | 105 | try: Description = cnt[i].find("a", {"class": "IRpwTa"}).text 106 | except: Description = "None" 107 | 108 | try: rating=cnt[i].find("div",{"class":"_3LWZlK"}).text 109 | except: rating="None" 110 | 111 | lst.append([name, Price, oprice, Description, rating]) 112 | 113 | return lst 114 | 115 | def flipkart(soup): 116 | if len(soup.find_all("div",class_="_4ddWXP"))>=1: 117 | return Card_Style(soup) 118 | elif len(soup.find_all("div",class_="_2kHMtA"))>=1: 119 | return rectangle(soup) 120 | elif len(soup.find_all("div", {"class": "_1xHGtK _373qXS"}))>=1: 121 | return OtherStyle(soup) 122 | 123 | x = [] 124 | for i in lst_of_urls: 125 | url = requests.get(i) 126 | soup = BeautifulSoup(url.text,"lxml") 127 | abc = flipkart(soup) 128 | if abc: 129 | for j in abc: x.append(j) 130 | 131 | return pd.DataFrame(x, columns =['Name', 'Price', 'Original Price', 'Description', 'Rating']) -------------------------------------------------------------------------------- /src/stock.py: -------------------------------------------------------------------------------- 1 | from webdriver_manager.chrome import ChromeDriverManager 2 | from selenium import webdriver 3 | from time import sleep 4 | import pandas as pd 5 | 6 | ########## Stock Analysis ########## 7 | def stock_analysis(stock_code, analysis_type): 8 | analysis = { 9 | "earning estimate" : 0, 10 | "revenue estimate" : 1, 11 | "earning history" : 2, 12 | "EPS trend" : 3, 13 | "EPS revision" : 4, 14 | "growth estimate" : 5 15 | } 16 | if analysis_type not in analysis.keys(): raise KeyError("Invalid value for 'analysis_type'") 17 | 18 | chrome_options = webdriver.ChromeOptions() 19 | chrome_options.add_argument('--headless') 20 | chrome_options.headless = True 21 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 22 | driver.create_options() 23 | driver.get("https://in.finance.yahoo.com/quote/"+stock_code+"/analysis?p="+stock_code) 24 | 25 | try: tables = driver.find_elements_by_xpath('.//section[@data-test="qsp-analyst"]/table') 26 | except: raise KeyError("Invalid value for 'stock_code'") 27 | 28 | 29 | try: df = pd.DataFrame([[data.text for data in row.find_elements_by_xpath('td')] for row in tables[analysis[analysis_type]].find_elements_by_xpath('.//tbody/tr')], columns = [head.text for head in tables[analysis[analysis_type]].find_elements_by_tag_name('th')]) 30 | except: 31 | print("Analysis report not Available") 32 | return None 33 | return df 34 | 35 | ########## Stock Profile ########## 36 | def stock_profile(stock_code): 37 | chrome_options = webdriver.ChromeOptions() 38 | chrome_options.add_argument('--headless') 39 | chrome_options.headless = True 40 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 41 | driver.create_options() 42 | try: 43 | url = "https://in.finance.yahoo.com/quote/"+stock_code+"/profile?p="+stock_code 44 | driver.get(url) 45 | sleep(3) 46 | 47 | card = driver.find_element_by_xpath('.//div[@id="Main"]') 48 | executives = pd.DataFrame([[data.text for data in row.find_elements_by_xpath('td')] for row in card.find_elements_by_xpath('.//tbody/tr')], columns = [i.text for i in card.find_elements_by_tag_name('th')]) 49 | description = card.find_element_by_xpath('.//section/section[2]/p').text 50 | pro_card = card.find_element_by_xpath('.//div[@data-test="qsp-profile"]') 51 | profile = {} 52 | profile["Company Name"] = pro_card.find_element_by_xpath('.//h3').text 53 | profile["Headquater"] = ', '.join(pro_card.find_element_by_xpath('.//p').text.split('\n')[:3]) 54 | profile["Sector"], profile["Industry"], profile["Employees"] = [i.split(': ')[1] for i in pro_card.find_element_by_xpath('.//p[2]').text.split('\n')] 55 | except: 56 | print("Profile details not Available or the 'stock_code' is Invalid") 57 | return None, None, None 58 | return [profile, description, executives] 59 | 60 | 61 | ########## Historical Data ########## 62 | def stock_history(stock_code, stock_range, stock_frequency): 63 | chrome_options = webdriver.ChromeOptions() 64 | chrome_options.add_argument('--headless') 65 | chrome_options.headless = True 66 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options = chrome_options) 67 | driver.create_options() 68 | 69 | try: 70 | url = "https://in.finance.yahoo.com/quote/"+stock_code+"/history?p="+stock_code 71 | driver.get(url) 72 | sleep(3) 73 | 74 | card = driver.find_element_by_xpath('//section[@data-test="qsp-historical"]/div[1]') 75 | time_period = card.find_element_by_xpath('.//div[1]/div[@data-test="dropdown"]') 76 | time_period.click() 77 | sleep(3) 78 | 79 | if stock_range == "1d": time_period.find_element_by_xpath('.//button[@data-value="1_D"]').click() 80 | elif stock_range == "5d":time_period.find_element_by_xpath('.//button[@data-value="5_D"]').click() 81 | elif stock_range == "3m":time_period.find_element_by_xpath('.//button[@data-value="3_M"]').click() 82 | elif stock_range == "6m":time_period.find_element_by_xpath('.//button[@data-value="6_M"]').click() 83 | elif stock_range == "1y":time_period.find_element_by_xpath('.//button[@data-value="1_Y"]').click() 84 | elif stock_range == "5y":time_period.find_element_by_xpath('.//button[@data-value="5_Y"]').click() 85 | elif stock_range == "max":time_period.find_element_by_xpath('.//button[@data-value="MAX"]').click() 86 | else: raise KeyError("Invalid value passed for 'stock_range'") 87 | sleep(3) 88 | 89 | frequency = card.find_element_by_xpath('.//span[@data-test="historicalFrequency-selected"]') 90 | frequency.click() 91 | sleep(3) 92 | 93 | if stock_frequency == "Daily": card.find_element_by_xpath('.//div[@data-value="1d"]').click() 94 | elif stock_frequency == "Weekly": card.find_element_by_xpath('.//div[@data-value="1wk"]').click() 95 | elif stock_frequency == "Monthly": card.find_element_by_xpath('.//div[@data-value="1mo"]').click() 96 | else: raise KeyError("Invalid value passed for 'stock_frequency'") 97 | sleep(3) 98 | 99 | card.find_element_by_xpath('.//div[1]/button[@data-reactid="25"]').click() 100 | sleep(3) 101 | except: 102 | print("Historical Data Not Available or the 'stock_code' is Invalid") 103 | return 104 | 105 | try: 106 | link = driver.find_elements_by_link_text('Download')[0].get_attribute('href') 107 | driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install()) 108 | driver.get(link) 109 | sleep(5) 110 | print("CSV Downloaded Successfully") 111 | driver.close() 112 | except: 113 | print("CSV file not Available or the 'stock_code' is Invalid") 114 | return -------------------------------------------------------------------------------- /src/PyScrappy.py: -------------------------------------------------------------------------------- 1 | ############## ECommereceScrapper ############## 2 | class ECommerceScrapper(): 3 | 4 | """ 5 | 6 | ECommerece Scrapper: Helps in scrapping data from E-Comm websites 7 | 1. Alibaba 8 | 2. Amazon 9 | 3. Flipkart 10 | 4. Snapdeal 11 | 12 | Type: class 13 | 14 | Note 15 | ------ 16 | Create an object of this class to procced further. 17 | 18 | Example 19 | --------- 20 | >>> obj = PyScrappy.ECommerceScrapper() 21 | 22 | """ 23 | 24 | ############## Alibaba Scrapper ############## 25 | def alibaba_scrapper(self, product_name, n_pages): 26 | 27 | """ 28 | 29 | Alibaba Scrapper: Helps in scrapping Alibaba data ('Name', 'Number of Items', 'Description', 'Ratings'). 30 | return type: DataFrame 31 | 32 | Parameters 33 | ------------ 34 | product_name: Enter the name of desired product 35 | Type: str 36 | 37 | n_pages: Enter the number of pages that you want to scrape 38 | Type: int 39 | 40 | Note 41 | ------ 42 | Both the arguments are a compulsion. 43 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run. 44 | 45 | Example 46 | --------- 47 | >>> obj.alibabal_scrapper('product', 3) 48 | out: Name Number of Items Description Ratings 49 | abc 440 product a 3.5 50 | aec 240 product b 4.5 51 | 52 | """ 53 | 54 | import alibaba 55 | return alibaba.scrappi(product_name, n_pages) 56 | 57 | 58 | ############## Amazon Scrapper ############## 59 | def amazon_scrapper(self, product_name, n_pages): 60 | 61 | """ 62 | 63 | Amazon Scrapper: Helps in scrapping amazon data ('Description', 'Rating', 'Votes', 'Offer Price', 'Actual Price'). 64 | return type: DataFrame 65 | 66 | Parameters 67 | ------------ 68 | product_name: Enter the name of desired product 69 | Type: str 70 | 71 | n_pages: Enter the number of pages that you want to scrape 72 | Type: int 73 | 74 | Note 75 | ------ 76 | Both the arguments are a compulsion. 77 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run. 78 | 79 | Example 80 | --------- 81 | >>> obj.amazon_scrapper('product', 3) 82 | out: Name Number of Items Description Ratings 83 | abc 440 product a 3.5 84 | aec 240 product b 4.5 85 | 86 | """ 87 | 88 | import amazon 89 | return amazon.scrappi(product_name, n_pages) 90 | 91 | 92 | ############## Flipkart Scrapper ############## 93 | def flipkart_scrapper(self, product_name, n_pages): 94 | 95 | """ 96 | 97 | Flipkart Scrapper: Helps in scrapping Flikart data ('Name', 'Price', 'Original Price', 'Description', 'Rating'). 98 | return type: DataFrame 99 | 100 | Parameters 101 | ------------ 102 | product_name: Enter the name of the desired product, which you want to scrape the data of 103 | Type: str 104 | 105 | n_pages: Enter the number of pages that you want to scrape 106 | Type: int 107 | 108 | Note 109 | ------ 110 | Both the arguments are a compulsion. 111 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run. 112 | 113 | Example 114 | --------- 115 | >>> obj.flipkart_scrapper("Product Name", 3) 116 | out: Name Price Original Price Description Rating 117 | abc ₹340 ₹440 Product 4.2 118 | aec ₹140 ₹240 Product 4.7 119 | 120 | """ 121 | 122 | import flipkart 123 | return flipkart.scrappi(product_name, n_pages) 124 | 125 | 126 | ############## Snapdeal Scrapper ############## 127 | def snapdeal_scrapper(self, product_name, n_pages): 128 | 129 | """ 130 | 131 | Snapdeal Scrapper: Helps in scrapping Snapdeal data ('Name', 'Price', 'Original Price', 'Number of Ratings'). 132 | return type: DataFrame 133 | 134 | Parameters 135 | ------------ 136 | product_name: Enter the name of the desired product, which you want to scrape the data of 137 | Type: str 138 | 139 | n_pages: Enter the number of pages that you want to scrape 140 | Type: int 141 | 142 | Note 143 | ------ 144 | Both the arguments are a compulsion. 145 | If n_pages == 0: A prompt will ask you to enter a valid page number and the scrapper will re-run. 146 | 147 | Example 148 | --------- 149 | >>> obj.snapdeal_scrapper('product', 3) 150 | out: Name Price Original Price Number of Ratings 151 | abc ₹340 ₹440 40 152 | aec ₹140 ₹240 34 153 | 154 | """ 155 | 156 | import snapdeal 157 | return snapdeal.scrappi(product_name, n_pages) 158 | 159 | ######################################################################################################################## 160 | 161 | ############## FoodScrapper ############## 162 | class FoodScrapper(): 163 | 164 | """ 165 | 166 | Food Scrapper: Helps in scrapping data from food delivery websites 167 | 1. Swiggy 168 | 2. Zomato 169 | 170 | Type: class 171 | 172 | Note 173 | ------ 174 | Create an object of this class to procced further. 175 | 176 | Example 177 | --------- 178 | >>> obj = PyScrappy.FoodScrapper() 179 | 180 | """ 181 | 182 | ############## Swiggy Scrapper ############## 183 | def swiggy_scrapper(self, city, n_pages): 184 | 185 | """ 186 | 187 | Swiggy Scrapper: Helps in scrapping swiggy data ('Name', 'Cuisine', 'Price', 'Rating'). 188 | return type: DataFrame 189 | 190 | Parameters 191 | ------------ 192 | city: Enter the name of desired city were swiggy delivers 193 | Type: str 194 | 195 | n_pages: Enter the number of pages that you want to scrape 196 | Type: int 197 | 198 | Note 199 | ------ 200 | Both the arguments are a compulsion. 201 | 202 | Example 203 | --------- 204 | >>> obj.swiggy_scrapper('city', 3) 205 | out: Name Cuisine Price Rating 206 | abc indian ₹123 for two 4.5 207 | xyz indian ₹342 for two 4.3 208 | 209 | """ 210 | 211 | import swiggy 212 | return swiggy.scrappi(city, n_pages) 213 | 214 | 215 | ############## Zomato Scrapper ############## 216 | def zomato_scrapper(self, city, n_pages): 217 | 218 | """ 219 | 220 | Zomato Scrapper: Helps in scrapping zomato data ("Name", "Cusine", "Price", "Rating", "Delivery Time", "Review counts"). 221 | return type: DataFrame 222 | 223 | Parameters 224 | ------------ 225 | city: Enter the name of desired city were zomato delivers 226 | Type: str 227 | 228 | n_pages: Enter the number of pages that you want to scrape 229 | Type: int 230 | 231 | Note 232 | ------ 233 | Both the arguments are a compulsion. 234 | 235 | Example 236 | --------- 237 | >>> obj.zomato_scrapper('city', 3) 238 | out: Name Cuisine Price Rating Delivery Time Review Count 239 | abc indian ₹123 for two 4.5 45min 3.4K 240 | xyz indian ₹342 for two 4.3 40min 1.2K 241 | 242 | """ 243 | 244 | import zomato 245 | return zomato.scrappi(city, n_pages) 246 | 247 | ######################################################################################################################## 248 | 249 | ############## Image Scrapper ############## 250 | def image_scrapper(data_name, n_images=10, img_format='jpg', folder_name='images'): 251 | 252 | """ 253 | 254 | Image Scrapper: Helps in scrapping images from "Google", "Yahoo", "Bing". 255 | Downloads it to the desired folder. 256 | 257 | Parameters 258 | ------------ 259 | data_name: Enter the name of object/item whose images you want to scrape/download 260 | Type: str 261 | 262 | n_images: Enter the number of imsges you want to scrape of download 263 | Type: int 264 | Default: 10 265 | 266 | img_format: Enter the format of the image file 267 | Type: str 268 | Default: 'jpg' 269 | Accepted Values: 'jpg', 'jpeg', 'png', 'gif' 270 | 271 | folder_name: Enter the path/folder name where you want to download the images 272 | Type: str 273 | Default: 'images' 274 | 275 | Note 276 | ------ 277 | Make sure the data_name is a valid name, and if you enter the directory make sure its a valid one. 278 | The scrapper will take some time to work. Wait for the images to get scrapped and downloaded, as it scrapes from all three search engines: Google, Yahoo and Bing. 279 | 280 | Feel free to experiment with different image formats. 281 | 282 | Example 283 | --------- 284 | >>> image_scarpper('Apple', 100, 'png', 'Apples') 285 | out: Starting to download... 286 | Successfully downloaded 100 images 287 | 288 | """ 289 | 290 | import image 291 | return image.scrappi(data_name, n_images, img_format, folder_name) 292 | 293 | ######################################################################################################################## 294 | 295 | ############## IMDB Scrapper ############## 296 | def imdb_scrapper(genre, n_pages): 297 | 298 | """ 299 | 300 | IMDB Scrapper: Helps in scrapping movies from IMDB. 301 | return type: DataFrame 302 | 303 | Parameters 304 | ------------ 305 | genre: Enter the genre of the movie 306 | Type: str 307 | 308 | n_pages: Enter the number of pages that it will scrape at a single run. 309 | Type: int 310 | 311 | Note 312 | ------ 313 | both the parameters are compulsory. 314 | 315 | Example 316 | --------- 317 | >>> imdb_scrapper('action', 4) 318 | out: Title Year Certificate Runtime Genre Rating Description Stars Directors Votes 319 | asd 2022 UA 49min action 3.9 about the.. asd dfgv 23 320 | scr 2022 15+ 89min action 4.9 about the.. add dfgv 23 321 | """ 322 | 323 | import imdb 324 | return imdb.scrappi(genre, n_pages) 325 | 326 | ######################################################################################################################## 327 | 328 | ############## LinkedIn Scrapper ############## 329 | def linkedin_scrapper(job_title, n_pages): 330 | 331 | """ 332 | 333 | LinkedIn Scrapper: Helps in scrapping job related data from LinkedIn (Job Title, Company Name, Location, Salary, Benefits, Date) 334 | return type: DataFrame 335 | 336 | Parameters 337 | ------------ 338 | job_title: Enter the job title or type. 339 | Type: str 340 | 341 | n_pages: Enter the number of pages that it will scrape at a single run. 342 | Type: int 343 | 344 | Note 345 | ------ 346 | Both the parameters is a compulsion 347 | 348 | Example 349 | --------- 350 | >>> linkedin_scrapper('python', 1) 351 | out: Job Title Company Name Location Salary Benefits Date 352 | abc PyScrappy US 2300 Actively Hiring +1 1 day ago 353 | abc PyScrappy US 2300 Actively Hiring +1 1 day ago 354 | ... 355 | .. 356 | 357 | """ 358 | 359 | import linkedin 360 | return linkedin.scrappi(job_title, n_pages) 361 | 362 | ######################################################################################################################## 363 | 364 | ############## News Scrapper ############## 365 | def news_scrapper(n_pages, genre = str()): 366 | 367 | """ 368 | 369 | News Scrapper: Helps in scrapping News (Headlines, Time, Date, News) 370 | return type: DataFrame 371 | 372 | Parameters 373 | ------------ 374 | n_pages: Enter the number of pages that it will scrape at a single run. 375 | Type: int 376 | 377 | genre: Enter the news genre 378 | Type: str 379 | Default: str() (None) 380 | Values accepted: 381 | 'national', 'business', 'sports', 'world', 'politics', 'technology', 'startup', 'entertainment', 382 | 'miscellaneous', 'hatke', 'science', 'automobile' 383 | 384 | Note 385 | ------ 386 | n_pages in a compulsion 387 | 388 | Example 389 | --------- 390 | >>> news_scrapper(3, 'hatke') 391 | out: Headlines Time Date News 392 | New Package 08:19 pm 25 Jun 2021,Sunday PyScrappy is a new package... 393 | New Scrapper 08:19 am 25 Jun 2020,Wednesday PyScrappy is a new Scrapper... 394 | 395 | """ 396 | 397 | import news 398 | return news.scrappi(n_pages, genre = genre) 399 | 400 | ######################################################################################################################## 401 | 402 | ############## Social Media Scrapper ############## 403 | class SocialMediaScrapper(): 404 | 405 | """ 406 | 407 | Social Media Scrapper: Helps in scrapping data from social media platforms 408 | 1. Instagram 409 | 2. Twitter 410 | 3. YouTube 411 | 412 | Type: class 413 | 414 | Note 415 | ------ 416 | Create an object of this class to procced further. 417 | 418 | Example 419 | --------- 420 | >>> obj = PyScrappy.SocialMediaScrapper() 421 | 422 | """ 423 | 424 | ############## Instagram Scrapper ############## 425 | class InstagramScrapper(): 426 | 427 | """ 428 | 429 | Instagram Scrapper: Helps in scrapping instagram data (name, posts, followers, following, bio, captions) 430 | 1. Details and post captions based on Insta handle 431 | 2. Post captions based on #hashtags 432 | 433 | Type: class 434 | 435 | Note 436 | ------ 437 | Create an object of this class to procced further. 438 | 439 | Example 440 | --------- 441 | >>> obj2 = obj.InstagramScrapper() 442 | 443 | """ 444 | 445 | ############## Instagram account Scrapper ############## 446 | def account_scrapper(self, insta_handle, n_pages): 447 | 448 | """ 449 | 450 | Instagram account Scrapper: Helps in scrapping instagram data (name, posts, followers, following, bio, captions) 451 | return type: DataFrame (for captions) 452 | 453 | Parameters 454 | ------------ 455 | insta_handle: Enter the desired Insta handle/username 456 | Type: str 457 | 458 | n_pages: Enter the number of pages that you want to scrape 459 | Type: int 460 | 461 | Note 462 | ------ 463 | Make sure the Instagram account is public, after certain number of runs, Instagram will ask you for your Instagram ID and PASSWORD, kindly enter it to continue. 464 | 465 | Example 466 | --------- 467 | >>> obj2.account_scrapper('Public_account_name', 3) 468 | out: Name: abc 469 | Posts: 50 470 | Followers: 128 471 | Following: 150 472 | Bio: Hello World!! 473 | 474 | Captions 475 | Hello World !!! My first picture. 476 | Hello World !!! My first program.... 477 | 478 | """ 479 | 480 | import instagram 481 | return instagram.account(insta_handle, n_pages) 482 | 483 | 484 | ############## Instagram hashtag Scrapper ############## 485 | def hashtag_scrapper(self, hashtag, n_posts): 486 | 487 | """ 488 | 489 | Instagram hashtag Scrapper: Helps in scrapping instagram data (captions) 490 | return type: DataFrame 491 | 492 | Parameters 493 | ------------ 494 | hashtag: Enter the desired hashtag 495 | Type: str 496 | 497 | n_posts: Enter the number of posts that you want to scrape 498 | Type: int 499 | 500 | Note 501 | ------ 502 | After certain number of runs, Instagram will ask you for your Instagram ID and PASSWORD, kindly enter it to continue. 503 | 504 | Example 505 | --------- 506 | >>> obj2.hashtag_scrapper('#python', 3) 507 | out: Captions 508 | Hello World !!! My first picture. #python 509 | Hello World !!! My first program. #python 510 | This is scrapping package. #python 511 | 512 | """ 513 | 514 | import instagram 515 | return instagram.hashtag(hashtag, n_posts) 516 | 517 | 518 | ############## Twitter Scrapper ############## 519 | def twitter_scrapper(self, hashtag, n_pages): 520 | 521 | """ 522 | 523 | Twitter Scrapper: Helps in scrapping data from Twitter ("Name", "Twitter handle", "Post Time", "Tweet", "Reply Count", "Retweet Count", "Like Count") 524 | return type: DataFrame 525 | 526 | Parameters 527 | ------------ 528 | hashtag: Enter the desired hashtag 529 | Type: str 530 | 531 | n_pages: Enter the number of pages that you want to scrape 532 | Type: int 533 | 534 | Note 535 | ------ 536 | Both the arguments are a compulsion 537 | 538 | Example 539 | --------- 540 | >>> obj.twitter_scrapper('#python', 3) 541 | out: Name Twitter handle Post Time Tweet Reply Count Retweet Count Like Count 542 | asd @ksnkj 3:49:36 this is ... 102 230 1.2k 543 | fsd @ksdtj 6:49:36 it is a ... 12 30 1k 544 | 545 | """ 546 | 547 | import twitter 548 | return twitter.scrappi(hashtag, n_pages) 549 | 550 | 551 | ############## YouTube Scrapper ############## 552 | def youtube_scrapper(self, video_sec_url, n_pages): 553 | 554 | """ 555 | 556 | YouTube Scrapper: Helps in scrapping YouTube data ('Title', 'Video_url', 'Views', 'Days') 557 | return type: DataFrame 558 | 559 | Parameters 560 | ------------ 561 | video_sec_url: Enter the desired YouTube URL (only video section) 562 | Type: str 563 | 564 | n_pages: The number of pages that it will scrape at a single run 565 | Type: int 566 | 567 | Note 568 | ------ 569 | Make sure the url is a valid YouTube url, and please enter the url ending with 'videos', i.e urls only from the video sections are acceptable. The scrapping limit is unlimited. 570 | 571 | Example 572 | --------- 573 | >>> obj.youtube_scrapper('https://www.youtube.com/user/youtuber_name/videos', 3) 574 | out: Title Video_url Views Days 575 | My video https://www.youtube.com/user/youtuber_name/my_video 1.2m 30 days 576 | My video2 https://www.youtube.com/user/youtuber_name/my_video2 1m 2 weeks 577 | 578 | """ 579 | 580 | import youtube 581 | return youtube.scrappi(video_sec_url, n_pages) 582 | 583 | ######################################################################################################################## 584 | 585 | ############## Song Scrapper ############## 586 | class SongScrapper(): 587 | 588 | """ 589 | 590 | Song Scrapper: Helps in scrapping songs related data 591 | 1. Soundcloud 592 | 593 | Type: class 594 | 595 | Note 596 | ------ 597 | Create an object of this class to procced further. 598 | 599 | Example 600 | --------- 601 | >>> obj = PyScrappy.SongScrapper() 602 | 603 | """ 604 | 605 | ############## Soundcloud Scrapper ############## 606 | def soundcloud_scrapper(self, track_name, n_pages): 607 | 608 | """ 609 | 610 | Soundcloud Scrapper: Helps in scrapping data from soundcloud ('Uploader', 'Music Title', 'Time of Upload', 'Plays') 611 | return type: DataFrame 612 | 613 | Parameters 614 | ------------ 615 | track_name: Enter the name of desired track/song/music 616 | Type: str 617 | 618 | n_pages: The number of pages that it will scrape at a single run 619 | Type: int 620 | 621 | Note 622 | ------ 623 | Make sure to enter a valid name 624 | 625 | Example 626 | --------- 627 | >>> obj.soundcloud_scrapper('music track', 3) 628 | out: Uploader Music Title Time of Upload Plays 629 | name1 music 3:34:76 234 630 | name2 music 5:6:34 445 631 | 632 | """ 633 | 634 | import soundcloud 635 | return soundcloud.soundcloud_tracks(track_name, n_pages) 636 | 637 | 638 | ############## Spotify Scrapper ############## 639 | def spotify_scrapper(self, track_name, n_pages): 640 | 641 | """ 642 | 643 | Spotify Scrapper: Helps in scrapping data from spotify ('Id', 'Title', 'Singers', 'Album', 'Duration') 644 | return type: DataFrame 645 | 646 | Parameters 647 | ------------ 648 | track_name: Enter the name of desired track/song/music/artist/bodcast 649 | Type: str 650 | 651 | n_pages: The number of pages that it will scrape at a single run 652 | Type: int 653 | 654 | Note 655 | ------ 656 | Make sure to enter a valid name 657 | 658 | Example 659 | --------- 660 | >>> obj.spotify_scrapper('pop', 3) 661 | out: Id Title Singers Album Duration 662 | 1 abc abc abc 2:30 663 | 2 def def def 2:30 664 | 665 | """ 666 | 667 | import spotify 668 | return spotify.scrappi(track_name, n_pages) 669 | 670 | ######################################################################################################################## 671 | 672 | ############## stock Scrapper ############## 673 | class StockScrapper(): 674 | 675 | """ 676 | 677 | Stock Scrapper: Helps in scrapping stock data 678 | 1. Analysis data of the stock 679 | 2. Historical data of the stock 680 | 3. Profile data of the stock 681 | 682 | Type: class 683 | 684 | Note 685 | ------ 686 | Create an object of this class to procced further. 687 | 688 | Example 689 | --------- 690 | >>> obj = PyScrappy.StockScrapper() 691 | 692 | """ 693 | 694 | ############## Analysis data scrapper ############## 695 | def analysis_data_scrapper(self, stock_code, analysis_type): 696 | 697 | """ 698 | 699 | Analysis data scrapper: Helps in scrapping the Analytical data of the stock 700 | return type: DataFrame 701 | 702 | Parameters 703 | ------------ 704 | stock_code: Enter the desired stock code 705 | Type: str 706 | 707 | analysis_type: Enter the name of the analysis type of the stock 708 | Type: str 709 | Accepted values: "earning estimate", "revenue estimate", "earning history", "EPS trend", "EPS revision", "growth estimate" 710 | 711 | Note 712 | ------ 713 | Make sure you enter a valid stock code. 714 | 715 | Example 716 | --------- 717 | >>> obj.analysis_data_scrapper('STOCK_CODE', 'earning estimates') 718 | 719 | """ 720 | 721 | import stock 722 | return stock.stock_analysis(stock_code, analysis_type) 723 | 724 | 725 | ############## Historical data scrapper ############## 726 | def historical_data_scrapper(self, stock_code, time_period, frequency): 727 | 728 | """ 729 | 730 | Historical data scrapper: Helps in scrapping the Historical data of the stock ('Date', 'Open', 'High', 'Low', 'Close', 'Adjusted Close', 'Volume') 731 | return type: CSV file 732 | 733 | Parameters 734 | ------------ 735 | stock_code: Enter the desired stock code 736 | Type: str 737 | 738 | time_period: Enter the range/time period of the stock 739 | Type: str 740 | Accepted values: '1d', '5d', '3m', '6m', '1y', '5y', 'max' 741 | 742 | frequency: Enter the time period interval of the data 743 | Type: str 744 | Accepted values: 'Daily', 'Weekly', 'Monthly' 745 | 746 | Note 747 | ------ 748 | Make sure you enter a valid stock code, time period and time interval 749 | 750 | Example 751 | --------- 752 | >>> obj.historical_data_scrapper('STOCK_CODE', '1y', 'Daily') 753 | 754 | """ 755 | 756 | import stock 757 | return stock.stock_history(stock_code, time_period, frequency) 758 | 759 | 760 | ############## Profile details scrapper ############## 761 | def profile_data_scrapper(self, stock_code): 762 | 763 | """ 764 | 765 | Profile data scrapper: Helps in scrapping the Profile data of the stock (profile, description, executives) 766 | return type: [dict, str, DataFrame] 767 | 768 | Parameters 769 | ------------ 770 | stock_code: Enter the desired stock code 771 | Type: str 772 | 773 | Note 774 | ------ 775 | Make sure you enter a valid stock code. 776 | Make sure to store the result in three variable as list spreading 777 | 778 | Example 779 | --------- 780 | >>> profile, description, executives = obj.profile_data_scrapper('STOCK_CODE') 781 | 782 | """ 783 | 784 | import stock 785 | return stock.stock_profile(stock_code) 786 | 787 | ######################################################################################################################## 788 | 789 | ############## Wikipedia Scrapper ############## 790 | class WikipediaScrapper(): 791 | 792 | """ 793 | 794 | Wikipedia Scrapper: Helps in scrapping Wikipedia data 795 | 1. Header 796 | 2. Paragraph 797 | 3. Text 798 | 799 | Type: class 800 | 801 | Note 802 | ------ 803 | Create an object of this class to procced further. 804 | 805 | Example 806 | --------- 807 | >>> obj = PyScrappy.WikipediaScrapper() 808 | 809 | """ 810 | 811 | ############## Wikipedia Paragraph Scrapper ############## 812 | def para_scrapper(self, word): 813 | 814 | """ 815 | 816 | Para Scrapper: Helps in scrapping paragraphs from Wikipedia. 817 | 818 | Parameters 819 | ------------ 820 | word: Enter the desired keyword 821 | Type: str 822 | 823 | Note 824 | ------ 825 | Make sure that the info of the word is available in Wikipedia. 826 | 827 | Example 828 | --------- 829 | >>> obj.para_scrapper("Python (programming language)") 830 | out: ['\n', 831 | "Python is an interpreted high-level general-purpose programming language.", .....] 832 | 833 | """ 834 | 835 | import wikipedia 836 | return wikipedia.para(word) 837 | 838 | ############## Wikipedia Header Scrapper ############## 839 | def header_scrapper(self, word): 840 | 841 | """ 842 | 843 | Header Scrapper: Helps in scrapping headers from Wikipedia. 844 | 845 | Parameters 846 | ------------ 847 | word: Enter the desired keyword 848 | Type: str 849 | 850 | Note 851 | ------ 852 | Make sure that the info of the word is available in Wikipedia. 853 | 854 | Example 855 | --------- 856 | >>> obj.header_scrapper("Python (programming language)") 857 | out: ['History', 858 | 'Design philosophy and features', ....] 859 | 860 | """ 861 | 862 | import wikipedia 863 | return wikipedia.header(word) 864 | 865 | ############## Wikipedia Text Scrapper ############## 866 | def text_scrapper(self, word): 867 | 868 | """ 869 | 870 | Text Scrapper: Helps in scrapping text from Wikipedia. 871 | 872 | Parameters 873 | ------------ 874 | word: Enter the desired keyword 875 | Type: str 876 | 877 | Note 878 | ------ 879 | Make sure that the info of the word is available in Wikipedia. 880 | 881 | Example 882 | --------- 883 | >>> obj.text_scrapper("Python (programming language)") 884 | out: ' History Python is an interpreted high-level general-purpose programming language..... ' 885 | 886 | """ 887 | 888 | import wikipedia 889 | return wikipedia.text(word) 890 | 891 | ######################################################################################################################## --------------------------------------------------------------------------------