├── LICENSE ├── README.md └── webscrape.sh /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Sandesh Yadav 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WebScrape ![License](https://img.shields.io/badge/License-MIT-yellow.svg) ![Version](https://img.shields.io/badge/Version-1.0-red.svg) 2 | 3 | It is a web scraper written in bash with all possible error handling which scrapes mail ID's and phone numbers from the websites. 4 | 5 | **What is Web Scraping ?**
6 | Web Scraping also termed Web Data Extraction or Web Harvesting it is a technique employed to extract large amounts of data from websites whereby the data is extracted and saved to a local file in your computer for many uses. 7 | 8 | ## Screenshot 9 | 10 | 11 | 12 | ## Installing and requirements 13 | 14 | - Linux or Unix-based system 15 | 16 | ### Installing 17 | 18 | ``` 19 | ~ ❯❯❯ git clone https://github.com/3xploitGuy/webscrape.git 20 | 21 | ~ ❯❯❯ cd webscrape 22 | 23 | ~/webscrape ❯❯❯ chmod +x webscrape.sh 24 | 25 | ~/webscrape ❯❯❯ ./webscrape.sh 26 | ``` 27 | 28 | ## Contact 29 | 30 | [Gmail](mailto:sandeshyadavm46@gmail.com)
31 | [Instagram](https://instagram.com/1n_only_sandy)
32 | [Blog](https://virtualprivacy.blogspot.com)
33 | [Website](https://sandeshyadav.000webhostapp.com)
34 | [YouTube](https://www.youtube.com/channel/UCAdDJn4yWzQMJyKyRWne3qg) 35 | 36 | ## Disclaimer 37 | 38 | Webscrape is created to help in data scraping from the Internet and it's not responsible for any misuse or illegal purposes. 39 | 40 | ## License 41 | 42 | This work by [3xplotGuy](https://github.com/3xploitGuy) is licensed under the terms of the [MIT License](https://www.tldrlegal.com/l/mit). 43 | -------------------------------------------------------------------------------- /webscrape.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | Black="\e[1;90m" 4 | Red="\e[1;91m" 5 | Green="\e[1;92m" 6 | Yellow="\e[1;93m" 7 | Blue="\e[1;94m" 8 | Purple="\e[1;95m" 9 | White="\e[1;97m" 10 | clear 11 | banner () { 12 | echo -e " 13 | ${Red} __ __ ${Yellow} ___. _________ 14 | ${Red}/ \ / \ ${Yellow}____\_ |__ / _____/ ________________ ______ ____ 15 | ${Red}\ \/\/ /${Yellow}/ __ \| __ \ \_____ \_/ ___\_ __ \__ \ \____ \_/ __ \ 16 | ${Red} \ /${Yellow}\ ___/| \_\ \/ \ \___| | \// __ \| |_> > ___/ 17 | ${Red} \__/\ /${Yellow} \___ >___ /_______ /\___ >__| (____ / __/ \___ > 18 | ${Red} \/${Yellow} \/ \/ \/ \/ \/|__| \/ " 19 | printf "\n\e[1;77m A web scraper to get emails and phone numbers from websites \e[0m\n\n" 20 | echo -e "\e[0;96m Developed by: ${Red}Sandesh (3xploitGuy)\n\n\n" 21 | #echo -e "\e[0;96m Version: ${Red}1.0 Stable\n\n\n" 22 | } 23 | scanner () { 24 | sleep 0.5 25 | read -p $'\e[1;97m[\e[1;92m*\e[1;97m]\e[1;92m Enter URL to begin : \e[1;97m' url 26 | url_validity='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]' 27 | if [[ $url =~ $url_validity ]] 28 | then 29 | read -p $'\e[1;97m[\e[1;92m*\e[1;97m]\e[1;92m Scrape emails from website (y/n) : \e[1;97m' email 30 | read -p $'\e[1;97m[\e[1;92m*\e[1;97m]\e[1;92m Scrape phone numbers from website (y/n) : \e[1;97m' phone 31 | if [ "$email" = "Y" ] || [ "$email" = "y" ] || [ "$phone" = "Y" ] || [ "$phone" = "y" ]; then 32 | echo -e "$White[${Red}!$White] ${Red}Scraping started" 33 | scraper 34 | fi 35 | sleep 0.4 36 | echo -e "$White[${Red}!$White] ${Red}Exiting....\n" 37 | exit 38 | else 39 | echo -e "$White[$Red!$White] ${Red}Check your url (invalid)" 40 | scanner 41 | fi 42 | } 43 | scraper () { 44 | curl -s $url > temp.txt 45 | if [ "$email" = "Y" ] || [ "$email" = "y" ]; then 46 | email_scraping 47 | fi 48 | if [ "$phone" = "Y" ] || [ "$phone" = "y" ]; then 49 | phone_scraping 50 | fi 51 | rm temp.txt 52 | if [[ -f "email.txt" ]] || [[ -f "phone.txt" ]] ; then 53 | sleep 0.4 54 | read -p $'\e[1;97m[\e[1;92m*\e[1;97m]\e[1;92m Do you want to save the output (y/n) : \e[1;97m' save_output 55 | if [ "$save_output" = "Y" ] || [ "$save_output" = "y" ]; then 56 | output 57 | fi 58 | fi 59 | sleep 0.4 60 | echo -e "$White[${Red}!$White] ${Red}Exiting....\n" 61 | rm email.txt phone.txt 2> /dev/null 62 | exit 63 | } 64 | email_scraping () { 65 | grep -i -o '[A-Z0-9._%+-]\+@[A-Z0-9.-]\+\.[A-Z]\{2,4\}' temp.txt | sort -u > email.txt 66 | if [[ -s email.txt ]]; then 67 | echo -e "$White[${Yellow}*$White] ${Yellow}Emails success${White}" 68 | cat email.txt 69 | else 70 | echo -e "$White[${Red}!$White] ${Red}No Emails found" 71 | rm email.txt 72 | fi 73 | } 74 | phone_scraping () { 75 | grep -o '\([0-9]\{3\}\-[0-9]\{3\}\-[0-9]\{4\}\)\|\(([0-9]\{3\})[0-9]\{3\}\-[0-9]\{4\}\)\|\([0-9]\{10\}\)\|\([0-9]\{3\}\s[0-9]\{3\}\s[0-9]\{4\}\)' temp.txt | sort -u > phone.txt 76 | if [[ -s phone.txt ]]; then 77 | echo -e "$White[${Yellow}*$White] ${Yellow}Phone numbers success${White}" 78 | cat phone.txt 79 | else 80 | echo -e "$White[${Red}!$White] ${Red}No phone numbers found" 81 | rm phone.txt 82 | fi 83 | } 84 | output () { 85 | sleep 0.4 86 | read -p $'\e[1;97m[\e[1;92m*\e[1;97m]\e[1;92m Enter folder name : \e[1;97m' folder_name 87 | if [ -d "$folder_name" ] 88 | then 89 | echo -e "$White[${Red}!$White] ${Red}Folder already exists" 90 | output 91 | fi 92 | mkdir $folder_name 93 | mv email.txt $folder_name 2> /dev/null 94 | mv phone.txt $folder_name 2> /dev/null 95 | sleep 0.3 96 | echo -e "$White[${Green}*$White] ${Yellow}Output saved" 97 | sleep 0.4 98 | echo -e "$White[${Red}!$White] ${Red}Exiting....\n" 99 | exit 100 | } 101 | internet () { 102 | sleep 0.5 103 | echo -e "$White[$Red!$White] ${Red}Checking internet connection" 104 | wget -q --spider http://google.com 105 | if [ $? -eq 0 ]; then 106 | echo -e "$White[$Yellow*$White] ${Yellow}Connected" 107 | else 108 | sleep 0.5 109 | echo -e "$White[$Red!$White] ${Red}No internet try later" 110 | exit 111 | fi 112 | } 113 | banner 114 | internet 115 | scanner 116 | --------------------------------------------------------------------------------