├── requirements.txt ├── README.md └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.5.1 2 | lxml==3.6.4 3 | requests==2.11.1 4 | tabulate==0.7.5 5 | stop-words==2015.2.23.1 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web_scraper_live_demo 2 | This is the code for the "Build a Web Scraper" Live stream by @Sirajology on Youtube 3 | 4 | #Overview 5 | This is the code for the live [Youtube](https://youtu.be/A0Ac_dKNmH0) session I hosted on how to build a web scraper. This script pulls the top 20 most frequently used words from a Wikipedia article. It uses regular expressions and stop word removal to create a cleaned table that we can view with the results 6 | 7 | #Installation 8 | 9 | The necessary dependencies are in the requirements.txt file so just run this before running the actual code to get them installed 10 | 11 | `` 12 | pip install -r requirements.txt 13 | `` 14 | 15 | #Usage 16 | 17 | There are two arguments. The first is the article you want to retrive words from. The second is a boolean value that describes 18 | whether or not you want to remove stop words. 19 | 20 | `` 21 | python main.py your_article_name_here yes 22 | `` 23 | 24 | #Credits 25 | 26 | Thanks to [prabhakar267](https://github.com/prabhakar267) for the inspiration. 27 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #lets write a Simple script 2 | #to get the 20 words and their frequency percentage 3 | #with highest frequency in an English Wikipedia article. 4 | #applications are recommender systems, chatbots and NLP, sentiment analysis, 5 | #data visualization, 6 | #market research 7 | 8 | #Beautiful Soup is a Python library 9 | #for pulling data out of HTML and XML files. 10 | from bs4 import BeautifulSoup 11 | #Requests is one of the most downloaded 12 | #Python packages of all time, 13 | #pulling in over 7,000,000 downloads every month. 14 | #HTTP library for pulling pushing and authenticating 15 | import requests 16 | #lets you do Regular expression operations 17 | #special text string for describing a search pattern. 18 | #find and replace 19 | import re 20 | #The operator module exports a 21 | #set of efficient functions 22 | #corresponding to the intrinsic operators of Python. 23 | #comparison, addition, greater than less then 24 | import operator 25 | #parses json, formats it 26 | import json 27 | #The module provides just one function, 28 | #tabulate, which takes a list of lists or another 29 | #tabular data type as the first argument, 30 | #and outputs a nicely formatted plain-text table: 31 | from tabulate import tabulate 32 | #system calls, dealw with user arguments 33 | import sys 34 | #list of common stop words various languages like the 35 | from stop_words import get_stop_words 36 | 37 | #get the words 38 | def getWordList(url): 39 | word_list = [] 40 | #raw data 41 | source_code = requests.get(url) 42 | #convert to text 43 | plain_text = source_code.text 44 | #lxml format 45 | soup = BeautifulSoup(plain_text,'lxml') 46 | 47 | #find the words in paragraph tag 48 | for text in soup.findAll('p'): 49 | if text.text is None: 50 | continue 51 | #content 52 | content = text.text 53 | #lowercase and split into an array 54 | words = content.lower().split() 55 | 56 | #for each word 57 | for word in words: 58 | #remove non-chars 59 | cleaned_word = clean_word(word) 60 | #if there is still something there 61 | if len(cleaned_word) > 0: 62 | #add it to our word list 63 | word_list.append(cleaned_word) 64 | 65 | return word_list 66 | 67 | 68 | #clean word with regex 69 | def clean_word(word): 70 | cleaned_word = re.sub('[^A-Za-z]+', '', word) 71 | return cleaned_word 72 | 73 | 74 | def createFrquencyTable(word_list): 75 | #word count 76 | word_count = {} 77 | for word in word_list: 78 | #index is the word 79 | if word in word_count: 80 | word_count[word] += 1 81 | else: 82 | word_count[word] = 1 83 | 84 | return word_count 85 | 86 | #remove stop words 87 | def remove_stop_words(frequency_list): 88 | stop_words = get_stop_words('en') 89 | 90 | temp_list = [] 91 | for key,value in frequency_list: 92 | if key not in stop_words: 93 | temp_list.append([key, value]) 94 | 95 | return temp_list 96 | 97 | #access wiki API. json format. query it for data. search tyep. shows list of possibilities 98 | wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch=" 99 | wikipedia_link = "https://en.wikipedia.org/wiki/" 100 | 101 | #if the search word is too small, throw error 102 | if(len(sys.argv) < 2): 103 | print("Enter valid string") 104 | exit() 105 | 106 | #get the search word 107 | string_query = sys.argv[1] 108 | 109 | #to remove stop words or not 110 | if(len(sys.argv) > 2): 111 | search_mode = True 112 | else: 113 | search_mode = False 114 | 115 | #create our URL 116 | url = wikipedia_api_link + string_query 117 | 118 | #try-except block. simple way to deal with exceptions 119 | #great for HTTP requests 120 | try: 121 | #use requests to retrieve raw data from wiki API URL we 122 | #just constructed 123 | response = requests.get(url) 124 | 125 | #format that data as a JSON dictionary 126 | data = json.loads(response.content.decode("utf-8")) 127 | 128 | #page title, first option 129 | #show this in web browser 130 | wikipedia_page_tag = data['query']['search'][0]['title'] 131 | 132 | #get actual wiki page based on retrieved title 133 | url = wikipedia_link + wikipedia_page_tag 134 | #get list of words from that page 135 | page_word_list = getWordList(url) 136 | #create table of word counts, dictionary 137 | page_word_count = createFrquencyTable(page_word_list) 138 | #sort the table by the frequency count 139 | sorted_word_frequency_list = sorted(page_word_count.items(), key=operator.itemgetter(1), reverse=True) 140 | #remove stop words if the user specified 141 | if(search_mode): 142 | sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list) 143 | 144 | #sum the total words to calculate frequencies 145 | total_words_sum = 0 146 | for key,value in sorted_word_frequency_list: 147 | total_words_sum = total_words_sum + value 148 | 149 | #just get the top 20 words 150 | if len(sorted_word_frequency_list) > 20: 151 | sorted_word_frequency_list = sorted_word_frequency_list[:20] 152 | 153 | #create our final list which contains words, frequency (word count), percentage 154 | final_list = [] 155 | for key,value in sorted_word_frequency_list: 156 | percentage_value = float(value * 100) / total_words_sum 157 | final_list.append([key, value, round(percentage_value, 4)]) 158 | 159 | #headers before the table 160 | print_headers = ['Word', 'Frequency', 'Frequency Percentage'] 161 | 162 | #print the table with tabulate 163 | print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl')) 164 | 165 | #throw an exception in case it breaks 166 | except requests.exceptions.Timeout: 167 | print("The server didn't respond. Please, try again later.") 168 | 169 | --------------------------------------------------------------------------------