├── requirements.txt
├── README.md
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.5.1
2 | lxml==3.6.4
3 | requests==2.11.1
4 | tabulate==0.7.5
5 | stop-words==2015.2.23.1
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # web_scraper_live_demo
 2 | This is the code for the "Build a Web Scraper" Live stream by @Sirajology on Youtube
 3 | 
 4 | #Overview
 5 | This is the code for the live [Youtube](https://youtu.be/A0Ac_dKNmH0) session I hosted on how to build a web scraper. This script pulls the top 20 most frequently used words from a Wikipedia article. It uses regular expressions and stop word removal to create a cleaned table that we can view with the results
 6 | 
 7 | #Installation
 8 | 
 9 | The necessary dependencies are in the requirements.txt file so just run this before running the actual code to get them installed
10 | 
11 | ``
12 | pip install -r requirements.txt
13 | ``
14 | 
15 | #Usage
16 | 
17 | There are two arguments. The first is the article you want to retrive words from. The second is a boolean value that describes
18 | whether or not you want to remove stop words. 
19 | 
20 | ``
21 | python main.py your_article_name_here yes
22 | ``
23 | 
24 | #Credits
25 | 
26 | Thanks to [prabhakar267](https://github.com/prabhakar267) for the inspiration.
27 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #lets write a Simple script 
  2 | #to get the 20 words and their frequency percentage 
  3 | #with highest frequency in an English Wikipedia article. 
  4 | #applications are recommender systems, chatbots and NLP, sentiment analysis,
  5 | #data visualization,
  6 | #market research
  7 | 
  8 | #Beautiful Soup is a Python library 
  9 | #for pulling data out of HTML and XML files.
 10 | from bs4 import BeautifulSoup
 11 | #Requests is one of the most downloaded 
 12 | #Python packages of all time, 
 13 | #pulling in over 7,000,000 downloads every month.
 14 | #HTTP library for pulling pushing and authenticating
 15 | import requests
 16 | #lets you do Regular expression operations
 17 | #special text string for describing a search pattern.
 18 | #find and replace
 19 | import re
 20 | #The operator module exports a 
 21 | #set of efficient functions 
 22 | #corresponding to the intrinsic operators of Python.
 23 | #comparison, addition, greater than less then
 24 | import operator
 25 | #parses json, formats it
 26 | import json
 27 | #The module provides just one function, 
 28 | #tabulate, which takes a list of lists or another 
 29 | #tabular data type as the first argument, 
 30 | #and outputs a nicely formatted plain-text table:
 31 | from tabulate import tabulate
 32 | #system calls, dealw with user arguments
 33 | import sys
 34 | #list of common stop words various languages like the
 35 | from stop_words import get_stop_words
 36 | 
 37 | #get the words
 38 | def getWordList(url):
 39 |     word_list = []
 40 |     #raw data
 41 |     source_code = requests.get(url)
 42 |     #convert to text
 43 |     plain_text = source_code.text
 44 |     #lxml format
 45 |     soup = BeautifulSoup(plain_text,'lxml')
 46 | 
 47 |     #find the words in paragraph tag
 48 |     for text in soup.findAll('p'):
 49 |         if text.text is None:
 50 |             continue
 51 |         #content
 52 |         content = text.text
 53 |         #lowercase and split into an array
 54 |         words = content.lower().split()
 55 | 
 56 |         #for each word
 57 |         for word in words:
 58 |             #remove non-chars
 59 |             cleaned_word = clean_word(word)
 60 |             #if there is still something there
 61 |             if len(cleaned_word) > 0:
 62 |                 #add it to our word list
 63 |                 word_list.append(cleaned_word)
 64 | 
 65 |     return word_list
 66 | 
 67 | 
 68 | #clean word with regex
 69 | def clean_word(word):
 70 |     cleaned_word = re.sub('[^A-Za-z]+', '', word)
 71 |     return cleaned_word
 72 | 
 73 | 
 74 | def createFrquencyTable(word_list):
 75 |     #word count
 76 |     word_count = {}
 77 |     for word in word_list:
 78 |         #index is the word
 79 |         if word in word_count:
 80 |             word_count[word] += 1
 81 |         else:
 82 |             word_count[word] = 1
 83 | 
 84 |     return word_count
 85 | 
 86 | #remove stop words
 87 | def remove_stop_words(frequency_list):
 88 |     stop_words = get_stop_words('en')
 89 | 
 90 |     temp_list = []
 91 |     for key,value in frequency_list:
 92 |         if key not in stop_words:
 93 |             temp_list.append([key, value])
 94 | 
 95 |     return temp_list
 96 | 
 97 | #access wiki API. json format. query it for data. search tyep. shows list of possibilities
 98 | wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
 99 | wikipedia_link = "https://en.wikipedia.org/wiki/"
100 | 
101 | #if the search word is too small, throw error
102 | if(len(sys.argv) < 2):
103 |     print("Enter valid string")
104 |     exit()
105 | 
106 | #get the search word
107 | string_query = sys.argv[1]
108 | 
109 | #to remove stop words or not
110 | if(len(sys.argv) > 2):
111 |     search_mode = True
112 | else:
113 |     search_mode = False
114 | 
115 | #create our URL
116 | url = wikipedia_api_link + string_query
117 | 
118 | #try-except block. simple way to deal with exceptions 
119 | #great for HTTP requests
120 | try:
121 |     #use requests to retrieve raw data from wiki API URL we
122 |     #just constructed
123 |     response = requests.get(url)
124 | 
125 |     #format that data as a JSON dictionary
126 |     data = json.loads(response.content.decode("utf-8"))
127 | 
128 |     #page title, first option
129 |     #show this in web browser
130 |     wikipedia_page_tag = data['query']['search'][0]['title']
131 | 
132 |     #get actual wiki page based on retrieved title
133 |     url = wikipedia_link + wikipedia_page_tag
134 |     #get list of words from that page
135 |     page_word_list = getWordList(url)
136 |     #create table of word counts, dictionary
137 |     page_word_count = createFrquencyTable(page_word_list)
138 |     #sort the table by the frequency count
139 |     sorted_word_frequency_list = sorted(page_word_count.items(), key=operator.itemgetter(1), reverse=True)
140 |     #remove stop words if the user specified
141 |     if(search_mode):
142 |         sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list)
143 | 
144 |     #sum the total words to calculate frequencies   
145 |     total_words_sum = 0
146 |     for key,value in sorted_word_frequency_list:
147 |         total_words_sum = total_words_sum + value
148 | 
149 |     #just get the top 20 words
150 |     if len(sorted_word_frequency_list) > 20:
151 |         sorted_word_frequency_list = sorted_word_frequency_list[:20]
152 | 
153 |     #create our final list which contains words, frequency (word count), percentage
154 |     final_list = []
155 |     for key,value in sorted_word_frequency_list:
156 |         percentage_value = float(value * 100) / total_words_sum
157 |         final_list.append([key, value, round(percentage_value, 4)])
158 | 
159 |     #headers before the table
160 |     print_headers = ['Word', 'Frequency', 'Frequency Percentage']
161 | 
162 |     #print the table with tabulate
163 |     print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))
164 | 
165 | #throw an exception in case it breaks
166 | except requests.exceptions.Timeout:
167 |     print("The server didn't respond. Please, try again later.")
168 | 
169 | 


--------------------------------------------------------------------------------