├── README.md
├── Scraps.py
└── income.py


/README.md:
--------------------------------------------------------------------------------
1 | # Job-Parser
2 | small web parser that gets all the top jobs and visualizes the various salaries for each position
3 | 


--------------------------------------------------------------------------------
/Scraps.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import urllib2 as u
  3 | import pandas as pd
  4 | 
  5 | import plotly.plotly as py
  6 | import plotly.graph_objs as go
  7 | import plotly.tools as tls
  8 | 
  9 | import robotparser
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | from income import SalaryEstimates
 13 | # The Required dictionaries to be used later on in the script.
 14 | Dict, profile, profile_company = {}, {}, {}
 15 | # Lists
 16 | location_list = []
 17 | company_list = []
 18 | job_title = []
 19 | 
 20 | 
 21 | def allow():
 22 | 
 23 |     '''
 24 |     The purpose of this module will be to go to the robots.txt file and ask for access for  the parsing process
 25 |     to begin. If the robot.txt denies the required access, then the program will be halted and a message will be
 26 |     displayed explaining that we do not have the rights to parse the site.
 27 |     '''
 28 | 
 29 |     rp = robotparser.RobotFileParser()
 30 |     rp.set_url("http://www.indeed.com/robots.txt")
 31 |     rp.read()
 32 |     access = rp.can_fetch("*","http://www.indeed.com/jobs?q=python+analyst&l=CA" )
 33 |     return access
 34 | 
 35 | 
 36 | class Parser:
 37 |     def __init__(self):
 38 |         pass
 39 | 
 40 |     def data_parse(self, x, soup, job):
 41 | 
 42 |         '''
 43 | 
 44 |         :param x: x is the boolean parameter that is passed from the previous query with the robots.txt file./
 45 |          Once passed        then the first process will be to validate the required access, displaying a message/
 46 |             if it does not agree.
 47 |         :param job: (str) a list of jobs will be passed to determine which one has the most amount of jobs possible.
 48 |         :param soup: bs4 object
 49 |         :return: The process will return a Pandas DataFrame table that displays the quantity of jobs available/
 50 |          per query. For instance, Python engineer- 10,000 jobs available, Python Analyst- 5000 jobs e.t.c.
 51 | 
 52 |         '''
 53 | 
 54 |         # To check for access of the scraping process.
 55 |         if not x:
 56 |             print "defies the site rules"
 57 |             quit()
 58 | 
 59 |         # Once access is granted then the process starts parsing the data by first comparing the number/
 60 |         # of jobs available and returning the facts and figures.
 61 |         for text in soup.find_all("div", id="searchCount"):
 62 |             self.data = str(text.get_text()[16:]).replace(",","")
 63 |             job2 = job.replace("+", " ")
 64 |             Dict[job2] = self.data
 65 | 
 66 |         self.df = pd.DataFrame(Dict.items(), columns=['jobs', 'number of openings'])
 67 |         self.df = self.df.apply(pd.to_numeric, errors='ignore')
 68 |         return self.df
 69 | 
 70 |     def graph_parsed_data(self, username, api_key):
 71 | 
 72 |         '''
 73 |          At this process the program will access the Plotly api and graph the features that/
 74 |         were given by the first parsing process. The attributes that it will take in will be Api Username,
 75 |         ApiPassword or api_key
 76 |         :param username: this accesses is the api Username that you initially added in the first process.
 77 |         :param api_key: this is the api key that you receive after registration.
 78 |         :return: Final graph
 79 |         '''
 80 | 
 81 |         tls.set_credentials_file(username=username, api_key=api_key)
 82 |         data = [
 83 |             go.Scatter(
 84 |                 x=self.df['jobs'], # assign x as the dataframe column 'x'
 85 |                 y=self.df['number of openings']
 86 | 
 87 |             )
 88 |         ]
 89 |         final_graph = py.plot(data, filename='pandas/basic-bar')
 90 |         return final_graph
 91 | 
 92 | 
 93 | class TextParser:
 94 | 
 95 |     def __init__(self):
 96 |         pass
 97 | 
 98 |     def listed_jobs(self, jobs, soup):
 99 | 
100 |         for post in soup.find_all("div", {"class":"  row  result"}):
101 |             # job title
102 |             jobs = post.find_all("a", {"class": "turnstileLink"})
103 | 
104 |             job_contents = (job.get_text(' ', strip=True) for job in jobs)
105 |             job_title.append(job_contents)
106 |             #           company Name
107 |             companies = post.find_all("span", {"itemprop":"name"})
108 |             company_content = (company.get_text(' ', strip=True) for company in companies)
109 |             company_list.append(company_content)
110 |             #           location
111 |             locations = post.find_all("span", {"itemprop":"addressLocality"})
112 |             locality = (location.get_text(' ', strip=True) for location in locations)
113 |             location_list.append(locality)
114 |         # return location_list
115 |         profile["Job Title"] =(list(itertools.chain.from_iterable(job_title)))
116 |         profile["Location"] = (list(itertools.chain.from_iterable(location_list)))
117 |         profile_company["Company"] = (list(itertools.chain.from_iterable(company_list)))
118 | 
119 |         # Turning the list into a panda DataFrame which will have 3 columns. These columns/
120 |         # include jobtitle, job location, and Company
121 |         df3 = pd.DataFrame(profile_company)
122 |         df4 = pd.DataFrame(profile).join(df3, how='left')
123 |         return df4
124 | 
125 | 
126 | # main module that manages all the other modules in the  script
127 | def main(jobs):
128 | 
129 |     username = raw_input("please enter your Plotly Username: \n")
130 |     api_key = raw_input("please enter your Plotly Api Key: \n")
131 |     state = "CA"
132 |     for job in jobs:
133 |             url = "http://www.indeed.com/jobs?q=" + str(job) + "&l="+ str(state)+ "&rq=1&fromage=last"
134 |             response = u.urlopen(url)
135 | 
136 |             response = response.read()
137 |             soup = BeautifulSoup(response, "html.parser")
138 |             allowance = allow()
139 | 
140 |             # Declaring the classes that have been used sequentially
141 |             parser = Parser()
142 |             text = TextParser()
143 |             salary = SalaryEstimates()
144 | 
145 |             # functions that are present in these classes respectively
146 |             items = parser.data_parse(allowance, soup, job)
147 | 
148 |             final = text.listed_jobs(job, soup)
149 |             print "-~"*50
150 |             print "-~"*50
151 |             print "the requested job was", job
152 |             print "-~"*50
153 |             print final
154 |             print "-~"*50
155 |             print "-~"*50
156 |             print "the requested job salary was "+job+" salary"
157 |             print "-~"*50
158 |             wage_compiled = salary.salary_parser(soup)
159 |     print "\n"
160 |     print "-~"*50
161 |     print "-~"*50
162 |     print "The total number of jobs in each field is"
163 |     print "-~"*50
164 |     print items
165 |     # compares the total number of jobs visually on Plotly
166 |     parser.graph_parsed_data(username, api_key)
167 | 
168 |     # runs the salary graph on Plotly
169 |     salary.graphing_salary(username, api_key)
170 | 
171 | main(["python analyst", "civil engineer", "python"])
172 | 


--------------------------------------------------------------------------------
/income.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | import plotly.plotly as py
 4 | import plotly.graph_objs as go
 5 | import plotly.tools as tls
 6 | 
 7 | 
 8 | class SalaryEstimates:
 9 |     def __init__(self):
10 |         pass
11 | 
12 |     def salary_parser(self, soup):
13 |         '''
14 |         :param soup: beautiful soup object defined at the main module.
15 |         :return: returns a pandas DataFrame
16 |         '''
17 | 
18 |         rx = re.compile('([+(),])')
19 |         for post in soup.find_all("ul", {"class":"rbList"}):
20 | 
21 |             figures = post.get_text(' ', strip=True)
22 |             figures = list(rx.sub(r'', figures).replace(' ', ', ').split())
23 | 
24 |             quantity = [elem.replace(',','') for elem in figures if '$' not in elem]
25 | 
26 |             salary = [elem.replace('$', '').replace(',','') for elem in figures if '$' in elem]
27 | 
28 |             d = {'Salary from jobs': pd.Series(salary, index=['a', 'b', 'c','d','e']),
29 |                  'Quantity': pd.Series(quantity, index=['a', 'b', 'c', 'd', 'e'])}
30 |             self.df5 = pd.DataFrame(d)
31 |             self.df5 = self.df5.apply(pd.to_numeric, errors='coerce')
32 |             print self.df5
33 |             df5_median =self.df5['Salary from jobs'].median()
34 |             df5_mean = self.df5['Salary from jobs'].mean()
35 | 
36 |             print "The median for this job is:", df5_median
37 |             print "The mean for this job is:", df5_mean
38 |             return self.df5
39 | 
40 |     def graphing_salary(self, username, api_key):
41 |         '''
42 | 
43 |         :param username: str. This is the Plotly api username that you gave beforehand
44 |         :param api_key: str. Plotly api_key
45 |         :return: graphical output of the job selected.
46 |         '''
47 | 
48 |         # authorizing the user Plotly credentials
49 |         tls.set_credentials_file(username=username, api_key=api_key)
50 | 
51 |         # creating a Plotly scatter object
52 |         data = [
53 |             go.Scatter(
54 |                 x=self.df5['Quantity'],
55 |                 y=self.df5['Salary from jobs']
56 | 
57 |             )
58 |         ]
59 |         final_graph = py.plot(data, filename='pandas/basic-bar')
60 |         return final_graph
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------