├── README.md ├── Scraps.py └── income.py /README.md: -------------------------------------------------------------------------------- 1 | # Job-Parser 2 | small web parser that gets all the top jobs and visualizes the various salaries for each position 3 | -------------------------------------------------------------------------------- /Scraps.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import urllib2 as u 3 | import pandas as pd 4 | 5 | import plotly.plotly as py 6 | import plotly.graph_objs as go 7 | import plotly.tools as tls 8 | 9 | import robotparser 10 | from bs4 import BeautifulSoup 11 | 12 | from income import SalaryEstimates 13 | # The Required dictionaries to be used later on in the script. 14 | Dict, profile, profile_company = {}, {}, {} 15 | # Lists 16 | location_list = [] 17 | company_list = [] 18 | job_title = [] 19 | 20 | 21 | def allow(): 22 | 23 | ''' 24 | The purpose of this module will be to go to the robots.txt file and ask for access for the parsing process 25 | to begin. If the robot.txt denies the required access, then the program will be halted and a message will be 26 | displayed explaining that we do not have the rights to parse the site. 27 | ''' 28 | 29 | rp = robotparser.RobotFileParser() 30 | rp.set_url("http://www.indeed.com/robots.txt") 31 | rp.read() 32 | access = rp.can_fetch("*","http://www.indeed.com/jobs?q=python+analyst&l=CA" ) 33 | return access 34 | 35 | 36 | class Parser: 37 | def __init__(self): 38 | pass 39 | 40 | def data_parse(self, x, soup, job): 41 | 42 | ''' 43 | 44 | :param x: x is the boolean parameter that is passed from the previous query with the robots.txt file./ 45 | Once passed then the first process will be to validate the required access, displaying a message/ 46 | if it does not agree. 47 | :param job: (str) a list of jobs will be passed to determine which one has the most amount of jobs possible. 48 | :param soup: bs4 object 49 | :return: The process will return a Pandas DataFrame table that displays the quantity of jobs available/ 50 | per query. For instance, Python engineer- 10,000 jobs available, Python Analyst- 5000 jobs e.t.c. 51 | 52 | ''' 53 | 54 | # To check for access of the scraping process. 55 | if not x: 56 | print "defies the site rules" 57 | quit() 58 | 59 | # Once access is granted then the process starts parsing the data by first comparing the number/ 60 | # of jobs available and returning the facts and figures. 61 | for text in soup.find_all("div", id="searchCount"): 62 | self.data = str(text.get_text()[16:]).replace(",","") 63 | job2 = job.replace("+", " ") 64 | Dict[job2] = self.data 65 | 66 | self.df = pd.DataFrame(Dict.items(), columns=['jobs', 'number of openings']) 67 | self.df = self.df.apply(pd.to_numeric, errors='ignore') 68 | return self.df 69 | 70 | def graph_parsed_data(self, username, api_key): 71 | 72 | ''' 73 | At this process the program will access the Plotly api and graph the features that/ 74 | were given by the first parsing process. The attributes that it will take in will be Api Username, 75 | ApiPassword or api_key 76 | :param username: this accesses is the api Username that you initially added in the first process. 77 | :param api_key: this is the api key that you receive after registration. 78 | :return: Final graph 79 | ''' 80 | 81 | tls.set_credentials_file(username=username, api_key=api_key) 82 | data = [ 83 | go.Scatter( 84 | x=self.df['jobs'], # assign x as the dataframe column 'x' 85 | y=self.df['number of openings'] 86 | 87 | ) 88 | ] 89 | final_graph = py.plot(data, filename='pandas/basic-bar') 90 | return final_graph 91 | 92 | 93 | class TextParser: 94 | 95 | def __init__(self): 96 | pass 97 | 98 | def listed_jobs(self, jobs, soup): 99 | 100 | for post in soup.find_all("div", {"class":" row result"}): 101 | # job title 102 | jobs = post.find_all("a", {"class": "turnstileLink"}) 103 | 104 | job_contents = (job.get_text(' ', strip=True) for job in jobs) 105 | job_title.append(job_contents) 106 | # company Name 107 | companies = post.find_all("span", {"itemprop":"name"}) 108 | company_content = (company.get_text(' ', strip=True) for company in companies) 109 | company_list.append(company_content) 110 | # location 111 | locations = post.find_all("span", {"itemprop":"addressLocality"}) 112 | locality = (location.get_text(' ', strip=True) for location in locations) 113 | location_list.append(locality) 114 | # return location_list 115 | profile["Job Title"] =(list(itertools.chain.from_iterable(job_title))) 116 | profile["Location"] = (list(itertools.chain.from_iterable(location_list))) 117 | profile_company["Company"] = (list(itertools.chain.from_iterable(company_list))) 118 | 119 | # Turning the list into a panda DataFrame which will have 3 columns. These columns/ 120 | # include jobtitle, job location, and Company 121 | df3 = pd.DataFrame(profile_company) 122 | df4 = pd.DataFrame(profile).join(df3, how='left') 123 | return df4 124 | 125 | 126 | # main module that manages all the other modules in the script 127 | def main(jobs): 128 | 129 | username = raw_input("please enter your Plotly Username: \n") 130 | api_key = raw_input("please enter your Plotly Api Key: \n") 131 | state = "CA" 132 | for job in jobs: 133 | url = "http://www.indeed.com/jobs?q=" + str(job) + "&l="+ str(state)+ "&rq=1&fromage=last" 134 | response = u.urlopen(url) 135 | 136 | response = response.read() 137 | soup = BeautifulSoup(response, "html.parser") 138 | allowance = allow() 139 | 140 | # Declaring the classes that have been used sequentially 141 | parser = Parser() 142 | text = TextParser() 143 | salary = SalaryEstimates() 144 | 145 | # functions that are present in these classes respectively 146 | items = parser.data_parse(allowance, soup, job) 147 | 148 | final = text.listed_jobs(job, soup) 149 | print "-~"*50 150 | print "-~"*50 151 | print "the requested job was", job 152 | print "-~"*50 153 | print final 154 | print "-~"*50 155 | print "-~"*50 156 | print "the requested job salary was "+job+" salary" 157 | print "-~"*50 158 | wage_compiled = salary.salary_parser(soup) 159 | print "\n" 160 | print "-~"*50 161 | print "-~"*50 162 | print "The total number of jobs in each field is" 163 | print "-~"*50 164 | print items 165 | # compares the total number of jobs visually on Plotly 166 | parser.graph_parsed_data(username, api_key) 167 | 168 | # runs the salary graph on Plotly 169 | salary.graphing_salary(username, api_key) 170 | 171 | main(["python analyst", "civil engineer", "python"]) 172 | -------------------------------------------------------------------------------- /income.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | import plotly.plotly as py 4 | import plotly.graph_objs as go 5 | import plotly.tools as tls 6 | 7 | 8 | class SalaryEstimates: 9 | def __init__(self): 10 | pass 11 | 12 | def salary_parser(self, soup): 13 | ''' 14 | :param soup: beautiful soup object defined at the main module. 15 | :return: returns a pandas DataFrame 16 | ''' 17 | 18 | rx = re.compile('([+(),])') 19 | for post in soup.find_all("ul", {"class":"rbList"}): 20 | 21 | figures = post.get_text(' ', strip=True) 22 | figures = list(rx.sub(r'', figures).replace(' ', ', ').split()) 23 | 24 | quantity = [elem.replace(',','') for elem in figures if '$' not in elem] 25 | 26 | salary = [elem.replace('$', '').replace(',','') for elem in figures if '$' in elem] 27 | 28 | d = {'Salary from jobs': pd.Series(salary, index=['a', 'b', 'c','d','e']), 29 | 'Quantity': pd.Series(quantity, index=['a', 'b', 'c', 'd', 'e'])} 30 | self.df5 = pd.DataFrame(d) 31 | self.df5 = self.df5.apply(pd.to_numeric, errors='coerce') 32 | print self.df5 33 | df5_median =self.df5['Salary from jobs'].median() 34 | df5_mean = self.df5['Salary from jobs'].mean() 35 | 36 | print "The median for this job is:", df5_median 37 | print "The mean for this job is:", df5_mean 38 | return self.df5 39 | 40 | def graphing_salary(self, username, api_key): 41 | ''' 42 | 43 | :param username: str. This is the Plotly api username that you gave beforehand 44 | :param api_key: str. Plotly api_key 45 | :return: graphical output of the job selected. 46 | ''' 47 | 48 | # authorizing the user Plotly credentials 49 | tls.set_credentials_file(username=username, api_key=api_key) 50 | 51 | # creating a Plotly scatter object 52 | data = [ 53 | go.Scatter( 54 | x=self.df5['Quantity'], 55 | y=self.df5['Salary from jobs'] 56 | 57 | ) 58 | ] 59 | final_graph = py.plot(data, filename='pandas/basic-bar') 60 | return final_graph 61 | 62 | 63 | 64 | --------------------------------------------------------------------------------