├── README.md └── job_scraper.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # job-scraping-python 2 | Python 3 and BeautifulSoup to process job listings on popular websites. 3 | -------------------------------------------------------------------------------- /job_scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A Simple Scraper which accesses Job Postings and checks for specific strings" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from bs4 import BeautifulSoup\n", 17 | "import urllib.request as ur\n", 18 | "import re\n", 19 | "import sys" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 5, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "False" 31 | ] 32 | }, 33 | "execution_count": 5, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "red_flags = [\"senior\", \"intern\", \"contract\", \"staff\"] #List of words to avoid in job title\n", 40 | "#required = [\"software\"] #Can also check for required words\n", 41 | "\n", 42 | "def qualifies(title):\n", 43 | " title = title.lower()\n", 44 | " #Define a function to check if a job title is worth checking out \n", 45 | " for word in red_flags:\n", 46 | " if word in title: return False\n", 47 | " return True\n", 48 | "\n", 49 | "#test:\n", 50 | "qualifies(\"Senior Software Engineer\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 15, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "<_sre.SRE_Match object; span=(0, 8), match='2+ Years'> \n", 63 | " None\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# Now define the Regex, \n", 69 | "# 1. Should not have the phrase 1+ years, 1-2 Years, so on..\n", 70 | "p1 = re.compile('[2-9]\\s*\\+?-?\\s*[1-9]?\\s*[yY]e?a?[rR][Ss]?')\n", 71 | "# 2. Should not have mention of \"Citizenship\", \"Citizens\", so on..\n", 72 | "p2 = re.compile('[Cc]itizens?(ship)?')\n", 73 | "\n", 74 | "t1 = p1.search(\"2+ Years of experiencce\")\n", 75 | "t2 = p1.search(\"0-1 Year\")\n", 76 | "print (t1, \"\\n\",t2)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 23, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "'Ready.'" 88 | ] 89 | }, 90 | "execution_count": 23, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "#The first page with search results\n", 97 | "url_base = \"base url here.. \"\n", 98 | "pgno = 0\n", 99 | "try:\n", 100 | " response = ur.urlopen(url_base+str(pgno))\n", 101 | " html_doc = response.read()\n", 102 | "except:\n", 103 | " print(\"URL not accesible\")\n", 104 | " exit();\n", 105 | "soup = BeautifulSoup(html_doc, 'html.parser')\n", 106 | "\"Ready.\"" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 30, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "370\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "try:\n", 124 | " total_results = soup.find(id=\"searchCount\").get_text()\n", 125 | " last_page = int(int(total_results[total_results.index(\"of\")+2: total_results.index(\"jobs\")].strip()) / 10) * 10\n", 126 | " print(last_page)\n", 127 | "except:\n", 128 | " print (\"No jobs found\")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 34, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "Software Engineer, University New Graduate , Houzz : http://www.indeed.com/rc/clk?jk=b13f83b8083235b1&fccid=cd808c272e6d956a \n", 141 | "\n", 142 | "Software Engineer , LiveAction : http://www.indeed.com/rc/clk?jk=02ecac2491ae61cf&fccid=ace8b9c0b5f18b27 \n", 143 | "\n", 144 | "Java Developer (Full Time) , Ezen computer services : http://www.indeed.com/company/Ezen-computer-services-INC/jobs/Java-Developer-c84007a1deb5b12d?fccid=8db39aaae3efa27e \n", 145 | "\n", 146 | "Software Engineer, Cloud Platform - UI , Toyota Research Institute : http://www.indeed.com/rc/clk?jk=d16fb14301cdff9f&fccid=e490ccf806951166 \n", 147 | "\n", 148 | "Dev-Ops / Java Software Engineer , Paypal : http://www.indeed.com/rc/clk?jk=31fe013ea2b5fe01&fccid=978d9fd9799d55a8 \n", 149 | "\n", 150 | "Jr. Web Application Developer (AMRD2017) , Fortinet : http://www.indeed.com/rc/clk?jk=eb7835743224b1db&fccid=28d99c55a9ebe7b6 \n", 151 | "\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "jobs_per_page = 10\n", 157 | "goodlinks = []\n", 158 | "for pgno in range(0,last_page,jobs_per_page):\n", 159 | " if pgno > 0:\n", 160 | " try:\n", 161 | " response = ur.urlopen(url_base+str(pgno))\n", 162 | " html_doc = response.read()\n", 163 | " except:\n", 164 | " break;\n", 165 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 166 | " for job in soup.find_all(class_='result'):\n", 167 | " link = job.find(class_=\"turnstileLink\")\n", 168 | " try:\n", 169 | " jt = link.get('title')\n", 170 | " except:\n", 171 | " jt = \"\"\n", 172 | " try:\n", 173 | " comp = job.find(class_='company').get_text().strip()\n", 174 | " except:\n", 175 | " comp = \"\"\n", 176 | "\n", 177 | " if(qualifies(jt.lower())):\n", 178 | " toVisit = \"http://www.indeed.com\"+link.get('href')\n", 179 | " try:\n", 180 | " html_doc = ur.urlopen(toVisit).read().decode('utf-8')\n", 181 | " except:\n", 182 | " continue;\n", 183 | " m = p1.search(html_doc)\n", 184 | " n = p2.search(html_doc)\n", 185 | " if not m and not n:\n", 186 | " print(jt,\",\",comp,\":\",toVisit,\"\\n\")\n", 187 | " goodlinks.append(toVisit)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "And that's all, very easy to customize for any kind of task." 195 | ] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3", 201 | "language": "python", 202 | "name": "python3" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.6.3" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 2 219 | } 220 | --------------------------------------------------------------------------------