├── README.md
└── job_scraper.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # job-scraping-python
2 | Python 3 and BeautifulSoup to process job listings on popular websites.
3 | 


--------------------------------------------------------------------------------
/job_scraper.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# A Simple Scraper which accesses Job Postings and checks for specific strings"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from bs4 import BeautifulSoup\n",
 17 |     "import urllib.request as ur\n",
 18 |     "import re\n",
 19 |     "import sys"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 5,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "False"
 31 |       ]
 32 |      },
 33 |      "execution_count": 5,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "red_flags = [\"senior\", \"intern\", \"contract\", \"staff\"] #List of words to avoid in job title\n",
 40 |     "#required = [\"software\"] #Can also check for required words\n",
 41 |     "\n",
 42 |     "def qualifies(title):\n",
 43 |     "    title = title.lower()\n",
 44 |     "    #Define a function to check if a job title is worth checking out  \n",
 45 |     "    for word in red_flags:\n",
 46 |     "        if word in title: return False\n",
 47 |     "    return True\n",
 48 |     "\n",
 49 |     "#test:\n",
 50 |     "qualifies(\"Senior Software Engineer\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 15,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "<_sre.SRE_Match object; span=(0, 8), match='2+ Years'> \n",
 63 |       " None\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "# Now define the Regex, \n",
 69 |     "# 1. Should not have the phrase 1+ years, 1-2 Years, so on..\n",
 70 |     "p1 = re.compile('[2-9]\\s*\\+?-?\\s*[1-9]?\\s*[yY]e?a?[rR][Ss]?')\n",
 71 |     "# 2. Should not have mention of \"Citizenship\", \"Citizens\", so on..\n",
 72 |     "p2 = re.compile('[Cc]itizens?(ship)?')\n",
 73 |     "\n",
 74 |     "t1 = p1.search(\"2+ Years of experiencce\")\n",
 75 |     "t2 = p1.search(\"0-1 Year\")\n",
 76 |     "print (t1, \"\\n\",t2)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 23,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "'Ready.'"
 88 |       ]
 89 |      },
 90 |      "execution_count": 23,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "#The first page with search results\n",
 97 |     "url_base = \"base url here.. \"\n",
 98 |     "pgno = 0\n",
 99 |     "try:\n",
100 |     "        response = ur.urlopen(url_base+str(pgno))\n",
101 |     "        html_doc = response.read()\n",
102 |     "except:\n",
103 |     "        print(\"URL not accesible\")\n",
104 |     "        exit();\n",
105 |     "soup = BeautifulSoup(html_doc, 'html.parser')\n",
106 |     "\"Ready.\""
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 30,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "370\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "try:\n",
124 |     "    total_results = soup.find(id=\"searchCount\").get_text()\n",
125 |     "    last_page = int(int(total_results[total_results.index(\"of\")+2: total_results.index(\"jobs\")].strip()) / 10) * 10\n",
126 |     "    print(last_page)\n",
127 |     "except:\n",
128 |     "    print (\"No jobs found\")"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 34,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "Software Engineer, University New Graduate , Houzz : http://www.indeed.com/rc/clk?jk=b13f83b8083235b1&fccid=cd808c272e6d956a \n",
141 |       "\n",
142 |       "Software Engineer , LiveAction : http://www.indeed.com/rc/clk?jk=02ecac2491ae61cf&fccid=ace8b9c0b5f18b27 \n",
143 |       "\n",
144 |       "Java Developer (Full Time) , Ezen computer services : http://www.indeed.com/company/Ezen-computer-services-INC/jobs/Java-Developer-c84007a1deb5b12d?fccid=8db39aaae3efa27e \n",
145 |       "\n",
146 |       "Software Engineer, Cloud Platform - UI , Toyota Research Institute : http://www.indeed.com/rc/clk?jk=d16fb14301cdff9f&fccid=e490ccf806951166 \n",
147 |       "\n",
148 |       "Dev-Ops / Java Software Engineer , Paypal : http://www.indeed.com/rc/clk?jk=31fe013ea2b5fe01&fccid=978d9fd9799d55a8 \n",
149 |       "\n",
150 |       "Jr. Web Application Developer (AMRD2017) , Fortinet : http://www.indeed.com/rc/clk?jk=eb7835743224b1db&fccid=28d99c55a9ebe7b6 \n",
151 |       "\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "jobs_per_page = 10\n",
157 |     "goodlinks = []\n",
158 |     "for pgno in range(0,last_page,jobs_per_page):\n",
159 |     "    if pgno > 0:\n",
160 |     "        try:\n",
161 |     "            response = ur.urlopen(url_base+str(pgno))\n",
162 |     "            html_doc = response.read()\n",
163 |     "        except:\n",
164 |     "            break;\n",
165 |     "        soup = BeautifulSoup(html_doc, 'html.parser')\n",
166 |     "    for job in soup.find_all(class_='result'):\n",
167 |     "        link = job.find(class_=\"turnstileLink\")\n",
168 |     "        try:\n",
169 |     "            jt = link.get('title')\n",
170 |     "        except:\n",
171 |     "            jt = \"\"\n",
172 |     "        try:\n",
173 |     "            comp = job.find(class_='company').get_text().strip()\n",
174 |     "        except:\n",
175 |     "            comp = \"\"\n",
176 |     "\n",
177 |     "        if(qualifies(jt.lower())):\n",
178 |     "            toVisit = \"http://www.indeed.com\"+link.get('href')\n",
179 |     "            try:\n",
180 |     "                html_doc = ur.urlopen(toVisit).read().decode('utf-8')\n",
181 |     "            except:\n",
182 |     "                continue;\n",
183 |     "            m = p1.search(html_doc)\n",
184 |     "            n = p2.search(html_doc)\n",
185 |     "            if not m and not n:\n",
186 |     "                print(jt,\",\",comp,\":\",toVisit,\"\\n\")\n",
187 |     "                goodlinks.append(toVisit)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "And that's all, very easy to customize for any kind of task."
195 |    ]
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.6.3"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 2
219 | }
220 | 


--------------------------------------------------------------------------------