├── Data Modelling ├── Doc2Vec_Model.ipynb ├── Pickle_testing.ipynb └── README.md ├── Data ├── Course_Data │ ├── Coursera_Catalog.csv │ └── Coursera_Catalog_Request.ipynb ├── Job_Data │ ├── Glassdoor_Joblist.csv │ ├── Glassdoor_Joblist_Integration.ipynb │ └── Raw │ │ ├── Data_Job_NY.csv │ │ ├── Data_Job_SF.csv │ │ ├── Data_Job_TX.csv │ │ └── Data_Job_WA.csv └── README.md ├── Exploratory Data Analysis ├── Create_Test_Set.ipynb ├── Job_Posts_EDA.ipynb ├── README.md ├── courses_test_sample.csv └── jobs_test_sample.csv ├── LICENSE ├── Other ├── Course_webpages.ipynb ├── Coursera_data_collection.ipynb ├── Coursera_review_data.ipynb ├── Coursetalk_data.ipynb ├── Indeed_data.ipynb └── coursera_description.ipynb ├── Procfile ├── README.md ├── app.py ├── model.p ├── requirements.txt ├── static └── css │ └── styles.css └── templates ├── form.html └── results.html /Data Modelling/Pickle_testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook checks to make sure the pickled model is working properly." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 12, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pickle\n", 17 | "import gensim\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# Load the pickled model from disk\n", 28 | "model = pickle.load(open('model.p', 'rb'))" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Select a sample job description\n", 38 | "js = \"'\\nData Scientist\\n\\nat Brightidea\\n\\nSan Francisco\\n\\nThe Role\\n\\nWe are seeking machine learning developers with natural language processing experience.\\n\\nIn general, we are looking for people who are self-motivated and passionate about the field of machine learning and the vast applications of it. These folks will have the ability to work with / understand / and build on top of an existing code base using their deep knowledge of various machine learning algorithms (e.g. neural networks, bayesian methods, etc).\\n\\nKey responsibilities include, but not limited to:\\n\\n\\nBuild on top of an existing text processing/classification system\\nWrite, maintain, and develop python machine learning modules & repos\\nRun hyperparameter optimizations + collect, analyze, visualize, and present results\\n\\nWhat You Need to Succeed\\n\\nBS or MS in computer science, mathematics, physics or other hard science/engineering discipline\\nProgramming in Python ~ 2+ years\\nNumpy, scipy, pandas, Jupyter, and scikit-learn background\\nData visualization (e.g. matplotlib, seaborn, bokeh, mpld3, etc)\\nAbility to implement machine learning algorithms from scratch\\nExperience with full machine learning pipeline: from data preprocessing, to building/training various models, to hyperparameter optimization, testing, and visualization of results.\\nBackground in deep learning preferred but not required\\n\\nIn Your Application Please Include\\n\\n\\n\\nA past machine learning project you worked on in which highlights your skills, including: What tools/models did you use? What were some problems you encountered along the way, and how did you solve them?\"" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 9, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Preprocess the job description\n", 48 | "doc = gensim.utils.simple_preprocess(js)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 7, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# Vectorize the job description\n", 58 | "vector = model.infer_vector(doc)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 10, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "[(441, 0.5915085673332214),\n", 70 | " (1231, 0.5760758519172668),\n", 71 | " (2849, 0.5734542012214661),\n", 72 | " (3976, 0.5609011650085449),\n", 73 | " (1634, 0.5435008406639099),\n", 74 | " (1074, 0.5411732792854309),\n", 75 | " (3298, 0.5391528010368347),\n", 76 | " (18, 0.5345658659934998),\n", 77 | " (4269, 0.5208688378334045),\n", 78 | " (1656, 0.5193619728088379)]" 79 | ] 80 | }, 81 | "execution_count": 10, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "# Extract the most similar docs from the model\n", 88 | "sims = model.docvecs.most_similar([vector])\n", 89 | "sims" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 13, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/html": [ 100 | "
\n", 101 | "\n", 114 | "\n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
courseTypedescriptiondomainTypesidslugspecializationsworkloadprimaryLanguagescertificatesname
0v2.ondemandGamification is the application of game elemen...[{'subdomainId': 'design-and-product', 'domain...69Bku0KoEeWZtA4u62x6lQgamification[]4-8 hours/week['en']['VerifiedCert']Gamification
1v2.ondemandThis course will cover the steps used in weigh...[{'subdomainId': 'data-analysis', 'domainId': ...0HiU7Oe4EeWTAQ4yevf_oQmissing-data[]4 weeks of study, 1-2 hours/week['en']['VerifiedCert', 'Specialization']Dealing With Missing Data
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " courseType description \\\n", 163 | "0 v2.ondemand Gamification is the application of game elemen... \n", 164 | "1 v2.ondemand This course will cover the steps used in weigh... \n", 165 | "\n", 166 | " domainTypes id \\\n", 167 | "0 [{'subdomainId': 'design-and-product', 'domain... 69Bku0KoEeWZtA4u62x6lQ \n", 168 | "1 [{'subdomainId': 'data-analysis', 'domainId': ... 0HiU7Oe4EeWTAQ4yevf_oQ \n", 169 | "\n", 170 | " slug specializations workload \\\n", 171 | "0 gamification [] 4-8 hours/week \n", 172 | "1 missing-data [] 4 weeks of study, 1-2 hours/week \n", 173 | "\n", 174 | " primaryLanguages certificates \\\n", 175 | "0 ['en'] ['VerifiedCert'] \n", 176 | "1 ['en'] ['VerifiedCert', 'Specialization'] \n", 177 | "\n", 178 | " name \n", 179 | "0 Gamification \n", 180 | "1 Dealing With Missing Data " 181 | ] 182 | }, 183 | "execution_count": 13, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "# Read in the course data\n", 190 | "course_df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')\n", 191 | "course_df.head(2)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 14, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# Extract course ids from the similar doc list\n", 201 | "course_ids = [sim[0] for sim in sims]" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 15, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "441 Data Science Math Skills\n", 213 | "1231 Mathematics for economists\n", 214 | "2849 Scalable Machine Learning on Big Data using Ap...\n", 215 | "3976 Big Data Integration and Processing\n", 216 | "1634 Parallel Programming in Java\n", 217 | "1074 Tools for Data Science\n", 218 | "3298 Programming for Everybody (Getting Started wit...\n", 219 | "18 Computer Vision Basics\n", 220 | "4269 Disease Clusters\n", 221 | "1656 業務効率や生産性向上につながる時間管理\n", 222 | "Name: name, dtype: object" 223 | ] 224 | }, 225 | "execution_count": 15, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "# Display the names of the most similar courses\n", 232 | "course_df.loc[course_ids, 'name']" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.7.3" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 4 264 | } 265 | -------------------------------------------------------------------------------- /Data Modelling/README.md: -------------------------------------------------------------------------------- 1 | # Data Modelling 2 | 3 | Data Modelling is implemented using a Doc2Vec model for matching course descriptions to job descriptions. It trains the model on a corpus of course descriptions (from the Coursera catalog). 4 | Then it evaluates the model by testing it out with a 5 | sample set of job descriptions for which relevant courses have already been pre-selected. Finally, the model is "pickled" for use by a Flask API. 6 | 7 | The Libraries and Modules used for this purpose are: 8 | 9 | - [Numpy](https://numpy.org/) 10 | - [Pandas](https://pandas.pydata.org/) 11 | - [Gensim](https://pypi.org/project/gensim/) 12 | - [Scipy Spatial](https://docs.scipy.org/doc/scipy/reference/spatial.html) 13 | - [Pickle](https://docs.python.org/3/library/pickle.html) 14 | -------------------------------------------------------------------------------- /Data/Course_Data/Coursera_Catalog_Request.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Coursera Catalog Request\n", 8 | "\n", 9 | "This notebook requests the entire catalog of courses from the Coursera API, converts it into a dataframe, and exports it as a csv file." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 22, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Import libraries\n", 19 | "\n", 20 | "import requests\n", 21 | "import time\n", 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 35, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Request status for page 0 is 200.\n", 35 | "Request status for page 1 is 200.\n", 36 | "Request status for page 2 is 200.\n", 37 | "Request status for page 3 is 200.\n", 38 | "Request status for page 4 is 200.\n", 39 | "Request status for page 5 is 200.\n", 40 | "Request status for page 6 is 200.\n", 41 | "Request status for page 7 is 200.\n", 42 | "Request status for page 8 is 200.\n", 43 | "Request status for page 9 is 200.\n", 44 | "Request status for page 10 is 200.\n", 45 | "Request status for page 11 is 200.\n", 46 | "Request status for page 12 is 200.\n", 47 | "Request status for page 13 is 200.\n", 48 | "Request status for page 14 is 200.\n", 49 | "Request status for page 15 is 200.\n", 50 | "Request status for page 16 is 200.\n", 51 | "Request status for page 17 is 200.\n", 52 | "Request status for page 18 is 200.\n", 53 | "Request status for page 19 is 200.\n", 54 | "Request status for page 20 is 200.\n", 55 | "Request status for page 21 is 200.\n", 56 | "Request status for page 22 is 200.\n", 57 | "Request status for page 23 is 200.\n", 58 | "Request status for page 24 is 200.\n", 59 | "Request status for page 25 is 200.\n", 60 | "Request status for page 26 is 200.\n", 61 | "Request status for page 27 is 200.\n", 62 | "Request status for page 28 is 200.\n", 63 | "Request status for page 29 is 200.\n", 64 | "Request status for page 30 is 200.\n", 65 | "Request status for page 31 is 200.\n", 66 | "Request status for page 32 is 200.\n", 67 | "Request status for page 33 is 200.\n", 68 | "Request status for page 34 is 200.\n", 69 | "Request status for page 35 is 200.\n", 70 | "Request status for page 36 is 200.\n", 71 | "Request status for page 37 is 200.\n", 72 | "Request status for page 38 is 200.\n", 73 | "Request status for page 39 is 200.\n", 74 | "Request status for page 40 is 200.\n", 75 | "Request status for page 41 is 200.\n", 76 | "Request status for page 42 is 200.\n", 77 | "Request status for page 43 is 200.\n", 78 | "Request status for page 44 is 200.\n", 79 | "Finished. The number of courses gotten from the catalog is 4416\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# Get the entire Coursera catalog.\n", 85 | "\n", 86 | "# Instantiate a list to hold the courses\n", 87 | "courses = []\n", 88 | "\n", 89 | "# Set the base url for making get requests\n", 90 | "base_url = 'https://api.coursera.org/api/courses.v1'\n", 91 | "\n", 92 | "# Add the fields I want to include in my requests\n", 93 | "fields = \"&fields=description,primaryLanguages,certificates,workload,specializations,domainTypes\"\n", 94 | "\n", 95 | "# Loop through all 45 pages of the catalog\n", 96 | "for page in range(45):\n", 97 | " \n", 98 | " # set pagination\n", 99 | " pagination = f\"?start={page*100}&limit=100\"\n", 100 | "\n", 101 | " # make a request\n", 102 | " res = requests.get(base_url + pagination + fields)\n", 103 | " print(f'Request status for page {page} is {res.status_code}.')\n", 104 | " \n", 105 | " # convert from json\n", 106 | " dict = res.json()\n", 107 | " \n", 108 | " # add to the catalog dictionary\n", 109 | " for course in dict['elements']:\n", 110 | " courses.append(course)\n", 111 | " \n", 112 | " # delay time to next request\n", 113 | " time.sleep(2)\n", 114 | "\n", 115 | "print(f'Finished. The number of courses gotten from the catalog is {len(courses)}')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 36, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "(4416, 10)\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "text/html": [ 133 | "
\n", 134 | "\n", 147 | "\n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | "
courseTypedescriptiondomainTypesidslugspecializationsworkloadprimaryLanguagescertificatesname
0v2.ondemandGamification is the application of game elemen...[{'subdomainId': 'design-and-product', 'domain...69Bku0KoEeWZtA4u62x6lQgamification[]4-8 hours/week[en][VerifiedCert]Gamification
1v2.ondemandThis course will cover the steps used in weigh...[{'subdomainId': 'data-analysis', 'domainId': ...0HiU7Oe4EeWTAQ4yevf_oQmissing-data[]4 weeks of study, 1-2 hours/week[en][VerifiedCert, Specialization]Dealing With Missing Data
2v2.ondemandThe Unordered Data Structures course covers th...[{'domainId': 'computer-science', 'subdomainId...sI_-QEBiEemtDRLx7Ne8jgcs-fundamentals-3[][en][VerifiedCert, Specialization]Unordered Data Structures
3v2.ondemandThe vital signs – heart rate, blood pressure, ...[{'subdomainId': 'patient-care', 'domainId': '...5zjIsJq-EeW_wArffOXkOwvital-signs[]3-5 hours/week[en][VerifiedCert]Vital Signs: Understanding What the Body Is Te...
4v2.ondemandThis course “FinTech Disruptive Innovation: Im...[{'subdomainId': 'finance', 'domainId': 'busin...WFanvtoSEeedbRLwgi9a7Afintech-disruption[]Around 4 hours of videos in total, plus a fina...[en][VerifiedCert, Specialization]FinTech Disruptive Innovation: Implications fo...
\n", 231 | "
" 232 | ], 233 | "text/plain": [ 234 | " courseType description \\\n", 235 | "0 v2.ondemand Gamification is the application of game elemen... \n", 236 | "1 v2.ondemand This course will cover the steps used in weigh... \n", 237 | "2 v2.ondemand The Unordered Data Structures course covers th... \n", 238 | "3 v2.ondemand The vital signs – heart rate, blood pressure, ... \n", 239 | "4 v2.ondemand This course “FinTech Disruptive Innovation: Im... \n", 240 | "\n", 241 | " domainTypes id \\\n", 242 | "0 [{'subdomainId': 'design-and-product', 'domain... 69Bku0KoEeWZtA4u62x6lQ \n", 243 | "1 [{'subdomainId': 'data-analysis', 'domainId': ... 0HiU7Oe4EeWTAQ4yevf_oQ \n", 244 | "2 [{'domainId': 'computer-science', 'subdomainId... sI_-QEBiEemtDRLx7Ne8jg \n", 245 | "3 [{'subdomainId': 'patient-care', 'domainId': '... 5zjIsJq-EeW_wArffOXkOw \n", 246 | "4 [{'subdomainId': 'finance', 'domainId': 'busin... WFanvtoSEeedbRLwgi9a7A \n", 247 | "\n", 248 | " slug specializations \\\n", 249 | "0 gamification [] \n", 250 | "1 missing-data [] \n", 251 | "2 cs-fundamentals-3 [] \n", 252 | "3 vital-signs [] \n", 253 | "4 fintech-disruption [] \n", 254 | "\n", 255 | " workload primaryLanguages \\\n", 256 | "0 4-8 hours/week [en] \n", 257 | "1 4 weeks of study, 1-2 hours/week [en] \n", 258 | "2 [en] \n", 259 | "3 3-5 hours/week [en] \n", 260 | "4 Around 4 hours of videos in total, plus a fina... [en] \n", 261 | "\n", 262 | " certificates \\\n", 263 | "0 [VerifiedCert] \n", 264 | "1 [VerifiedCert, Specialization] \n", 265 | "2 [VerifiedCert, Specialization] \n", 266 | "3 [VerifiedCert] \n", 267 | "4 [VerifiedCert, Specialization] \n", 268 | "\n", 269 | " name \n", 270 | "0 Gamification \n", 271 | "1 Dealing With Missing Data \n", 272 | "2 Unordered Data Structures \n", 273 | "3 Vital Signs: Understanding What the Body Is Te... \n", 274 | "4 FinTech Disruptive Innovation: Implications fo... " 275 | ] 276 | }, 277 | "execution_count": 36, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "# Convert the dictionary to DataFrame\n", 284 | "\n", 285 | "catalog_df = pd.DataFrame(courses)\n", 286 | "print(catalog_df.shape)\n", 287 | "catalog_df.head()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 38, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "# Export the DataFrame as a csv file.\n", 297 | "\n", 298 | "catalog_df.to_csv('./Coursera_Catalog.csv', index=False)" 299 | ] 300 | } 301 | ], 302 | "metadata": { 303 | "kernelspec": { 304 | "display_name": "Python 3", 305 | "language": "python", 306 | "name": "python3" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 3 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython3", 318 | "version": "3.7.3" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 4 323 | } 324 | -------------------------------------------------------------------------------- /Data/Job_Data/Glassdoor_Joblist_Integration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Glassdoor Joblist Integration\n", 8 | "\n", 9 | "This notebook provides a dataset of job listings for testing the recommender model. It uses public data from Glassdoor in May 2020 (https://www.kaggle.com/atharvap329/glassdoor-data-science-job-data). The notebook integrates the four datasets from this collection into one and generates a new csv file." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 13, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Import libraries\n", 19 | "\n", 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 14, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "NY:900, SF:889, TX:643, WA:892\n" 33 | ] 34 | }, 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
0Chief Marketing Officer (CMO)National Debt ReliefNYNew York-1-1Who We're Looking For:\\n\\nThe Chief Marketing ...Finance4.02020-05-082020-06-07FULL_TIME
1Registered NurseQueens Boulevard Endoscopy CenterNYRego Park-1-1Queens Boulevard Endoscopy Center, an endoscop...NaN3.02020-04-252020-06-07FULL_TIME
2Dental HygienistBatista DentalNJWest New York-1-1Part-time or Full-timedental hygienist positio...NaNNaN2020-05-022020-06-07PART_TIME
3Senior Salesforce DeveloperNational Debt ReliefNYNew York4458782162Principle Duties & Responsibilities:\\n\\nAnalyz...Finance4.02020-05-082020-06-07FULL_TIME
4DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...National Advocates for Pregnant WomenNYNew York125410212901For FULL Job Announcement, visit our website: ...NaNNaN2020-04-282020-06-07FULL_TIME
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " Job_title \\\n", 152 | "0 Chief Marketing Officer (CMO) \n", 153 | "1 Registered Nurse \n", 154 | "2 Dental Hygienist \n", 155 | "3 Senior Salesforce Developer \n", 156 | "4 DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A... \n", 157 | "\n", 158 | " Company State City Min_Salary \\\n", 159 | "0 National Debt Relief NY New York -1 \n", 160 | "1 Queens Boulevard Endoscopy Center NY Rego Park -1 \n", 161 | "2 Batista Dental NJ West New York -1 \n", 162 | "3 National Debt Relief NY New York 44587 \n", 163 | "4 National Advocates for Pregnant Women NY New York 125410 \n", 164 | "\n", 165 | " Max_Salary Job_Desc Industry \\\n", 166 | "0 -1 Who We're Looking For:\\n\\nThe Chief Marketing ... Finance \n", 167 | "1 -1 Queens Boulevard Endoscopy Center, an endoscop... NaN \n", 168 | "2 -1 Part-time or Full-timedental hygienist positio... NaN \n", 169 | "3 82162 Principle Duties & Responsibilities:\\n\\nAnalyz... Finance \n", 170 | "4 212901 For FULL Job Announcement, visit our website: ... NaN \n", 171 | "\n", 172 | " Rating Date_Posted Valid_until Job_Type \n", 173 | "0 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 174 | "1 3.0 2020-04-25 2020-06-07 FULL_TIME \n", 175 | "2 NaN 2020-05-02 2020-06-07 PART_TIME \n", 176 | "3 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 177 | "4 NaN 2020-04-28 2020-06-07 FULL_TIME " 178 | ] 179 | }, 180 | "execution_count": 14, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "# Read in csv files with glassdoor job postings.\n", 187 | "\n", 188 | "ny = pd.read_csv('./Raw/Data_Job_NY.csv')\n", 189 | "sf = pd.read_csv('./Raw/Data_Job_SF.csv')\n", 190 | "tx = pd.read_csv('./Raw/Data_Job_TX.csv')\n", 191 | "wa = pd.read_csv('./Raw/Data_Job_WA.csv')\n", 192 | "\n", 193 | "print(f'NY:{ny.shape[0]}, SF:{sf.shape[0]}, TX:{tx.shape[0]}, WA:{wa.shape[0]}')\n", 194 | "ny.head() " 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 15, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "(3324, 12)\n" 207 | ] 208 | }, 209 | { 210 | "data": { 211 | "text/html": [ 212 | "
\n", 213 | "\n", 226 | "\n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
0Chief Marketing Officer (CMO)National Debt ReliefNYNew York-1-1Who We're Looking For:\\n\\nThe Chief Marketing ...Finance4.02020-05-082020-06-07FULL_TIME
1Registered NurseQueens Boulevard Endoscopy CenterNYRego Park-1-1Queens Boulevard Endoscopy Center, an endoscop...NaN3.02020-04-252020-06-07FULL_TIME
2Dental HygienistBatista DentalNJWest New York-1-1Part-time or Full-timedental hygienist positio...NaNNaN2020-05-022020-06-07PART_TIME
3Senior Salesforce DeveloperNational Debt ReliefNYNew York4458782162Principle Duties & Responsibilities:\\n\\nAnalyz...Finance4.02020-05-082020-06-07FULL_TIME
4DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...National Advocates for Pregnant WomenNYNew York125410212901For FULL Job Announcement, visit our website: ...NaNNaN2020-04-282020-06-07FULL_TIME
\n", 322 | "
" 323 | ], 324 | "text/plain": [ 325 | " Job_title \\\n", 326 | "0 Chief Marketing Officer (CMO) \n", 327 | "1 Registered Nurse \n", 328 | "2 Dental Hygienist \n", 329 | "3 Senior Salesforce Developer \n", 330 | "4 DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A... \n", 331 | "\n", 332 | " Company State City Min_Salary \\\n", 333 | "0 National Debt Relief NY New York -1 \n", 334 | "1 Queens Boulevard Endoscopy Center NY Rego Park -1 \n", 335 | "2 Batista Dental NJ West New York -1 \n", 336 | "3 National Debt Relief NY New York 44587 \n", 337 | "4 National Advocates for Pregnant Women NY New York 125410 \n", 338 | "\n", 339 | " Max_Salary Job_Desc Industry \\\n", 340 | "0 -1 Who We're Looking For:\\n\\nThe Chief Marketing ... Finance \n", 341 | "1 -1 Queens Boulevard Endoscopy Center, an endoscop... NaN \n", 342 | "2 -1 Part-time or Full-timedental hygienist positio... NaN \n", 343 | "3 82162 Principle Duties & Responsibilities:\\n\\nAnalyz... Finance \n", 344 | "4 212901 For FULL Job Announcement, visit our website: ... NaN \n", 345 | "\n", 346 | " Rating Date_Posted Valid_until Job_Type \n", 347 | "0 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 348 | "1 3.0 2020-04-25 2020-06-07 FULL_TIME \n", 349 | "2 NaN 2020-05-02 2020-06-07 PART_TIME \n", 350 | "3 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 351 | "4 NaN 2020-04-28 2020-06-07 FULL_TIME " 352 | ] 353 | }, 354 | "execution_count": 15, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "# Merge all data into a single dataframe\n", 361 | "\n", 362 | "jobs_df = pd.concat([ny, sf, tx, wa])\n", 363 | "print(jobs_df.shape)\n", 364 | "jobs_df.head()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 16, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "# Export the integrated dataset to a new csv file.\n", 374 | "\n", 375 | "jobs_df.to_csv('Glassdoor_Joblist.csv', index=False)" 376 | ] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "Python 3", 382 | "language": "python", 383 | "name": "python3" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 3 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython3", 395 | "version": "3.7.3" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 4 400 | } 401 | -------------------------------------------------------------------------------- /Data/README.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | To develop the Dataset for Course Data, the Coursera Catalog API was utilized which downloaded the entire Coursera Catalog and 4 | the Dataframes were stacked to develop the Dataset. The Job List Dataset was generated using public data from Glassdoor in form of a [Kaggle](https://www.kaggle.com/atharvap329/glassdoor-data-science-job-data) 5 | Dataset. The notebooks given in the required directories integrates the four datasets from this collection into one and generates a new Dataset that can be centrally used. 6 | 7 | The Technologies used for generating the Datasets are: 8 | 9 | - [Request](https://requests.readthedocs.io/en/master/) 10 | - [Pandas](https://pandas.pydata.org/) 11 | - [Time](https://docs.python.org/3/library/time.html) 12 | -------------------------------------------------------------------------------- /Exploratory Data Analysis/Create_Test_Set.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Test Data Selection\n", 8 | "This notebook creates a small set of labeled data that can be used to test the Doc2Vec model. Specifically, in selects 10 sample job descriptions under 2 job titles (data scientist and data engineer). It matches each of these 2 job titles with 5 courses each that I believe the model should recommend. This sample data will then be used to test the accuracy of the model." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 130, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Job test data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 131, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "(3324, 12)\n" 37 | ] 38 | }, 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
0Chief Marketing Officer (CMO)National Debt ReliefNYNew York-1-1Who We're Looking For:\\n\\nThe Chief Marketing ...Finance4.02020-05-082020-06-07FULL_TIME
1Registered NurseQueens Boulevard Endoscopy CenterNYRego Park-1-1Queens Boulevard Endoscopy Center, an endoscop...NaN3.02020-04-252020-06-07FULL_TIME
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " Job_title Company State \\\n", 111 | "0 Chief Marketing Officer (CMO) National Debt Relief NY \n", 112 | "1 Registered Nurse Queens Boulevard Endoscopy Center NY \n", 113 | "\n", 114 | " City Min_Salary Max_Salary \\\n", 115 | "0 New York -1 -1 \n", 116 | "1 Rego Park -1 -1 \n", 117 | "\n", 118 | " Job_Desc Industry Rating \\\n", 119 | "0 Who We're Looking For:\\n\\nThe Chief Marketing ... Finance 4.0 \n", 120 | "1 Queens Boulevard Endoscopy Center, an endoscop... NaN 3.0 \n", 121 | "\n", 122 | " Date_Posted Valid_until Job_Type \n", 123 | "0 2020-05-08 2020-06-07 FULL_TIME \n", 124 | "1 2020-04-25 2020-06-07 FULL_TIME " 125 | ] 126 | }, 127 | "execution_count": 131, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "# Read in the jobs data\n", 134 | "jobs_df = pd.read_csv('../Data/Job_Data/Glassdoor_Joblist.csv')\n", 135 | "print(jobs_df.shape)\n", 136 | "jobs_df.head(2)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 132, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "Data Scientist 186\n", 148 | "Data Engineer 129\n", 149 | "Data Analyst 69\n", 150 | "Senior Data Engineer 44\n", 151 | "Senior Data Scientist 39\n", 152 | " ... \n", 153 | "Support Scientist-Ocean Data Assimilation 1\n", 154 | "Insights and Analytics Manager 1\n", 155 | "DHS-NTC Senior Scientist 1\n", 156 | "Document Security Scientist 1\n", 157 | "Sr. Healthcare Data Analyst 1\n", 158 | "Name: Job_title, Length: 1619, dtype: int64" 159 | ] 160 | }, 161 | "execution_count": 132, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "# Check the major job titles in the dataset\n", 168 | "jobs_df['Job_title'].value_counts()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 133, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# Select 5 job descriptions for data scientist\n", 178 | "ds_jobs = [901, 910, 916, 920, 938]" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 134, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# Select 5 job descriptions for data engineer\n", 188 | "de_jobs = [935, 1068, 1089, 1100, 1105]" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 135, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | "
Job_titleJob_Desc
901Data ScientistWe are looking for Data Scientists who are int...
910Data ScientistThe world's largest and fastest-growing compan...
916Data Scientist\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...
920Data ScientistUpstart is the leading AI lending platform par...
938Data ScientistWhy Divvy?Over the past decade, millions of Am...
935Data EngineerAbout Rocket LawyerWe believe everyone deserve...
1068Data EngineerOur mission is to create a world where mental ...
1089Data EngineerData Engineer \\nIf you are a Data Engineer wit...
1100Data EngineerPrabhav Services Inc. is one of the premier pr...
1105Data EngineerAbout Skupos\\nSkupos is the data platform for ...
\n", 274 | "
" 275 | ], 276 | "text/plain": [ 277 | " Job_title Job_Desc\n", 278 | "901 Data Scientist We are looking for Data Scientists who are int...\n", 279 | "910 Data Scientist The world's largest and fastest-growing compan...\n", 280 | "916 Data Scientist \\nRole: Data Scientist.\\n\\nLocation: Foster Ci...\n", 281 | "920 Data Scientist Upstart is the leading AI lending platform par...\n", 282 | "938 Data Scientist Why Divvy?Over the past decade, millions of Am...\n", 283 | "935 Data Engineer About Rocket LawyerWe believe everyone deserve...\n", 284 | "1068 Data Engineer Our mission is to create a world where mental ...\n", 285 | "1089 Data Engineer Data Engineer \\nIf you are a Data Engineer wit...\n", 286 | "1100 Data Engineer Prabhav Services Inc. is one of the premier pr...\n", 287 | "1105 Data Engineer About Skupos\\nSkupos is the data platform for ..." 288 | ] 289 | }, 290 | "execution_count": 135, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "sample_jobs = jobs_df.loc[[901, 910, 916, 920, 938, 935, 1068, 1089, 1100, 1105], ['Job_title', 'Job_Desc']]\n", 297 | "sample_jobs" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 145, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/html": [ 308 | "
\n", 309 | "\n", 322 | "\n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
Job_titleJob_DescJob_id
901Data ScientistWe are looking for Data Scientists who are int...901
910Data ScientistThe world's largest and fastest-growing compan...910
916Data Scientist\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...916
920Data ScientistUpstart is the leading AI lending platform par...920
938Data ScientistWhy Divvy?Over the past decade, millions of Am...938
935Data EngineerAbout Rocket LawyerWe believe everyone deserve...935
1068Data EngineerOur mission is to create a world where mental ...1068
1089Data EngineerData Engineer \\nIf you are a Data Engineer wit...1089
1100Data EngineerPrabhav Services Inc. is one of the premier pr...1100
1105Data EngineerAbout Skupos\\nSkupos is the data platform for ...1105
\n", 394 | "
" 395 | ], 396 | "text/plain": [ 397 | " Job_title Job_Desc \\\n", 398 | "901 Data Scientist We are looking for Data Scientists who are int... \n", 399 | "910 Data Scientist The world's largest and fastest-growing compan... \n", 400 | "916 Data Scientist \\nRole: Data Scientist.\\n\\nLocation: Foster Ci... \n", 401 | "920 Data Scientist Upstart is the leading AI lending platform par... \n", 402 | "938 Data Scientist Why Divvy?Over the past decade, millions of Am... \n", 403 | "935 Data Engineer About Rocket LawyerWe believe everyone deserve... \n", 404 | "1068 Data Engineer Our mission is to create a world where mental ... \n", 405 | "1089 Data Engineer Data Engineer \\nIf you are a Data Engineer wit... \n", 406 | "1100 Data Engineer Prabhav Services Inc. is one of the premier pr... \n", 407 | "1105 Data Engineer About Skupos\\nSkupos is the data platform for ... \n", 408 | "\n", 409 | " Job_id \n", 410 | "901 901 \n", 411 | "910 910 \n", 412 | "916 916 \n", 413 | "920 920 \n", 414 | "938 938 \n", 415 | "935 935 \n", 416 | "1068 1068 \n", 417 | "1089 1089 \n", 418 | "1100 1100 \n", 419 | "1105 1105 " 420 | ] 421 | }, 422 | "execution_count": 145, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "sample_jobs['Job_id'] = sample_jobs.index\n", 429 | "sample_jobs" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 147, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "sample_jobs.to_csv('jobs_test_sample.csv', index=False)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "## Course test data" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 137, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "(4416, 10)\n" 458 | ] 459 | }, 460 | { 461 | "data": { 462 | "text/html": [ 463 | "
\n", 464 | "\n", 477 | "\n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | "
courseTypedescriptiondomainTypesidslugspecializationsworkloadprimaryLanguagescertificatesname
0v2.ondemandGamification is the application of game elemen...[{'subdomainId': 'design-and-product', 'domain...69Bku0KoEeWZtA4u62x6lQgamification[]4-8 hours/week['en']['VerifiedCert']Gamification
1v2.ondemandThis course will cover the steps used in weigh...[{'subdomainId': 'data-analysis', 'domainId': ...0HiU7Oe4EeWTAQ4yevf_oQmissing-data[]4 weeks of study, 1-2 hours/week['en']['VerifiedCert', 'Specialization']Dealing With Missing Data
\n", 522 | "
" 523 | ], 524 | "text/plain": [ 525 | " courseType description \\\n", 526 | "0 v2.ondemand Gamification is the application of game elemen... \n", 527 | "1 v2.ondemand This course will cover the steps used in weigh... \n", 528 | "\n", 529 | " domainTypes id \\\n", 530 | "0 [{'subdomainId': 'design-and-product', 'domain... 69Bku0KoEeWZtA4u62x6lQ \n", 531 | "1 [{'subdomainId': 'data-analysis', 'domainId': ... 0HiU7Oe4EeWTAQ4yevf_oQ \n", 532 | "\n", 533 | " slug specializations workload \\\n", 534 | "0 gamification [] 4-8 hours/week \n", 535 | "1 missing-data [] 4 weeks of study, 1-2 hours/week \n", 536 | "\n", 537 | " primaryLanguages certificates \\\n", 538 | "0 ['en'] ['VerifiedCert'] \n", 539 | "1 ['en'] ['VerifiedCert', 'Specialization'] \n", 540 | "\n", 541 | " name \n", 542 | "0 Gamification \n", 543 | "1 Dealing With Missing Data " 544 | ] 545 | }, 546 | "execution_count": 137, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "# Read in the course data\n", 553 | "courses_df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')\n", 554 | "print(courses_df.shape)\n", 555 | "courses_df.head(2)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 138, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "ds_courses = [3823, 143, 3165, 3588, 2517]" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 139, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "de_courses = [545, 1015, 4233, 3763, 1311]" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 140, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/html": [ 584 | "
\n", 585 | "\n", 598 | "\n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
namedescription
3823The Data Scientist’s ToolboxIn this course you will get an introduction to...
143Machine LearningMachine learning is the science of getting com...
3165Applied Machine Learning in PythonThis course will introduce the learner to appl...
3588Data Visualization with Python\"A picture is worth a thousand words\". We are ...
2517Machine Learning with PythonThis course dives into the basics of machine l...
545Databases and SQL for Data ScienceMuch of the world's data resides in databases....
1015Google Cloud Platform Big Data and Machine Lea...This 2-week accelerated on-demand course intro...
4233Big Data Modeling and Management SystemsOnce you’ve identified a big data issue to ana...
3763Database Management EssentialsDatabase Management Essentials provides the fo...
1311Data Warehouse Concepts, Design, and Data Inte...This is the second course in the Data Warehous...
\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " name \\\n", 663 | "3823 The Data Scientist’s Toolbox \n", 664 | "143 Machine Learning \n", 665 | "3165 Applied Machine Learning in Python \n", 666 | "3588 Data Visualization with Python \n", 667 | "2517 Machine Learning with Python \n", 668 | "545 Databases and SQL for Data Science \n", 669 | "1015 Google Cloud Platform Big Data and Machine Lea... \n", 670 | "4233 Big Data Modeling and Management Systems \n", 671 | "3763 Database Management Essentials \n", 672 | "1311 Data Warehouse Concepts, Design, and Data Inte... \n", 673 | "\n", 674 | " description \n", 675 | "3823 In this course you will get an introduction to... \n", 676 | "143 Machine learning is the science of getting com... \n", 677 | "3165 This course will introduce the learner to appl... \n", 678 | "3588 \"A picture is worth a thousand words\". We are ... \n", 679 | "2517 This course dives into the basics of machine l... \n", 680 | "545 Much of the world's data resides in databases.... \n", 681 | "1015 This 2-week accelerated on-demand course intro... \n", 682 | "4233 Once you’ve identified a big data issue to ana... \n", 683 | "3763 Database Management Essentials provides the fo... \n", 684 | "1311 This is the second course in the Data Warehous... " 685 | ] 686 | }, 687 | "execution_count": 140, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "sample_courses = courses_df.loc[[3823, 143, 3165, 3588, 2517, 545, 1015, 4233, 3763, 1311], ['name', 'description']]\n", 694 | "sample_courses" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 141, 700 | "metadata": {}, 701 | "outputs": [ 702 | { 703 | "data": { 704 | "text/html": [ 705 | "
\n", 706 | "\n", 719 | "\n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | "
namedescriptionjob_title
3823The Data Scientist’s ToolboxIn this course you will get an introduction to...None
143Machine LearningMachine learning is the science of getting com...None
3165Applied Machine Learning in PythonThis course will introduce the learner to appl...None
3588Data Visualization with Python\"A picture is worth a thousand words\". We are ...None
2517Machine Learning with PythonThis course dives into the basics of machine l...None
545Databases and SQL for Data ScienceMuch of the world's data resides in databases....None
1015Google Cloud Platform Big Data and Machine Lea...This 2-week accelerated on-demand course intro...None
4233Big Data Modeling and Management SystemsOnce you’ve identified a big data issue to ana...None
3763Database Management EssentialsDatabase Management Essentials provides the fo...None
1311Data Warehouse Concepts, Design, and Data Inte...This is the second course in the Data Warehous...None
\n", 791 | "
" 792 | ], 793 | "text/plain": [ 794 | " name \\\n", 795 | "3823 The Data Scientist’s Toolbox \n", 796 | "143 Machine Learning \n", 797 | "3165 Applied Machine Learning in Python \n", 798 | "3588 Data Visualization with Python \n", 799 | "2517 Machine Learning with Python \n", 800 | "545 Databases and SQL for Data Science \n", 801 | "1015 Google Cloud Platform Big Data and Machine Lea... \n", 802 | "4233 Big Data Modeling and Management Systems \n", 803 | "3763 Database Management Essentials \n", 804 | "1311 Data Warehouse Concepts, Design, and Data Inte... \n", 805 | "\n", 806 | " description job_title \n", 807 | "3823 In this course you will get an introduction to... None \n", 808 | "143 Machine learning is the science of getting com... None \n", 809 | "3165 This course will introduce the learner to appl... None \n", 810 | "3588 \"A picture is worth a thousand words\". We are ... None \n", 811 | "2517 This course dives into the basics of machine l... None \n", 812 | "545 Much of the world's data resides in databases.... None \n", 813 | "1015 This 2-week accelerated on-demand course intro... None \n", 814 | "4233 Once you’ve identified a big data issue to ana... None \n", 815 | "3763 Database Management Essentials provides the fo... None \n", 816 | "1311 This is the second course in the Data Warehous... None " 817 | ] 818 | }, 819 | "execution_count": 141, 820 | "metadata": {}, 821 | "output_type": "execute_result" 822 | } 823 | ], 824 | "source": [ 825 | "sample_courses['job_title'] = None\n", 826 | "sample_courses" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 142, 832 | "metadata": {}, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/html": [ 837 | "
\n", 838 | "\n", 851 | "\n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | "
namedescriptionjob_title
3823The Data Scientist’s ToolboxIn this course you will get an introduction to...Data Scientist
143Machine LearningMachine learning is the science of getting com...Data Scientist
3165Applied Machine Learning in PythonThis course will introduce the learner to appl...Data Scientist
3588Data Visualization with Python\"A picture is worth a thousand words\". We are ...Data Scientist
2517Machine Learning with PythonThis course dives into the basics of machine l...Data Scientist
545Databases and SQL for Data ScienceMuch of the world's data resides in databases....Data Engineer
1015Google Cloud Platform Big Data and Machine Lea...This 2-week accelerated on-demand course intro...Data Engineer
4233Big Data Modeling and Management SystemsOnce you’ve identified a big data issue to ana...Data Engineer
3763Database Management EssentialsDatabase Management Essentials provides the fo...Data Engineer
1311Data Warehouse Concepts, Design, and Data Inte...This is the second course in the Data Warehous...Data Engineer
\n", 923 | "
" 924 | ], 925 | "text/plain": [ 926 | " name \\\n", 927 | "3823 The Data Scientist’s Toolbox \n", 928 | "143 Machine Learning \n", 929 | "3165 Applied Machine Learning in Python \n", 930 | "3588 Data Visualization with Python \n", 931 | "2517 Machine Learning with Python \n", 932 | "545 Databases and SQL for Data Science \n", 933 | "1015 Google Cloud Platform Big Data and Machine Lea... \n", 934 | "4233 Big Data Modeling and Management Systems \n", 935 | "3763 Database Management Essentials \n", 936 | "1311 Data Warehouse Concepts, Design, and Data Inte... \n", 937 | "\n", 938 | " description job_title \n", 939 | "3823 In this course you will get an introduction to... Data Scientist \n", 940 | "143 Machine learning is the science of getting com... Data Scientist \n", 941 | "3165 This course will introduce the learner to appl... Data Scientist \n", 942 | "3588 \"A picture is worth a thousand words\". We are ... Data Scientist \n", 943 | "2517 This course dives into the basics of machine l... Data Scientist \n", 944 | "545 Much of the world's data resides in databases.... Data Engineer \n", 945 | "1015 This 2-week accelerated on-demand course intro... Data Engineer \n", 946 | "4233 Once you’ve identified a big data issue to ana... Data Engineer \n", 947 | "3763 Database Management Essentials provides the fo... Data Engineer \n", 948 | "1311 This is the second course in the Data Warehous... Data Engineer " 949 | ] 950 | }, 951 | "execution_count": 142, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "sample_courses.loc[ds_courses, 'job_title'] = 'Data Scientist'\n", 958 | "sample_courses.loc[de_courses, 'job_title'] = 'Data Engineer'\n", 959 | "sample_courses" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 148, 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "data": { 969 | "text/html": [ 970 | "
\n", 971 | "\n", 984 | "\n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | "
namedescriptionjob_titlecourse_id
3823The Data Scientist’s ToolboxIn this course you will get an introduction to...Data Scientist3823
143Machine LearningMachine learning is the science of getting com...Data Scientist143
3165Applied Machine Learning in PythonThis course will introduce the learner to appl...Data Scientist3165
3588Data Visualization with Python\"A picture is worth a thousand words\". We are ...Data Scientist3588
2517Machine Learning with PythonThis course dives into the basics of machine l...Data Scientist2517
545Databases and SQL for Data ScienceMuch of the world's data resides in databases....Data Engineer545
1015Google Cloud Platform Big Data and Machine Lea...This 2-week accelerated on-demand course intro...Data Engineer1015
4233Big Data Modeling and Management SystemsOnce you’ve identified a big data issue to ana...Data Engineer4233
3763Database Management EssentialsDatabase Management Essentials provides the fo...Data Engineer3763
1311Data Warehouse Concepts, Design, and Data Inte...This is the second course in the Data Warehous...Data Engineer1311
\n", 1067 | "
" 1068 | ], 1069 | "text/plain": [ 1070 | " name \\\n", 1071 | "3823 The Data Scientist’s Toolbox \n", 1072 | "143 Machine Learning \n", 1073 | "3165 Applied Machine Learning in Python \n", 1074 | "3588 Data Visualization with Python \n", 1075 | "2517 Machine Learning with Python \n", 1076 | "545 Databases and SQL for Data Science \n", 1077 | "1015 Google Cloud Platform Big Data and Machine Lea... \n", 1078 | "4233 Big Data Modeling and Management Systems \n", 1079 | "3763 Database Management Essentials \n", 1080 | "1311 Data Warehouse Concepts, Design, and Data Inte... \n", 1081 | "\n", 1082 | " description job_title \\\n", 1083 | "3823 In this course you will get an introduction to... Data Scientist \n", 1084 | "143 Machine learning is the science of getting com... Data Scientist \n", 1085 | "3165 This course will introduce the learner to appl... Data Scientist \n", 1086 | "3588 \"A picture is worth a thousand words\". We are ... Data Scientist \n", 1087 | "2517 This course dives into the basics of machine l... Data Scientist \n", 1088 | "545 Much of the world's data resides in databases.... Data Engineer \n", 1089 | "1015 This 2-week accelerated on-demand course intro... Data Engineer \n", 1090 | "4233 Once you’ve identified a big data issue to ana... Data Engineer \n", 1091 | "3763 Database Management Essentials provides the fo... Data Engineer \n", 1092 | "1311 This is the second course in the Data Warehous... Data Engineer \n", 1093 | "\n", 1094 | " course_id \n", 1095 | "3823 3823 \n", 1096 | "143 143 \n", 1097 | "3165 3165 \n", 1098 | "3588 3588 \n", 1099 | "2517 2517 \n", 1100 | "545 545 \n", 1101 | "1015 1015 \n", 1102 | "4233 4233 \n", 1103 | "3763 3763 \n", 1104 | "1311 1311 " 1105 | ] 1106 | }, 1107 | "execution_count": 148, 1108 | "metadata": {}, 1109 | "output_type": "execute_result" 1110 | } 1111 | ], 1112 | "source": [ 1113 | "sample_courses['course_id'] = sample_courses.index\n", 1114 | "sample_courses" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": 149, 1120 | "metadata": {}, 1121 | "outputs": [], 1122 | "source": [ 1123 | "sample_courses.to_csv('courses_test_sample.csv', index=False)" 1124 | ] 1125 | }, 1126 | { 1127 | "cell_type": "code", 1128 | "execution_count": null, 1129 | "metadata": {}, 1130 | "outputs": [], 1131 | "source": [] 1132 | } 1133 | ], 1134 | "metadata": { 1135 | "kernelspec": { 1136 | "display_name": "Python 3", 1137 | "language": "python", 1138 | "name": "python3" 1139 | }, 1140 | "language_info": { 1141 | "codemirror_mode": { 1142 | "name": "ipython", 1143 | "version": 3 1144 | }, 1145 | "file_extension": ".py", 1146 | "mimetype": "text/x-python", 1147 | "name": "python", 1148 | "nbconvert_exporter": "python", 1149 | "pygments_lexer": "ipython3", 1150 | "version": "3.7.6" 1151 | } 1152 | }, 1153 | "nbformat": 4, 1154 | "nbformat_minor": 4 1155 | } 1156 | -------------------------------------------------------------------------------- /Exploratory Data Analysis/Job_Posts_EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Job Listings EDA\n", 8 | "\n", 9 | "This notebook examines the dataset of job posts from Glassdoor. It finds that there are issues in this dataset, such as duplicate rows and adverstisements mixed in. However, because I have decided only to use a small portion of this data for testing out the recommender model, these issues will not affect the project and do not need to be fixed here." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Import libraries\n", 19 | "\n", 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "(3324, 12)\n" 33 | ] 34 | }, 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
0Chief Marketing Officer (CMO)National Debt ReliefNYNew York-1-1Who We're Looking For:\\n\\nThe Chief Marketing ...Finance4.02020-05-082020-06-07FULL_TIME
1Registered NurseQueens Boulevard Endoscopy CenterNYRego Park-1-1Queens Boulevard Endoscopy Center, an endoscop...NaN3.02020-04-252020-06-07FULL_TIME
2Dental HygienistBatista DentalNJWest New York-1-1Part-time or Full-timedental hygienist positio...NaNNaN2020-05-022020-06-07PART_TIME
3Senior Salesforce DeveloperNational Debt ReliefNYNew York4458782162Principle Duties & Responsibilities:\\n\\nAnalyz...Finance4.02020-05-082020-06-07FULL_TIME
4DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...National Advocates for Pregnant WomenNYNew York125410212901For FULL Job Announcement, visit our website: ...NaNNaN2020-04-282020-06-07FULL_TIME
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " Job_title \\\n", 152 | "0 Chief Marketing Officer (CMO) \n", 153 | "1 Registered Nurse \n", 154 | "2 Dental Hygienist \n", 155 | "3 Senior Salesforce Developer \n", 156 | "4 DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A... \n", 157 | "\n", 158 | " Company State City Min_Salary \\\n", 159 | "0 National Debt Relief NY New York -1 \n", 160 | "1 Queens Boulevard Endoscopy Center NY Rego Park -1 \n", 161 | "2 Batista Dental NJ West New York -1 \n", 162 | "3 National Debt Relief NY New York 44587 \n", 163 | "4 National Advocates for Pregnant Women NY New York 125410 \n", 164 | "\n", 165 | " Max_Salary Job_Desc Industry \\\n", 166 | "0 -1 Who We're Looking For:\\n\\nThe Chief Marketing ... Finance \n", 167 | "1 -1 Queens Boulevard Endoscopy Center, an endoscop... NaN \n", 168 | "2 -1 Part-time or Full-timedental hygienist positio... NaN \n", 169 | "3 82162 Principle Duties & Responsibilities:\\n\\nAnalyz... Finance \n", 170 | "4 212901 For FULL Job Announcement, visit our website: ... NaN \n", 171 | "\n", 172 | " Rating Date_Posted Valid_until Job_Type \n", 173 | "0 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 174 | "1 3.0 2020-04-25 2020-06-07 FULL_TIME \n", 175 | "2 NaN 2020-05-02 2020-06-07 PART_TIME \n", 176 | "3 4.0 2020-05-08 2020-06-07 FULL_TIME \n", 177 | "4 NaN 2020-04-28 2020-06-07 FULL_TIME " 178 | ] 179 | }, 180 | "execution_count": 2, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "# Read in the dataset\n", 187 | "\n", 188 | "df = pd.read_csv('../Data/Job_Data/Glassdoor_Joblist.csv')\n", 189 | "print(df.shape)\n", 190 | "df.head()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 3, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "Job_title 0\n", 202 | "Company 0\n", 203 | "State 2\n", 204 | "City 6\n", 205 | "Min_Salary 0\n", 206 | "Max_Salary 0\n", 207 | "Job_Desc 0\n", 208 | "Industry 624\n", 209 | "Rating 475\n", 210 | "Date_Posted 0\n", 211 | "Valid_until 0\n", 212 | "Job_Type 0\n", 213 | "dtype: int64" 214 | ] 215 | }, 216 | "execution_count": 3, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "# Check for missing values:\n", 223 | "# No missing values in key columns of job title and description.\n", 224 | "\n", 225 | "df.isna().sum()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 4, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "Data Scientist 186\n", 237 | "Data Engineer 129\n", 238 | "Data Analyst 69\n", 239 | "Senior Data Engineer 44\n", 240 | "Senior Data Scientist 39\n", 241 | " ... \n", 242 | "Spectral Research Scientist with Security Clearance 1\n", 243 | "Senior Medical Scientist 1\n", 244 | "Senior Scientist, Oncology BioMarker Development 1\n", 245 | "Data Scientist, AMP Commerce/ Payments/ Subscription Analytics 1\n", 246 | "Innovation - Data Science Manager 1\n", 247 | "Name: Job_title, Length: 1619, dtype: int64" 248 | ] 249 | }, 250 | "execution_count": 4, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "# Examine the key category of job title:\n", 257 | "# might need to consolidate these; leave it for now...\n", 258 | "\n", 259 | "df['Job_title'].value_counts()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 5, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "'Queens Boulevard Endoscopy Center, an endoscopy ASC located in Rego Park, has an exciting opportunity for Full-Time Registered Nurse! Successful candidates will provide quality nursing care in all areas of the Center including pre-assessment, pre-op and pacu Qualified candidates must possess the following:\\n\\nCurrent NY state RN license\\nBLS Certification, ACLS preferred\\nMust be a team-player with excellent multi-tasking and interpersonal skills\\nCompassion for patient needs and a high degree of professionalism\\nChinese Speaking and Spanish Preferred\\n\\nQueens Boulevard Endoscopy Center offers a pleasant professional work environment and no evening or holiday work hours. Drug-free work environment and EOE.'" 271 | ] 272 | }, 273 | "execution_count": 5, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "# Examine an example of a job description:\n", 280 | "# Other than \\n line breaks, the text is pretty clean.\n", 281 | "\n", 282 | "df['Job_Desc'][1]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 18, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/html": [ 293 | "
\n", 294 | "\n", 307 | "\n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
901Data ScientistGovTechCASan Francisco78594147225We are looking for Data Scientists who are int...Government3.62020-05-012020-06-05FULL_TIME
902Data ScientistTriplebyteCASan Francisco145000225000This company is in a hiring surge in response ...Information Technology3.62020-04-282020-06-05FULL_TIME
903Data ScientistNotion LabsCASan Francisco105765142959So, what will you do as a Data Scientist at No...Information Technology5.02020-05-042020-06-05FULL_TIME
904Data ScientistSeen by IndeedCASan Francisco110377143329With one application you can be considered for...NaNNaN2020-04-252020-06-05FULL_TIME
907Data ScientistFormationCASan Francisco119642135250Formation provides personalization for the lar...Information Technology3.12020-04-292020-06-05FULL_TIME
909Data ScientistDuettoCASan Francisco108809173353We are an ambitious, well-funded, high-growth ...Information Technology4.42020-04-242020-06-05FULL_TIME
910Data ScientistDemandbaseCASan Francisco148171160387The world's largest and fastest-growing compan...Information Technology4.52020-04-292020-06-05FULL_TIME
916Data ScientistCentrapriseCAFoster City116415143186\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...Information Technology4.22020-05-022020-06-05FULL_TIME
918Data ScientistCyberCodersCASan Francisco-1-1Data Scientist \\nJob Title: Data ScientistLoca...Business Services4.12020-05-062020-06-05FULL_TIME
920Data ScientistUpstartCASan Mateo124204139717Upstart is the leading AI lending platform par...Finance4.02020-04-242020-06-05FULL_TIME
\n", 478 | "
" 479 | ], 480 | "text/plain": [ 481 | " Job_title Company State City Min_Salary \\\n", 482 | "901 Data Scientist GovTech CA San Francisco 78594 \n", 483 | "902 Data Scientist Triplebyte CA San Francisco 145000 \n", 484 | "903 Data Scientist Notion Labs CA San Francisco 105765 \n", 485 | "904 Data Scientist Seen by Indeed CA San Francisco 110377 \n", 486 | "907 Data Scientist Formation CA San Francisco 119642 \n", 487 | "909 Data Scientist Duetto CA San Francisco 108809 \n", 488 | "910 Data Scientist Demandbase CA San Francisco 148171 \n", 489 | "916 Data Scientist Centraprise CA Foster City 116415 \n", 490 | "918 Data Scientist CyberCoders CA San Francisco -1 \n", 491 | "920 Data Scientist Upstart CA San Mateo 124204 \n", 492 | "\n", 493 | " Max_Salary Job_Desc \\\n", 494 | "901 147225 We are looking for Data Scientists who are int... \n", 495 | "902 225000 This company is in a hiring surge in response ... \n", 496 | "903 142959 So, what will you do as a Data Scientist at No... \n", 497 | "904 143329 With one application you can be considered for... \n", 498 | "907 135250 Formation provides personalization for the lar... \n", 499 | "909 173353 We are an ambitious, well-funded, high-growth ... \n", 500 | "910 160387 The world's largest and fastest-growing compan... \n", 501 | "916 143186 \\nRole: Data Scientist.\\n\\nLocation: Foster Ci... \n", 502 | "918 -1 Data Scientist \\nJob Title: Data ScientistLoca... \n", 503 | "920 139717 Upstart is the leading AI lending platform par... \n", 504 | "\n", 505 | " Industry Rating Date_Posted Valid_until Job_Type \n", 506 | "901 Government 3.6 2020-05-01 2020-06-05 FULL_TIME \n", 507 | "902 Information Technology 3.6 2020-04-28 2020-06-05 FULL_TIME \n", 508 | "903 Information Technology 5.0 2020-05-04 2020-06-05 FULL_TIME \n", 509 | "904 NaN NaN 2020-04-25 2020-06-05 FULL_TIME \n", 510 | "907 Information Technology 3.1 2020-04-29 2020-06-05 FULL_TIME \n", 511 | "909 Information Technology 4.4 2020-04-24 2020-06-05 FULL_TIME \n", 512 | "910 Information Technology 4.5 2020-04-29 2020-06-05 FULL_TIME \n", 513 | "916 Information Technology 4.2 2020-05-02 2020-06-05 FULL_TIME \n", 514 | "918 Business Services 4.1 2020-05-06 2020-06-05 FULL_TIME \n", 515 | "920 Finance 4.0 2020-04-24 2020-06-05 FULL_TIME " 516 | ] 517 | }, 518 | "execution_count": 18, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "df.loc[df['Job_title'] == 'Data Scientist'].head(10)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 22, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "data": { 534 | "text/plain": [ 535 | "'The world\\'s largest and fastest-growing companies such as Accenture, Adobe, DocuSign and Salesforce rely on Demandbase to drive their Account-Based Marketing strategy and maximize their B2B marketing performance. We pioneered the ABM category nearly a decade ago, and today we lead the category as an indispensable part of the B2B MarTech stack. Our achievements and innovation would not be possible without the driven and collaborative teams here at Demandbase. As a company, we\\'re as committed to growing careers as we are to building word-class technology. We invest heavily in people, our culture and the community around us, and have continuously been recognized as one of the best places to work in the Bay Area.\\n\\nDemandbase is currently looking for a Staff Data Scientist to develop ground-breaking insights from our data sets and create a completely new way of data-driven thinking in B2B marketing —providing Sales and Marketing users with unique approaches for account-based advertising and web engagement.\\n\\nAs a Staff Data Scientist, you\\'ll be responsible for developing and testing hypotheses on behavioral responses in B2B marketing, creating models that extract data from, among others, website, digital advertising, and CRM solutions into actionable insights, and defining leading edge thinking on how analytical frameworks can be applied to predictive marketing. You\\'ll engage closely with product managers, engineers, customers and others to turn your models into products that delight customers and create \"A-HA\" moments. You will engage with industry peers and experts and showcase your findings (of course while maintaining company and client confidentiality!). You are both hands-on and strategic—with both a broad ecosystem-level understanding of our market space and the ability to work closely with engineering and product teams to deliver software in an iterative, continual-release environment. This is a high-visibility position involving close collaboration across functional groups and with executive stakeholders at customers like the above.\\n\\nWhat you\\'ll be doing:\\n\\nOwn: Be the functional owner of the Data Science role\\nFrame: Use data and insights to explore questions our customers and product team can and should be asking but never asked before.\\nDefine: Work with customers and internal stakeholders to define hypotheses and models, and with engineering teams to define productionalization of data science system\\nDocument: Write clear, concise descriptions of how insights can be converted into repeatable actions.\\nBuild: Write robust machine learning pipelines and data science systems that interface with production infrastructure and APIs\\nTest: Continually test your models and refine assumptions, data sources and more.\\nDrive: Work to spread understanding and buy-in among all stakeholders at all levels.\\nOther duties as assigned\\n\\nWhat we\\'re looking for:\\n\\n2-4 years of data science experience—you have driven more than one greenfield project from concept to production release\\nStrong quantitative and data analysis abilities (statistics, engineering, or financial academic background preferred)—making data actionable must be your thing!\\nGood working knowledge of Spark is a must (we use Scala heavily)\\nAny experience with Google Cloud (especially BQML) and AWS is a huge plus.\\nExperience defining products & solutions containing large data sets from diverse sources— preferably in sales and/or marketing situations.\\nPrior experience in the marketing or sales analytics/data science space desired\\nKnowledge of web site, digital marketing, and CRM technologies and companies a big plus\\n\\nOther important qualities:\\n\\nYou are perfectly comfortable working in a fast paced, market making environment\\nYou love data and data visualization—you love making data actionable for customers\\nYou are a driver and a doer\\nYou are truly passionate about asking and answering questions – some never asked before\\nYou have a strong sense of ownership for the products you help build\\n\\nBenefits:\\n\\nOur benefits include 100% paid for Medical, Dental and Vision for you and your entire family, 100% paid for short-term and long-term disability, 100% paid for life insurance, 401k, flexible vacation\\n\\nAbout Demandbase:\\n\\nDemandbase is the leader in Account-Based Marketing (ABM) and an indispensable part of the B2B tech stack. The company offers the only end-to-end ABM platform that helps B2B marketers identify, engage, close and measure progress against best-fit accounts. The biggest and fastest growing companies in the world, such as Accenture, Adobe, DocuSign, GE, Salesforce and others rely on Demandbase to drive their ABM strategy and maximize their marketing performance. The company has been named to the JMP Securities list \"The Hot 100: The Best Privately Held Software Companies,\" the Deloitte Fast 500 and named a Gartner Cool Vendor for Tech Go-To Market. In 2019, Demandbase executives authored the definitive book on ABM, Account-Based Marketing: How to Target and Engage the Companies That Will Grow Your Revenue. For more information, please visit www.demandbase.com or follow the company on Twitter @Demandbase.'" 536 | ] 537 | }, 538 | "execution_count": 22, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "df['Job_Desc'][910]" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 23, 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "data": { 554 | "text/html": [ 555 | "
\n", 556 | "\n", 569 | "\n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | "
Job_titleCompanyStateCityMin_SalaryMax_SalaryJob_DescIndustryRatingDate_PostedValid_untilJob_Type
935Data EngineerRocket LawyerCASan Francisco116784118008About Rocket LawyerWe believe everyone deserve...Information Technology3.52020-04-232020-06-05FULL_TIME
1015Data EngineerSeen by IndeedCASan Francisco100959124595With one application you can be considered for...NaNNaN2020-04-252020-06-05FULL_TIME
1068Data EngineerGingerCASan Francisco102913155464Our mission is to create a world where mental ...Health Care4.12020-04-292020-06-05FULL_TIME
1081Data EngineerAllstateCASan Francisco97656112456Avail is a new car sharing platform focused on...Insurance3.42020-05-062020-06-05FULL_TIME
1089Data EngineerCyberCodersCASan Francisco-1-1Data Engineer \\nIf you are a Data Engineer wit...Business Services4.12020-05-022020-06-05FULL_TIME
1100Data EngineerPrabhav Services IncCASan Francisco-1-1Prabhav Services Inc. is one of the premier pr...Information Technology4.62020-05-062020-06-05FULL_TIME
1105Data EngineerSkuposCASan Francisco8306899451About Skupos\\nSkupos is the data platform for ...Information Technology5.02020-04-242020-06-05FULL_TIME
1140Data EngineerModern HealthCASan Francisco100959124595Modern Health-Modern Health is a mental health...Information Technology5.02020-04-302020-06-05FULL_TIME
1144Data EngineerZypmediaCASan Francisco99278122333Data Engineer\\n\\nZypMedia has built an enterpr...Business Services4.22020-05-012020-06-05FULL_TIME
1165Data EngineerDotSolved Systems, Inc.CASan Francisco-1-1Data Engineer Minimum 7- 8 years experience No...Information Technology4.92020-05-052020-06-05FULL_TIME
\n", 740 | "
" 741 | ], 742 | "text/plain": [ 743 | " Job_title Company State City Min_Salary \\\n", 744 | "935 Data Engineer Rocket Lawyer CA San Francisco 116784 \n", 745 | "1015 Data Engineer Seen by Indeed CA San Francisco 100959 \n", 746 | "1068 Data Engineer Ginger CA San Francisco 102913 \n", 747 | "1081 Data Engineer Allstate CA San Francisco 97656 \n", 748 | "1089 Data Engineer CyberCoders CA San Francisco -1 \n", 749 | "1100 Data Engineer Prabhav Services Inc CA San Francisco -1 \n", 750 | "1105 Data Engineer Skupos CA San Francisco 83068 \n", 751 | "1140 Data Engineer Modern Health CA San Francisco 100959 \n", 752 | "1144 Data Engineer Zypmedia CA San Francisco 99278 \n", 753 | "1165 Data Engineer DotSolved Systems, Inc. CA San Francisco -1 \n", 754 | "\n", 755 | " Max_Salary Job_Desc \\\n", 756 | "935 118008 About Rocket LawyerWe believe everyone deserve... \n", 757 | "1015 124595 With one application you can be considered for... \n", 758 | "1068 155464 Our mission is to create a world where mental ... \n", 759 | "1081 112456 Avail is a new car sharing platform focused on... \n", 760 | "1089 -1 Data Engineer \\nIf you are a Data Engineer wit... \n", 761 | "1100 -1 Prabhav Services Inc. is one of the premier pr... \n", 762 | "1105 99451 About Skupos\\nSkupos is the data platform for ... \n", 763 | "1140 124595 Modern Health-Modern Health is a mental health... \n", 764 | "1144 122333 Data Engineer\\n\\nZypMedia has built an enterpr... \n", 765 | "1165 -1 Data Engineer Minimum 7- 8 years experience No... \n", 766 | "\n", 767 | " Industry Rating Date_Posted Valid_until Job_Type \n", 768 | "935 Information Technology 3.5 2020-04-23 2020-06-05 FULL_TIME \n", 769 | "1015 NaN NaN 2020-04-25 2020-06-05 FULL_TIME \n", 770 | "1068 Health Care 4.1 2020-04-29 2020-06-05 FULL_TIME \n", 771 | "1081 Insurance 3.4 2020-05-06 2020-06-05 FULL_TIME \n", 772 | "1089 Business Services 4.1 2020-05-02 2020-06-05 FULL_TIME \n", 773 | "1100 Information Technology 4.6 2020-05-06 2020-06-05 FULL_TIME \n", 774 | "1105 Information Technology 5.0 2020-04-24 2020-06-05 FULL_TIME \n", 775 | "1140 Information Technology 5.0 2020-04-30 2020-06-05 FULL_TIME \n", 776 | "1144 Business Services 4.2 2020-05-01 2020-06-05 FULL_TIME \n", 777 | "1165 Information Technology 4.9 2020-05-05 2020-06-05 FULL_TIME " 778 | ] 779 | }, 780 | "execution_count": 23, 781 | "metadata": {}, 782 | "output_type": "execute_result" 783 | } 784 | ], 785 | "source": [ 786 | "df.loc[df['Job_title'] == 'Data Engineer'].head(10)" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 24, 792 | "metadata": {}, 793 | "outputs": [ 794 | { 795 | "data": { 796 | "text/plain": [ 797 | "\"Our mission is to create a world where mental health is never an obstacle.\\n\\nGinger is transforming how behavioral healthcare is delivered by making it easy for people to get the support they need, when they need it, through on-demand coaching, teletherapy, telepsychiatry, and guided self-care.\\n\\nBusinesses purchase Ginger for their employee's or member's benefit. Through Ginger's secure mobile app, people around the world receive immediate emotional support that's personalized to their needs, and completely confidential. Ginger's high-quality, team-based care works — 70% of people show significant improvement after 12 weeks of using Ginger.\\n\\nAt Ginger, people are at the heart of what we do. We believe that diverse and inclusive teams make our company better. Teams with individuals that bring different perspectives to challenges are more innovative, collaborative, and create better solutions. We're building a workplace that actively embraces a diversity of people, ideas, talents, and experiences. Come join us!\\n\\nFast Company called Ginger one of The World's Top 10 Most Innovative Companies in Healthcare and the World Economic Forum named us a Technology Pioneer.\\n\\nAbout the Role:\\n\\nAt Ginger, we aim to provide better mental health care to humanity at a scale larger than has ever been possible before. This is no small task and as an expanding team we are working on a number of initiatives to achieve this, including aggressively building tools to simultaneously grow our reach and improve quality of care.\\n\\nWhat You'll Do:\\n\\nStanding at the center of multiple teams (data science, engineering) and core systems, you'll..\\n\\n\\nOpen up our data to uncover important patterns at the level of individuals and sub-populations.\\nSurface, serve and persist key actionable insights in mental health, healthy habit formation, goal pursuit, and care efficacy.\\nHelp us scale our services using modern distributed processing tools and GPUs in the cloud (AWS)\\nCollaborate with product to ideate and unlock features which derive as much actionable information from our data (text, media, activity etc) as possible.\\nHelp architect systems for near-real-time delivery of recommendations, care insights and other time-critical information to coaches and members.\\nDesign lightweight data schemas appropriate for storing, organizing and joining processed communication and care analytics.\\nDevise the tooling that takes us from algorithm prototype to production and can track data/model lineage and statistical drift through time.\\nDevelop pipelines that efficiently and reliably route output of machine learning algorithms to consumer processes and persistence mechanisms.\\nOwn operational scalability of our algorithms, systems and data models.\\nStand up infrastructure for optimal extraction, transformation, and loading of data from a wide variety of data sources using SQL, Python and AWS tools.\\nWork with a variety of stakeholders including the Data, Product, Engineering, Security and Executive teams to support their data accessibility needs.\\n\\nNecessary Skills:\\n\\nDatabases SQL/NoSQL 4+ years\\nCloud platform experience 3+ years\\nSQL 4+ years\\nSchema design 2+ years\\nAmazon Web Services (AWS) 2+ years\\nDeployment pipelines 2+ years\\nPython 2+ years\\nDeploying to production systems with active customers 2+ years\\nDistributed computing (e.g Spark, Hadoop etc.) 3+ years\\nInfrastructure monitoring 1+ years\\nWide variety of data warehouse, data lake (s3) etc familiarity\\nAnalytics experience working with structured and unstructured data\\nProject lead (self-managing) 1+ years\\nBachelors in technical field or experiential equivalent\\n\\nIdeal Skills:\\n\\nAmazon Web Services (AWS) 3+ years\\nAWS Lambda, Sagemaker\\nDocker / Kubernetes\\nDB performance engineering\\nMachine Learning (ML) 1+ years\\nRunning ML on GPUs 1+ years\\nPython 3+ years\\nStrong analytics intuition grounded in significant experience\\nExperience in the healthcare space\\nMasters in technical field or experiential equivalent\\n\"" 798 | ] 799 | }, 800 | "execution_count": 24, 801 | "metadata": {}, 802 | "output_type": "execute_result" 803 | } 804 | ], 805 | "source": [ 806 | "df['Job_Desc'][1068]" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": {}, 813 | "outputs": [], 814 | "source": [] 815 | } 816 | ], 817 | "metadata": { 818 | "kernelspec": { 819 | "display_name": "Python 3", 820 | "language": "python", 821 | "name": "python3" 822 | }, 823 | "language_info": { 824 | "codemirror_mode": { 825 | "name": "ipython", 826 | "version": 3 827 | }, 828 | "file_extension": ".py", 829 | "mimetype": "text/x-python", 830 | "name": "python", 831 | "nbconvert_exporter": "python", 832 | "pygments_lexer": "ipython3", 833 | "version": "3.7.3" 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 4 838 | } 839 | -------------------------------------------------------------------------------- /Exploratory Data Analysis/README.md: -------------------------------------------------------------------------------- 1 | # Exploratory Data Analysis 2 | 3 | In this Module, Test Datasets have been created to check out the efficiency of our Model. This notebook `Job_Posts_EDA.ipynb` examines the dataset of 4 | job posts from Glassdoor. It finds that there are issues in this dataset, such as duplicate rows and adverstisements mixed in. However, because 5 | only a small portion of this data is used for testing out the recommender model, these issues will not affect the project and do not need to be fixed here. 6 | 7 | Later a small set of labeled data is taken in `Create_Test_Set.ipynb` that can be used to test the Doc2Vec model. Specifically, 10 sample job descriptions under 2 job titles (data scientist and data 8 | engineer) are selected. It matches each of these 2 job titles with 5 courses that I believe the model should recommend. 9 | This sample data will then be used to test the accuracy of the model. 10 | 11 | -------------------------------------------------------------------------------- /Exploratory Data Analysis/courses_test_sample.csv: -------------------------------------------------------------------------------- 1 | name,description,job_title,course_id 2 | The Data Scientist’s Toolbox,"In this course you will get an introduction to the main tools and ideas in the data scientist's toolbox. The course gives an overview of the data, questions, and tools that data analysts and data scientists work with. There are two components to this course. The first is a conceptual introduction to the ideas behind turning data into actionable knowledge. The second is a practical introduction to the tools that will be used in the program like version control, markdown, git, GitHub, R, and RStudio.",Data Scientist,3823 3 | Machine Learning,"Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI. 4 | 5 | This course provides a broad introduction to machine learning, datamining, and statistical pattern recognition. Topics include: (i) Supervised learning (parametric/non-parametric algorithms, support vector machines, kernels, neural networks). (ii) Unsupervised learning (clustering, dimensionality reduction, recommender systems, deep learning). (iii) Best practices in machine learning (bias/variance theory; innovation process in machine learning and AI). The course will also draw from numerous case studies and applications, so that you'll also learn how to apply learning algorithms to building smart robots (perception, control), text understanding (web search, anti-spam), computer vision, medical informatics, audio, database mining, and other areas.",Data Scientist,143 6 | Applied Machine Learning in Python,"This course will introduce the learner to applied machine learning, focusing more on the techniques and methods than on the statistics behind these methods. The course will start with a discussion of how machine learning is different than descriptive statistics, and introduce the scikit learn toolkit through a tutorial. The issue of dimensionality of data will be discussed, and the task of clustering data, as well as evaluating those clusters, will be tackled. Supervised approaches for creating predictive models will be described, and learners will be able to apply the scikit learn predictive modelling methods while understanding process issues related to data generalizability (e.g. cross validation, overfitting). The course will end with a look at more advanced techniques, such as building ensembles, and practical limitations of predictive models. By the end of this course, students will be able to identify the difference between a supervised (classification) and unsupervised (clustering) technique, identify which technique they need to apply for a particular dataset and need, engineer features to meet that need, and write python code to carry out an analysis. 7 | 8 | This course should be taken after Introduction to Data Science in Python and Applied Plotting, Charting & Data Representation in Python and before Applied Text Mining in Python and Applied Social Analysis in Python.",Data Scientist,3165 9 | Data Visualization with Python,"""A picture is worth a thousand words"". We are all familiar with this expression. It especially applies when trying to explain the insight obtained from the analysis of increasingly large datasets. Data visualization plays an essential role in the representation of both small and large-scale data. 10 | 11 | One of the key skills of a data scientist is the ability to tell a compelling story, visualizing data and findings in an approachable and stimulating way. Learning how to leverage a software tool to visualize data will also enable you to extract information, better understand the data, and make more effective decisions. 12 | 13 | The main goal of this Data Visualization with Python course is to teach you how to take data that at first glance has little meaning and present that data in a form that makes sense to people. Various techniques have been developed for presenting data visually but in this course, we will be using several data visualization libraries in Python, namely Matplotlib, Seaborn, and Folium. 14 | 15 | LIMITED TIME OFFER: Subscription is only $39 USD per month for access to graded materials and a certificate.",Data Scientist,3588 16 | Machine Learning with Python,"This course dives into the basics of machine learning using an approachable, and well-known programming language, Python. 17 | In this course, we will be reviewing two main components: 18 | First, you will be learning about the purpose of Machine Learning and where it applies to the real world. 19 | Second, you will get a general overview of Machine Learning topics such as supervised vs unsupervised learning, model evaluation, and Machine Learning algorithms. 20 | 21 | In this course, you practice with real-life examples of Machine learning and see how it affects society in ways you may not have guessed! 22 | 23 | By just putting in a few hours a week for the next few weeks, this is what you’ll get. 24 | 1) New skills to add to your resume, such as regression, classification, clustering, sci-kit learn and SciPy 25 | 2) New projects that you can add to your portfolio, including cancer detection, predicting economic trends, predicting customer churn, recommendation engines, and many more. 26 | 3) And a certificate in machine learning to prove your competency, and share it anywhere you like online or offline, such as LinkedIn profiles and social media. 27 | 28 | If you choose to take this course and earn the Coursera course certificate, you will also earn an IBM digital badge upon successful completion of the course.",Data Scientist,2517 29 | Databases and SQL for Data Science,"Much of the world's data resides in databases. SQL (or Structured Query Language) is a powerful language which is used for communicating with and extracting data from databases. A working knowledge of databases and SQL is a must if you want to become a data scientist. 30 | 31 | The purpose of this course is to introduce relational database concepts and help you learn and apply foundational knowledge of the SQL language. It is also intended to get you started with performing SQL access in a data science environment. 32 | 33 | The emphasis in this course is on hands-on and practical learning . As such, you will work with real databases, real data science tools, and real-world datasets. You will create a database instance in the cloud. Through a series of hands-on labs you will practice building and running SQL queries. You will also learn how to access databases from Jupyter notebooks using SQL and Python. 34 | 35 | No prior knowledge of databases, SQL, Python, or programming is required. 36 | 37 | Anyone can audit this course at no-charge. If you choose to take this course and earn the Coursera course certificate, you can also earn an IBM digital badge upon successful completion of the course. 38 | 39 | LIMITED TIME OFFER: Subscription is only $39 USD per month for access to graded materials and a certificate.",Data Engineer,545 40 | Google Cloud Platform Big Data and Machine Learning Fundamentals,"This 2-week accelerated on-demand course introduces participants to the Big Data and Machine Learning capabilities of Google Cloud Platform (GCP). It provides a quick overview of the Google Cloud Platform and a deeper dive of the data processing capabilities. 41 | 42 | At the end of this course, participants will be able to: 43 | • Identify the purpose and value of the key Big Data and Machine Learning products in the Google Cloud Platform 44 | • Use CloudSQL and Cloud Dataproc to migrate existing MySQL and Hadoop/Pig/Spark/Hive workloads to Google Cloud Platform 45 | • Employ BigQuery and Cloud Datalab to carry out interactive data analysis 46 | • Choose between Cloud SQL, BigTable and Datastore 47 | • Train and use a neural network using TensorFlow 48 | • Choose between different data processing products on the Google Cloud Platform 49 | 50 | Before enrolling in this course, participants should have roughly one (1) year of experience with one or more of the following: 51 | • A common query language such as SQL 52 | • Extract, transform, load activities 53 | • Data modeling 54 | • Machine learning and/or statistics 55 | • Programming in Python 56 | 57 | Google Account Notes: 58 | • Google services are currently unavailable in China.",Data Engineer,1015 59 | Big Data Modeling and Management Systems,"Once you’ve identified a big data issue to analyze, how do you collect, store and organize your data using Big Data solutions? In this course, you will experience various data genres and management tools appropriate for each. You will be able to describe the reasons behind the evolving plethora of new big data platforms from the perspective of big data management systems and analytical tools. Through guided hands-on tutorials, you will become familiar with techniques using real-time and semi-structured data examples. Systems and tools discussed include: AsterixDB, HP Vertica, Impala, Neo4j, Redis, SparkSQL. This course provides techniques to extract value from existing untapped data sources and discovering new data sources. 60 | 61 | At the end of this course, you will be able to: 62 | * Recognize different data elements in your own work and in everyday life problems 63 | * Explain why your team needs to design a Big Data Infrastructure Plan and Information System Design 64 | * Identify the frequent data operations required for various types of data 65 | * Select a data model to suit the characteristics of your data 66 | * Apply techniques to handle streaming data 67 | * Differentiate between a traditional Database Management System and a Big Data Management System 68 | * Appreciate why there are so many data management systems 69 | * Design a big data information system for an online game company 70 | 71 | This course is for those new to data science. Completion of Intro to Big Data is recommended. No prior programming experience is needed, although the ability to install applications and utilize a virtual machine is necessary to complete the hands-on assignments. Refer to the specialization technical requirements for complete hardware and software specifications. 72 | 73 | Hardware Requirements: 74 | (A) Quad Core Processor (VT-x or AMD-V support recommended), 64-bit; (B) 8 GB RAM; (C) 20 GB disk free. How to find your hardware information: (Windows): Open System by clicking the Start button, right-clicking Computer, and then clicking Properties; (Mac): Open Overview by clicking on the Apple menu and clicking “About This Mac.” Most computers with 8 GB RAM purchased in the last 3 years will meet the minimum requirements.You will need a high speed internet connection because you will be downloading files up to 4 Gb in size. 75 | 76 | Software Requirements: 77 | This course relies on several open-source software tools, including Apache Hadoop. All required software can be downloaded and installed free of charge (except for data charges from your internet provider). Software requirements include: Windows 7+, Mac OS X 10.10+, Ubuntu 14.04+ or CentOS 6+ VirtualBox 5+.",Data Engineer,4233 78 | Database Management Essentials,"Database Management Essentials provides the foundation you need for a career in database development, data warehousing, or business intelligence, as well as for the entire Data Warehousing for Business Intelligence specialization. In this course, you will create relational databases, write SQL statements to extract information to satisfy business reporting requests, create entity relationship diagrams (ERDs) to design databases, and analyze table designs for excessive redundancy. As you develop these skills, you will use either Oracle, MySQL, or PostgreSQL to execute SQL statements and a database diagramming tool such as the ER Assistant or Visual Paradigm to create ERDs. We’ve designed this course to ensure a common foundation for specialization learners. Everyone taking the course can jump right in with writing SQL statements in Oracle, MySQL, or PostgreSQL.",Data Engineer,3763 79 | "Data Warehouse Concepts, Design, and Data Integration","This is the second course in the Data Warehousing for Business Intelligence specialization. Ideally, the courses should be taken in sequence. 80 | 81 | In this course, you will learn exciting concepts and skills for designing data warehouses and creating data integration workflows. These are fundamental skills for data warehouse developers and administrators. You will have hands-on experience for data warehouse design and use open source products for manipulating pivot tables and creating data integration workflows. In the data integration assignment, you can use either Oracle, MySQL, or PostgreSQL databases. You will also gain conceptual background about maturity models, architectures, multidimensional models, and management practices, providing an organizational perspective about data warehouse development. If you are currently a business or information technology professional and want to become a data warehouse designer or administrator, this course will give you the knowledge and skills to do that. By the end of the course, you will have the design experience, software background, and organizational context that prepares you to succeed with data warehouse development projects. 82 | 83 | In this course, you will create data warehouse designs and data integration workflows that satisfy the business intelligence needs of organizations. When you’re done with this course, you’ll be able to: 84 | * Evaluate an organization for data warehouse maturity and business architecture alignment; 85 | * Create a data warehouse design and reflect on alternative design methodologies and design goals; 86 | * Create data integration workflows using prominent open source software; 87 | * Reflect on the role of change data, refresh constraints, refresh frequency trade-offs, and data quality goals in data integration process design; and 88 | * Perform operations on pivot tables to satisfy typical business analysis requests using prominent open source software",Data Engineer,1311 89 | -------------------------------------------------------------------------------- /Exploratory Data Analysis/jobs_test_sample.csv: -------------------------------------------------------------------------------- 1 | Job_title,Job_Desc,Job_id 2 | Data Scientist,"We are looking for Data Scientists who are interested in using data to draw insights that will result in policy changes or business process optimisation, benefiting the public. The applicant will be scoping projects with stakeholders, using data sets across Government Agencies, applying business acumen to tease out relevant impactful insights, and presenting insights in a clear, concise manner by using appropriate visualisations. 3 | 4 | He/she should have some training and working experiences on data analytics, and should be comfortable with hands-on data manipulation, data modelling and data visualisation. He/she should also be comfortable with engaging stakeholders on sharpening their business problems. 5 | 6 | The analytics work that we do are typically action oriented and cross-cutting across various domains such as social, economic and infrastructure sectors. Over time, he/she will gain exposure to various policy and ops domains and become more adept in bridging between business users and technical expertise. 7 | 8 | What to Expect: 9 | 10 | Work closely with stakeholders to understand their business challenges, scope the problem and develop business case on how to turn data into critical information and knowledge that are actionable and impactful,. Perform data cleaning, pre-processing, feature engineering and build relevant models to conduct meaningful analysis. Apply appropriate visualisation techniques to communicate the insight effectively. Iterate with the stakeholders to perform subsequent deep dives based on the initial insights.Depending on the use case, design of dashboards and interactive visualisations as tools for data exploration and storytelling may be expected. Potentially deployed to other Government Agencies to be their resident Data Scientist. This will involve formulating and implementing strategies to build strong pipeline of impactful projects at the Agency and executing these projects. 11 | 12 | How to Succeed: 13 | 14 | 15 | 16 | Bachelor Degree in Computer Science, Statistics, Economics, Quantitative Social Science, or related degrees. Advanced degrees preferred. We will also factor in relevant certifications (e.g., Coursera)Minimum 2 years of relevant working experience, preferably in public sector or data science fieldAbility to take a broad, strategic perspective as well as drill deep to understand business needs and challengesUnderstand key concepts, techniques and considerations in machine learning and data analyticsTraining and relevant experience in one or more of the following areas: 17 | 18 | 19 | Data science tools such as R, PythonVisual analytics technologies like Tableau, Qlik 20 | Excellent communication skills, both oral and written, with ability to pitch ideas and influence stakeholdersStrong analytical, conceptualisation and problem solving skillsTeam player with strong organization and people handling skillsPassion for the use of analytics and data to improve Public Service 21 | ",901 22 | Data Scientist,"The world's largest and fastest-growing companies such as Accenture, Adobe, DocuSign and Salesforce rely on Demandbase to drive their Account-Based Marketing strategy and maximize their B2B marketing performance. We pioneered the ABM category nearly a decade ago, and today we lead the category as an indispensable part of the B2B MarTech stack. Our achievements and innovation would not be possible without the driven and collaborative teams here at Demandbase. As a company, we're as committed to growing careers as we are to building word-class technology. We invest heavily in people, our culture and the community around us, and have continuously been recognized as one of the best places to work in the Bay Area. 23 | 24 | Demandbase is currently looking for a Staff Data Scientist to develop ground-breaking insights from our data sets and create a completely new way of data-driven thinking in B2B marketing —providing Sales and Marketing users with unique approaches for account-based advertising and web engagement. 25 | 26 | As a Staff Data Scientist, you'll be responsible for developing and testing hypotheses on behavioral responses in B2B marketing, creating models that extract data from, among others, website, digital advertising, and CRM solutions into actionable insights, and defining leading edge thinking on how analytical frameworks can be applied to predictive marketing. You'll engage closely with product managers, engineers, customers and others to turn your models into products that delight customers and create ""A-HA"" moments. You will engage with industry peers and experts and showcase your findings (of course while maintaining company and client confidentiality!). You are both hands-on and strategic—with both a broad ecosystem-level understanding of our market space and the ability to work closely with engineering and product teams to deliver software in an iterative, continual-release environment. This is a high-visibility position involving close collaboration across functional groups and with executive stakeholders at customers like the above. 27 | 28 | What you'll be doing: 29 | 30 | Own: Be the functional owner of the Data Science role 31 | Frame: Use data and insights to explore questions our customers and product team can and should be asking but never asked before. 32 | Define: Work with customers and internal stakeholders to define hypotheses and models, and with engineering teams to define productionalization of data science system 33 | Document: Write clear, concise descriptions of how insights can be converted into repeatable actions. 34 | Build: Write robust machine learning pipelines and data science systems that interface with production infrastructure and APIs 35 | Test: Continually test your models and refine assumptions, data sources and more. 36 | Drive: Work to spread understanding and buy-in among all stakeholders at all levels. 37 | Other duties as assigned 38 | 39 | What we're looking for: 40 | 41 | 2-4 years of data science experience—you have driven more than one greenfield project from concept to production release 42 | Strong quantitative and data analysis abilities (statistics, engineering, or financial academic background preferred)—making data actionable must be your thing! 43 | Good working knowledge of Spark is a must (we use Scala heavily) 44 | Any experience with Google Cloud (especially BQML) and AWS is a huge plus. 45 | Experience defining products & solutions containing large data sets from diverse sources— preferably in sales and/or marketing situations. 46 | Prior experience in the marketing or sales analytics/data science space desired 47 | Knowledge of web site, digital marketing, and CRM technologies and companies a big plus 48 | 49 | Other important qualities: 50 | 51 | You are perfectly comfortable working in a fast paced, market making environment 52 | You love data and data visualization—you love making data actionable for customers 53 | You are a driver and a doer 54 | You are truly passionate about asking and answering questions – some never asked before 55 | You have a strong sense of ownership for the products you help build 56 | 57 | Benefits: 58 | 59 | Our benefits include 100% paid for Medical, Dental and Vision for you and your entire family, 100% paid for short-term and long-term disability, 100% paid for life insurance, 401k, flexible vacation 60 | 61 | About Demandbase: 62 | 63 | Demandbase is the leader in Account-Based Marketing (ABM) and an indispensable part of the B2B tech stack. The company offers the only end-to-end ABM platform that helps B2B marketers identify, engage, close and measure progress against best-fit accounts. The biggest and fastest growing companies in the world, such as Accenture, Adobe, DocuSign, GE, Salesforce and others rely on Demandbase to drive their ABM strategy and maximize their marketing performance. The company has been named to the JMP Securities list ""The Hot 100: The Best Privately Held Software Companies,"" the Deloitte Fast 500 and named a Gartner Cool Vendor for Tech Go-To Market. In 2019, Demandbase executives authored the definitive book on ABM, Account-Based Marketing: How to Target and Engage the Companies That Will Grow Your Revenue. For more information, please visit www.demandbase.com or follow the company on Twitter @Demandbase.",910 64 | Data Scientist," 65 | Role: Data Scientist. 66 | 67 | Location: Foster City, CA 68 | 69 | Hire Type: 12 Months Contract 70 | 71 | Job Description: 72 | 73 | Advanced degree in Data Science, Statistics, Computer Science, or similar. 74 | 75 | Extensive experience as a Data Scientist. 76 | 77 | Proficiency in R or Python, where the former is preferred. 78 | 79 | In-depth understanding of SQL. 80 | 81 | Competent in machine learning principles and techniques. 82 | 83 | Demonstrable history of devising and overseeing data-centered projects. 84 | 85 | Ability to relay insights in layman's terms, such that these can be used to inform business decisions. 86 | 87 | Outstanding supervision and mentorship abilities. 88 | 89 | Capacity to foster a healthy, stimulating work environment that frequently harnesses teamwork. 90 | 91 | ",916 92 | Data Scientist,"Upstart is the leading AI lending platform partnering with banks to expand access to affordable credit. Forbes recently ranked Upstart #12 on its list of ""most promising AI companies in America."" By leveraging Upstart's AI platform, Upstart-powered banks can have higher approval rates and lower loss rates, while simultaneously delivering the exceptional digital-first lending experience their customers demand. Upstart's patent-pending platform is the first to receive a no-action letter from the Consumer Financial Protection Bureau related to fair lending. Upstart is based in San Mateo, California and Columbus, Ohio. 93 | 94 | The Role 95 | 96 | Our data science team consists of full-stack generalists as well as specialists in statistical modeling or machine learning. Because our challenges are so new, data scientists at Upstart need strong creative problem-solving skills and the technical background to implement solutions. Our research environment affords team members the opportunity to utilize a variety of statistical and machine learning methods with the freedom and encouragement to pursue alternative approaches to solving problems. Whether developing new products or identifying novel approaches to core models, we are continuously seeking the next big ideas to move our business forward. 97 | 98 | Our current Data Scientists summarize some of their favorite aspects of our team as: 99 | 100 | 101 | Having a direct impact on the company's success 102 | Collaborative, intelligent and open team 103 | Mentorship, growth and friendship 104 | Leaders committed to challenging and growing team members 105 | Feeling safe asking for help when it's necessary; feeling trusted to get the job done when it's not 106 | 107 | Hiring Profile 108 | 109 | Strong academic credentials with a M.S. in Computer Science, Statistics, Data Science or a related field of study with a preference for Ph.D. 110 | Comfort with programming (ideally in Python and R) 111 | Rigorous quantitative background 112 | Predictive modeling experience is preferred 113 | Enthusiasm for and alignment with Upstart's mission and values 114 | Strong sense of intellectual curiosity balanced with humility 115 | Numerically-savvy with ability to operate at a speedy pace 116 | 117 | Most Upstarters join us because they connect with our mission of enabling access to effortless credit based on true risk. If you are energized by the impact you can make at Upstart, we would love to hear from you!",920 118 | Data Scientist,"Why Divvy?Over the past decade, millions of Americans have been forced to put their dreams of homeownership on hold. Home prices have outpaced wage growth while mortgage requirements continue to tighten. As a result, renters are missing out on a critical wealth-building opportunity: owning a home.At Divvy, we're building an on-ramp to homeownership – one that's more affordable, more flexible, and an overall better fit for the modern American family – and it’s working.We’re looking for a Data Scientist to join our growing company. In this role, you’ll ensure the financial viability of our business by developing our underwriting and/or pricing models. Developing this model will also mean simulating new financial product offerings which match customer needs to Divvy’s capacities. Day to day, this will include a mix of dataset acquisition, statistical modeling, exploratory data analysis, and software engineering. You’ll report directly to Divvy’s Head of Data Science and work alongside a team of 8-10 software engineers and data scientists.ResponsibilitiesBuild and refine our default and/or pricing models using structured dataIdentify, analyze, and acquire new data sources to improve model accuracyInfluence Divvy’s product offerings based on quantitative insightsBecome a domain expert in risk and/or pricingWork ExperienceYou have 3+ years of experience in machine learning, data science or analyticsYou have experience in either R or PythonYou have a strong understanding of statistical modeling techniquesYou demonstrate the ability to clearly communicate analysisBonus points for previous credit default modeling experience, risk management experience, and/or real estate pricing (AVM) experiencePerksCompetitive salary + equity Full benefits (medical, dental, vision, 401k, commuter) A beautiful dog-friendly office Diverse, smart, and witty co-workersCommitment to Diversity & InclusionWe prioritize a commitment to diversity in our team building process. We enthusiastically encourage individuals from a variety of lived experiences to reach out.",938 119 | Data Engineer,"About Rocket LawyerWe believe everyone deserves access to simple and affordable legal services. 120 | Founded in 2008, Rocket Lawyer is the largest and most widely used online legal service platform in the world. With offices in North America and Europe, Rocket Lawyer has helped over 20 million people create over 50 million legal documents, and get their legal questions answered. 121 | We are in a unique position to enhance and expand the Rocket Lawyer platform to a scale never seen before in the company’s history, to capture audiences worldwide. We are expanding our team to take on this challenge! 122 | About the RoleRocket Lawyer is looking for a Data Engineer that will contribute in all aspects of creating an analytical data driven environment. The core data engineering team is responsible for the building out the data pipeline, gathering internal and external data, generating metrics, managing and monitoring batch and streaming jobs, and implementing analytical tools to drive strategic decision making.A Day in the Life 123 | 124 | 125 | Evangelize Modern Big Data Practices Design warehouse schemas that accurately represent our business, and facilitate analysis and building of reportsHelp build batch and streaming data ingestion pipeline using Hadoop, Hive, Pig, Storm, and Kafka StreamsWrite ETL jobs to transform raw data into business information to drive decision makingDevelop analytical environment using internal and external reporting toolsIntegrate internal and external data with warehouse and external tools 126 | 127 | 128 | Experience 129 | 130 | 131 | Excellent technical skills including expert knowledge of the Hadoop ecosystemExperience of the analysis, design and development of Data Warehouse and Big Data solutions, including analyzing source systems, developing ETL design patterns and templates, ETL development, data profiling and data quality issues resolution.Project and team management experience Excellent communication skills and presentation skillsStrong SQL, Java, and Python skillsDatabase (relational & NoSQL), Data Warehouse knowledgeStream processing experience (Storm, Kafka Streams)Passion and enthusiasm for learning new technologies and techniqueComfortable with LinuxBS or MS in computer scienceDetail oriented and organizedDesire to learn broad set of technologies 132 | 133 | 134 | Benefits and Perks 135 | 136 | 137 | Comprehensive health plans (including Medical, Dental and Vision insurance for full-time employees)Unlimited PTOCompetitive salary packages401k programLife insuranceDisability benefitsFlexible Spending AccountsCommuter/Transit ProgramYour choice of a MAC or PCMonthly onsite masseuse sessionsWeekly Friday catered lunchesCompany sponsored events, both on- and off-site 138 | 139 | 140 | ",935 141 | Data Engineer,"Our mission is to create a world where mental health is never an obstacle. 142 | 143 | Ginger is transforming how behavioral healthcare is delivered by making it easy for people to get the support they need, when they need it, through on-demand coaching, teletherapy, telepsychiatry, and guided self-care. 144 | 145 | Businesses purchase Ginger for their employee's or member's benefit. Through Ginger's secure mobile app, people around the world receive immediate emotional support that's personalized to their needs, and completely confidential. Ginger's high-quality, team-based care works — 70% of people show significant improvement after 12 weeks of using Ginger. 146 | 147 | At Ginger, people are at the heart of what we do. We believe that diverse and inclusive teams make our company better. Teams with individuals that bring different perspectives to challenges are more innovative, collaborative, and create better solutions. We're building a workplace that actively embraces a diversity of people, ideas, talents, and experiences. Come join us! 148 | 149 | Fast Company called Ginger one of The World's Top 10 Most Innovative Companies in Healthcare and the World Economic Forum named us a Technology Pioneer. 150 | 151 | About the Role: 152 | 153 | At Ginger, we aim to provide better mental health care to humanity at a scale larger than has ever been possible before. This is no small task and as an expanding team we are working on a number of initiatives to achieve this, including aggressively building tools to simultaneously grow our reach and improve quality of care. 154 | 155 | What You'll Do: 156 | 157 | Standing at the center of multiple teams (data science, engineering) and core systems, you'll.. 158 | 159 | 160 | Open up our data to uncover important patterns at the level of individuals and sub-populations. 161 | Surface, serve and persist key actionable insights in mental health, healthy habit formation, goal pursuit, and care efficacy. 162 | Help us scale our services using modern distributed processing tools and GPUs in the cloud (AWS) 163 | Collaborate with product to ideate and unlock features which derive as much actionable information from our data (text, media, activity etc) as possible. 164 | Help architect systems for near-real-time delivery of recommendations, care insights and other time-critical information to coaches and members. 165 | Design lightweight data schemas appropriate for storing, organizing and joining processed communication and care analytics. 166 | Devise the tooling that takes us from algorithm prototype to production and can track data/model lineage and statistical drift through time. 167 | Develop pipelines that efficiently and reliably route output of machine learning algorithms to consumer processes and persistence mechanisms. 168 | Own operational scalability of our algorithms, systems and data models. 169 | Stand up infrastructure for optimal extraction, transformation, and loading of data from a wide variety of data sources using SQL, Python and AWS tools. 170 | Work with a variety of stakeholders including the Data, Product, Engineering, Security and Executive teams to support their data accessibility needs. 171 | 172 | Necessary Skills: 173 | 174 | Databases SQL/NoSQL 4+ years 175 | Cloud platform experience 3+ years 176 | SQL 4+ years 177 | Schema design 2+ years 178 | Amazon Web Services (AWS) 2+ years 179 | Deployment pipelines 2+ years 180 | Python 2+ years 181 | Deploying to production systems with active customers 2+ years 182 | Distributed computing (e.g Spark, Hadoop etc.) 3+ years 183 | Infrastructure monitoring 1+ years 184 | Wide variety of data warehouse, data lake (s3) etc familiarity 185 | Analytics experience working with structured and unstructured data 186 | Project lead (self-managing) 1+ years 187 | Bachelors in technical field or experiential equivalent 188 | 189 | Ideal Skills: 190 | 191 | Amazon Web Services (AWS) 3+ years 192 | AWS Lambda, Sagemaker 193 | Docker / Kubernetes 194 | DB performance engineering 195 | Machine Learning (ML) 1+ years 196 | Running ML on GPUs 1+ years 197 | Python 3+ years 198 | Strong analytics intuition grounded in significant experience 199 | Experience in the healthcare space 200 | Masters in technical field or experiential equivalent 201 | ",1068 202 | Data Engineer,"Data Engineer 203 | If you are a Data Engineer with several years of relevant experience, please read on!We are poised to triple our customer base AGAIN in 2020 and we need a Data Engineer to help us manage the growth! Our tech stack: AWS, Aptible, Postgres, Redis, Rails, Python, Airflow, Mode Analytics, Android, React, and React Native. 204 | 205 | What You Will Be Doing 206 | - Maintain our current ETL-lite while scaling it for the future- Create and maintain views and expand use of rollup tables- Identify opportunities to improve the integrity of our datasets and implement the fixes- Assist in building out our payments platform for managing medical claims- Help explore options for delivering data to clients, including possible API access- Inform our 2020 objectives and key results around scaling and data needs 207 | What You Need for this Position 208 | Requirements: - Bachelors degree in C.S. or comparable degree preferred- Minimum of 3 years relevant experience in data engineering- Ability to collaborate and problem solve across teams- Excellent communication skills, both written and verbal- Python: using community-standards, linting, and testing at all appropriate levels.- SQL: comfort with joins, unions, views, rollups, windowing functions, testing- JSON parsing and fluency with RESTful APIs- Operational competency with cloud-hosted systems such as AWS, Aptible, or Heroku- Ability to correlate data across multiple sources: RDBs, csv, json- Understands how to write efficient code and can optimize existing software and queriesNice to Have: - Prior experience with healthcare data (PHI/PII/HIPAA requirements)- Experience developing software in Ruby on Rails- Understanding of user experience principles- History of technical writing 209 | What's In It for You 210 | - Competitive compensation with meaningful stock options- Medical, dental, vision- 401K match - 3 months paid parental leave- Daily lunch- Professional development budget - Monthly fitness/gym reimbursement - Annual mental wellness benefit - Noise-cancelling headphones - Work from home policy- Opportunity to join a fantastically talented, diverse, and passionate team at a pivotal time in the companys lifecycle 211 | 212 | So, if you are a Data Engineer with the required experience, please apply today! 213 | - Applicants must be authorized to work in the U.S. 214 | CyberCoders, Inc is proud to be an Equal Opportunity Employer 215 | 216 | All qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, disability, protected veteran status, or any other characteristic protected by law. 217 | 218 | Your Right to Work In compliance with federal law, all persons hired will be required to verify identity and eligibility to work in the United States and to complete the required employment eligibility verification document form upon hire.",1089 219 | Data Engineer,"Prabhav Services Inc. is one of the premier preferred vendor with many end clients, we have offices in USA, Canada and India, we do sponsor H1B for right candidate and do the Greencard immediately as required, we are looking for candidates for next year as well so if you or any of your friends are looking for job feel free to refer to me on parinprabhavonline.com Currently we are seeking the candidates for Data Engineer with excellent in implement automation, data modeling, data wrangling, data analysis, and data vision solutions to complex problems, processes, and scenarios. Familiarity with common data structures and languages. Responsibilities Perform data stream design, integration engineering including a full understanding and support for a typical Capture-Ingestion-Storage-Validation-Analysis-Visualization process Process mapping and automation Wrangling structured, unstructured and poorly structured data into appropriate data structures Develop data architectures that improve automation, processes, data flow and analyses (including recommendations in systems owned by other organizations) Establish objectives, formulate methodologies, and help coordinate fusion of data science, data architecture, data visualization, and data management streams and teams Identify opportunities to further build out our IoT strategy ExperienceSkillsAbilities At least 3 years of experience working as a Systems Integrator, Data Engineer, Software Engineer or similar position demonstrating the ability to design and implement automation, data modeling, data wrangling, data analysis, and data vision solutions to complex problems, processes, and scenarios. Familiarity with common data structures and languages. BS Computer Science, ComputerElectrical Engineering, or Math Degree or relevant experience. Experience with IoT, cloud computing, distributed data systems Experience working with statistical teams andor data scientists ToolsProgramming Experience SQL, Python, R, JS, HTML, CSS, BI, Tableau, AWS (to work and reorder this), excel, R, DB conceptsprogramming, object-orientated languages(e.g. Java, C++), other scripting languages, programming skills Friendly and approachable, with strong communication and presentation skills Desire to keep current with a challenging and evolving environment Team focused and self-motivated. Able to work as part of a coordinated team, yet independently when necessary Proven abilities to take initiative and to be innovative have an analytical mind with a problem-solving aptitude",1100 220 | Data Engineer,"About Skupos 221 | Skupos is the data platform for the convenience retail industry. Retailers, distributors, and brands connect to the Skupos network to create value from disparate data. Convenience retail is a long-standing industry with limited technology adoption, but is responsible for more revenue annually than all of e-commerce in the United States. Skupos leverages our massive datasets to build tools that help the industry succeed. 222 | Skupos software integrates at a retailer’s point of sale, generates analytical insights, and automates the inventory and ordering process. For distributors and CPG brands, we provide real-time visibility into consumer purchasing decisions and enable automated promotional discounts at the point of sale. We view our company as revolutionizing a brick-and-mortar industry by bringing cutting-edge technology to physical stores, and helping harness data to create a frictionless connection between millions of people and the products they need. 223 | What You'll Do 224 | Skupos is seeking a Data Engineer to help build the foundation of our big data platform. As we pave the way for our data product offerings, you will architect, develop and deploy data solutions at scale using modern data technologies. You will have an opportunity to drive the tech stack for this platform. Come, join us and be in charge of your career trajectory and leverage coaching/mentorship opportunities with your manager to write your own success story at Skupos and beyond. 225 | 226 | Build data pipelines for the end to end data ecosystem:Data integration and Ingestion from multiple external data providers/partners.Data processing in accordance with product requirements, ensuring data security and compliance throughout the pipeline. Data Storage layer - Maintain a Single Source of Truth. Data Access layer - Make data available for reporting, dashboards, analytics, business intelligence needs.Data Science and Machine Learning modeling. Collaborate with cross-functional technical, product and business teams to take ownership of data projects to ensure a complete end to end customer experience.Research and recommend technologies to build data solutions at scale with near real-time processing of data using service oriented architecture. Improve project delivery and decrease process redundancy and overhead.Foster a lean agile development culture within the team with emphasis on code quality and software best practices. Join the foundational core data engineering team and play an instrumental role in hiring your future teammates. 227 | 228 | 229 | What You Should Have 230 | 231 | 232 | BA/BS in business, computer science; or similar degree in a related field or equivalent experience with demonstrated proficiency3+ years of hands-on experience building big data pipelines using streaming technologies (Kafka, Spark, or similar) in cloud environments. 5+ years experience in database technologies, including RDBMS, NoSQL, Document storage, graphs, and distributed file systems.Advanced skills with functional programming languages (Scala, Python, R, Java, or similar)Experience with data warehouse architecture and data modeling for Business Intelligence.Working knowledge of BI tools (Tableau, Looker, Snowflake, or similar)Excellent attention to detail and focused on execution through rapid iterations. Self Motivated individual with strong ethics who brings the best version of themselves to raise the bar for the entire team. 233 | 234 | 235 | What Makes You A Great Fit 236 | 237 | 238 | Startup experienceSubject matter knowledge on retail industryExperience building SAAS software 239 | 240 | 241 | What We Offer 242 | • Competitive salary• Healthcare benefits• 401K• Commuter benefits• Major role in a strong, small and growing development team• Be a part of a key platform of product offerings to the retail convenience store industry 243 | What are your goals and aspirations? Build your technical skills, business acumen, and leadership with Skupos.",1105 244 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Harsh Bardhan Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Other/Course_webpages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "The purpose of this notebook is to try to retrieve the web addresses for the courses. They do not seem to be available through the API." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import requests\n", 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 8, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Request status for page 0 is 200.\n", 31 | "Request status for page 1 is 200.\n", 32 | "Finished. The number of courses gotten from the catalog is 200\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Get the entire Coursera catalog.\n", 38 | "\n", 39 | "# Instantiate a list to hold the courses\n", 40 | "courses = []\n", 41 | "\n", 42 | "# Set the base url for making get requests\n", 43 | "base_url = 'https://api.coursera.org/api/courses.v1'\n", 44 | "\n", 45 | "# Add the fields I want to include in my requests\n", 46 | "fields = \"&fields=previewLink,photoURL\"\n", 47 | "\n", 48 | "# Loop through all 45 pages of the catalog\n", 49 | "for page in range(2):\n", 50 | " \n", 51 | " # set pagination\n", 52 | " pagination = f\"?start={page*100}&limit=100\"\n", 53 | "\n", 54 | " # make a request\n", 55 | " res = requests.get(base_url + pagination + fields)\n", 56 | " print(f'Request status for page {page} is {res.status_code}.')\n", 57 | " \n", 58 | " # convert from json\n", 59 | " dict = res.json()\n", 60 | " \n", 61 | " # add to the catalog dictionary\n", 62 | " for course in dict['elements']:\n", 63 | " courses.append(course)\n", 64 | " \n", 65 | " # delay time to next request\n", 66 | " time.sleep(2)\n", 67 | "\n", 68 | "print(f'Finished. The number of courses gotten from the catalog is {len(courses)}')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 9, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "(200, 4)\n" 81 | ] 82 | }, 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
courseTypeidslugname
0v2.ondemand69Bku0KoEeWZtA4u62x6lQgamificationGamification
1v2.ondemand0HiU7Oe4EeWTAQ4yevf_oQmissing-dataDealing With Missing Data
2v2.ondemandsI_-QEBiEemtDRLx7Ne8jgcs-fundamentals-3Unordered Data Structures
3v2.ondemand5zjIsJq-EeW_wArffOXkOwvital-signsVital Signs: Understanding What the Body Is Te...
4v2.ondemandWFanvtoSEeedbRLwgi9a7Afintech-disruptionFinTech Disruptive Innovation: Implications fo...
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " courseType id slug \\\n", 152 | "0 v2.ondemand 69Bku0KoEeWZtA4u62x6lQ gamification \n", 153 | "1 v2.ondemand 0HiU7Oe4EeWTAQ4yevf_oQ missing-data \n", 154 | "2 v2.ondemand sI_-QEBiEemtDRLx7Ne8jg cs-fundamentals-3 \n", 155 | "3 v2.ondemand 5zjIsJq-EeW_wArffOXkOw vital-signs \n", 156 | "4 v2.ondemand WFanvtoSEeedbRLwgi9a7A fintech-disruption \n", 157 | "\n", 158 | " name \n", 159 | "0 Gamification \n", 160 | "1 Dealing With Missing Data \n", 161 | "2 Unordered Data Structures \n", 162 | "3 Vital Signs: Understanding What the Body Is Te... \n", 163 | "4 FinTech Disruptive Innovation: Implications fo... " 164 | ] 165 | }, 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "# Convert the dictionary to DataFrame\n", 173 | "\n", 174 | "catalog_df = pd.DataFrame(courses)\n", 175 | "print(catalog_df.shape)\n", 176 | "catalog_df.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.7.6" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 4 208 | } 209 | -------------------------------------------------------------------------------- /Other/Coursera_data_collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook makes a preliminary attempt to pull data from the Coursera API." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "base_url = 'https://api.coursera.org/api/courses.v1'" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "200" 37 | ] 38 | }, 39 | "execution_count": 4, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "# Search for courses on machine learning\n", 46 | "res = requests.get(base_url + '?q=search&query=machine+learning')\n", 47 | "res.status_code" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 6, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Convert JSON code to Python dictionary\n", 57 | "ml = res.json()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 7, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "dict_keys(['elements', 'paging', 'linked'])" 69 | ] 70 | }, 71 | "execution_count": 7, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "# Check the keys of the dictionary\n", 78 | "ml.keys()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 19, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "{'courseType': 'v2.ondemand',\n", 90 | " 'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n", 91 | " 'slug': 'machine-learning',\n", 92 | " 'name': 'Machine Learning'}" 93 | ] 94 | }, 95 | "execution_count": 19, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "# Check the first element (a course)\n", 102 | "ml['elements'][0]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 21, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "'/Gtv4Xb1-EeS-ViIACwYKVQ'" 114 | ] 115 | }, 116 | "execution_count": 21, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "# Get the course id for this course\n", 123 | "ml_id = ml['elements'][0]['id']\n", 124 | "ml_id = '/' + ml_id # need to add the backslash for the request\n", 125 | "ml_id" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 22, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "200" 137 | ] 138 | }, 139 | "execution_count": 22, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "# Now make another request for this specific course\n", 146 | "res2 = requests.get(base_url + ml_id)\n", 147 | "res2.status_code" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 25, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "dict_keys(['elements', 'paging', 'linked'])" 159 | ] 160 | }, 161 | "execution_count": 25, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "ml_course = res2.json()\n", 168 | "ml_course.keys()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 28, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "[{'courseType': 'v2.ondemand',\n", 180 | " 'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n", 181 | " 'slug': 'machine-learning',\n", 182 | " 'name': 'Machine Learning'}]" 183 | ] 184 | }, 185 | "execution_count": 28, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "ml_course['elements']" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 37, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "200" 203 | ] 204 | }, 205 | "execution_count": 37, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "# Now try requesting the course with additional fields\n", 212 | "\n", 213 | "fields = \"?ids=Gtv4Xb1-EeS-ViIACwYKVQ&fields=description\"\n", 214 | "\n", 215 | "res3 = requests.get(base_url + fields)\n", 216 | "res3.status_code" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 38, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "dict_keys(['elements', 'paging', 'linked'])" 228 | ] 229 | }, 230 | "execution_count": 38, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "ml_fields = res3.json()\n", 237 | "ml_fields.keys()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 39, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "[{'courseType': 'v2.ondemand',\n", 249 | " 'description': \"Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI.\\n\\nThis course provides a broad introduction to machine learning, datamining, and statistical pattern recognition. Topics include: (i) Supervised learning (parametric/non-parametric algorithms, support vector machines, kernels, neural networks). (ii) Unsupervised learning (clustering, dimensionality reduction, recommender systems, deep learning). (iii) Best practices in machine learning (bias/variance theory; innovation process in machine learning and AI). The course will also draw from numerous case studies and applications, so that you'll also learn how to apply learning algorithms to building smart robots (perception, control), text understanding (web search, anti-spam), computer vision, medical informatics, audio, database mining, and other areas.\",\n", 250 | " 'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n", 251 | " 'slug': 'machine-learning',\n", 252 | " 'name': 'Machine Learning'}]" 253 | ] 254 | }, 255 | "execution_count": 39, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "ml_fields['elements']" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 40, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "{'total': 1}" 273 | ] 274 | }, 275 | "execution_count": 40, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "ml_fields['paging']" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 41, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "{}" 293 | ] 294 | }, 295 | "execution_count": 41, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "ml_fields['linked']" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Python 3", 315 | "language": "python", 316 | "name": "python3" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 3 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython3", 328 | "version": "3.7.6" 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 4 333 | } 334 | -------------------------------------------------------------------------------- /Other/Coursera_review_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook pulls course reviews from Coursera." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 5, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "200" 30 | ] 31 | }, 32 | "execution_count": 5, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "url = \"https://www.coursera.org/learn/machine-learning/reviews?page=1&sort=recent\"\n", 39 | "res = requests.get(url)\n", 40 | "res.status_code" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 8, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "soup = BeautifulSoup(res.content, 'lxml')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 10, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "reviews_list = soup.find('div', {'data-e2e': 'reviews-list'})" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 14, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "review_text = reviews_list.find_all('div', {'class': \"reviewText\"})" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 20, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "reviews = []\n", 77 | "for review in review_text:\n", 78 | " reviews.append(review.text)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 21, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "\"Thank you so much Prof. Andrew. I am very thankful to coursera for providing such a valuable course. For me math is love and Andrew's mathematical explanation behind every concept was a great plus point for me and the course wasn't easy or too hard either. Again thanks to coursera and Andrew sir for growing my interest in Machine Learning. Thanks.\"" 90 | ] 91 | }, 92 | "execution_count": 21, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "reviews[0]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.7.6" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 4 130 | } 131 | -------------------------------------------------------------------------------- /Other/Coursetalk_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook scrapes reviews from coursetalk.com" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "200" 30 | ] 31 | }, 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "url = \"https://www.coursetalk.com/providers/coursera/courses/machine-learning\"\n", 39 | "res = requests.get(url)\n", 40 | "res.status_code" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "soup = BeautifulSoup(res.content, 'lxml')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 20, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "review_list = soup.find('div', {'class': 'reviews-list'})" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 34, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "rating_list = review_list.find_all('meta', {'itemprop': 'ratingValue'})" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 37, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "ratings = []\n", 77 | "for rating in rating_list:\n", 78 | " ratings.append(rating['content'])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 38, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "['10',\n", 90 | " '10',\n", 91 | " '8',\n", 92 | " '10',\n", 93 | " '10',\n", 94 | " '8',\n", 95 | " '8',\n", 96 | " '10',\n", 97 | " '10',\n", 98 | " '10',\n", 99 | " '10',\n", 100 | " '10',\n", 101 | " '8',\n", 102 | " '10',\n", 103 | " '10',\n", 104 | " '10',\n", 105 | " '8',\n", 106 | " '10',\n", 107 | " '10',\n", 108 | " '10',\n", 109 | " '10',\n", 110 | " '10',\n", 111 | " '10',\n", 112 | " '10',\n", 113 | " '8',\n", 114 | " '10',\n", 115 | " '8',\n", 116 | " '10',\n", 117 | " '8',\n", 118 | " '8']" 119 | ] 120 | }, 121 | "execution_count": 38, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "ratings" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.7.6" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 4 159 | } 160 | -------------------------------------------------------------------------------- /Other/Indeed_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "The purpose of this notebook is to pull job descriptions off Indeed (using RapidAPI)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 4, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "{\"context\":{\"results_count\":519,\"current_page\":1,\"page_count\":52,\"search_url\":\"https://www.indeed.com/jobs?q=data+scientist&l=san+francisco&start=00\"},\"jobs\":[{\"job_id\":\"54ab161b978a26b3\",\"company\":\"University of California San Francisco\",\"company_url\":\"https://www.indeed.com/cmp/University-of-California---SAN-Francisco\",\"company_rating\":4.2,\"location\":\"San Francisco, CA 94143 (Haight-Ashbury area)\",\"description\":\"Additionally, this position requires strong multitasking skills as the Research Data Scientist may also support data management and analysis needs of…\",\"publication_date\":\"2020-05-28T12:00:00.000Z\"},{\"job_id\":\"281c22f20aaf3dc9\",\"company\":\"Twitter\",\"company_url\":\"https://www.indeed.com/cmp/Twitter\",\"company_rating\":4.1,\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"You’re passionate to work on large datasets to generate knowledge on behaviors and trends and have a diverse interest and skill set covering data analysis,…\",\"publication_date\":\"2020-05-06T12:00:00.000Z\"},{\"job_id\":\"7e5b1dd0315dd25a\",\"company\":\"Pinterest\",\"company_url\":\"https://www.indeed.com/cmp/Pinterest\",\"company_rating\":4.2,\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"6+ years of industry experience with proven ability to apply scientific methods to solve real-world problems on web-scale data.\",\"publication_date\":\"2020-05-31T12:00:00.000Z\"},{\"job_id\":\"544e3faafc2bf1d7\",\"company\":\"Blue Owl\",\"location\":\"San Francisco, CA\",\"salary\":\"$200,000 - $350,000 a year\",\"description\":\"Demonstrable expertise building and supporting machine learning models deployed to production. You have built time series models using econometric approaches as…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"45811e1c376e78ca\",\"company\":\"Adobe\",\"company_url\":\"https://www.indeed.com/cmp/Adobe\",\"company_rating\":4.3,\"location\":\"San Francisco, CA 94107 (South of Market area)\",\"description\":\"Exposure to applied machine learning in an industrial setting. This will involve thinking hard about product quality, the role of machine learning in those…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"ecacf58f2f41884b\",\"company\":\"Notion\",\"location\":\"San Francisco, CA 94110 (Mission area)\",\"description\":\"You have experience building predictive statistical and machine learning models, and you can build reproducible backtests for proposed models to demonstrate…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"7ce9e0a3bb536d7a\",\"company\":\"SentiLink\",\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"A graduate degree in a technical field and 1+ years relevant work experience OR 3+ years relevant work experience (e.g. data scientist, machine learning…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"69f3b39791a4d24d\",\"company\":\"Eaze\",\"location\":\"San Francisco, CA\",\"description\":\"Building production data science models utilized by many departments at Eaze as well as our core product, including predictive, vehicle routing, monte-carlo,…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"19559f2a996703a4\",\"company\":\"Y Combinator\",\"location\":\"San Francisco, CA 94108 (Chinatown area)\",\"description\":\"Build machine learning models to support admissions processes. You will be the point person for data pipeline, analysis and modeling efforts, primarily focusing…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"b575ede49cdd6689\",\"company\":\"The Climate Corporation\",\"company_url\":\"https://www.indeed.com/cmp/The-Climate-Corporation\",\"company_rating\":3.6,\"location\":\"San Francisco, CA\",\"description\":\"Working with engineering and scientific leaders, you will set the strategic direction of productizing large scale scientific problems that inform our products…\",\"publication_date\":\"30+ days ago\"}]}\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "# Use the code copied from Rapid API.\n", 34 | "\n", 35 | "url = \"https://indeed9.p.rapidapi.com/search\"\n", 36 | "\n", 37 | "payload = \"page=1&position=data%20scientist&city=san%20francisco\"\n", 38 | "headers = {\n", 39 | " 'x-rapidapi-host': \"indeed9.p.rapidapi.com\",\n", 40 | " 'x-rapidapi-key': \"\",\n", 41 | " 'content-type': \"application/x-www-form-urlencoded\"\n", 42 | " }\n", 43 | "\n", 44 | "response = requests.request(\"POST\", url, data=payload, headers=headers)\n", 45 | "\n", 46 | "print(response.text)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "jobs = response.json()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "dict_keys(['context', 'jobs'])" 67 | ] 68 | }, 69 | "execution_count": 6, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "jobs.keys()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "{'results_count': 519,\n", 87 | " 'current_page': 1,\n", 88 | " 'page_count': 52,\n", 89 | " 'search_url': 'https://www.indeed.com/jobs?q=data+scientist&l=san+francisco&start=00'}" 90 | ] 91 | }, 92 | "execution_count": 7, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "jobs['context']" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "{'job_id': '54ab161b978a26b3',\n", 110 | " 'company': 'University of California San Francisco',\n", 111 | " 'company_url': 'https://www.indeed.com/cmp/University-of-California---SAN-Francisco',\n", 112 | " 'company_rating': 4.2,\n", 113 | " 'location': 'San Francisco, CA 94143 (Haight-Ashbury area)',\n", 114 | " 'description': 'Additionally, this position requires strong multitasking skills as the Research Data Scientist may also support data management and analysis needs of…',\n", 115 | " 'publication_date': '2020-05-28T12:00:00.000Z'}" 116 | ] 117 | }, 118 | "execution_count": 8, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "# What does one job entry look like?\n", 125 | "# Does not include the full description!\n", 126 | "\n", 127 | "jobs['jobs'][0]" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "10" 139 | ] 140 | }, 141 | "execution_count": 9, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "# Only getting ten jobs per request.\n", 148 | "len(jobs['jobs'])" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Python 3", 162 | "language": "python", 163 | "name": "python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.7.3" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 4 180 | } 181 | -------------------------------------------------------------------------------- /Other/coursera_description.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/html": [ 29 | "
\n", 30 | "\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | "
courseTypedescriptiondomainTypesidslugspecializationsworkloadprimaryLanguagescertificatesname
0v2.ondemandGamification is the application of game elemen...[{'subdomainId': 'design-and-product', 'domain...69Bku0KoEeWZtA4u62x6lQgamification[]4-8 hours/week['en']['VerifiedCert']Gamification
1v2.ondemandThis course will cover the steps used in weigh...[{'subdomainId': 'data-analysis', 'domainId': ...0HiU7Oe4EeWTAQ4yevf_oQmissing-data[]4 weeks of study, 1-2 hours/week['en']['VerifiedCert', 'Specialization']Dealing With Missing Data
\n", 88 | "
" 89 | ], 90 | "text/plain": [ 91 | " courseType description \\\n", 92 | "0 v2.ondemand Gamification is the application of game elemen... \n", 93 | "1 v2.ondemand This course will cover the steps used in weigh... \n", 94 | "\n", 95 | " domainTypes id \\\n", 96 | "0 [{'subdomainId': 'design-and-product', 'domain... 69Bku0KoEeWZtA4u62x6lQ \n", 97 | "1 [{'subdomainId': 'data-analysis', 'domainId': ... 0HiU7Oe4EeWTAQ4yevf_oQ \n", 98 | "\n", 99 | " slug specializations workload \\\n", 100 | "0 gamification [] 4-8 hours/week \n", 101 | "1 missing-data [] 4 weeks of study, 1-2 hours/week \n", 102 | "\n", 103 | " primaryLanguages certificates \\\n", 104 | "0 ['en'] ['VerifiedCert'] \n", 105 | "1 ['en'] ['VerifiedCert', 'Specialization'] \n", 106 | "\n", 107 | " name \n", 108 | "0 Gamification \n", 109 | "1 Dealing With Missing Data " 110 | ] 111 | }, 112 | "execution_count": 3, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "df.head(2)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 10, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "'By enrolling in this specialization you agree to the Qwiklabs Terms of Service as set out in the FAQ and located at: htt...'" 130 | ] 131 | }, 132 | "execution_count": 10, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "df.iloc[920]['description'][0:120] + \"...\"" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.7.3" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn app:app 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MargSetu 2 | 3 |

4 | 5 | Logo 6 | 7 | 8 | ## 📌 Introduction 9 | 10 | This Web Application powered by Machine Learning and Flask API is a Recommender System which can be used to recommend Massive Open Online Courses (MOOCs) to Students and 11 | Professionals according to their needs and proficiency. The [Dataset](https://github.com/HarshCasper/MargSetu/tree/master/Data) used to process the Data Model and power the Application has been scrapped from Coursera's Public Catalog which consists of more than 4000+ Courses in various domains like Data Scientist, DevOps Engineers and Cloud Development Roles. Using Gensim for developing Natural Language Processing Model, a Doc2Vec Model was used to generate predictions for a given role. 12 | 13 | ## 🎯 Purpose of the Project 14 | 15 | Massive Open Online Courses (MOOCs) are increasingly being relied on by Students and Professionals to learn new skills and get the know-how of various technologies and toolkits. While this has allowed the awareness among people, this has also led them to take multiple unreliable course materials that simply don't do justice to people. This Machine Learning Application tries to recommend the appropriate courses to Students and Professionals according to the Job-Profile they are aiming for. 16 | 17 | Our Model performs fairly well when it comes to recommending the appropriate courses and hence allows the right recommendations to be generated as per the technology or tooling that someone is aiming to learn. 18 | 19 | ## 🏁 Technology Stack 20 | 21 | * [Flask](https://github.com/pallets/flask) 22 | * [HTML](https://www.w3.org/TR/html52/) 23 | * [CSS](https://developer.mozilla.org/en-US/docs/Web/CSS) 24 | * [Gensim](https://pypi.org/project/gensim/) 25 | * [Pandas](https://pandas.pydata.org/) 26 | 27 | ## 🏃‍♂️ Local Installation 28 | 29 | 1. Drop a ⭐ on the Github Repository. 30 | 2. Clone the Repo by going to your local Git Client and pushing in the command: 31 | 32 | ```sh 33 | https://github.com/HarshCasper/MargSetu.git 34 | ``` 35 | 36 | 3. Install the Packages: 37 | ```sh 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | 4. At last, push in the command: 42 | ```sh 43 | python app.py 44 | ``` 45 | 46 | 5. Go to ` http://127.0.0.1:5000/` and enjoy the application. 47 | 48 | ## 📋 Further Changes to be Done 49 | 50 | - [ ] Deploying the Web Application on Cloud. 51 | - [ ] Development of the Model using Tensorflow/PyTorch. 52 | - [ ] Enhance the User-Interface using HTML/CSS. 53 | - [ ] Set the Application on Docker. 54 | - [ ] Improve the Quality of Predictions. 55 | - [ ] Add a more interactive User-Interface and integrate various other parameters. 56 | 57 | ## 📜 LICENSE 58 | 59 | [MIT](https://github.com/HarshCasper/MargSetu/blob/master/LICENSE) 60 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | from flask import Flask, request, Response, render_template, jsonify 4 | import gensim 5 | import pandas as pd 6 | 7 | # Initializing the Flask Application 8 | app = Flask('cr_app') 9 | 10 | # Route 1: Shows a Form to the User to fill in 11 | @app.route('/') 12 | def home(): 13 | return render_template('form.html') 14 | 15 | # Route 2: Accept the Form Submission and process it 16 | @app.route('/submit') 17 | def submit(): 18 | jd = request.args["JobDesc"] 19 | doc = gensim.utils.simple_preprocess(jd) 20 | model = pickle.load(open('./model.p', 'rb')) 21 | vector = model.infer_vector(doc) 22 | top = 5 23 | sims = model.docvecs.most_similar([vector], topn=top) 24 | course_ids = [sim[0] for sim in sims] 25 | df = pd.read_csv('./Data/Course_Data/Coursera_Catalog.csv') 26 | course_names = [df.iloc[id]['name'] for id in course_ids] 27 | course_descriptions = [df.iloc[id]['description'][0:150] + "..." for id in course_ids] 28 | return render_template('results.html', len=top, names=course_names, descriptions=course_descriptions) 29 | 30 | if __name__ == '__main__': 31 | app.run(debug=True) 32 | -------------------------------------------------------------------------------- /model.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarshCasper/MargSetu/fce01bddb1672d33cf74cdd7338894e83013ec0e/model.p -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==19.3.0 2 | autopep8==1.4.4 3 | Babel==2.8.0 4 | backcall==0.1.0 5 | backports.functools-lru-cache==1.6.1 6 | backports.shutil-get-terminal-size==1.0.0 7 | backports.tempfile==1.0 8 | backports.weakref==1.0.post1 9 | beautifulsoup4==4.8.2 10 | bitarray==1.2.1 11 | bkcharts==0.2 12 | bleach==3.1.4 13 | bokeh==2.0.0 14 | boto==2.49.0 15 | boto3==1.13.5 16 | botocore==1.16.5 17 | Bottleneck==1.3.2 18 | branca==0.4.1 19 | cachetools==4.1.0 20 | certifi==2019.11.28 21 | cffi==1.14.0 22 | cftime==1.1.3 23 | chardet==3.0.4 24 | Click==7.0 25 | click-plugins==1.1.1 26 | cligj==0.5.0 27 | cloudpickle==1.3.0 28 | clyent==1.2.2 29 | colorama==0.4.3 30 | conda==4.8.3 31 | conda-build==3.18.11 32 | conda-package-handling==1.6.0 33 | conda-verify==3.4.2 34 | gast==0.3.3 35 | gensim==3.8.3 36 | geopandas==0.7.0 37 | gevent==1.4.0 38 | glob2==0.7 39 | gmpy2==2.0.8 40 | greenlet==0.4.15 41 | grpcio==1.29.0 42 | gunicorn==20.0.4 43 | h5py==2.10.0 44 | HeapDict==1.0.1 45 | html5lib==1.0.1 46 | hypothesis==5.5.4 47 | inflection==0.4.0 48 | isort==4.3.21 49 | itsdangerous==1.1.0 50 | jdcal==1.4.1 51 | jedi==0.14.1 52 | jellyfish==0.6.1 53 | Jinja2==2.11.1 54 | jmespath==0.9.5 55 | joblib==0.14.1 56 | json5==0.9.1 57 | jsonschema==3.2.0 58 | jupyter==1.0.0 59 | jupyter-client==5.3.4 60 | jupyter-console==6.1.0 61 | jupyter-core==4.6.1 62 | jupyterlab==1.2.6 63 | jupyterlab-server==1.0.6 64 | keyring==21.1.0 65 | kiwisolver==1.1.0 66 | lazy-object-proxy==1.4.3 67 | libarchive-c==2.8 68 | lief==0.9.0 69 | llvmlite==0.31.0 70 | locket==0.2.0 71 | lxml==4.5.0 72 | Markdown==3.2.2 73 | MarkupSafe==1.1.1 74 | matplotlib==3.2.1 75 | mccabe==0.6.1 76 | mistune==0.8.4 77 | mkl-fft==1.0.15 78 | mkl-random==1.1.0 79 | mkl-service==2.3.0 80 | mock==4.0.1 81 | more-itertools==8.2.0 82 | mpmath==1.1.0 83 | msgpack==0.6.1 84 | multipledispatch==0.6.0 85 | multitasking==0.0.9 86 | munch==2.5.0 87 | navigator-updater==0.2.1 88 | nbconvert==5.6.1 89 | nbformat==5.0.4 90 | netCDF4==1.5.3 91 | networkx==2.4 92 | nltk==3.4.5 93 | nose==1.3.7 94 | notebook==6.0.3 95 | numba==0.48.0 96 | numexpr==2.7.1 97 | numpy==1.18.2 98 | numpydoc==0.9.2 99 | olefile==0.46 100 | openpyxl==3.0.3 101 | opt-einsum==3.2.1 102 | packaging==20.1 103 | pandas==1.0.3 104 | pandocfilters==1.4.2 105 | parso==0.5.2 106 | partd==1.1.0 107 | path==13.1.0 108 | pathlib2==2.3.5 109 | pathtools==0.1.2 110 | patsy==0.5.1 111 | pep8==1.7.1 112 | pexpect==4.8.0 113 | pickleshare==0.7.5 114 | pipenv==2020.6.2 115 | pkginfo==1.5.0.1 116 | requests==2.22.0 117 | rope==0.16.0 118 | Rtree==0.9.3 119 | ruamel-yaml==0.15.87 120 | scikit-learn==0.22.2.post1 121 | scipy==1.4.1 122 | seaborn==0.10.0 123 | spyder-kernels==1.8.1 124 | SQLAlchemy==1.3.13 125 | sqlparse==0.3.1 126 | statsmodels==0.11.0 127 | sympy==1.5.1 128 | tables==3.6.1 129 | tblib==1.6.0 130 | urllib3==1.25.8 131 | virtualenv==20.0.21 132 | virtualenv-clone==0.5.4 133 | webencodings==0.5.1 134 | Werkzeug==1.0.0 135 | -------------------------------------------------------------------------------- /static/css/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: sans-serif; 3 | margin: 0; 4 | padding: 0; 5 | } 6 | 7 | header { 8 | text-align: center; 9 | color: black; 10 | font-family: Arial Black, sans-serif; 11 | /* font-variant: small-caps; */ 12 | font-size: 23px; 13 | font-weight: 700; 14 | font-style: normal; 15 | padding-top: 2rem; 16 | /* padding-bottom: 0rem; */ 17 | } 18 | 19 | footer { 20 | text-align: center; 21 | padding-top: 1rem; 22 | padding-bottom: 2rem; 23 | } 24 | 25 | .submission-form { 26 | text-align: center; 27 | padding-left: 1rem; 28 | padding-right: 1rem; 29 | } 30 | 31 | textarea { 32 | margin: auto; 33 | outline: none; 34 | resize: none; 35 | text-align: left; 36 | font-family: sans-serif; 37 | } 38 | 39 | button { 40 | padding: 10px; 41 | font-weight: 600; 42 | } 43 | 44 | .results-list { 45 | padding-top: 2rem; 46 | padding-bottom: 0rem; 47 | padding-left: 2rem; 48 | padding-right: 2rem; 49 | } 50 | 51 | dt { 52 | font-weight: bold; 53 | } 54 | 55 | .button-container { 56 | text-align: center; 57 | } 58 | 59 | .info-container { 60 | text-align: center; 61 | display: inline-block; 62 | } 63 | 64 | .info { 65 | color: gray; 66 | } 67 | -------------------------------------------------------------------------------- /templates/form.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Career Course Recommender 8 | 9 | 10 | 11 | 12 | 13 | 14 |

15 |

Career Course Recommender

16 |
17 |
18 |
19 |
20 |

21 | 22 |
23 |
24 |
25 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /templates/results.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Career Course Recommender 8 | 9 | 10 | 11 | 12 | 13 |
14 |
15 |
16 | {%for i in range(len)%} 17 |
{{names[i]}}
18 |
{{descriptions[i]}}
19 |
20 | {%endfor%} 21 |
22 |
23 |
24 | 25 |
26 |
27 |
28 |
29 | 34 | 35 | 36 | 37 | --------------------------------------------------------------------------------